Hackage: Implement incremental mode

Use http api lastUpload argument in search query to retrieve new or
updated origins since last run

Related to T4597
This commit is contained in:
Franck Bret 2022-10-12 10:08:38 +02:00
parent 6ad61aec23
commit 065b3f81a1
5 changed files with 170 additions and 23 deletions

View file

@ -20,7 +20,7 @@ Origins retrieving strategy
---------------------------
To get a list of all package names we make a POST call to
`https://hackage.haskell.org/packages/search` endpoint with some params given as
``https://hackage.haskell.org/packages/search`` endpoint with some params given as
json data.
Default params::
@ -35,6 +35,10 @@ Default params::
The page size is 50. The lister will make has much http api call has needed to get
all results.
For incremental mode we expand the search query with ``lastUpload`` greater than
``state.last_listing_date``, the api will return all new or updated package names since
last run.
Page listing
------------
@ -60,7 +64,7 @@ Origins from page
-----------------
The lister yields 50 origins url per page.
Each ListedOrigin has a `last_update` date set.
Each ListedOrigin has a ``last_update`` date set.
Running tests
-------------

View file

@ -3,6 +3,8 @@
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from dataclasses import dataclass
from datetime import datetime, timezone
import logging
from typing import Any, Dict, Iterator, List, Optional
@ -11,7 +13,7 @@ import iso8601
from swh.scheduler.interface import SchedulerInterface
from swh.scheduler.model import ListedOrigin
from ..pattern import CredentialsType, StatelessLister
from ..pattern import CredentialsType, Lister
logger = logging.getLogger(__name__)
@ -19,7 +21,15 @@ logger = logging.getLogger(__name__)
HackageListerPage = List[Dict[str, Any]]
class HackageLister(StatelessLister[HackageListerPage]):
@dataclass
class HackageListerState:
"""Store lister state for incremental mode operations"""
last_listing_date: Optional[datetime] = None
"""Last date when Hackage lister was executed"""
class HackageLister(Lister[HackageListerState, HackageListerPage]):
"""List Hackage (The Haskell Package Repository) origins."""
LISTER_NAME = "hackage"
@ -45,6 +55,20 @@ class HackageLister(StatelessLister[HackageListerPage]):
# Ensure to set this with same value as the http api search endpoint use
# (50 as of august 2022)
self.page_size: int = 50
self.listing_date = datetime.now().astimezone(tz=timezone.utc)
def state_from_dict(self, d: Dict[str, Any]) -> HackageListerState:
last_listing_date = d.get("last_listing_date")
if last_listing_date is not None:
d["last_listing_date"] = iso8601.parse_date(last_listing_date)
return HackageListerState(**d)
def state_to_dict(self, state: HackageListerState) -> Dict[str, Any]:
d: Dict[str, Optional[str]] = {"last_listing_date": None}
last_listing_date = state.last_listing_date
if last_listing_date is not None:
d["last_listing_date"] = last_listing_date.isoformat()
return d
def get_pages(self) -> Iterator[HackageListerPage]:
"""Yield an iterator which returns 'page'
@ -54,11 +78,24 @@ class HackageLister(StatelessLister[HackageListerPage]):
Results are paginated.
"""
# Search query
sq = "(deprecated:any)"
if self.state.last_listing_date:
last_str = (
self.state.last_listing_date.astimezone(tz=timezone.utc)
.date()
.isoformat()
)
# Incremental mode search query
sq += "(lastUpload >= %s)" % last_str
params = {
"page": 0,
"sortColumn": "default",
"sortDirection": "ascending",
"searchQuery": "(deprecated:any)",
"searchQuery": sq,
}
data = self.http_request(
@ -67,20 +104,22 @@ class HackageLister(StatelessLister[HackageListerPage]):
json=params,
).json()
nb_entries: int = data["numberOfResults"]
(nb_pages, remainder) = divmod(nb_entries, self.page_size)
if remainder:
nb_pages += 1
yield data["pageContents"]
for page in range(1, nb_pages):
params["page"] = page
data = self.http_request(
url=self.PACKAGE_NAMES_URL_PATTERN.format(base_url=self.url),
method="POST",
json=params,
).json()
if data.get("pageContents"):
nb_entries: int = data["numberOfResults"]
(nb_pages, remainder) = divmod(nb_entries, self.page_size)
if remainder:
nb_pages += 1
# First page
yield data["pageContents"]
# Next pages
for page in range(1, nb_pages):
params["page"] = page
data = self.http_request(
url=self.PACKAGE_NAMES_URL_PATTERN.format(base_url=self.url),
method="POST",
json=params,
).json()
yield data["pageContents"]
def get_origins_from_page(self, page: HackageListerPage) -> Iterator[ListedOrigin]:
"""Iterate on all pages and yield ListedOrigin instances."""
@ -92,9 +131,14 @@ class HackageLister(StatelessLister[HackageListerPage]):
url = self.PACKAGE_INFO_URL_PATTERN.format(
base_url=self.url, pkgname=pkgname
)
yield ListedOrigin(
lister_id=self.lister_obj.id,
visit_type=self.VISIT_TYPE,
url=url,
last_update=last_update,
)
def finalize(self) -> None:
self.state.last_listing_date = self.listing_date
self.updated = True

View file

@ -0,0 +1 @@
{"numberOfResults":3,"pageContents":[{"description":"Translations of classic Truth Maintenance Systems","downloads":14,"lastUpload":"2022-09-13T19:21:15.533437837Z","maintainers":[{"display":"jpmrst","uri":"/user/jpmrst"}],"name":{"display":"BPS","uri":"/package/BPS"},"tags":[{"display":"gpl","uri":"/packages/tag/gpl"},{"display":"library","uri":"/packages/tag/library"},{"display":"program","uri":"/packages/tag/program"},{"display":"truth-maintenance","uri":"/packages/tag/truth-maintenance"}],"votes":0},{"description":"C-Structs implementation for Haskell","downloads":25,"lastUpload":"2022-09-30T08:00:34.348551203Z","maintainers":[{"display":"SimonPlakolb","uri":"/user/SimonPlakolb"}],"name":{"display":"C-structs","uri":"/package/C-structs"},"tags":[{"display":"c","uri":"/packages/tag/c"},{"display":"data","uri":"/packages/tag/data"},{"display":"foreign","uri":"/packages/tag/foreign"},{"display":"library","uri":"/packages/tag/library"},{"display":"mit","uri":"/packages/tag/mit"},{"display":"structures","uri":"/packages/tag/structures"}],"votes":2},{"description":"Cluster algorithms, PCA, and chemical conformere analysis","downloads":29,"lastUpload":"2022-09-28T11:54:25.8011197Z","maintainers":[{"display":"phillipseeber","uri":"/user/phillipseeber"}],"name":{"display":"ConClusion","uri":"/package/ConClusion"},"tags":[{"display":"agpl","uri":"/packages/tag/agpl"},{"display":"chemistry","uri":"/packages/tag/chemistry"},{"display":"library","uri":"/packages/tag/library"},{"display":"program","uri":"/packages/tag/program"},{"display":"statistics","uri":"/packages/tag/statistics"}],"votes":2}]}

View file

@ -0,0 +1 @@
{"numberOfResults":0,"pageContents":[]}

View file

@ -8,25 +8,31 @@ import json
from pathlib import Path
from urllib.parse import unquote, urlparse
from swh.lister.hackage.lister import HackageLister
import iso8601
from swh.lister.hackage.lister import HackageLister, HackageListerState
def json_callback(request, context, datadir):
def json_callback(request, context, datadir, visit=0):
"""Callback for requests_mock that load a json file regarding a page number"""
page = request.json()["page"]
unquoted_url = unquote(request.url)
url = urlparse(unquoted_url)
page = request.json()["page"]
dirname = "%s_%s" % (url.scheme, url.hostname)
filename = url.path[1:]
if filename.endswith("/"):
filename = filename[:-1]
filename = filename.replace("/", "_")
filepath = Path(datadir, dirname, f"{filename}_{page}")
return json.loads(Path(datadir, dirname, f"{filename}_{page}").read_text())
if visit > 0:
filepath = filepath.parent / f"{filepath.stem}_visit{visit}"
return json.loads(filepath.read_text())
def test_hackage_lister(swh_scheduler, requests_mock, datadir):
"""Assert a full listing of 3 pages of 50 origins"""
requests_mock.post(
url="https://hackage.haskell.org/packages/search",
@ -74,6 +80,10 @@ def test_hackage_lister(swh_scheduler, requests_mock, datadir):
def test_hackage_lister_pagination_49(swh_scheduler, requests_mock, datadir):
"""Test Pagination
Page size is 50, lister returns 1 page when origins < page size
"""
requests_mock.post(
url="https://fake49.haskell.org/packages/search",
status_code=200,
@ -87,6 +97,10 @@ def test_hackage_lister_pagination_49(swh_scheduler, requests_mock, datadir):
def test_hackage_lister_pagination_51(swh_scheduler, requests_mock, datadir):
"""Test Pagination
Page size is 50, lister returns 2 page when origins > page size
"""
requests_mock.post(
url="https://fake51.haskell.org/packages/search",
status_code=200,
@ -98,3 +112,86 @@ def test_hackage_lister_pagination_51(swh_scheduler, requests_mock, datadir):
assert len(pages) == 2
assert len(pages[0]) == 50
assert len(pages[1]) == 1
def test_hackage_lister_incremental(swh_scheduler, requests_mock, datadir):
"""Test incremental lister
* First run, full listing, 3 pages, 150 origins
* Second run, 1 page, 3 new or updated origins
* Third run, nothing new, 0 page, 0 origins
"""
mock_url = "https://hackage.haskell.org/packages/search"
# first run
requests_mock.post(
url=mock_url,
status_code=200,
json=functools.partial(json_callback, datadir=datadir),
)
lister = HackageLister(scheduler=swh_scheduler)
# force lister.last_listing_date to not being 'now'
lister.state.last_listing_date = iso8601.parse_date("2022-08-26T02:27:45.073759Z")
lister.set_state_in_scheduler()
assert lister.get_state_from_scheduler() == HackageListerState(
last_listing_date=iso8601.parse_date("2022-08-26T02:27:45.073759Z")
)
first = lister.run()
assert first.pages == 3
assert first.origins == 3 * 50
# 3 http requests done
assert len(requests_mock.request_history) == 3
for rh in requests_mock.request_history:
assert rh.json()["searchQuery"] == "(deprecated:any)(lastUpload >= 2022-08-26)"
# second run
requests_mock.post(
url=mock_url,
status_code=200,
json=functools.partial(json_callback, datadir=datadir, visit=1),
)
lister = HackageLister(scheduler=swh_scheduler)
# force lister.last_listing_date to not being 'now'
lister.state.last_listing_date = iso8601.parse_date(
"2022-09-30T08:00:34.348551203Z"
)
lister.set_state_in_scheduler()
assert lister.get_state_from_scheduler() == HackageListerState(
last_listing_date=iso8601.parse_date("2022-09-30T08:00:34.348551203Z")
)
second = lister.run()
assert second.pages == 1
assert second.origins == 3
assert len(requests_mock.request_history) == 3 + 1
# Check the first three ones, should be the same as first run
for i in range(3):
assert (
requests_mock.request_history[i].json()["searchQuery"]
== "(deprecated:any)(lastUpload >= 2022-08-26)"
)
# Check the last one, lastUpload should be the same as second run
assert (
requests_mock.last_request.json()["searchQuery"]
== "(deprecated:any)(lastUpload >= 2022-09-30)"
)
# third run (no update since last run, no new or updated origins but one http requests
# with no results)
requests_mock.post(
url=mock_url,
status_code=200,
json=functools.partial(json_callback, datadir=datadir, visit=2),
)
lister = HackageLister(scheduler=swh_scheduler)
third = lister.run()
assert third.pages == 0
assert third.origins == 0
assert lister.get_state_from_scheduler() == HackageListerState(
last_listing_date=lister.state.last_listing_date
)
assert len(requests_mock.request_history) == 3 + 1 + 1