Hackage: Implement incremental mode
Use http api lastUpload argument in search query to retrieve new or updated origins since last run Related to T4597
This commit is contained in:
parent
6ad61aec23
commit
065b3f81a1
5 changed files with 170 additions and 23 deletions
|
@ -20,7 +20,7 @@ Origins retrieving strategy
|
|||
---------------------------
|
||||
|
||||
To get a list of all package names we make a POST call to
|
||||
`https://hackage.haskell.org/packages/search` endpoint with some params given as
|
||||
``https://hackage.haskell.org/packages/search`` endpoint with some params given as
|
||||
json data.
|
||||
|
||||
Default params::
|
||||
|
@ -35,6 +35,10 @@ Default params::
|
|||
The page size is 50. The lister will make has much http api call has needed to get
|
||||
all results.
|
||||
|
||||
For incremental mode we expand the search query with ``lastUpload`` greater than
|
||||
``state.last_listing_date``, the api will return all new or updated package names since
|
||||
last run.
|
||||
|
||||
Page listing
|
||||
------------
|
||||
|
||||
|
@ -60,7 +64,7 @@ Origins from page
|
|||
-----------------
|
||||
|
||||
The lister yields 50 origins url per page.
|
||||
Each ListedOrigin has a `last_update` date set.
|
||||
Each ListedOrigin has a ``last_update`` date set.
|
||||
|
||||
Running tests
|
||||
-------------
|
||||
|
|
|
@ -3,6 +3,8 @@
|
|||
# License: GNU General Public License version 3, or any later version
|
||||
# See top-level LICENSE file for more information
|
||||
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime, timezone
|
||||
import logging
|
||||
from typing import Any, Dict, Iterator, List, Optional
|
||||
|
||||
|
@ -11,7 +13,7 @@ import iso8601
|
|||
from swh.scheduler.interface import SchedulerInterface
|
||||
from swh.scheduler.model import ListedOrigin
|
||||
|
||||
from ..pattern import CredentialsType, StatelessLister
|
||||
from ..pattern import CredentialsType, Lister
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
@ -19,7 +21,15 @@ logger = logging.getLogger(__name__)
|
|||
HackageListerPage = List[Dict[str, Any]]
|
||||
|
||||
|
||||
class HackageLister(StatelessLister[HackageListerPage]):
|
||||
@dataclass
|
||||
class HackageListerState:
|
||||
"""Store lister state for incremental mode operations"""
|
||||
|
||||
last_listing_date: Optional[datetime] = None
|
||||
"""Last date when Hackage lister was executed"""
|
||||
|
||||
|
||||
class HackageLister(Lister[HackageListerState, HackageListerPage]):
|
||||
"""List Hackage (The Haskell Package Repository) origins."""
|
||||
|
||||
LISTER_NAME = "hackage"
|
||||
|
@ -45,6 +55,20 @@ class HackageLister(StatelessLister[HackageListerPage]):
|
|||
# Ensure to set this with same value as the http api search endpoint use
|
||||
# (50 as of august 2022)
|
||||
self.page_size: int = 50
|
||||
self.listing_date = datetime.now().astimezone(tz=timezone.utc)
|
||||
|
||||
def state_from_dict(self, d: Dict[str, Any]) -> HackageListerState:
|
||||
last_listing_date = d.get("last_listing_date")
|
||||
if last_listing_date is not None:
|
||||
d["last_listing_date"] = iso8601.parse_date(last_listing_date)
|
||||
return HackageListerState(**d)
|
||||
|
||||
def state_to_dict(self, state: HackageListerState) -> Dict[str, Any]:
|
||||
d: Dict[str, Optional[str]] = {"last_listing_date": None}
|
||||
last_listing_date = state.last_listing_date
|
||||
if last_listing_date is not None:
|
||||
d["last_listing_date"] = last_listing_date.isoformat()
|
||||
return d
|
||||
|
||||
def get_pages(self) -> Iterator[HackageListerPage]:
|
||||
"""Yield an iterator which returns 'page'
|
||||
|
@ -54,11 +78,24 @@ class HackageLister(StatelessLister[HackageListerPage]):
|
|||
|
||||
Results are paginated.
|
||||
"""
|
||||
# Search query
|
||||
sq = "(deprecated:any)"
|
||||
|
||||
if self.state.last_listing_date:
|
||||
last_str = (
|
||||
self.state.last_listing_date.astimezone(tz=timezone.utc)
|
||||
.date()
|
||||
.isoformat()
|
||||
)
|
||||
|
||||
# Incremental mode search query
|
||||
sq += "(lastUpload >= %s)" % last_str
|
||||
|
||||
params = {
|
||||
"page": 0,
|
||||
"sortColumn": "default",
|
||||
"sortDirection": "ascending",
|
||||
"searchQuery": "(deprecated:any)",
|
||||
"searchQuery": sq,
|
||||
}
|
||||
|
||||
data = self.http_request(
|
||||
|
@ -67,20 +104,22 @@ class HackageLister(StatelessLister[HackageListerPage]):
|
|||
json=params,
|
||||
).json()
|
||||
|
||||
nb_entries: int = data["numberOfResults"]
|
||||
(nb_pages, remainder) = divmod(nb_entries, self.page_size)
|
||||
if remainder:
|
||||
nb_pages += 1
|
||||
yield data["pageContents"]
|
||||
|
||||
for page in range(1, nb_pages):
|
||||
params["page"] = page
|
||||
data = self.http_request(
|
||||
url=self.PACKAGE_NAMES_URL_PATTERN.format(base_url=self.url),
|
||||
method="POST",
|
||||
json=params,
|
||||
).json()
|
||||
if data.get("pageContents"):
|
||||
nb_entries: int = data["numberOfResults"]
|
||||
(nb_pages, remainder) = divmod(nb_entries, self.page_size)
|
||||
if remainder:
|
||||
nb_pages += 1
|
||||
# First page
|
||||
yield data["pageContents"]
|
||||
# Next pages
|
||||
for page in range(1, nb_pages):
|
||||
params["page"] = page
|
||||
data = self.http_request(
|
||||
url=self.PACKAGE_NAMES_URL_PATTERN.format(base_url=self.url),
|
||||
method="POST",
|
||||
json=params,
|
||||
).json()
|
||||
yield data["pageContents"]
|
||||
|
||||
def get_origins_from_page(self, page: HackageListerPage) -> Iterator[ListedOrigin]:
|
||||
"""Iterate on all pages and yield ListedOrigin instances."""
|
||||
|
@ -92,9 +131,14 @@ class HackageLister(StatelessLister[HackageListerPage]):
|
|||
url = self.PACKAGE_INFO_URL_PATTERN.format(
|
||||
base_url=self.url, pkgname=pkgname
|
||||
)
|
||||
|
||||
yield ListedOrigin(
|
||||
lister_id=self.lister_obj.id,
|
||||
visit_type=self.VISIT_TYPE,
|
||||
url=url,
|
||||
last_update=last_update,
|
||||
)
|
||||
|
||||
def finalize(self) -> None:
|
||||
self.state.last_listing_date = self.listing_date
|
||||
self.updated = True
|
||||
|
|
|
@ -0,0 +1 @@
|
|||
{"numberOfResults":3,"pageContents":[{"description":"Translations of classic Truth Maintenance Systems","downloads":14,"lastUpload":"2022-09-13T19:21:15.533437837Z","maintainers":[{"display":"jpmrst","uri":"/user/jpmrst"}],"name":{"display":"BPS","uri":"/package/BPS"},"tags":[{"display":"gpl","uri":"/packages/tag/gpl"},{"display":"library","uri":"/packages/tag/library"},{"display":"program","uri":"/packages/tag/program"},{"display":"truth-maintenance","uri":"/packages/tag/truth-maintenance"}],"votes":0},{"description":"C-Structs implementation for Haskell","downloads":25,"lastUpload":"2022-09-30T08:00:34.348551203Z","maintainers":[{"display":"SimonPlakolb","uri":"/user/SimonPlakolb"}],"name":{"display":"C-structs","uri":"/package/C-structs"},"tags":[{"display":"c","uri":"/packages/tag/c"},{"display":"data","uri":"/packages/tag/data"},{"display":"foreign","uri":"/packages/tag/foreign"},{"display":"library","uri":"/packages/tag/library"},{"display":"mit","uri":"/packages/tag/mit"},{"display":"structures","uri":"/packages/tag/structures"}],"votes":2},{"description":"Cluster algorithms, PCA, and chemical conformere analysis","downloads":29,"lastUpload":"2022-09-28T11:54:25.8011197Z","maintainers":[{"display":"phillipseeber","uri":"/user/phillipseeber"}],"name":{"display":"ConClusion","uri":"/package/ConClusion"},"tags":[{"display":"agpl","uri":"/packages/tag/agpl"},{"display":"chemistry","uri":"/packages/tag/chemistry"},{"display":"library","uri":"/packages/tag/library"},{"display":"program","uri":"/packages/tag/program"},{"display":"statistics","uri":"/packages/tag/statistics"}],"votes":2}]}
|
|
@ -0,0 +1 @@
|
|||
{"numberOfResults":0,"pageContents":[]}
|
|
@ -8,25 +8,31 @@ import json
|
|||
from pathlib import Path
|
||||
from urllib.parse import unquote, urlparse
|
||||
|
||||
from swh.lister.hackage.lister import HackageLister
|
||||
import iso8601
|
||||
|
||||
from swh.lister.hackage.lister import HackageLister, HackageListerState
|
||||
|
||||
|
||||
def json_callback(request, context, datadir):
|
||||
def json_callback(request, context, datadir, visit=0):
|
||||
"""Callback for requests_mock that load a json file regarding a page number"""
|
||||
page = request.json()["page"]
|
||||
|
||||
unquoted_url = unquote(request.url)
|
||||
url = urlparse(unquoted_url)
|
||||
page = request.json()["page"]
|
||||
|
||||
dirname = "%s_%s" % (url.scheme, url.hostname)
|
||||
filename = url.path[1:]
|
||||
if filename.endswith("/"):
|
||||
filename = filename[:-1]
|
||||
filename = filename.replace("/", "_")
|
||||
filepath = Path(datadir, dirname, f"{filename}_{page}")
|
||||
|
||||
return json.loads(Path(datadir, dirname, f"{filename}_{page}").read_text())
|
||||
if visit > 0:
|
||||
filepath = filepath.parent / f"{filepath.stem}_visit{visit}"
|
||||
return json.loads(filepath.read_text())
|
||||
|
||||
|
||||
def test_hackage_lister(swh_scheduler, requests_mock, datadir):
|
||||
"""Assert a full listing of 3 pages of 50 origins"""
|
||||
|
||||
requests_mock.post(
|
||||
url="https://hackage.haskell.org/packages/search",
|
||||
|
@ -74,6 +80,10 @@ def test_hackage_lister(swh_scheduler, requests_mock, datadir):
|
|||
|
||||
|
||||
def test_hackage_lister_pagination_49(swh_scheduler, requests_mock, datadir):
|
||||
"""Test Pagination
|
||||
|
||||
Page size is 50, lister returns 1 page when origins < page size
|
||||
"""
|
||||
requests_mock.post(
|
||||
url="https://fake49.haskell.org/packages/search",
|
||||
status_code=200,
|
||||
|
@ -87,6 +97,10 @@ def test_hackage_lister_pagination_49(swh_scheduler, requests_mock, datadir):
|
|||
|
||||
|
||||
def test_hackage_lister_pagination_51(swh_scheduler, requests_mock, datadir):
|
||||
"""Test Pagination
|
||||
|
||||
Page size is 50, lister returns 2 page when origins > page size
|
||||
"""
|
||||
requests_mock.post(
|
||||
url="https://fake51.haskell.org/packages/search",
|
||||
status_code=200,
|
||||
|
@ -98,3 +112,86 @@ def test_hackage_lister_pagination_51(swh_scheduler, requests_mock, datadir):
|
|||
assert len(pages) == 2
|
||||
assert len(pages[0]) == 50
|
||||
assert len(pages[1]) == 1
|
||||
|
||||
|
||||
def test_hackage_lister_incremental(swh_scheduler, requests_mock, datadir):
|
||||
"""Test incremental lister
|
||||
|
||||
* First run, full listing, 3 pages, 150 origins
|
||||
* Second run, 1 page, 3 new or updated origins
|
||||
* Third run, nothing new, 0 page, 0 origins
|
||||
"""
|
||||
|
||||
mock_url = "https://hackage.haskell.org/packages/search"
|
||||
|
||||
# first run
|
||||
requests_mock.post(
|
||||
url=mock_url,
|
||||
status_code=200,
|
||||
json=functools.partial(json_callback, datadir=datadir),
|
||||
)
|
||||
lister = HackageLister(scheduler=swh_scheduler)
|
||||
# force lister.last_listing_date to not being 'now'
|
||||
lister.state.last_listing_date = iso8601.parse_date("2022-08-26T02:27:45.073759Z")
|
||||
lister.set_state_in_scheduler()
|
||||
assert lister.get_state_from_scheduler() == HackageListerState(
|
||||
last_listing_date=iso8601.parse_date("2022-08-26T02:27:45.073759Z")
|
||||
)
|
||||
|
||||
first = lister.run()
|
||||
assert first.pages == 3
|
||||
assert first.origins == 3 * 50
|
||||
# 3 http requests done
|
||||
assert len(requests_mock.request_history) == 3
|
||||
for rh in requests_mock.request_history:
|
||||
assert rh.json()["searchQuery"] == "(deprecated:any)(lastUpload >= 2022-08-26)"
|
||||
|
||||
# second run
|
||||
requests_mock.post(
|
||||
url=mock_url,
|
||||
status_code=200,
|
||||
json=functools.partial(json_callback, datadir=datadir, visit=1),
|
||||
)
|
||||
lister = HackageLister(scheduler=swh_scheduler)
|
||||
# force lister.last_listing_date to not being 'now'
|
||||
lister.state.last_listing_date = iso8601.parse_date(
|
||||
"2022-09-30T08:00:34.348551203Z"
|
||||
)
|
||||
lister.set_state_in_scheduler()
|
||||
assert lister.get_state_from_scheduler() == HackageListerState(
|
||||
last_listing_date=iso8601.parse_date("2022-09-30T08:00:34.348551203Z")
|
||||
)
|
||||
|
||||
second = lister.run()
|
||||
assert second.pages == 1
|
||||
assert second.origins == 3
|
||||
|
||||
assert len(requests_mock.request_history) == 3 + 1
|
||||
# Check the first three ones, should be the same as first run
|
||||
for i in range(3):
|
||||
assert (
|
||||
requests_mock.request_history[i].json()["searchQuery"]
|
||||
== "(deprecated:any)(lastUpload >= 2022-08-26)"
|
||||
)
|
||||
# Check the last one, lastUpload should be the same as second run
|
||||
assert (
|
||||
requests_mock.last_request.json()["searchQuery"]
|
||||
== "(deprecated:any)(lastUpload >= 2022-09-30)"
|
||||
)
|
||||
|
||||
# third run (no update since last run, no new or updated origins but one http requests
|
||||
# with no results)
|
||||
requests_mock.post(
|
||||
url=mock_url,
|
||||
status_code=200,
|
||||
json=functools.partial(json_callback, datadir=datadir, visit=2),
|
||||
)
|
||||
lister = HackageLister(scheduler=swh_scheduler)
|
||||
third = lister.run()
|
||||
|
||||
assert third.pages == 0
|
||||
assert third.origins == 0
|
||||
assert lister.get_state_from_scheduler() == HackageListerState(
|
||||
last_listing_date=lister.state.last_listing_date
|
||||
)
|
||||
assert len(requests_mock.request_history) == 3 + 1 + 1
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue