feat(hex): Use updated_after search query
This commit is contained in:
parent
a452995d95
commit
cfd9a693aa
5 changed files with 88 additions and 32 deletions
|
@ -13,11 +13,14 @@ following Python modules:
|
|||
- `swh.lister.cgit`
|
||||
- `swh.lister.cran`
|
||||
- `swh.lister.debian`
|
||||
- `swh.liser.fedora`
|
||||
- `swh.lister.gitea`
|
||||
- `swh.lister.github`
|
||||
- `swh.lister.gitlab`
|
||||
- `swh.lister.gnu`
|
||||
- `swh.lister.gogs`
|
||||
- `swh.lister.golang`
|
||||
- `swh.lister.hex`
|
||||
- `swh.lister.launchpad`
|
||||
- `swh.lister.maven`
|
||||
- `swh.lister.npm`
|
||||
|
@ -25,9 +28,6 @@ following Python modules:
|
|||
- `swh.lister.phabricator`
|
||||
- `swh.lister.pypi`
|
||||
- `swh.lister.tuleap`
|
||||
- `swh.lister.gogs`
|
||||
- `swh.liser.fedora`
|
||||
- `swh.lister.hex`
|
||||
|
||||
Dependencies
|
||||
------------
|
||||
|
|
6
setup.py
6
setup.py
|
@ -65,12 +65,15 @@ setup(
|
|||
lister.cran=swh.lister.cran:register
|
||||
lister.crates=swh.lister.crates:register
|
||||
lister.debian=swh.lister.debian:register
|
||||
lister.fedora=swh.lister.fedora:register
|
||||
lister.gitea=swh.lister.gitea:register
|
||||
lister.github=swh.lister.github:register
|
||||
lister.gitlab=swh.lister.gitlab:register
|
||||
lister.gnu=swh.lister.gnu:register
|
||||
lister.golang=swh.lister.golang:register
|
||||
lister.gogs=swh.lister.gogs:register
|
||||
lister.hackage=swh.lister.hackage:register
|
||||
lister.hex=swh.lister.hex:register
|
||||
lister.launchpad=swh.lister.launchpad:register
|
||||
lister.nixguix=swh.lister.nixguix:register
|
||||
lister.npm=swh.lister.npm:register
|
||||
|
@ -85,9 +88,6 @@ setup(
|
|||
lister.sourceforge=swh.lister.sourceforge:register
|
||||
lister.tuleap=swh.lister.tuleap:register
|
||||
lister.maven=swh.lister.maven:register
|
||||
lister.gogs=swh.lister.gogs:register
|
||||
lister.fedora=swh.lister.fedora:register
|
||||
lister.hex=swh.lister.hex:register
|
||||
""",
|
||||
classifiers=[
|
||||
"Programming Language :: Python :: 3",
|
||||
|
|
|
@ -4,6 +4,7 @@
|
|||
# See top-level LICENSE file for more information
|
||||
|
||||
from dataclasses import asdict, dataclass
|
||||
from datetime import datetime
|
||||
import logging
|
||||
from typing import Any, Dict, Iterator, List
|
||||
from urllib.parse import urljoin
|
||||
|
@ -32,10 +33,12 @@ class HexListerState:
|
|||
"""Id of the last page listed on an incremental pass"""
|
||||
last_pkg_name: str = ""
|
||||
"""Name of the last package inserted at on an incremental pass"""
|
||||
last_updated_at: str = datetime.min.replace(tzinfo=iso8601.UTC).isoformat()
|
||||
"""updated_at value of the last seen package on an incremental pass"""
|
||||
|
||||
|
||||
class HexLister(Lister[HexListerState, HexListerPage]):
|
||||
"""List origins from the "Hex" forge."""
|
||||
"""List origins from the Hex.pm"""
|
||||
|
||||
LISTER_NAME = "hex"
|
||||
VISIT_TYPE = "hex"
|
||||
|
@ -73,12 +76,19 @@ class HexLister(Lister[HexListerState, HexListerPage]):
|
|||
url = urljoin(self.url, self.PACKAGES_PATH)
|
||||
|
||||
while page_id is not None:
|
||||
logger.debug(
|
||||
"Fetching URL %s with page_id = %s and updated_after = %s",
|
||||
url,
|
||||
page_id,
|
||||
self.state.last_updated_at,
|
||||
)
|
||||
|
||||
body = self.http_request(
|
||||
url,
|
||||
params={
|
||||
"page": page_id,
|
||||
"sort": "name",
|
||||
}, # sort=name is actually the default
|
||||
"search": f"updated_after:{self.state.last_updated_at}",
|
||||
},
|
||||
).json()
|
||||
|
||||
yield body
|
||||
|
@ -116,15 +126,23 @@ class HexLister(Lister[HexListerState, HexListerPage]):
|
|||
return
|
||||
|
||||
last_pkg_name = page[-1]["name"]
|
||||
|
||||
# incoming page should have alphabetically greater
|
||||
# last package name than the one stored in the state
|
||||
if last_pkg_name > self.state.last_pkg_name:
|
||||
last_updated_at = page[-1]["updated_at"]
|
||||
# TODO: Think more about 2nd condition:
|
||||
if (
|
||||
iso8601.parse_date(last_updated_at)
|
||||
> iso8601.parse_date(self.state.last_updated_at)
|
||||
and last_pkg_name != self.state.last_pkg_name
|
||||
and len(page) > 0
|
||||
):
|
||||
self.state.last_pkg_name = last_pkg_name
|
||||
self.state.last_page_id += 1
|
||||
self.state.last_updated_at = last_updated_at
|
||||
|
||||
def finalize(self) -> None:
|
||||
scheduler_state = self.get_state_from_scheduler()
|
||||
|
||||
if self.state.last_page_id > scheduler_state.last_page_id:
|
||||
# Mark the lister as updated only if it finds any updated repos
|
||||
if iso8601.parse_date(self.state.last_updated_at) > iso8601.parse_date(
|
||||
scheduler_state.last_updated_at
|
||||
):
|
||||
self.updated = True
|
||||
|
|
|
@ -13,7 +13,7 @@ from .lister import HexLister
|
|||
def list_hex_full(
|
||||
instance: Optional[str] = None,
|
||||
) -> Dict[str, int]:
|
||||
"""Full update of a Hex.pm instance"""
|
||||
"""Full listing of Hex.pm"""
|
||||
lister = HexLister.from_configfile(instance=instance)
|
||||
return lister.run().dict()
|
||||
|
||||
|
|
|
@ -1,8 +1,10 @@
|
|||
import json
|
||||
from pathlib import Path
|
||||
from typing import List
|
||||
from typing import List, Optional
|
||||
from urllib.parse import quote
|
||||
|
||||
import pytest
|
||||
from requests import HTTPError
|
||||
|
||||
from swh.lister.hex.lister import HexLister, ListedOrigin
|
||||
from swh.scheduler.interface import SchedulerInterface
|
||||
|
@ -25,10 +27,27 @@ def check_listed_origins(lister_urls: List[str], scheduler_origins: List[ListedO
|
|||
assert set(lister_urls) == {origin.url for origin in scheduler_origins}
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_hexpm_page(requests_mock):
|
||||
def func(
|
||||
page_id: int,
|
||||
updated_after: str,
|
||||
body: Optional[List[dict]],
|
||||
status_code: int = 200,
|
||||
):
|
||||
search_query = quote(f"updated_after:{updated_after}")
|
||||
page_url = f"https://hex.pm/api/packages/?page={page_id}&search={search_query}"
|
||||
requests_mock.get(
|
||||
page_url, json=body, complete_qs=True, status_code=status_code
|
||||
)
|
||||
|
||||
return func
|
||||
|
||||
|
||||
def test_full_lister_hex(
|
||||
swh_scheduler: SchedulerInterface,
|
||||
requests_mock,
|
||||
hexpm_page,
|
||||
mock_hexpm_page,
|
||||
):
|
||||
"""
|
||||
Simulate a full listing of packages for hex (erlang package manager)
|
||||
|
@ -37,10 +56,10 @@ def test_full_lister_hex(
|
|||
p2_origin_urls, p2_json = hexpm_page(2)
|
||||
p3_origin_urls, p3_json = hexpm_page(3)
|
||||
|
||||
requests_mock.get("https://hex.pm/api/packages/?page=1", json=p1_json)
|
||||
requests_mock.get("https://hex.pm/api/packages/?page=2", json=p2_json)
|
||||
requests_mock.get("https://hex.pm/api/packages/?page=3", json=p3_json)
|
||||
requests_mock.get("https://hex.pm/api/packages/?page=4", json=[])
|
||||
mock_hexpm_page(1, "0001-01-01T00:00:00+00:00", p1_json)
|
||||
mock_hexpm_page(2, "2018-01-30T04:56:03.053561Z", p2_json)
|
||||
mock_hexpm_page(3, "2019-03-27T00:32:47.822901Z", p3_json)
|
||||
mock_hexpm_page(4, "2022-09-09T21:00:14.993273Z", [])
|
||||
|
||||
lister = HexLister(swh_scheduler)
|
||||
|
||||
|
@ -60,9 +79,9 @@ def test_full_lister_hex(
|
|||
assert lister.updated
|
||||
|
||||
|
||||
def test_gogs_incremental_lister(
|
||||
def test_hex_incremental_lister(
|
||||
swh_scheduler,
|
||||
requests_mock,
|
||||
mock_hexpm_page,
|
||||
hexpm_page,
|
||||
):
|
||||
lister = HexLister(swh_scheduler)
|
||||
|
@ -71,9 +90,9 @@ def test_gogs_incremental_lister(
|
|||
p1_origin_urls, p1_json = hexpm_page(1)
|
||||
p2_origin_urls, p2_json = hexpm_page(2)
|
||||
|
||||
requests_mock.get("https://hex.pm/api/packages/?page=1", json=p1_json)
|
||||
requests_mock.get("https://hex.pm/api/packages/?page=2", json=p2_json)
|
||||
requests_mock.get("https://hex.pm/api/packages/?page=3", json=[])
|
||||
mock_hexpm_page(1, "0001-01-01T00:00:00+00:00", p1_json)
|
||||
mock_hexpm_page(2, "2018-01-30T04:56:03.053561Z", p2_json)
|
||||
mock_hexpm_page(3, "2019-03-27T00:32:47.822901Z", [])
|
||||
|
||||
stats = lister.run()
|
||||
|
||||
|
@ -94,9 +113,9 @@ def test_gogs_incremental_lister(
|
|||
# Second run: P3 isn't empty anymore
|
||||
p3_origin_urls, p3_json = hexpm_page(3)
|
||||
|
||||
requests_mock.get("https://hex.pm/api/packages/?page=3", json=p3_json)
|
||||
requests_mock.get(
|
||||
"https://hex.pm/api/packages/?page=4", json=[]
|
||||
mock_hexpm_page(3, "2019-03-27T00:32:47.822901Z", p3_json)
|
||||
mock_hexpm_page(
|
||||
4, "2022-09-09T21:00:14.993273Z", []
|
||||
) # TODO: Try with 40x/50x here?
|
||||
|
||||
stats = lister.run()
|
||||
|
@ -125,9 +144,7 @@ def test_gogs_incremental_lister(
|
|||
stats = lister.run()
|
||||
|
||||
assert stats.pages == 1
|
||||
assert (
|
||||
stats.origins == 0
|
||||
) # FIXME: inconsistent with Gogs lister. Either of them could be wrong
|
||||
assert stats.origins == 0 # FIXME: inconsistent with Gogs lister
|
||||
|
||||
lister_state = lister.get_state_from_scheduler()
|
||||
assert (
|
||||
|
@ -139,3 +156,24 @@ def test_gogs_incremental_lister(
|
|||
check_listed_origins(
|
||||
p1_origin_urls + p2_origin_urls + p3_origin_urls, scheduler_origins
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("http_code", [400, 500])
|
||||
def test_hex_lister_http_error(swh_scheduler, http_code, mock_hexpm_page, hexpm_page):
|
||||
"""Test handling of some HTTP errors commonly encountered"""
|
||||
lister = HexLister(swh_scheduler)
|
||||
|
||||
# First run: P1 and P2 return 4 origins each and P3 returns 0
|
||||
p1_origin_urls, p1_json = hexpm_page(1)
|
||||
_, p3_json = hexpm_page(3)
|
||||
|
||||
mock_hexpm_page(1, "0001-01-01T00:00:00+00:00", p1_json)
|
||||
mock_hexpm_page(2, "2018-01-30T04:56:03.053561Z", None, http_code)
|
||||
mock_hexpm_page(3, "2019-03-27T00:32:47.822901Z", p3_json)
|
||||
|
||||
with pytest.raises(HTTPError):
|
||||
lister.run()
|
||||
|
||||
# Only P1 should be listed
|
||||
scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
|
||||
check_listed_origins(p1_origin_urls, scheduler_origins)
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue