feat(hex): Use updated_after search query

This commit is contained in:
KShivendu 2023-02-01 12:38:08 +05:30 committed by Kumar Shivendu
parent a452995d95
commit cfd9a693aa
5 changed files with 88 additions and 32 deletions

View file

@ -13,11 +13,14 @@ following Python modules:
- `swh.lister.cgit`
- `swh.lister.cran`
- `swh.lister.debian`
- `swh.liser.fedora`
- `swh.lister.gitea`
- `swh.lister.github`
- `swh.lister.gitlab`
- `swh.lister.gnu`
- `swh.lister.gogs`
- `swh.lister.golang`
- `swh.lister.hex`
- `swh.lister.launchpad`
- `swh.lister.maven`
- `swh.lister.npm`
@ -25,9 +28,6 @@ following Python modules:
- `swh.lister.phabricator`
- `swh.lister.pypi`
- `swh.lister.tuleap`
- `swh.lister.gogs`
- `swh.liser.fedora`
- `swh.lister.hex`
Dependencies
------------

View file

@ -65,12 +65,15 @@ setup(
lister.cran=swh.lister.cran:register
lister.crates=swh.lister.crates:register
lister.debian=swh.lister.debian:register
lister.fedora=swh.lister.fedora:register
lister.gitea=swh.lister.gitea:register
lister.github=swh.lister.github:register
lister.gitlab=swh.lister.gitlab:register
lister.gnu=swh.lister.gnu:register
lister.golang=swh.lister.golang:register
lister.gogs=swh.lister.gogs:register
lister.hackage=swh.lister.hackage:register
lister.hex=swh.lister.hex:register
lister.launchpad=swh.lister.launchpad:register
lister.nixguix=swh.lister.nixguix:register
lister.npm=swh.lister.npm:register
@ -85,9 +88,6 @@ setup(
lister.sourceforge=swh.lister.sourceforge:register
lister.tuleap=swh.lister.tuleap:register
lister.maven=swh.lister.maven:register
lister.gogs=swh.lister.gogs:register
lister.fedora=swh.lister.fedora:register
lister.hex=swh.lister.hex:register
""",
classifiers=[
"Programming Language :: Python :: 3",

View file

@ -4,6 +4,7 @@
# See top-level LICENSE file for more information
from dataclasses import asdict, dataclass
from datetime import datetime
import logging
from typing import Any, Dict, Iterator, List
from urllib.parse import urljoin
@ -32,10 +33,12 @@ class HexListerState:
"""Id of the last page listed on an incremental pass"""
last_pkg_name: str = ""
"""Name of the last package inserted at on an incremental pass"""
last_updated_at: str = datetime.min.replace(tzinfo=iso8601.UTC).isoformat()
"""updated_at value of the last seen package on an incremental pass"""
class HexLister(Lister[HexListerState, HexListerPage]):
"""List origins from the "Hex" forge."""
"""List origins from the Hex.pm"""
LISTER_NAME = "hex"
VISIT_TYPE = "hex"
@ -73,12 +76,19 @@ class HexLister(Lister[HexListerState, HexListerPage]):
url = urljoin(self.url, self.PACKAGES_PATH)
while page_id is not None:
logger.debug(
"Fetching URL %s with page_id = %s and updated_after = %s",
url,
page_id,
self.state.last_updated_at,
)
body = self.http_request(
url,
params={
"page": page_id,
"sort": "name",
}, # sort=name is actually the default
"search": f"updated_after:{self.state.last_updated_at}",
},
).json()
yield body
@ -116,15 +126,23 @@ class HexLister(Lister[HexListerState, HexListerPage]):
return
last_pkg_name = page[-1]["name"]
# incoming page should have alphabetically greater
# last package name than the one stored in the state
if last_pkg_name > self.state.last_pkg_name:
last_updated_at = page[-1]["updated_at"]
# TODO: Think more about 2nd condition:
if (
iso8601.parse_date(last_updated_at)
> iso8601.parse_date(self.state.last_updated_at)
and last_pkg_name != self.state.last_pkg_name
and len(page) > 0
):
self.state.last_pkg_name = last_pkg_name
self.state.last_page_id += 1
self.state.last_updated_at = last_updated_at
def finalize(self) -> None:
scheduler_state = self.get_state_from_scheduler()
if self.state.last_page_id > scheduler_state.last_page_id:
# Mark the lister as updated only if it finds any updated repos
if iso8601.parse_date(self.state.last_updated_at) > iso8601.parse_date(
scheduler_state.last_updated_at
):
self.updated = True

View file

@ -13,7 +13,7 @@ from .lister import HexLister
def list_hex_full(
instance: Optional[str] = None,
) -> Dict[str, int]:
"""Full update of a Hex.pm instance"""
"""Full listing of Hex.pm"""
lister = HexLister.from_configfile(instance=instance)
return lister.run().dict()

View file

@ -1,8 +1,10 @@
import json
from pathlib import Path
from typing import List
from typing import List, Optional
from urllib.parse import quote
import pytest
from requests import HTTPError
from swh.lister.hex.lister import HexLister, ListedOrigin
from swh.scheduler.interface import SchedulerInterface
@ -25,10 +27,27 @@ def check_listed_origins(lister_urls: List[str], scheduler_origins: List[ListedO
assert set(lister_urls) == {origin.url for origin in scheduler_origins}
@pytest.fixture
def mock_hexpm_page(requests_mock):
def func(
page_id: int,
updated_after: str,
body: Optional[List[dict]],
status_code: int = 200,
):
search_query = quote(f"updated_after:{updated_after}")
page_url = f"https://hex.pm/api/packages/?page={page_id}&search={search_query}"
requests_mock.get(
page_url, json=body, complete_qs=True, status_code=status_code
)
return func
def test_full_lister_hex(
swh_scheduler: SchedulerInterface,
requests_mock,
hexpm_page,
mock_hexpm_page,
):
"""
Simulate a full listing of packages for hex (erlang package manager)
@ -37,10 +56,10 @@ def test_full_lister_hex(
p2_origin_urls, p2_json = hexpm_page(2)
p3_origin_urls, p3_json = hexpm_page(3)
requests_mock.get("https://hex.pm/api/packages/?page=1", json=p1_json)
requests_mock.get("https://hex.pm/api/packages/?page=2", json=p2_json)
requests_mock.get("https://hex.pm/api/packages/?page=3", json=p3_json)
requests_mock.get("https://hex.pm/api/packages/?page=4", json=[])
mock_hexpm_page(1, "0001-01-01T00:00:00+00:00", p1_json)
mock_hexpm_page(2, "2018-01-30T04:56:03.053561Z", p2_json)
mock_hexpm_page(3, "2019-03-27T00:32:47.822901Z", p3_json)
mock_hexpm_page(4, "2022-09-09T21:00:14.993273Z", [])
lister = HexLister(swh_scheduler)
@ -60,9 +79,9 @@ def test_full_lister_hex(
assert lister.updated
def test_gogs_incremental_lister(
def test_hex_incremental_lister(
swh_scheduler,
requests_mock,
mock_hexpm_page,
hexpm_page,
):
lister = HexLister(swh_scheduler)
@ -71,9 +90,9 @@ def test_gogs_incremental_lister(
p1_origin_urls, p1_json = hexpm_page(1)
p2_origin_urls, p2_json = hexpm_page(2)
requests_mock.get("https://hex.pm/api/packages/?page=1", json=p1_json)
requests_mock.get("https://hex.pm/api/packages/?page=2", json=p2_json)
requests_mock.get("https://hex.pm/api/packages/?page=3", json=[])
mock_hexpm_page(1, "0001-01-01T00:00:00+00:00", p1_json)
mock_hexpm_page(2, "2018-01-30T04:56:03.053561Z", p2_json)
mock_hexpm_page(3, "2019-03-27T00:32:47.822901Z", [])
stats = lister.run()
@ -94,9 +113,9 @@ def test_gogs_incremental_lister(
# Second run: P3 isn't empty anymore
p3_origin_urls, p3_json = hexpm_page(3)
requests_mock.get("https://hex.pm/api/packages/?page=3", json=p3_json)
requests_mock.get(
"https://hex.pm/api/packages/?page=4", json=[]
mock_hexpm_page(3, "2019-03-27T00:32:47.822901Z", p3_json)
mock_hexpm_page(
4, "2022-09-09T21:00:14.993273Z", []
) # TODO: Try with 40x/50x here?
stats = lister.run()
@ -125,9 +144,7 @@ def test_gogs_incremental_lister(
stats = lister.run()
assert stats.pages == 1
assert (
stats.origins == 0
) # FIXME: inconsistent with Gogs lister. Either of them could be wrong
assert stats.origins == 0 # FIXME: inconsistent with Gogs lister
lister_state = lister.get_state_from_scheduler()
assert (
@ -139,3 +156,24 @@ def test_gogs_incremental_lister(
check_listed_origins(
p1_origin_urls + p2_origin_urls + p3_origin_urls, scheduler_origins
)
@pytest.mark.parametrize("http_code", [400, 500])
def test_hex_lister_http_error(swh_scheduler, http_code, mock_hexpm_page, hexpm_page):
"""Test handling of some HTTP errors commonly encountered"""
lister = HexLister(swh_scheduler)
# First run: P1 and P2 return 4 origins each and P3 returns 0
p1_origin_urls, p1_json = hexpm_page(1)
_, p3_json = hexpm_page(3)
mock_hexpm_page(1, "0001-01-01T00:00:00+00:00", p1_json)
mock_hexpm_page(2, "2018-01-30T04:56:03.053561Z", None, http_code)
mock_hexpm_page(3, "2019-03-27T00:32:47.822901Z", p3_json)
with pytest.raises(HTTPError):
lister.run()
# Only P1 should be listed
scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
check_listed_origins(p1_origin_urls, scheduler_origins)