fix(hex): Use only updated_after for pagination

This commit is contained in:
KShivendu 2023-02-05 15:32:17 +05:30 committed by Kumar Shivendu
parent ac9993a001
commit 9095bbec00
4 changed files with 46 additions and 60 deletions

View file

@ -5,6 +5,14 @@
import os
import pytest
pytest_plugins = ["swh.scheduler.pytest_plugin", "swh.core.github.pytest_plugin"]
os.environ["LC_ALL"] = "C.UTF-8"
@pytest.fixture(autouse=True)
def tenacity_wait(mocker):
# Stops tenacity from blocking lister tests for 50x errors
mocker.patch("tenacity.nap.time")

View file

@ -4,7 +4,6 @@
# See top-level LICENSE file for more information
from dataclasses import asdict, dataclass
from datetime import datetime
import logging
from typing import Any, Dict, Iterator, List
from urllib.parse import urljoin
@ -29,12 +28,10 @@ def get_tar_url(pkg_name: str, release_version: str):
class HexListerState:
"""The HexLister instance state. This is used for incremental listing."""
last_page_id: int = 1
"""Id of the last page listed on an incremental pass"""
last_pkg_name: str = ""
"""Name of the last package inserted at on an incremental pass"""
last_updated_at: str = datetime.min.replace(tzinfo=iso8601.UTC).isoformat()
"""updated_at value of the last seen package on an incremental pass"""
# Note: Default values are used only when the lister is run for the first time.
page_updated_at: str = "0001-01-01T00:00:00.000000Z" # Min datetime
"""`updated_at` value of the last seen package in the page."""
class HexLister(Lister[HexListerState, HexListerPage]):
@ -69,34 +66,20 @@ class HexLister(Lister[HexListerState, HexListerPage]):
return asdict(state)
def get_pages(self) -> Iterator[HexListerPage]:
page_id = 1
if self.state.last_page_id is not None:
page_id = self.state.last_page_id
url = urljoin(self.url, self.PACKAGES_PATH)
while page_id is not None:
logger.debug(
"Fetching URL %s with page_id = %s and updated_after = %s",
url,
page_id,
self.state.last_updated_at,
)
body = self.http_request(
while True:
body = self.http_request( # This also logs the request
url,
params={
"page": page_id,
"search": f"updated_after:{self.state.last_updated_at}",
"search": f"updated_after:{self.state.page_updated_at}",
},
).json()
yield body
page_id += 1 # Consider stopping before yielding?
if len(body) == 0:
break # Consider stopping if number of items < 100?
break
def get_origins_from_page(self, page: HexListerPage) -> Iterator[ListedOrigin]:
"""Convert a page of HexLister repositories into a list of ListedOrigins"""
@ -125,24 +108,24 @@ class HexLister(Lister[HexListerState, HexListerPage]):
if len(page) == 0:
return
last_pkg_name = page[-1]["name"]
last_updated_at = page[-1]["updated_at"]
# TODO: Think more about 2nd condition:
page_updated_at = page[-1]["updated_at"]
"""`page_updated_at` is same as `updated_at` of the last package in the page."""
if (
iso8601.parse_date(last_updated_at)
> iso8601.parse_date(self.state.last_updated_at)
and last_pkg_name != self.state.last_pkg_name
iso8601.parse_date(page_updated_at)
> iso8601.parse_date(self.state.page_updated_at)
and len(page) > 0
):
self.state.last_pkg_name = last_pkg_name
self.state.last_page_id += 1
self.state.last_updated_at = last_updated_at
# There's one edge case where `updated_at` don't change between two pages.
# But that seems practically impossible because we have 100 packages
# per page and the `updated_at` keeps on increasing with time.
self.state.page_updated_at = page_updated_at
def finalize(self) -> None:
scheduler_state = self.get_state_from_scheduler()
# Mark the lister as updated only if it finds any updated repos
if iso8601.parse_date(self.state.last_updated_at) > iso8601.parse_date(
scheduler_state.last_updated_at
if iso8601.parse_date(self.state.page_updated_at) > iso8601.parse_date(
scheduler_state.page_updated_at
):
self.updated = True
self.updated = True # This will update the lister state in the scheduler

View file

@ -9,7 +9,7 @@ from celery import shared_task
from .lister import HexLister
@shared_task(name=__name__ + ".FullHexRelister")
@shared_task(name=__name__ + ".HexListerTask")
def list_hex_full(
instance: Optional[str] = None,
) -> Dict[str, int]:

View file

@ -29,13 +29,12 @@ def check_listed_origins(lister_urls: List[str], scheduler_origins: List[ListedO
@pytest.fixture
def mock_hexpm_page(requests_mock):
def func(
page_id: int,
updated_after: str,
body: Optional[List[dict]],
status_code: int = 200,
):
search_query = quote(f"updated_after:{updated_after}")
page_url = f"https://hex.pm/api/packages/?page={page_id}&search={search_query}"
page_url = f"https://hex.pm/api/packages/?search={search_query}"
requests_mock.get(
page_url, json=body, complete_qs=True, status_code=status_code
)
@ -55,10 +54,10 @@ def test_full_lister_hex(
p2_origin_urls, p2_json = hexpm_page(2)
p3_origin_urls, p3_json = hexpm_page(3)
mock_hexpm_page(1, "0001-01-01T00:00:00+00:00", p1_json)
mock_hexpm_page(2, "2018-01-30T04:56:03.053561Z", p2_json)
mock_hexpm_page(3, "2019-03-27T00:32:47.822901Z", p3_json)
mock_hexpm_page(4, "2022-09-09T21:00:14.993273Z", [])
mock_hexpm_page("0001-01-01T00:00:00.000000Z", p1_json)
mock_hexpm_page("2018-01-30T04:56:03.053561Z", p2_json)
mock_hexpm_page("2019-03-27T00:32:47.822901Z", p3_json)
mock_hexpm_page("2022-09-09T21:00:14.993273Z", [])
lister = HexLister(swh_scheduler)
@ -73,8 +72,7 @@ def test_full_lister_hex(
p1_origin_urls + p2_origin_urls + p3_origin_urls, scheduler_origins
)
assert lister_state.last_page_id == 4
assert lister_state.last_pkg_name == "logger_dev"
assert lister_state.page_updated_at == "2022-09-09T21:00:14.993273Z"
assert lister.updated
@ -89,9 +87,9 @@ def test_hex_incremental_lister(
p1_origin_urls, p1_json = hexpm_page(1)
p2_origin_urls, p2_json = hexpm_page(2)
mock_hexpm_page(1, "0001-01-01T00:00:00+00:00", p1_json)
mock_hexpm_page(2, "2018-01-30T04:56:03.053561Z", p2_json)
mock_hexpm_page(3, "2019-03-27T00:32:47.822901Z", [])
mock_hexpm_page("0001-01-01T00:00:00.000000Z", p1_json)
mock_hexpm_page("2018-01-30T04:56:03.053561Z", p2_json)
mock_hexpm_page("2019-03-27T00:32:47.822901Z", [])
stats = lister.run()
@ -101,8 +99,7 @@ def test_hex_incremental_lister(
scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
lister_state = lister.get_state_from_scheduler()
assert lister_state.last_page_id == 3
assert lister.state.last_pkg_name == "alchemy_vm"
assert lister_state.page_updated_at == "2019-03-27T00:32:47.822901Z"
assert lister.updated
check_listed_origins(p1_origin_urls + p2_origin_urls, scheduler_origins)
@ -112,8 +109,8 @@ def test_hex_incremental_lister(
# Second run: P3 isn't empty anymore
p3_origin_urls, p3_json = hexpm_page(3)
mock_hexpm_page(3, "2019-03-27T00:32:47.822901Z", p3_json)
mock_hexpm_page(4, "2022-09-09T21:00:14.993273Z", [])
mock_hexpm_page("2019-03-27T00:32:47.822901Z", p3_json)
mock_hexpm_page("2022-09-09T21:00:14.993273Z", [])
stats = lister.run()
@ -123,8 +120,7 @@ def test_hex_incremental_lister(
scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
lister_state = lister.get_state_from_scheduler()
assert lister_state.last_page_id == 4
assert lister.state.last_pkg_name == "logger_dev"
assert lister.state.page_updated_at == "2022-09-09T21:00:14.993273Z"
assert lister.updated
check_listed_origins(
@ -142,8 +138,7 @@ def test_hex_incremental_lister(
assert stats.origins == 0
lister_state = lister.get_state_from_scheduler()
assert lister_state.last_page_id == 4
assert lister.state.last_pkg_name == "logger_dev"
assert lister_state.page_updated_at == "2022-09-09T21:00:14.993273Z"
assert lister.updated is False # No new origins so state isn't updated
check_listed_origins(
@ -159,9 +154,9 @@ def test_hex_lister_http_error(swh_scheduler, http_code, mock_hexpm_page, hexpm_
p1_origin_urls, p1_json = hexpm_page(1)
_, p3_json = hexpm_page(3)
mock_hexpm_page(1, "0001-01-01T00:00:00+00:00", p1_json)
mock_hexpm_page(2, "2018-01-30T04:56:03.053561Z", None, http_code)
mock_hexpm_page(3, "2019-03-27T00:32:47.822901Z", p3_json)
mock_hexpm_page("0001-01-01T00:00:00.000000Z", p1_json)
mock_hexpm_page("2018-01-30T04:56:03.053561Z", None, http_code)
mock_hexpm_page("2019-03-27T00:32:47.822901Z", p3_json)
with pytest.raises(HTTPError):
lister.run()