fix(hex): Use only updated_after for pagination
This commit is contained in:
parent
ac9993a001
commit
9095bbec00
4 changed files with 46 additions and 60 deletions
|
@ -5,6 +5,14 @@
|
|||
|
||||
import os
|
||||
|
||||
import pytest
|
||||
|
||||
pytest_plugins = ["swh.scheduler.pytest_plugin", "swh.core.github.pytest_plugin"]
|
||||
|
||||
os.environ["LC_ALL"] = "C.UTF-8"
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def tenacity_wait(mocker):
|
||||
# Stops tenacity from blocking lister tests for 50x errors
|
||||
mocker.patch("tenacity.nap.time")
|
||||
|
|
|
@ -4,7 +4,6 @@
|
|||
# See top-level LICENSE file for more information
|
||||
|
||||
from dataclasses import asdict, dataclass
|
||||
from datetime import datetime
|
||||
import logging
|
||||
from typing import Any, Dict, Iterator, List
|
||||
from urllib.parse import urljoin
|
||||
|
@ -29,12 +28,10 @@ def get_tar_url(pkg_name: str, release_version: str):
|
|||
class HexListerState:
|
||||
"""The HexLister instance state. This is used for incremental listing."""
|
||||
|
||||
last_page_id: int = 1
|
||||
"""Id of the last page listed on an incremental pass"""
|
||||
last_pkg_name: str = ""
|
||||
"""Name of the last package inserted at on an incremental pass"""
|
||||
last_updated_at: str = datetime.min.replace(tzinfo=iso8601.UTC).isoformat()
|
||||
"""updated_at value of the last seen package on an incremental pass"""
|
||||
# Note: Default values are used only when the lister is run for the first time.
|
||||
|
||||
page_updated_at: str = "0001-01-01T00:00:00.000000Z" # Min datetime
|
||||
"""`updated_at` value of the last seen package in the page."""
|
||||
|
||||
|
||||
class HexLister(Lister[HexListerState, HexListerPage]):
|
||||
|
@ -69,34 +66,20 @@ class HexLister(Lister[HexListerState, HexListerPage]):
|
|||
return asdict(state)
|
||||
|
||||
def get_pages(self) -> Iterator[HexListerPage]:
|
||||
page_id = 1
|
||||
if self.state.last_page_id is not None:
|
||||
page_id = self.state.last_page_id
|
||||
|
||||
url = urljoin(self.url, self.PACKAGES_PATH)
|
||||
|
||||
while page_id is not None:
|
||||
logger.debug(
|
||||
"Fetching URL %s with page_id = %s and updated_after = %s",
|
||||
url,
|
||||
page_id,
|
||||
self.state.last_updated_at,
|
||||
)
|
||||
|
||||
body = self.http_request(
|
||||
while True:
|
||||
body = self.http_request( # This also logs the request
|
||||
url,
|
||||
params={
|
||||
"page": page_id,
|
||||
"search": f"updated_after:{self.state.last_updated_at}",
|
||||
"search": f"updated_after:{self.state.page_updated_at}",
|
||||
},
|
||||
).json()
|
||||
|
||||
yield body
|
||||
|
||||
page_id += 1 # Consider stopping before yielding?
|
||||
|
||||
if len(body) == 0:
|
||||
break # Consider stopping if number of items < 100?
|
||||
break
|
||||
|
||||
def get_origins_from_page(self, page: HexListerPage) -> Iterator[ListedOrigin]:
|
||||
"""Convert a page of HexLister repositories into a list of ListedOrigins"""
|
||||
|
@ -125,24 +108,24 @@ class HexLister(Lister[HexListerState, HexListerPage]):
|
|||
if len(page) == 0:
|
||||
return
|
||||
|
||||
last_pkg_name = page[-1]["name"]
|
||||
last_updated_at = page[-1]["updated_at"]
|
||||
# TODO: Think more about 2nd condition:
|
||||
page_updated_at = page[-1]["updated_at"]
|
||||
"""`page_updated_at` is same as `updated_at` of the last package in the page."""
|
||||
|
||||
if (
|
||||
iso8601.parse_date(last_updated_at)
|
||||
> iso8601.parse_date(self.state.last_updated_at)
|
||||
and last_pkg_name != self.state.last_pkg_name
|
||||
iso8601.parse_date(page_updated_at)
|
||||
> iso8601.parse_date(self.state.page_updated_at)
|
||||
and len(page) > 0
|
||||
):
|
||||
self.state.last_pkg_name = last_pkg_name
|
||||
self.state.last_page_id += 1
|
||||
self.state.last_updated_at = last_updated_at
|
||||
# There's one edge case where `updated_at` don't change between two pages.
|
||||
# But that seems practically impossible because we have 100 packages
|
||||
# per page and the `updated_at` keeps on increasing with time.
|
||||
self.state.page_updated_at = page_updated_at
|
||||
|
||||
def finalize(self) -> None:
|
||||
scheduler_state = self.get_state_from_scheduler()
|
||||
|
||||
# Mark the lister as updated only if it finds any updated repos
|
||||
if iso8601.parse_date(self.state.last_updated_at) > iso8601.parse_date(
|
||||
scheduler_state.last_updated_at
|
||||
if iso8601.parse_date(self.state.page_updated_at) > iso8601.parse_date(
|
||||
scheduler_state.page_updated_at
|
||||
):
|
||||
self.updated = True
|
||||
self.updated = True # This will update the lister state in the scheduler
|
||||
|
|
|
@ -9,7 +9,7 @@ from celery import shared_task
|
|||
from .lister import HexLister
|
||||
|
||||
|
||||
@shared_task(name=__name__ + ".FullHexRelister")
|
||||
@shared_task(name=__name__ + ".HexListerTask")
|
||||
def list_hex_full(
|
||||
instance: Optional[str] = None,
|
||||
) -> Dict[str, int]:
|
||||
|
|
|
@ -29,13 +29,12 @@ def check_listed_origins(lister_urls: List[str], scheduler_origins: List[ListedO
|
|||
@pytest.fixture
|
||||
def mock_hexpm_page(requests_mock):
|
||||
def func(
|
||||
page_id: int,
|
||||
updated_after: str,
|
||||
body: Optional[List[dict]],
|
||||
status_code: int = 200,
|
||||
):
|
||||
search_query = quote(f"updated_after:{updated_after}")
|
||||
page_url = f"https://hex.pm/api/packages/?page={page_id}&search={search_query}"
|
||||
page_url = f"https://hex.pm/api/packages/?search={search_query}"
|
||||
requests_mock.get(
|
||||
page_url, json=body, complete_qs=True, status_code=status_code
|
||||
)
|
||||
|
@ -55,10 +54,10 @@ def test_full_lister_hex(
|
|||
p2_origin_urls, p2_json = hexpm_page(2)
|
||||
p3_origin_urls, p3_json = hexpm_page(3)
|
||||
|
||||
mock_hexpm_page(1, "0001-01-01T00:00:00+00:00", p1_json)
|
||||
mock_hexpm_page(2, "2018-01-30T04:56:03.053561Z", p2_json)
|
||||
mock_hexpm_page(3, "2019-03-27T00:32:47.822901Z", p3_json)
|
||||
mock_hexpm_page(4, "2022-09-09T21:00:14.993273Z", [])
|
||||
mock_hexpm_page("0001-01-01T00:00:00.000000Z", p1_json)
|
||||
mock_hexpm_page("2018-01-30T04:56:03.053561Z", p2_json)
|
||||
mock_hexpm_page("2019-03-27T00:32:47.822901Z", p3_json)
|
||||
mock_hexpm_page("2022-09-09T21:00:14.993273Z", [])
|
||||
|
||||
lister = HexLister(swh_scheduler)
|
||||
|
||||
|
@ -73,8 +72,7 @@ def test_full_lister_hex(
|
|||
p1_origin_urls + p2_origin_urls + p3_origin_urls, scheduler_origins
|
||||
)
|
||||
|
||||
assert lister_state.last_page_id == 4
|
||||
assert lister_state.last_pkg_name == "logger_dev"
|
||||
assert lister_state.page_updated_at == "2022-09-09T21:00:14.993273Z"
|
||||
assert lister.updated
|
||||
|
||||
|
||||
|
@ -89,9 +87,9 @@ def test_hex_incremental_lister(
|
|||
p1_origin_urls, p1_json = hexpm_page(1)
|
||||
p2_origin_urls, p2_json = hexpm_page(2)
|
||||
|
||||
mock_hexpm_page(1, "0001-01-01T00:00:00+00:00", p1_json)
|
||||
mock_hexpm_page(2, "2018-01-30T04:56:03.053561Z", p2_json)
|
||||
mock_hexpm_page(3, "2019-03-27T00:32:47.822901Z", [])
|
||||
mock_hexpm_page("0001-01-01T00:00:00.000000Z", p1_json)
|
||||
mock_hexpm_page("2018-01-30T04:56:03.053561Z", p2_json)
|
||||
mock_hexpm_page("2019-03-27T00:32:47.822901Z", [])
|
||||
|
||||
stats = lister.run()
|
||||
|
||||
|
@ -101,8 +99,7 @@ def test_hex_incremental_lister(
|
|||
scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
|
||||
|
||||
lister_state = lister.get_state_from_scheduler()
|
||||
assert lister_state.last_page_id == 3
|
||||
assert lister.state.last_pkg_name == "alchemy_vm"
|
||||
assert lister_state.page_updated_at == "2019-03-27T00:32:47.822901Z"
|
||||
assert lister.updated
|
||||
|
||||
check_listed_origins(p1_origin_urls + p2_origin_urls, scheduler_origins)
|
||||
|
@ -112,8 +109,8 @@ def test_hex_incremental_lister(
|
|||
# Second run: P3 isn't empty anymore
|
||||
p3_origin_urls, p3_json = hexpm_page(3)
|
||||
|
||||
mock_hexpm_page(3, "2019-03-27T00:32:47.822901Z", p3_json)
|
||||
mock_hexpm_page(4, "2022-09-09T21:00:14.993273Z", [])
|
||||
mock_hexpm_page("2019-03-27T00:32:47.822901Z", p3_json)
|
||||
mock_hexpm_page("2022-09-09T21:00:14.993273Z", [])
|
||||
|
||||
stats = lister.run()
|
||||
|
||||
|
@ -123,8 +120,7 @@ def test_hex_incremental_lister(
|
|||
scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
|
||||
|
||||
lister_state = lister.get_state_from_scheduler()
|
||||
assert lister_state.last_page_id == 4
|
||||
assert lister.state.last_pkg_name == "logger_dev"
|
||||
assert lister.state.page_updated_at == "2022-09-09T21:00:14.993273Z"
|
||||
assert lister.updated
|
||||
|
||||
check_listed_origins(
|
||||
|
@ -142,8 +138,7 @@ def test_hex_incremental_lister(
|
|||
assert stats.origins == 0
|
||||
|
||||
lister_state = lister.get_state_from_scheduler()
|
||||
assert lister_state.last_page_id == 4
|
||||
assert lister.state.last_pkg_name == "logger_dev"
|
||||
assert lister_state.page_updated_at == "2022-09-09T21:00:14.993273Z"
|
||||
assert lister.updated is False # No new origins so state isn't updated
|
||||
|
||||
check_listed_origins(
|
||||
|
@ -159,9 +154,9 @@ def test_hex_lister_http_error(swh_scheduler, http_code, mock_hexpm_page, hexpm_
|
|||
p1_origin_urls, p1_json = hexpm_page(1)
|
||||
_, p3_json = hexpm_page(3)
|
||||
|
||||
mock_hexpm_page(1, "0001-01-01T00:00:00+00:00", p1_json)
|
||||
mock_hexpm_page(2, "2018-01-30T04:56:03.053561Z", None, http_code)
|
||||
mock_hexpm_page(3, "2019-03-27T00:32:47.822901Z", p3_json)
|
||||
mock_hexpm_page("0001-01-01T00:00:00.000000Z", p1_json)
|
||||
mock_hexpm_page("2018-01-30T04:56:03.053561Z", None, http_code)
|
||||
mock_hexpm_page("2019-03-27T00:32:47.822901Z", p3_json)
|
||||
|
||||
with pytest.raises(HTTPError):
|
||||
lister.run()
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue