From 9095bbec007dafd38c05136b775ed9100693b61b Mon Sep 17 00:00:00 2001 From: KShivendu Date: Sun, 5 Feb 2023 15:32:17 +0530 Subject: [PATCH] fix(hex): Use only updated_after for pagination --- conftest.py | 8 ++++ swh/lister/hex/lister.py | 57 ++++++++++------------------- swh/lister/hex/tasks.py | 2 +- swh/lister/hex/tests/test_lister.py | 39 +++++++++----------- 4 files changed, 46 insertions(+), 60 deletions(-) diff --git a/conftest.py b/conftest.py index 00eb31a..1dc2e70 100644 --- a/conftest.py +++ b/conftest.py @@ -5,6 +5,14 @@ import os +import pytest + pytest_plugins = ["swh.scheduler.pytest_plugin", "swh.core.github.pytest_plugin"] os.environ["LC_ALL"] = "C.UTF-8" + + +@pytest.fixture(autouse=True) +def tenacity_wait(mocker): + # Stops tenacity from blocking lister tests for 50x errors + mocker.patch("tenacity.nap.time") diff --git a/swh/lister/hex/lister.py b/swh/lister/hex/lister.py index e064252..c97f4b8 100644 --- a/swh/lister/hex/lister.py +++ b/swh/lister/hex/lister.py @@ -4,7 +4,6 @@ # See top-level LICENSE file for more information from dataclasses import asdict, dataclass -from datetime import datetime import logging from typing import Any, Dict, Iterator, List from urllib.parse import urljoin @@ -29,12 +28,10 @@ def get_tar_url(pkg_name: str, release_version: str): class HexListerState: """The HexLister instance state. This is used for incremental listing.""" - last_page_id: int = 1 - """Id of the last page listed on an incremental pass""" - last_pkg_name: str = "" - """Name of the last package inserted at on an incremental pass""" - last_updated_at: str = datetime.min.replace(tzinfo=iso8601.UTC).isoformat() - """updated_at value of the last seen package on an incremental pass""" + # Note: Default values are used only when the lister is run for the first time. + + page_updated_at: str = "0001-01-01T00:00:00.000000Z" # Min datetime + """`updated_at` value of the last seen package in the page.""" class HexLister(Lister[HexListerState, HexListerPage]): @@ -69,34 +66,20 @@ class HexLister(Lister[HexListerState, HexListerPage]): return asdict(state) def get_pages(self) -> Iterator[HexListerPage]: - page_id = 1 - if self.state.last_page_id is not None: - page_id = self.state.last_page_id - url = urljoin(self.url, self.PACKAGES_PATH) - while page_id is not None: - logger.debug( - "Fetching URL %s with page_id = %s and updated_after = %s", - url, - page_id, - self.state.last_updated_at, - ) - - body = self.http_request( + while True: + body = self.http_request( # This also logs the request url, params={ - "page": page_id, - "search": f"updated_after:{self.state.last_updated_at}", + "search": f"updated_after:{self.state.page_updated_at}", }, ).json() yield body - page_id += 1 # Consider stopping before yielding? - if len(body) == 0: - break # Consider stopping if number of items < 100? + break def get_origins_from_page(self, page: HexListerPage) -> Iterator[ListedOrigin]: """Convert a page of HexLister repositories into a list of ListedOrigins""" @@ -125,24 +108,24 @@ class HexLister(Lister[HexListerState, HexListerPage]): if len(page) == 0: return - last_pkg_name = page[-1]["name"] - last_updated_at = page[-1]["updated_at"] - # TODO: Think more about 2nd condition: + page_updated_at = page[-1]["updated_at"] + """`page_updated_at` is same as `updated_at` of the last package in the page.""" + if ( - iso8601.parse_date(last_updated_at) - > iso8601.parse_date(self.state.last_updated_at) - and last_pkg_name != self.state.last_pkg_name + iso8601.parse_date(page_updated_at) + > iso8601.parse_date(self.state.page_updated_at) and len(page) > 0 ): - self.state.last_pkg_name = last_pkg_name - self.state.last_page_id += 1 - self.state.last_updated_at = last_updated_at + # There's one edge case where `updated_at` don't change between two pages. + # But that seems practically impossible because we have 100 packages + # per page and the `updated_at` keeps on increasing with time. + self.state.page_updated_at = page_updated_at def finalize(self) -> None: scheduler_state = self.get_state_from_scheduler() # Mark the lister as updated only if it finds any updated repos - if iso8601.parse_date(self.state.last_updated_at) > iso8601.parse_date( - scheduler_state.last_updated_at + if iso8601.parse_date(self.state.page_updated_at) > iso8601.parse_date( + scheduler_state.page_updated_at ): - self.updated = True + self.updated = True # This will update the lister state in the scheduler diff --git a/swh/lister/hex/tasks.py b/swh/lister/hex/tasks.py index 012bc8f..9e73ebd 100644 --- a/swh/lister/hex/tasks.py +++ b/swh/lister/hex/tasks.py @@ -9,7 +9,7 @@ from celery import shared_task from .lister import HexLister -@shared_task(name=__name__ + ".FullHexRelister") +@shared_task(name=__name__ + ".HexListerTask") def list_hex_full( instance: Optional[str] = None, ) -> Dict[str, int]: diff --git a/swh/lister/hex/tests/test_lister.py b/swh/lister/hex/tests/test_lister.py index 69bf9ea..513fdf8 100644 --- a/swh/lister/hex/tests/test_lister.py +++ b/swh/lister/hex/tests/test_lister.py @@ -29,13 +29,12 @@ def check_listed_origins(lister_urls: List[str], scheduler_origins: List[ListedO @pytest.fixture def mock_hexpm_page(requests_mock): def func( - page_id: int, updated_after: str, body: Optional[List[dict]], status_code: int = 200, ): search_query = quote(f"updated_after:{updated_after}") - page_url = f"https://hex.pm/api/packages/?page={page_id}&search={search_query}" + page_url = f"https://hex.pm/api/packages/?search={search_query}" requests_mock.get( page_url, json=body, complete_qs=True, status_code=status_code ) @@ -55,10 +54,10 @@ def test_full_lister_hex( p2_origin_urls, p2_json = hexpm_page(2) p3_origin_urls, p3_json = hexpm_page(3) - mock_hexpm_page(1, "0001-01-01T00:00:00+00:00", p1_json) - mock_hexpm_page(2, "2018-01-30T04:56:03.053561Z", p2_json) - mock_hexpm_page(3, "2019-03-27T00:32:47.822901Z", p3_json) - mock_hexpm_page(4, "2022-09-09T21:00:14.993273Z", []) + mock_hexpm_page("0001-01-01T00:00:00.000000Z", p1_json) + mock_hexpm_page("2018-01-30T04:56:03.053561Z", p2_json) + mock_hexpm_page("2019-03-27T00:32:47.822901Z", p3_json) + mock_hexpm_page("2022-09-09T21:00:14.993273Z", []) lister = HexLister(swh_scheduler) @@ -73,8 +72,7 @@ def test_full_lister_hex( p1_origin_urls + p2_origin_urls + p3_origin_urls, scheduler_origins ) - assert lister_state.last_page_id == 4 - assert lister_state.last_pkg_name == "logger_dev" + assert lister_state.page_updated_at == "2022-09-09T21:00:14.993273Z" assert lister.updated @@ -89,9 +87,9 @@ def test_hex_incremental_lister( p1_origin_urls, p1_json = hexpm_page(1) p2_origin_urls, p2_json = hexpm_page(2) - mock_hexpm_page(1, "0001-01-01T00:00:00+00:00", p1_json) - mock_hexpm_page(2, "2018-01-30T04:56:03.053561Z", p2_json) - mock_hexpm_page(3, "2019-03-27T00:32:47.822901Z", []) + mock_hexpm_page("0001-01-01T00:00:00.000000Z", p1_json) + mock_hexpm_page("2018-01-30T04:56:03.053561Z", p2_json) + mock_hexpm_page("2019-03-27T00:32:47.822901Z", []) stats = lister.run() @@ -101,8 +99,7 @@ def test_hex_incremental_lister( scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results lister_state = lister.get_state_from_scheduler() - assert lister_state.last_page_id == 3 - assert lister.state.last_pkg_name == "alchemy_vm" + assert lister_state.page_updated_at == "2019-03-27T00:32:47.822901Z" assert lister.updated check_listed_origins(p1_origin_urls + p2_origin_urls, scheduler_origins) @@ -112,8 +109,8 @@ def test_hex_incremental_lister( # Second run: P3 isn't empty anymore p3_origin_urls, p3_json = hexpm_page(3) - mock_hexpm_page(3, "2019-03-27T00:32:47.822901Z", p3_json) - mock_hexpm_page(4, "2022-09-09T21:00:14.993273Z", []) + mock_hexpm_page("2019-03-27T00:32:47.822901Z", p3_json) + mock_hexpm_page("2022-09-09T21:00:14.993273Z", []) stats = lister.run() @@ -123,8 +120,7 @@ def test_hex_incremental_lister( scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results lister_state = lister.get_state_from_scheduler() - assert lister_state.last_page_id == 4 - assert lister.state.last_pkg_name == "logger_dev" + assert lister.state.page_updated_at == "2022-09-09T21:00:14.993273Z" assert lister.updated check_listed_origins( @@ -142,8 +138,7 @@ def test_hex_incremental_lister( assert stats.origins == 0 lister_state = lister.get_state_from_scheduler() - assert lister_state.last_page_id == 4 - assert lister.state.last_pkg_name == "logger_dev" + assert lister_state.page_updated_at == "2022-09-09T21:00:14.993273Z" assert lister.updated is False # No new origins so state isn't updated check_listed_origins( @@ -159,9 +154,9 @@ def test_hex_lister_http_error(swh_scheduler, http_code, mock_hexpm_page, hexpm_ p1_origin_urls, p1_json = hexpm_page(1) _, p3_json = hexpm_page(3) - mock_hexpm_page(1, "0001-01-01T00:00:00+00:00", p1_json) - mock_hexpm_page(2, "2018-01-30T04:56:03.053561Z", None, http_code) - mock_hexpm_page(3, "2019-03-27T00:32:47.822901Z", p3_json) + mock_hexpm_page("0001-01-01T00:00:00.000000Z", p1_json) + mock_hexpm_page("2018-01-30T04:56:03.053561Z", None, http_code) + mock_hexpm_page("2019-03-27T00:32:47.822901Z", p3_json) with pytest.raises(HTTPError): lister.run()