gitlab: Implement keyset-based pagination listing
The previous pagination implementation has a hard-coded limit server side [1] [1] ``` {"error":"Offset pagination has a maximum allowed offset of 50000 for requests that return objects of type Project. Remaining records can be retrieved using keyset pagination."} ``` Related to T2994
This commit is contained in:
parent
22eeb0956e
commit
97254a19f2
2 changed files with 40 additions and 28 deletions
|
@ -7,7 +7,7 @@ from dataclasses import asdict, dataclass
|
|||
import logging
|
||||
import random
|
||||
from typing import Any, Dict, Iterator, Optional, Tuple
|
||||
from urllib.parse import parse_qs, urlparse
|
||||
from urllib.parse import parse_qs, urlencode, urlparse
|
||||
|
||||
import iso8601
|
||||
import requests
|
||||
|
@ -61,15 +61,18 @@ def _if_rate_limited(retry_state) -> bool:
|
|||
return False
|
||||
|
||||
|
||||
def _parse_page_id(url: Optional[str]) -> Optional[int]:
|
||||
"""Given an url, extract a return the 'page' query parameter associated value or None.
|
||||
def _parse_id_after(url: Optional[str]) -> Optional[int]:
|
||||
"""Given an url, extract a return the 'id_after' query parameter associated value
|
||||
or None.
|
||||
|
||||
This is the the repository id used for pagination purposes.
|
||||
|
||||
"""
|
||||
if not url:
|
||||
return None
|
||||
# link: https://${project-api}/?...&page=2x...
|
||||
# link: https://${project-api}/?...&id_after=2x...
|
||||
query_data = parse_qs(urlparse(url).query)
|
||||
page = query_data.get("page")
|
||||
page = query_data.get("id_after")
|
||||
if page and len(page) > 0:
|
||||
return int(page[0])
|
||||
return None
|
||||
|
@ -150,15 +153,22 @@ class GitLabLister(Lister[GitLabListerState, PageResult]):
|
|||
|
||||
return PageResult(repositories, next_page)
|
||||
|
||||
def page_url(self, page_id: int) -> str:
|
||||
return f"{self.url}projects?page={page_id}&order_by=id&sort=asc&per_page=20"
|
||||
def page_url(self, id_after: Optional[int] = None) -> str:
|
||||
parameters = {
|
||||
"pagination": "keyset",
|
||||
"order_by": "id",
|
||||
"sort": "asc",
|
||||
}
|
||||
if id_after is not None:
|
||||
parameters["id_after"] = str(id_after)
|
||||
return f"{self.url}projects?{urlencode(parameters)}"
|
||||
|
||||
def get_pages(self) -> Iterator[PageResult]:
|
||||
next_page: Optional[str]
|
||||
if self.incremental and self.state and self.state.last_seen_next_link:
|
||||
next_page = self.state.last_seen_next_link
|
||||
else:
|
||||
next_page = self.page_url(1)
|
||||
next_page = self.page_url()
|
||||
|
||||
while next_page:
|
||||
self.last_page = next_page
|
||||
|
@ -194,12 +204,12 @@ class GitLabLister(Lister[GitLabListerState, PageResult]):
|
|||
next_page = self.last_page
|
||||
|
||||
if next_page:
|
||||
page_id = _parse_page_id(next_page)
|
||||
id_after = _parse_id_after(next_page)
|
||||
previous_next_page = self.state.last_seen_next_link
|
||||
previous_page_id = _parse_page_id(previous_next_page)
|
||||
previous_id_after = _parse_id_after(previous_next_page)
|
||||
|
||||
if previous_next_page is None or (
|
||||
previous_page_id and page_id and previous_page_id < page_id
|
||||
previous_id_after and id_after and previous_id_after < id_after
|
||||
):
|
||||
self.state.last_seen_next_link = next_page
|
||||
|
||||
|
@ -212,13 +222,15 @@ class GitLabLister(Lister[GitLabListerState, PageResult]):
|
|||
next_page = self.state.last_seen_next_link
|
||||
if self.incremental and next_page:
|
||||
# link: https://${project-api}/?...&page=2x...
|
||||
next_page_id = _parse_page_id(next_page)
|
||||
next_id_after = _parse_id_after(next_page)
|
||||
scheduler_state = self.get_state_from_scheduler()
|
||||
previous_next_page_id = _parse_page_id(scheduler_state.last_seen_next_link)
|
||||
previous_next_id_after = _parse_id_after(
|
||||
scheduler_state.last_seen_next_link
|
||||
)
|
||||
|
||||
if (not previous_next_page_id and next_page_id) or (
|
||||
previous_next_page_id
|
||||
and next_page_id
|
||||
and previous_next_page_id < next_page_id
|
||||
if (not previous_next_id_after and next_id_after) or (
|
||||
previous_next_id_after
|
||||
and next_id_after
|
||||
and previous_next_id_after < next_id_after
|
||||
):
|
||||
self.updated = True
|
||||
|
|
|
@ -12,7 +12,7 @@ import pytest
|
|||
from requests.status_codes import codes
|
||||
|
||||
from swh.lister import USER_AGENT
|
||||
from swh.lister.gitlab.lister import GitLabLister, _parse_page_id
|
||||
from swh.lister.gitlab.lister import GitLabLister, _parse_id_after
|
||||
from swh.lister.pattern import ListerStats
|
||||
from swh.lister.tests.test_utils import assert_sleep_calls
|
||||
from swh.lister.utils import WAIT_EXP_BASE
|
||||
|
@ -38,7 +38,7 @@ def test_lister_gitlab(datadir, swh_scheduler, requests_mock):
|
|||
response = gitlab_page_response(datadir, instance, 1)
|
||||
|
||||
requests_mock.get(
|
||||
lister.page_url(1), [{"json": response}], additional_matcher=_match_request,
|
||||
lister.page_url(), [{"json": response}], additional_matcher=_match_request,
|
||||
)
|
||||
|
||||
listed_result = lister.run()
|
||||
|
@ -56,9 +56,9 @@ def test_lister_gitlab(datadir, swh_scheduler, requests_mock):
|
|||
assert listed_origin.last_update is not None
|
||||
|
||||
|
||||
def gitlab_page_response(datadir, instance: str, page_id: int) -> List[Dict]:
|
||||
def gitlab_page_response(datadir, instance: str, id_after: int) -> List[Dict]:
|
||||
"""Return list of repositories (out of test dataset)"""
|
||||
datapath = Path(datadir, f"https_{instance}", f"api_response_page{page_id}.json")
|
||||
datapath = Path(datadir, f"https_{instance}", f"api_response_page{id_after}.json")
|
||||
return json.loads(datapath.read_text()) if datapath.exists else []
|
||||
|
||||
|
||||
|
@ -73,7 +73,7 @@ def test_lister_gitlab_with_pages(swh_scheduler, requests_mock, datadir):
|
|||
response2 = gitlab_page_response(datadir, instance, 2)
|
||||
|
||||
requests_mock.get(
|
||||
lister.page_url(1),
|
||||
lister.page_url(),
|
||||
[{"json": response1, "headers": {"Link": f"<{lister.page_url(2)}>; rel=next"}}],
|
||||
additional_matcher=_match_request,
|
||||
)
|
||||
|
@ -106,7 +106,7 @@ def test_lister_gitlab_incremental(swh_scheduler, requests_mock, datadir):
|
|||
url = api_url(instance)
|
||||
lister = GitLabLister(swh_scheduler, url=url, instance=instance, incremental=True)
|
||||
|
||||
url_page1 = lister.page_url(1)
|
||||
url_page1 = lister.page_url()
|
||||
response1 = gitlab_page_response(datadir, instance, 1)
|
||||
url_page2 = lister.page_url(2)
|
||||
response2 = gitlab_page_response(datadir, instance, 2)
|
||||
|
@ -168,7 +168,7 @@ def test_lister_gitlab_rate_limit(swh_scheduler, requests_mock, datadir, mocker)
|
|||
url = api_url(instance)
|
||||
lister = GitLabLister(swh_scheduler, url=url, instance=instance)
|
||||
|
||||
url_page1 = lister.page_url(1)
|
||||
url_page1 = lister.page_url()
|
||||
response1 = gitlab_page_response(datadir, instance, 1)
|
||||
url_page2 = lister.page_url(2)
|
||||
response2 = gitlab_page_response(datadir, instance, 2)
|
||||
|
@ -221,9 +221,9 @@ def test_lister_gitlab_credentials(swh_scheduler):
|
|||
[
|
||||
(None, None),
|
||||
("http://dummy/?query=1", None),
|
||||
("http://dummy/?foo=bar&page=1&some=result", 1),
|
||||
("http://dummy/?foo=bar&page=&some=result", None),
|
||||
("http://dummy/?foo=bar&id_after=1&some=result", 1),
|
||||
("http://dummy/?foo=bar&id_after=&some=result", None),
|
||||
],
|
||||
)
|
||||
def test__parse_page_id(url, expected_result):
|
||||
assert _parse_page_id(url) == expected_result
|
||||
def test__parse_id_after(url, expected_result):
|
||||
assert _parse_id_after(url) == expected_result
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue