gitlab: Implement keyset-based pagination listing

The previous pagination implementation has a hard-coded limit server side [1]

[1]

```
{"error":"Offset pagination has a maximum allowed offset of 50000 for requests that return objects of type Project. Remaining records can be retrieved using keyset pagination."}
```

Related to T2994
This commit is contained in:
Antoine R. Dumont (@ardumont) 2021-01-26 15:39:46 +01:00
parent 22eeb0956e
commit 97254a19f2
No known key found for this signature in database
GPG key ID: 52E2E9840D10C3B8
2 changed files with 40 additions and 28 deletions

View file

@ -7,7 +7,7 @@ from dataclasses import asdict, dataclass
import logging
import random
from typing import Any, Dict, Iterator, Optional, Tuple
from urllib.parse import parse_qs, urlparse
from urllib.parse import parse_qs, urlencode, urlparse
import iso8601
import requests
@ -61,15 +61,18 @@ def _if_rate_limited(retry_state) -> bool:
return False
def _parse_page_id(url: Optional[str]) -> Optional[int]:
"""Given an url, extract a return the 'page' query parameter associated value or None.
def _parse_id_after(url: Optional[str]) -> Optional[int]:
"""Given an url, extract a return the 'id_after' query parameter associated value
or None.
This is the the repository id used for pagination purposes.
"""
if not url:
return None
# link: https://${project-api}/?...&page=2x...
# link: https://${project-api}/?...&id_after=2x...
query_data = parse_qs(urlparse(url).query)
page = query_data.get("page")
page = query_data.get("id_after")
if page and len(page) > 0:
return int(page[0])
return None
@ -150,15 +153,22 @@ class GitLabLister(Lister[GitLabListerState, PageResult]):
return PageResult(repositories, next_page)
def page_url(self, page_id: int) -> str:
return f"{self.url}projects?page={page_id}&order_by=id&sort=asc&per_page=20"
def page_url(self, id_after: Optional[int] = None) -> str:
parameters = {
"pagination": "keyset",
"order_by": "id",
"sort": "asc",
}
if id_after is not None:
parameters["id_after"] = str(id_after)
return f"{self.url}projects?{urlencode(parameters)}"
def get_pages(self) -> Iterator[PageResult]:
next_page: Optional[str]
if self.incremental and self.state and self.state.last_seen_next_link:
next_page = self.state.last_seen_next_link
else:
next_page = self.page_url(1)
next_page = self.page_url()
while next_page:
self.last_page = next_page
@ -194,12 +204,12 @@ class GitLabLister(Lister[GitLabListerState, PageResult]):
next_page = self.last_page
if next_page:
page_id = _parse_page_id(next_page)
id_after = _parse_id_after(next_page)
previous_next_page = self.state.last_seen_next_link
previous_page_id = _parse_page_id(previous_next_page)
previous_id_after = _parse_id_after(previous_next_page)
if previous_next_page is None or (
previous_page_id and page_id and previous_page_id < page_id
previous_id_after and id_after and previous_id_after < id_after
):
self.state.last_seen_next_link = next_page
@ -212,13 +222,15 @@ class GitLabLister(Lister[GitLabListerState, PageResult]):
next_page = self.state.last_seen_next_link
if self.incremental and next_page:
# link: https://${project-api}/?...&page=2x...
next_page_id = _parse_page_id(next_page)
next_id_after = _parse_id_after(next_page)
scheduler_state = self.get_state_from_scheduler()
previous_next_page_id = _parse_page_id(scheduler_state.last_seen_next_link)
previous_next_id_after = _parse_id_after(
scheduler_state.last_seen_next_link
)
if (not previous_next_page_id and next_page_id) or (
previous_next_page_id
and next_page_id
and previous_next_page_id < next_page_id
if (not previous_next_id_after and next_id_after) or (
previous_next_id_after
and next_id_after
and previous_next_id_after < next_id_after
):
self.updated = True

View file

@ -12,7 +12,7 @@ import pytest
from requests.status_codes import codes
from swh.lister import USER_AGENT
from swh.lister.gitlab.lister import GitLabLister, _parse_page_id
from swh.lister.gitlab.lister import GitLabLister, _parse_id_after
from swh.lister.pattern import ListerStats
from swh.lister.tests.test_utils import assert_sleep_calls
from swh.lister.utils import WAIT_EXP_BASE
@ -38,7 +38,7 @@ def test_lister_gitlab(datadir, swh_scheduler, requests_mock):
response = gitlab_page_response(datadir, instance, 1)
requests_mock.get(
lister.page_url(1), [{"json": response}], additional_matcher=_match_request,
lister.page_url(), [{"json": response}], additional_matcher=_match_request,
)
listed_result = lister.run()
@ -56,9 +56,9 @@ def test_lister_gitlab(datadir, swh_scheduler, requests_mock):
assert listed_origin.last_update is not None
def gitlab_page_response(datadir, instance: str, page_id: int) -> List[Dict]:
def gitlab_page_response(datadir, instance: str, id_after: int) -> List[Dict]:
"""Return list of repositories (out of test dataset)"""
datapath = Path(datadir, f"https_{instance}", f"api_response_page{page_id}.json")
datapath = Path(datadir, f"https_{instance}", f"api_response_page{id_after}.json")
return json.loads(datapath.read_text()) if datapath.exists else []
@ -73,7 +73,7 @@ def test_lister_gitlab_with_pages(swh_scheduler, requests_mock, datadir):
response2 = gitlab_page_response(datadir, instance, 2)
requests_mock.get(
lister.page_url(1),
lister.page_url(),
[{"json": response1, "headers": {"Link": f"<{lister.page_url(2)}>; rel=next"}}],
additional_matcher=_match_request,
)
@ -106,7 +106,7 @@ def test_lister_gitlab_incremental(swh_scheduler, requests_mock, datadir):
url = api_url(instance)
lister = GitLabLister(swh_scheduler, url=url, instance=instance, incremental=True)
url_page1 = lister.page_url(1)
url_page1 = lister.page_url()
response1 = gitlab_page_response(datadir, instance, 1)
url_page2 = lister.page_url(2)
response2 = gitlab_page_response(datadir, instance, 2)
@ -168,7 +168,7 @@ def test_lister_gitlab_rate_limit(swh_scheduler, requests_mock, datadir, mocker)
url = api_url(instance)
lister = GitLabLister(swh_scheduler, url=url, instance=instance)
url_page1 = lister.page_url(1)
url_page1 = lister.page_url()
response1 = gitlab_page_response(datadir, instance, 1)
url_page2 = lister.page_url(2)
response2 = gitlab_page_response(datadir, instance, 2)
@ -221,9 +221,9 @@ def test_lister_gitlab_credentials(swh_scheduler):
[
(None, None),
("http://dummy/?query=1", None),
("http://dummy/?foo=bar&page=1&some=result", 1),
("http://dummy/?foo=bar&page=&some=result", None),
("http://dummy/?foo=bar&id_after=1&some=result", 1),
("http://dummy/?foo=bar&id_after=&some=result", None),
],
)
def test__parse_page_id(url, expected_result):
assert _parse_page_id(url) == expected_result
def test__parse_id_after(url, expected_result):
assert _parse_id_after(url) == expected_result