feat: Make the Gogs lister incremental

This commit is contained in:
KShivendu 2022-08-09 11:13:32 +05:30
parent cee6bcb514
commit 6a53a6ad06
4 changed files with 471 additions and 77 deletions

View file

@ -3,10 +3,11 @@
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from dataclasses import asdict, dataclass
import logging
import random
from typing import Any, Dict, Iterator, List, Optional
from urllib.parse import urljoin
from urllib.parse import parse_qs, urljoin, urlparse
import iso8601
import requests
@ -17,15 +18,36 @@ from swh.scheduler.interface import SchedulerInterface
from swh.scheduler.model import ListedOrigin
from .. import USER_AGENT
from ..pattern import CredentialsType, StatelessLister
from ..pattern import CredentialsType, Lister
logger = logging.getLogger(__name__)
# Aliasing page results returned by `GogsLister.get_pages` method
GogsListerPage = List[Dict[str, Any]]
Repo = Dict[str, Any]
class GogsLister(StatelessLister[GogsListerPage]):
@dataclass
class GogsListerPage:
repos: Optional[List[Repo]] = None
next_link: Optional[str] = None
@dataclass
class GogsListerState:
last_seen_next_link: Optional[str] = None
"""Last link header (could be already visited) during an incremental pass."""
last_seen_repo_id: Optional[int] = None
"""Last repo id seen during an incremental pass."""
def _parse_page_id(url: Optional[str]) -> int:
"""Parse the page id from a Gogs page url."""
if url is None:
return 0
return int(parse_qs(urlparse(url).query)["page"][0])
class GogsLister(Lister[GogsListerState, GogsListerPage]):
"""List origins from the Gogs
@ -61,7 +83,6 @@ class GogsLister(StatelessLister[GogsListerPage]):
self.query_params = {
"limit": page_size,
"page": 1,
}
self.api_token = api_token
@ -88,6 +109,12 @@ class GogsLister(StatelessLister[GogsListerPage]):
}
)
def state_from_dict(self, d: Dict[str, Any]) -> GogsListerState:
return GogsListerState(**d)
def state_to_dict(self, state: GogsListerState) -> Dict[str, Any]:
return asdict(state)
@throttling_retry(before_sleep=before_sleep_log(logger, logging.WARNING))
def page_request(self, url, params) -> requests.Response:
@ -107,38 +134,70 @@ class GogsLister(StatelessLister[GogsListerPage]):
return response
@classmethod
def results_simplified(cls, body: Dict[str, GogsListerPage]) -> GogsListerPage:
def extract_repos(cls, body: Dict[str, Any]) -> List[Repo]:
fields_filter = ["id", "clone_url", "updated_at"]
return [{k: r[k] for k in fields_filter} for r in body["data"]]
def get_pages(self) -> Iterator[GogsListerPage]:
page_id = 1
if self.state.last_seen_next_link is not None:
page_id = _parse_page_id(self.state.last_seen_next_link)
# base with trailing slash, path without leading slash for urljoin
url = urljoin(self.url, self.REPO_LIST_PATH)
response = self.page_request(url, self.query_params)
next_link: Optional[str] = urljoin(self.url, self.REPO_LIST_PATH)
response = self.page_request(next_link, {**self.query_params, "page": page_id})
while True:
page_results = self.results_simplified(response.json())
yield page_results
while next_link is not None:
repos = self.extract_repos(response.json())
assert len(response.links) > 0, "API changed: no Link header found"
if "next" in response.links:
url = response.links["next"]["url"]
next_link = response.links["next"]["url"]
else:
break
next_link = None # Happens for the last page
response = self.page_request(url, {})
yield GogsListerPage(repos=repos, next_link=next_link)
if next_link is not None:
response = self.page_request(next_link, {})
def get_origins_from_page(self, page: GogsListerPage) -> Iterator[ListedOrigin]:
"""Convert a page of Gogs repositories into a list of ListedOrigins"""
assert self.lister_obj.id is not None
assert page.repos is not None
for repo in page:
last_update = iso8601.parse_date(repo["updated_at"])
for r in page.repos:
last_update = iso8601.parse_date(r["updated_at"])
yield ListedOrigin(
lister_id=self.lister_obj.id,
visit_type=self.VISIT_TYPE,
url=repo["clone_url"],
url=r["clone_url"],
last_update=last_update,
)
def commit_page(self, page: GogsListerPage) -> None:
last_seen_next_link = page.next_link
page_id = _parse_page_id(last_seen_next_link)
state_page_id = _parse_page_id(self.state.last_seen_next_link)
if page_id > state_page_id:
self.state.last_seen_next_link = last_seen_next_link
if (page.repos is not None) and len(page.repos) > 0:
self.state.last_seen_repo_id = page.repos[-1]["id"]
def finalize(self) -> None:
scheduler_state = self.get_state_from_scheduler()
state_page_id = _parse_page_id(self.state.last_seen_next_link)
scheduler_page_id = _parse_page_id(scheduler_state.last_seen_next_link)
state_last_repo_id = self.state.last_seen_repo_id or 0
scheduler_last_repo_id = scheduler_state.last_seen_repo_id or 0
if (state_page_id >= scheduler_page_id) and (
state_last_repo_id > scheduler_last_repo_id
):
self.updated = True # Marked updated only if it finds new repos

View file

@ -0,0 +1,168 @@
{
"data": [
{
"id": 340,
"owner": {
"id": 585,
"username": "zork",
"login": "zork",
"full_name": "",
"email": "f905334@trbvm.com",
"avatar_url": "https://secure.gravatar.com/avatar/ebcb8e171a1a47fde8ded46b2618f135?d=identicon"
},
"name": "beyond-the-titanic",
"full_name": "zork/beyond-the-titanic",
"description": "Adventure awaits you onboard the RMS Titanic. Can you survive the sinking and make it home to San Francisco?",
"private": false,
"fork": false,
"parent": null,
"empty": false,
"mirror": false,
"size": 1436672,
"html_url": "https://try.gogs.io/zork/beyond-the-titanic",
"ssh_url": "git@try.gogs.io:zork/beyond-the-titanic.git",
"clone_url": "https://try.gogs.io/zork/beyond-the-titanic.git",
"website": "",
"stars_count": 0,
"forks_count": 1,
"watchers_count": 1,
"open_issues_count": 0,
"default_branch": "master",
"created_at": "2015-03-03T22:51:12Z",
"updated_at": "2022-03-26T07:28:38Z"
},
{
"id": 350,
"owner": {
"id": 599,
"username": "perekre",
"login": "perekre",
"full_name": "",
"email": "perekre@nincsmail.com",
"avatar_url": "https://secure.gravatar.com/avatar/0e2666adf16f8a958a56141a2d94565c?d=identicon"
},
"name": "beyond-the-titanic",
"full_name": "perekre/beyond-the-titanic",
"description": "Adventure awaits you onboard the RMS Titanic. Can you survive the sinking and make it home to San Francisco?",
"private": false,
"fork": true,
"parent": {
"id": 340,
"owner": {
"id": 585,
"username": "zork",
"login": "zork",
"full_name": "",
"email": "f905334@trbvm.com",
"avatar_url": "https://secure.gravatar.com/avatar/ebcb8e171a1a47fde8ded46b2618f135?d=identicon"
},
"name": "beyond-the-titanic",
"full_name": "zork/beyond-the-titanic",
"description": "Adventure awaits you onboard the RMS Titanic. Can you survive the sinking and make it home to San Francisco?",
"private": false,
"fork": false,
"parent": null,
"empty": false,
"mirror": false,
"size": 1436672,
"html_url": "https://try.gogs.io/zork/beyond-the-titanic",
"ssh_url": "git@try.gogs.io:zork/beyond-the-titanic.git",
"clone_url": "https://try.gogs.io/zork/beyond-the-titanic.git",
"website": "",
"stars_count": 0,
"forks_count": 1,
"watchers_count": 1,
"open_issues_count": 0,
"default_branch": "master",
"created_at": "2015-03-03T22:51:12Z",
"updated_at": "2022-03-26T07:28:38Z",
"permissions": {
"admin": false,
"push": false,
"pull": true
}
},
"empty": false,
"mirror": false,
"size": 1437696,
"html_url": "https://try.gogs.io/perekre/beyond-the-titanic",
"ssh_url": "git@try.gogs.io:perekre/beyond-the-titanic.git",
"clone_url": "https://try.gogs.io/perekre/beyond-the-titanic.git",
"website": "",
"stars_count": 0,
"forks_count": 0,
"watchers_count": 1,
"open_issues_count": 0,
"default_branch": "master",
"created_at": "2015-03-04T10:40:46Z",
"updated_at": "2022-03-26T07:28:38Z"
},
{
"id": 369,
"owner": {
"id": 108,
"username": "yinheli",
"login": "yinheli",
"full_name": "",
"email": "me@yinheli.com",
"avatar_url": "https://secure.gravatar.com/avatar/dedb067ecae8155b87428ac7920dd0ae?d=identicon"
},
"name": "digits",
"full_name": "yinheli/digits",
"description": "Distantly related to the game Mastermind, you are given clues to help determine a random number combination. The object of the game is to guess the solution in as few tries as possible.",
"private": false,
"fork": true,
"parent": {
"id": 339,
"owner": {
"id": 585,
"username": "zork",
"login": "zork",
"full_name": "",
"email": "f905334@trbvm.com",
"avatar_url": "https://secure.gravatar.com/avatar/ebcb8e171a1a47fde8ded46b2618f135?d=identicon"
},
"name": "digits",
"full_name": "zork/digits",
"description": "Distantly related to the game Mastermind, you are given clues to help determine a random number combination. The object of the game is to guess the solution in as few tries as possible.",
"private": false,
"fork": false,
"parent": null,
"empty": false,
"mirror": false,
"size": 18432,
"html_url": "https://try.gogs.io/zork/digits",
"ssh_url": "git@try.gogs.io:zork/digits.git",
"clone_url": "https://try.gogs.io/zork/digits.git",
"website": "",
"stars_count": 0,
"forks_count": 1,
"watchers_count": 1,
"open_issues_count": 0,
"default_branch": "master",
"created_at": "2015-03-03T22:47:56Z",
"updated_at": "2022-03-26T07:28:38Z",
"permissions": {
"admin": false,
"push": false,
"pull": true
}
},
"empty": false,
"mirror": false,
"size": 18432,
"html_url": "https://try.gogs.io/yinheli/digits",
"ssh_url": "git@try.gogs.io:yinheli/digits.git",
"clone_url": "https://try.gogs.io/yinheli/digits.git",
"website": "",
"stars_count": 0,
"forks_count": 0,
"watchers_count": 1,
"open_issues_count": 0,
"default_branch": "master",
"created_at": "2015-03-06T01:31:17Z",
"updated_at": "2022-03-26T07:28:38Z"
}
],
"ok": true
}

View file

@ -0,0 +1 @@
{"data":[{"id":380,"owner":{"id":653,"username":"gdr","login":"gdr","full_name":"","email":"gdr@gdr.name","avatar_url":"https://secure.gravatar.com/avatar/237e2bf0a3687301ed4ef3c65e56c672?d=identicon"},"name":"ttrss-af_nofacebook","full_name":"gdr/ttrss-af_nofacebook","description":"Tiny Tiny RSS plugin for removing Facebook's l.php links","private":false,"fork":false,"parent":null,"empty":false,"mirror":true,"size":4096,"html_url":"https://try.gogs.io/gdr/ttrss-af_nofacebook","ssh_url":"git@try.gogs.io:gdr/ttrss-af_nofacebook.git","clone_url":"https://try.gogs.io/gdr/ttrss-af_nofacebook.git","website":"","stars_count":0,"forks_count":0,"watchers_count":1,"open_issues_count":0,"default_branch":"master","created_at":"2015-03-06T11:39:30Z","updated_at":"2022-03-26T07:28:38Z"},{"id":399,"owner":{"id":683,"username":"brejoc","login":"brejoc","full_name":"","email":"brejoc@gmail.com","avatar_url":"https://secure.gravatar.com/avatar/80674ca691e4a325d8bff1977a1d881d?d=identicon"},"name":"gosgp","full_name":"brejoc/gosgp","description":"Command line SuperGenPass password generator written in go.","private":false,"fork":false,"parent":null,"empty":false,"mirror":false,"size":35840,"html_url":"https://try.gogs.io/brejoc/gosgp","ssh_url":"git@try.gogs.io:brejoc/gosgp.git","clone_url":"https://try.gogs.io/brejoc/gosgp.git","website":"","stars_count":0,"forks_count":2,"watchers_count":1,"open_issues_count":1,"default_branch":"master","created_at":"2015-03-09T12:15:24Z","updated_at":"2022-03-26T07:28:38Z"},{"id":403,"owner":{"id":687,"username":"mirdhyn","login":"mirdhyn","full_name":"","email":"mirdhyn@gmail.com","avatar_url":"https://secure.gravatar.com/avatar/047818f3fffe0df833958ea40a25fd5c?d=identicon"},"name":"gosgp","full_name":"mirdhyn/gosgp","description":"Command line SuperGenPass password generator written in go.","private":false,"fork":true,"parent":{"id":399,"owner":{"id":683,"username":"brejoc","login":"brejoc","full_name":"","email":"brejoc@gmail.com","avatar_url":"https://secure.gravatar.com/avatar/80674ca691e4a325d8bff1977a1d881d?d=identicon"},"name":"gosgp","full_name":"brejoc/gosgp","description":"Command line SuperGenPass password generator written in go.","private":false,"fork":false,"parent":null,"empty":false,"mirror":false,"size":35840,"html_url":"https://try.gogs.io/brejoc/gosgp","ssh_url":"git@try.gogs.io:brejoc/gosgp.git","clone_url":"https://try.gogs.io/brejoc/gosgp.git","website":"","stars_count":0,"forks_count":2,"watchers_count":1,"open_issues_count":1,"default_branch":"master","created_at":"2015-03-09T12:15:24Z","updated_at":"2022-03-26T07:28:38Z","permissions":{"admin":false,"push":false,"pull":true}},"empty":false,"mirror":false,"size":48128,"html_url":"https://try.gogs.io/mirdhyn/gosgp","ssh_url":"git@try.gogs.io:mirdhyn/gosgp.git","clone_url":"https://try.gogs.io/mirdhyn/gosgp.git","website":"","stars_count":0,"forks_count":0,"watchers_count":1,"open_issues_count":0,"default_branch":"master","created_at":"2015-03-09T21:34:21Z","updated_at":"2022-03-26T07:28:38Z"}],"ok":true}

View file

@ -11,46 +11,84 @@ from unittest.mock import Mock
import pytest
from requests import HTTPError
from swh.lister.gogs.lister import GogsLister
from swh.lister.gogs.lister import GogsLister, GogsListerPage, _parse_page_id
from swh.scheduler.model import ListedOrigin
TRY_GOGS_URL = "https://try.gogs.io/api/v1/"
def try_gogs_page(n: int):
return TRY_GOGS_URL + f"repos/search?page={n}&limit=3"
return TRY_GOGS_URL + GogsLister.REPO_LIST_PATH + f"?page={n}&limit=3"
P1 = try_gogs_page(1)
P2 = try_gogs_page(2)
P3 = try_gogs_page(3)
P4 = try_gogs_page(4)
@pytest.fixture
def trygogs_p1(datadir):
text = Path(datadir, "https_try.gogs.io", "repos_page1").read_text()
headers = {
"Link": '<{p2}>; rel="next",<{p2}>; rel="last"'.format(p2=try_gogs_page(2))
}
page_result = GogsLister.results_simplified(json.loads(text))
origin_urls = [r["clone_url"] for r in page_result]
headers = {"Link": f'<{P2}>; rel="next"'}
page_result = GogsListerPage(
repos=GogsLister.extract_repos(json.loads(text)), next_link=P2
)
origin_urls = [r["clone_url"] for r in page_result.repos]
return text, headers, page_result, origin_urls
@pytest.fixture
def trygogs_p2(datadir):
text = Path(datadir, "https_try.gogs.io", "repos_page2").read_text()
headers = {
"Link": '<{p1}>; rel="prev",<{p1}>; rel="first"'.format(p1=try_gogs_page(1))
}
page_result = GogsLister.results_simplified(json.loads(text))
origin_urls = [r["clone_url"] for r in page_result]
headers = {"Link": f'<{P3}>; rel="next",<{P1}>; rel="prev"'}
page_result = GogsListerPage(
repos=GogsLister.extract_repos(json.loads(text)), next_link=P3
)
origin_urls = [r["clone_url"] for r in page_result.repos]
return text, headers, page_result, origin_urls
@pytest.fixture
def trygogs_empty_page():
def trygogs_p3(datadir):
text = Path(datadir, "https_try.gogs.io", "repos_page3").read_text()
headers = {"Link": f'<{P4}>; rel="next",<{P2}>; rel="prev"'}
page_result = GogsListerPage(
repos=GogsLister.extract_repos(json.loads(text)), next_link=P3
)
origin_urls = [r["clone_url"] for r in page_result.repos]
return text, headers, page_result, origin_urls
@pytest.fixture
def trygogs_p4(datadir):
text = Path(datadir, "https_try.gogs.io", "repos_page4").read_text()
headers = {"Link": f'<{P3}>; rel="prev"'}
page_result = GogsListerPage(
repos=GogsLister.extract_repos(json.loads(text)), next_link=P3
)
origin_urls = [r["clone_url"] for r in page_result.repos]
return text, headers, page_result, origin_urls
@pytest.fixture
def trygogs_p3_last(datadir):
text = Path(datadir, "https_try.gogs.io", "repos_page3").read_text()
headers = {"Link": f'<{P2}>; rel="prev",<{P1}>; rel="first"'}
page_result = GogsListerPage(
repos=GogsLister.extract_repos(json.loads(text)), next_link=None
)
origin_urls = [r["clone_url"] for r in page_result.repos]
return text, headers, page_result, origin_urls
@pytest.fixture
def trygogs_p3_empty():
origins_urls = []
page_result = {"data": [], "ok": True}
headers = {
"Link": '<{p1}>; rel="prev",<{p1}>; rel="first"'.format(p1=try_gogs_page(1))
}
text = json.dumps(page_result)
body = {"data": [], "ok": True}
headers = {"Link": f'<{P2}>; rel="prev",<{P1}>; rel="first"'}
page_result = GogsListerPage(repos=GogsLister.extract_repos(body), next_link=None)
text = json.dumps(body)
return text, headers, page_result, origins_urls
@ -69,7 +107,7 @@ def check_listed_origins(lister_urls: List[str], scheduler_origins: List[ListedO
def test_gogs_full_listing(
swh_scheduler, requests_mock, mocker, trygogs_p1, trygogs_p2, trygogs_empty_page
swh_scheduler, requests_mock, mocker, trygogs_p1, trygogs_p2, trygogs_p3_last
):
kwargs = dict(
url=TRY_GOGS_URL, instance="try_gogs", page_size=3, api_token="secret"
@ -80,84 +118,212 @@ def test_gogs_full_listing(
p1_text, p1_headers, p1_result, p1_origin_urls = trygogs_p1
p2_text, p2_headers, p2_result, p2_origin_urls = trygogs_p2
p3_text, p3_headers, _, _ = trygogs_empty_page
p3_text, p3_headers, p3_result, p3_origin_urls = trygogs_p3_last
requests_mock.get(try_gogs_page(1), text=p1_text, headers=p1_headers)
requests_mock.get(try_gogs_page(2), text=p2_text, headers=p2_headers)
requests_mock.get(try_gogs_page(3), text=p3_text, headers=p3_headers)
requests_mock.get(P1, text=p1_text, headers=p1_headers)
requests_mock.get(P2, text=p2_text, headers=p2_headers)
requests_mock.get(P3, text=p3_text, headers=p3_headers)
stats = lister.run()
assert stats.pages == 2
assert stats.origins == 6
assert stats.pages == 3
assert stats.origins == 9
calls = [mocker.call(p1_result), mocker.call(p2_result)]
lister.get_origins_from_page.assert_has_calls(calls)
calls = map(mocker.call, [p1_result, p2_result, p3_result])
lister.get_origins_from_page.assert_has_calls(list(calls))
scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
check_listed_origins(p1_origin_urls + p2_origin_urls, scheduler_origins)
check_listed_origins(
p1_origin_urls + p2_origin_urls + p3_origin_urls, scheduler_origins
)
assert lister.get_state_from_scheduler() is None
assert (
lister.get_state_from_scheduler().last_seen_next_link == P3
) # P3 didn't provide any next link so it remains the last_seen_next_link
def test_gogs_auth_instance(
swh_scheduler, requests_mock, trygogs_p1, trygogs_empty_page
swh_scheduler, requests_mock, trygogs_p1, trygogs_p2, trygogs_p3_empty
):
"""Covers token authentication, token from credentials,
instance inference from URL."""
api_token = "secret"
instance = "try.gogs.io"
creds = {"gogs": {instance: [{"username": "u", "password": api_token}]}}
instance = "try_gogs"
kwargs1 = dict(url=TRY_GOGS_URL, api_token=api_token, instance=instance)
lister = GogsLister(scheduler=swh_scheduler, **kwargs1)
# Test lister initialization without api_token or credentials:
with pytest.raises(ValueError, match="No credentials or API token provided"):
kwargs1 = dict(url=TRY_GOGS_URL, instance=instance)
GogsLister(scheduler=swh_scheduler, **kwargs1)
# test API token
assert "Authorization" in lister.session.headers
# Test lister initialization using api_token:
kwargs2 = dict(url=TRY_GOGS_URL, api_token=api_token, instance=instance)
lister = GogsLister(scheduler=swh_scheduler, **kwargs2)
assert lister.session.headers["Authorization"].lower() == "token %s" % api_token
with pytest.raises(ValueError, match="No credentials or API token provided"):
kwargs2 = dict(url=TRY_GOGS_URL, instance=instance)
GogsLister(scheduler=swh_scheduler, **kwargs2)
# Test lister initialization with credentials and run it:
creds = {"gogs": {instance: [{"username": "u", "password": api_token}]}}
kwargs3 = dict(url=TRY_GOGS_URL, credentials=creds, instance=instance, page_size=3)
lister = GogsLister(scheduler=swh_scheduler, **kwargs3)
# test API token from credentials
assert "Authorization" in lister.session.headers
assert lister.session.headers["Authorization"].lower() == "token %s" % api_token
# test instance inference from URL
assert lister.instance
assert "gogs" in lister.instance
assert lister.instance == "try_gogs"
# setup requests mocking
p1_text, p1_headers, _, _ = trygogs_p1
p2_text, p2_headers, _, _ = trygogs_empty_page
p2_text, p2_headers, _, _ = trygogs_p2
p3_text, p3_headers, _, _ = trygogs_p3_empty
base_url = TRY_GOGS_URL + lister.REPO_LIST_PATH
requests_mock.get(base_url, text=p1_text, headers=p1_headers)
requests_mock.get(try_gogs_page(2), text=p2_text, headers=p2_headers)
# now check the lister runs without error
requests_mock.get(P1, text=p1_text, headers=p1_headers)
requests_mock.get(P2, text=p2_text, headers=p2_headers)
requests_mock.get(P3, text=p3_text, headers=p3_headers)
# lister should run without any error and extract the origins
stats = lister.run()
assert stats.pages == 2
assert stats.origins == 3
assert stats.pages == 3
assert stats.origins == 6
@pytest.mark.parametrize("http_code", [400, 500, 502])
def test_gogs_list_http_error(swh_scheduler, requests_mock, http_code):
def test_gogs_list_http_error(
swh_scheduler, requests_mock, http_code, trygogs_p1, trygogs_p3_last
):
"""Test handling of some HTTP errors commonly encountered"""
lister = GogsLister(scheduler=swh_scheduler, url=TRY_GOGS_URL, api_token="secret")
p1_text, p1_headers, _, p1_origin_urls = trygogs_p1
p3_text, p3_headers, _, _ = trygogs_p3_last
base_url = TRY_GOGS_URL + lister.REPO_LIST_PATH
requests_mock.get(base_url, status_code=http_code)
requests_mock.get(
base_url,
[
{"text": p1_text, "headers": p1_headers, "status_code": 200},
{"status_code": http_code},
{"text": p3_text, "headers": p3_headers, "status_code": 200},
],
)
with pytest.raises(HTTPError):
lister.run()
scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
assert len(scheduler_origins) == 0
check_listed_origins(
p1_origin_urls, scheduler_origins
) # Only the first page is listed
def test_gogs_incremental_lister(
swh_scheduler,
requests_mock,
mocker,
trygogs_p1,
trygogs_p2,
trygogs_p3,
trygogs_p3_last,
trygogs_p3_empty,
trygogs_p4,
):
kwargs = dict(
url=TRY_GOGS_URL, instance="try_gogs", page_size=3, api_token="secret"
)
lister = GogsLister(scheduler=swh_scheduler, **kwargs)
lister.get_origins_from_page: Mock = mocker.spy(lister, "get_origins_from_page")
# First listing attempt: P1 and P2 return 3 origins each
# while P3 (current last page) is empty.
p1_text, p1_headers, p1_result, p1_origin_urls = trygogs_p1
p2_text, p2_headers, p2_result, p2_origin_urls = trygogs_p2
p3_text, p3_headers, p3_result, p3_origin_urls = trygogs_p3_empty
requests_mock.get(P1, text=p1_text, headers=p1_headers)
requests_mock.get(P2, text=p2_text, headers=p2_headers)
requests_mock.get(P3, text=p3_text, headers=p3_headers)
attempt1_stats = lister.run()
assert attempt1_stats.pages == 3
assert attempt1_stats.origins == 6
scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
lister_state = lister.get_state_from_scheduler()
assert lister_state.last_seen_next_link == P3
assert lister_state.last_seen_repo_id == p2_result.repos[-1]["id"]
assert lister.updated
check_listed_origins(p1_origin_urls + p2_origin_urls, scheduler_origins)
lister.updated = False # Reset the flag
# Second listing attempt: P3 isn't empty anymore.
# The lister should restart from last state and hence revisit P3.
p3_text, p3_headers, p3_result, p3_origin_urls = trygogs_p3_last
requests_mock.get(P3, text=p3_text, headers=p3_headers)
lister.session.get = mocker.spy(lister.session, "get")
attempt2_stats = lister.run()
assert attempt2_stats.pages == 1
assert attempt2_stats.origins == 3
scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
page_id = _parse_page_id(lister_state.last_seen_next_link)
query_params = lister.query_params
query_params["page"] = page_id
lister.session.get.assert_called_once_with(
TRY_GOGS_URL + lister.REPO_LIST_PATH, params=query_params
)
# All the 9 origins (3 pages) should be passed on to the scheduler:
check_listed_origins(
p1_origin_urls + p2_origin_urls + p3_origin_urls, scheduler_origins
)
lister_state = lister.get_state_from_scheduler()
assert lister_state.last_seen_next_link == P3
assert lister_state.last_seen_repo_id == p3_result.repos[-1]["id"]
assert lister.updated
lister.updated = False # Reset the flag
# Third listing attempt: No new origins
# The lister should revisit last seen page (P3)
attempt3_stats = lister.run()
assert attempt3_stats.pages == 1
assert attempt3_stats.origins == 3
lister_state = lister.get_state_from_scheduler()
assert lister_state.last_seen_next_link == P3
assert lister_state.last_seen_repo_id == p3_result.repos[-1]["id"]
assert lister.updated is False # No new origins so state isn't updated.
# Fourth listing attempt: Page 4 is introduced and returns 3 new origins
# The lister should revisit last seen page (P3) as well as P4.
p3_text, p3_headers, p3_result, p3_origin_urls = trygogs_p3 # new P3 points to P4
p4_text, p4_headers, p4_result, p4_origin_urls = trygogs_p4
requests_mock.get(P3, text=p3_text, headers=p3_headers)
requests_mock.get(P4, text=p4_text, headers=p4_headers)
attempt4_stats = lister.run()
assert attempt4_stats.pages == 2
assert attempt4_stats.origins == 6
lister_state = lister.get_state_from_scheduler()
assert lister_state.last_seen_next_link == P4
assert lister_state.last_seen_repo_id == p4_result.repos[-1]["id"]
assert lister.updated
# All the 12 origins (4 pages) should be passed on to the scheduler:
scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
check_listed_origins(
p1_origin_urls + p2_origin_urls + p3_origin_urls + p4_origin_urls,
scheduler_origins,
)