bitbucket: Skip buggy page when listing

Some URLs of the repositories endpoint from BitBucket REST API 2.0
can return an error 500. In that case, skip the buggy repositories
page and get next one to continue listing and avoid to end it
prematurely.

Related to #4239
This commit is contained in:
Antoine Lambert 2023-03-09 14:26:29 +01:00
parent 7da7fa57d0
commit 5d0f35aa69
3 changed files with 55 additions and 6 deletions

View file

@ -1,4 +1,4 @@
# Copyright (C) 2017-2022 The Software Heritage developers
# Copyright (C) 2017-2023 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
@ -11,6 +11,7 @@ from typing import Any, Dict, Iterator, List, Optional
from urllib import parse
import iso8601
from requests import HTTPError
from swh.scheduler.interface import SchedulerInterface
from swh.scheduler.model import ListedOrigin
@ -118,9 +119,22 @@ class BitbucketLister(Lister[BitbucketListerState, List[Dict[str, Any]]]):
while True:
self.url_params["after"] = last_repo_cdate
body = self.http_request(self.url, params=self.url_params).json()
yield body["values"]
try:
body = self.http_request(self.url, params=self.url_params).json()
yield body["values"]
except HTTPError as e:
if e.response.status_code == 500:
logger.warning(
"URL %s is buggy (error 500), skip it and get next page.",
e.response.url,
)
body = self.http_request(
self.url,
params={
"pagelen": self.url_params["pagelen"],
"fields": "next",
},
).json()
next_page_url = body.get("next")
if next_page_url is not None:

View file

@ -161,5 +161,5 @@
}
}
],
"next": "https://api.bitbucket.org/2.0/repositories?pagelen=10&after=2011-09-03T12%3A33%3A16.028393%2B00%3A00&fields=next%2Cvalues.links.clone.href%2Cvalues.slug%2Cvalues.scm%2Cvalues.updated_on%2Cvalues.created_on"
"next": "https://api.bitbucket.org/2.0/repositories?pagelen=10&fields=next%2Cvalues.links.clone.href%2Cvalues.scm%2Cvalues.updated_on%2Cvalues.created_on&after=2011-09-03T12%3A33%3A16.028393%2B00%3A00"
}

View file

@ -1,4 +1,4 @@
# Copyright (C) 2017-2022 The Software Heritage developers
# Copyright (C) 2017-2023 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
@ -10,6 +10,7 @@ import os
import pytest
from swh.lister.bitbucket.lister import BitbucketLister
from swh.lister.utils import MAX_NUMBER_ATTEMPTS
@pytest.fixture
@ -178,3 +179,37 @@ def test_bitbucket_full_lister(
)
_check_listed_origins(lister.get_origins_from_page(all_origins), scheduler_origins)
def test_bitbucket_lister_buggy_page(
swh_scheduler,
requests_mock,
mocker,
bb_api_repositories_page1,
bb_api_repositories_page2,
):
requests_mock.get(
BitbucketLister.API_URL,
[
{"json": bb_api_repositories_page1, "status_code": 200},
*[{"json": None, "status_code": 500}] * MAX_NUMBER_ATTEMPTS,
{"json": {"next": bb_api_repositories_page1["next"]}, "status_code": 200},
{"json": bb_api_repositories_page2, "status_code": 200},
],
)
lister = BitbucketLister(scheduler=swh_scheduler, page_size=10)
mocker.patch.object(lister.http_request.retry, "sleep")
stats = lister.run()
assert stats.pages == 2
assert stats.origins == 20
assert len(swh_scheduler.get_listed_origins(lister.lister_obj.id).results) == 20
assert (
requests_mock.request_history[MAX_NUMBER_ATTEMPTS + 2].url
== bb_api_repositories_page1["next"]
)