cgit: Enable to retry throttled HTTP requests

Related to T3645
This commit is contained in:
Antoine Lambert 2021-10-22 15:15:05 +02:00
parent 20232cc36e
commit 24bc671679
2 changed files with 31 additions and 0 deletions

View file

@ -11,9 +11,11 @@ from urllib.parse import urljoin, urlparse
from bs4 import BeautifulSoup
import requests
from requests.exceptions import HTTPError
from tenacity.before_sleep import before_sleep_log
from swh.lister import USER_AGENT
from swh.lister.pattern import CredentialsType, StatelessLister
from swh.lister.utils import throttling_retry
from swh.scheduler.interface import SchedulerInterface
from swh.scheduler.model import ListedOrigin
@ -74,6 +76,7 @@ class CGitLister(StatelessLister[Repositories]):
)
self.base_git_url = base_git_url
@throttling_retry(before_sleep=before_sleep_log(logger, logging.DEBUG))
def _get_and_parse(self, url: str) -> BeautifulSoup:
"""Get the given url and parse the retrieved HTML using BeautifulSoup"""
response = self.session.get(url)

View file

@ -3,6 +3,7 @@
# See top-level LICENSE file for more information
from datetime import datetime, timedelta, timezone
import os
from typing import List
import pytest
@ -229,3 +230,30 @@ def test_lister_cgit_with_base_git_url(
assert (
listed_origin.url.startswith(url) is False
), f"url should be mapped to {base_git_url}"
def test_lister_cgit_get_pages_with_pages_and_retry(
requests_mock_datadir, requests_mock, datadir, mocker, swh_scheduler
):
url = "https://git.tizen/cgit/"
with open(os.path.join(datadir, "https_git.tizen/cgit,ofs=50"), "rb") as page:
requests_mock.get(
f"{url}?ofs=50",
[
{"content": None, "status_code": 429},
{"content": None, "status_code": 429},
{"content": page.read(), "status_code": 200},
],
)
lister_cgit = CGitLister(swh_scheduler, url=url)
mocker.patch.object(lister_cgit._get_and_parse.retry, "sleep")
repos: List[List[str]] = list(lister_cgit.get_pages())
flattened_repos = sum(repos, [])
# we should have 16 repos (listed on 3 pages)
assert len(repos) == 3
assert len(flattened_repos) == 16