Arch: Add throttling retry for scrapping and resources download

This commit is contained in:
Franck Bret 2022-08-30 09:50:29 +02:00
parent b7b11887a0
commit 0acf5b0f4f

View file

@ -12,11 +12,14 @@ from urllib.parse import unquote, urljoin
from bs4 import BeautifulSoup
import requests
from tenacity.before_sleep import before_sleep_log
from swh.lister.utils import throttling_retry
from swh.model.hashutil import hash_to_hex
from swh.scheduler.interface import SchedulerInterface
from swh.scheduler.model import ListedOrigin
from .. import USER_AGENT
from ..pattern import CredentialsType, StatelessLister
logger = logging.getLogger(__name__)
@ -122,6 +125,29 @@ class ArchLister(StatelessLister[ArchListerPage]):
)
self.flavours = flavours
self.session = requests.Session()
self.session.headers.update(
{
"User-Agent": USER_AGENT,
}
)
@throttling_retry(before_sleep=before_sleep_log(logger, logging.WARNING))
def request_get(self, url: str, params: Dict[str, Any]) -> requests.Response:
logger.info("Fetching URL %s with params %s", url, params)
response = self.session.get(url, params=params)
if response.status_code != 200:
logger.warning(
"Unexpected HTTP status code %s on %s: %s",
response.status_code,
response.url,
response.content,
)
response.raise_for_status()
return response
def scrap_package_versions(
self, name: str, repo: str, base_url: str
@ -153,7 +179,8 @@ class ArchLister(StatelessLister[ArchListerPage]):
url = self.ARCH_PACKAGE_VERSIONS_URL_PATTERN.format(
pkgname=name, base_url=base_url
)
soup = BeautifulSoup(requests.get(url).text, "html.parser")
response = self.request_get(url=url, params={})
soup = BeautifulSoup(response.text, "html.parser")
links = soup.find_all("a", href=True)
# drop the first line (used to go to up directory)
@ -236,7 +263,7 @@ class ArchLister(StatelessLister[ArchListerPage]):
Returns:
a directory Path where the archive has been extracted to.
"""
res = requests.get(url)
res = self.request_get(url=url, params={})
destination_path.parent.mkdir(parents=True, exist_ok=True)
destination_path.write_bytes(res.content)