Refactor and deduplicate HTTP requests code in listers

Numerous listers were using the same page_request method or equivalent
in their implementation so prefer to deduplicate that code by adding
an http_request method in base lister class: swh.lister.pattern.Lister.

That method simply wraps a call to requests.Session.request and logs
some useful info for debugging and error reporting, also an HTTPError
will be raised if a request ends up with an error.

All listers using that new method now benefit of requests retry when
an HTTP error occurs thanks to the use of the http_retry decorator.
This commit is contained in:
Antoine Lambert 2022-09-21 19:53:22 +02:00
parent 9c55acd286
commit db6ce12e9e
28 changed files with 174 additions and 449 deletions

View file

@ -13,15 +13,11 @@ from typing import Any, Dict, Iterator, List, Optional
from urllib.parse import unquote, urljoin
from bs4 import BeautifulSoup
import requests
from tenacity.before_sleep import before_sleep_log
from swh.lister.utils import http_retry
from swh.model.hashutil import hash_to_hex
from swh.scheduler.interface import SchedulerInterface
from swh.scheduler.model import ListedOrigin
from .. import USER_AGENT
from ..pattern import CredentialsType, StatelessLister
logger = logging.getLogger(__name__)
@ -125,29 +121,6 @@ class ArchLister(StatelessLister[ArchListerPage]):
)
self.flavours = flavours
self.session = requests.Session()
self.session.headers.update(
{
"User-Agent": USER_AGENT,
}
)
@http_retry(before_sleep=before_sleep_log(logger, logging.WARNING))
def request_get(self, url: str, params: Dict[str, Any]) -> requests.Response:
logger.debug("Fetching URL %s with params %s", url, params)
response = self.session.get(url, params=params)
if response.status_code != 200:
logger.warning(
"Unexpected HTTP status code %s on %s: %s",
response.status_code,
response.url,
response.content,
)
response.raise_for_status()
return response
def scrap_package_versions(
self, name: str, repo: str, base_url: str
@ -179,7 +152,7 @@ class ArchLister(StatelessLister[ArchListerPage]):
url = self.ARCH_PACKAGE_VERSIONS_URL_PATTERN.format(
pkgname=name, base_url=base_url
)
response = self.request_get(url=url, params={})
response = self.http_request(url)
soup = BeautifulSoup(response.text, "html.parser")
links = soup.find_all("a", href=True)
@ -263,7 +236,7 @@ class ArchLister(StatelessLister[ArchListerPage]):
Returns:
a directory Path where the archive has been extracted to.
"""
res = self.request_get(url=url, params={})
res = self.http_request(url)
destination_path.parent.mkdir(parents=True, exist_ok=True)
destination_path.write_bytes(res.content)