Refactor and deduplicate HTTP requests code in listers

Numerous listers were using the same page_request method or equivalent in their implementation so prefer to deduplicate that code by adding an http_request method in base lister class: swh.lister.pattern.Lister. That method simply wraps a call to requests.Session.request and logs some useful info for debugging and error reporting, also an HTTPError will be raised if a request ends up with an error. All listers using that new method now benefit of requests retry when an HTTP error occurs thanks to the use of the http_retry decorator.
2022-09-21 19:53:22 +02:00 · 2022-09-21 19:53:22 +02:00 · db6ce12e9e
commit db6ce12e9e
parent 9c55acd286
28 changed files with 174 additions and 449 deletions
--- a/swh/lister/arch/lister.py
+++ b/swh/lister/arch/lister.py
@ -13,15 +13,11 @@ from typing import Any, Dict, Iterator, List, Optional
 from urllib.parse import unquote, urljoin

 from bs4 import BeautifulSoup
-import requests
-from tenacity.before_sleep import before_sleep_log

-from swh.lister.utils import http_retry
 from swh.model.hashutil import hash_to_hex
 from swh.scheduler.interface import SchedulerInterface
 from swh.scheduler.model import ListedOrigin

-from .. import USER_AGENT
 from ..pattern import CredentialsType, StatelessLister

 logger = logging.getLogger(__name__)
@ -125,29 +121,6 @@ class ArchLister(StatelessLister[ArchListerPage]):
        )

        self.flavours = flavours
-        self.session = requests.Session()
-        self.session.headers.update(
-            {
-                "User-Agent": USER_AGENT,
-            }
-        )
-
-    @http_retry(before_sleep=before_sleep_log(logger, logging.WARNING))
-    def request_get(self, url: str, params: Dict[str, Any]) -> requests.Response:
-
-        logger.debug("Fetching URL %s with params %s", url, params)
-
-        response = self.session.get(url, params=params)
-        if response.status_code != 200:
-            logger.warning(
-                "Unexpected HTTP status code %s on %s: %s",
-                response.status_code,
-                response.url,
-                response.content,
-            )
-        response.raise_for_status()
-
-        return response

    def scrap_package_versions(
        self, name: str, repo: str, base_url: str
@ -179,7 +152,7 @@ class ArchLister(StatelessLister[ArchListerPage]):
        url = self.ARCH_PACKAGE_VERSIONS_URL_PATTERN.format(
            pkgname=name, base_url=base_url
        )
-        response = self.request_get(url=url, params={})
+        response = self.http_request(url)
        soup = BeautifulSoup(response.text, "html.parser")
        links = soup.find_all("a", href=True)

@ -263,7 +236,7 @@ class ArchLister(StatelessLister[ArchListerPage]):
        Returns:
            a directory Path where the archive has been extracted to.
        """
-        res = self.request_get(url=url, params={})
+        res = self.http_request(url)
        destination_path.parent.mkdir(parents=True, exist_ok=True)
        destination_path.write_bytes(res.content)