Arch: Add throttling retry for scrapping and resources download

2022-08-30 09:50:29 +02:00 · 2022-08-30 09:50:29 +02:00 · 0acf5b0f4f
commit 0acf5b0f4f
parent b7b11887a0
1 changed files with 29 additions and 2 deletions
--- a/swh/lister/arch/lister.py
+++ b/swh/lister/arch/lister.py
@ -12,11 +12,14 @@ from urllib.parse import unquote, urljoin

 from bs4 import BeautifulSoup
 import requests
+from tenacity.before_sleep import before_sleep_log

+from swh.lister.utils import throttling_retry
 from swh.model.hashutil import hash_to_hex
 from swh.scheduler.interface import SchedulerInterface
 from swh.scheduler.model import ListedOrigin

+from .. import USER_AGENT
 from ..pattern import CredentialsType, StatelessLister

 logger = logging.getLogger(__name__)
@ -122,6 +125,29 @@ class ArchLister(StatelessLister[ArchListerPage]):
        )

        self.flavours = flavours
+        self.session = requests.Session()
+        self.session.headers.update(
+            {
+                "User-Agent": USER_AGENT,
+            }
+        )
+
+    @throttling_retry(before_sleep=before_sleep_log(logger, logging.WARNING))
+    def request_get(self, url: str, params: Dict[str, Any]) -> requests.Response:
+
+        logger.info("Fetching URL %s with params %s", url, params)
+
+        response = self.session.get(url, params=params)
+        if response.status_code != 200:
+            logger.warning(
+                "Unexpected HTTP status code %s on %s: %s",
+                response.status_code,
+                response.url,
+                response.content,
+            )
+        response.raise_for_status()
+
+        return response

    def scrap_package_versions(
        self, name: str, repo: str, base_url: str
@ -153,7 +179,8 @@ class ArchLister(StatelessLister[ArchListerPage]):
        url = self.ARCH_PACKAGE_VERSIONS_URL_PATTERN.format(
            pkgname=name, base_url=base_url
        )
-        soup = BeautifulSoup(requests.get(url).text, "html.parser")
+        response = self.request_get(url=url, params={})
+        soup = BeautifulSoup(response.text, "html.parser")
        links = soup.find_all("a", href=True)

        # drop the first line (used to go to up directory)
@ -236,7 +263,7 @@ class ArchLister(StatelessLister[ArchListerPage]):
        Returns:
            a directory Path where the archive has been extracted to.
        """
-        res = requests.get(url)
+        res = self.request_get(url=url, params={})
        destination_path.parent.mkdir(parents=True, exist_ok=True)
        destination_path.write_bytes(res.content)