Refactor and deduplicate HTTP requests code in listers

Numerous listers were using the same page_request method or equivalent in their implementation so prefer to deduplicate that code by adding an http_request method in base lister class: swh.lister.pattern.Lister. That method simply wraps a call to requests.Session.request and logs some useful info for debugging and error reporting, also an HTTPError will be raised if a request ends up with an error. All listers using that new method now benefit of requests retry when an HTTP error occurs thanks to the use of the http_retry decorator.
2022-09-21 19:53:22 +02:00 · 2022-09-21 19:53:22 +02:00 · db6ce12e9e
commit db6ce12e9e
parent 9c55acd286
28 changed files with 174 additions and 449 deletions
--- a/swh/lister/maven/lister.py
+++ b/swh/lister/maven/lister.py
@ -13,10 +13,8 @@ from urllib.parse import urljoin
 from bs4 import BeautifulSoup
 import lxml
 import requests
-from tenacity.before_sleep import before_sleep_log

 from swh.core.github.utils import GitHubSession
-from swh.lister.utils import http_retry
 from swh.scheduler.interface import SchedulerInterface
 from swh.scheduler.model import ListedOrigin

@ -93,13 +91,7 @@ class MavenLister(Lister[MavenListerState, RepoPage]):
            instance=instance,
        )

-        self.session = requests.Session()
-        self.session.headers.update(
-            {
-                "Accept": "application/json",
-                "User-Agent": USER_AGENT,
-            }
-        )
+        self.session.headers.update({"Accept": "application/json"})

        self.jar_origins: Dict[str, ListedOrigin] = {}
        self.github_session = GitHubSession(
@ -112,23 +104,6 @@ class MavenLister(Lister[MavenListerState, RepoPage]):
    def state_to_dict(self, state: MavenListerState) -> Dict[str, Any]:
        return asdict(state)

-    @http_retry(before_sleep=before_sleep_log(logger, logging.WARNING))
-    def page_request(self, url: str, params: Dict[str, Any]) -> requests.Response:
-
-        logger.info("Fetching URL %s with params %s", url, params)
-
-        response = self.session.get(url, params=params)
-        if response.status_code != 200:
-            logger.warning(
-                "Unexpected HTTP status code %s on %s: %s",
-                response.status_code,
-                response.url,
-                response.content,
-            )
-        response.raise_for_status()
-
-        return response
-
    def get_pages(self) -> Iterator[RepoPage]:
        """Retrieve and parse exported maven indexes to
        identify all pom files and src archives.
@ -155,10 +130,11 @@ class MavenLister(Lister[MavenListerState, RepoPage]):
        # Download the main text index file.
        logger.info("Downloading computed index from %s.", self.INDEX_URL)
        assert self.INDEX_URL is not None
-        response = requests.get(self.INDEX_URL, stream=True)
-        if response.status_code != 200:
+        try:
+            response = self.http_request(self.INDEX_URL, stream=True)
+        except requests.HTTPError:
            logger.error("Index %s not found, stopping", self.INDEX_URL)
-            response.raise_for_status()
+            raise

        # Prepare regexes to parse index exports.

@ -250,9 +226,9 @@ class MavenLister(Lister[MavenListerState, RepoPage]):
        # Now fetch pom files and scan them for scm info.

        logger.info("Fetching poms..")
-        for pom in out_pom:
+        for pom_url in out_pom:
            try:
-                response = self.page_request(pom, {})
+                response = self.http_request(pom_url)
                parsed_pom = BeautifulSoup(response.content, "xml")
                project = parsed_pom.find("project")
                if project is None:
@ -263,22 +239,24 @@ class MavenLister(Lister[MavenListerState, RepoPage]):
                    if connection is not None:
                        artifact_metadata_d = {
                            "type": "scm",
-                            "doc": out_pom[pom],
+                            "doc": out_pom[pom_url],
                            "url": connection.text,
                        }
-                        logger.debug("* Yielding pom %s: %s", pom, artifact_metadata_d)
+                        logger.debug(
+                            "* Yielding pom %s: %s", pom_url, artifact_metadata_d
+                        )
                        yield artifact_metadata_d
                    else:
-                        logger.debug("No scm.connection in pom %s", pom)
+                        logger.debug("No scm.connection in pom %s", pom_url)
                else:
-                    logger.debug("No scm in pom %s", pom)
+                    logger.debug("No scm in pom %s", pom_url)
            except requests.HTTPError:
                logger.warning(
                    "POM info page could not be fetched, skipping project '%s'",
-                    pom,
+                    pom_url,
                )
            except lxml.etree.Error as error:
-                logger.info("Could not parse POM %s XML: %s.", pom, error)
+                logger.info("Could not parse POM %s XML: %s.", pom_url, error)

    def get_scm(self, page: RepoPage) -> Optional[ListedOrigin]:
        """Retrieve scm origin out of the page information. Only called when type of the
--- a/swh/lister/maven/tests/test_lister.py
+++ b/swh/lister/maven/tests/test_lister.py
@ -127,7 +127,7 @@ def network_requests_mock(

@pytest.fixture(autouse=True)
 def retry_sleep_mock(mocker):
-    mocker.patch.object(MavenLister.page_request.retry, "sleep")
+    mocker.patch.object(MavenLister.http_request.retry, "sleep")


 def test_maven_full_listing(swh_scheduler):