Refactor and deduplicate HTTP requests code in listers
Numerous listers were using the same page_request method or equivalent in their implementation so prefer to deduplicate that code by adding an http_request method in base lister class: swh.lister.pattern.Lister. That method simply wraps a call to requests.Session.request and logs some useful info for debugging and error reporting, also an HTTPError will be raised if a request ends up with an error. All listers using that new method now benefit of requests retry when an HTTP error occurs thanks to the use of the http_retry decorator.
This commit is contained in:
parent
9c55acd286
commit
db6ce12e9e
28 changed files with 174 additions and 449 deletions
|
@ -1,4 +1,4 @@
|
|||
# Copyright (C) 2019-2021 The Software Heritage developers
|
||||
# Copyright (C) 2019-2022 The Software Heritage developers
|
||||
# License: GNU General Public License version 3, or any later version
|
||||
# See top-level LICENSE file for more information
|
||||
|
||||
|
@ -9,13 +9,9 @@ from typing import Any, Dict, Iterator, List, Optional
|
|||
from urllib.parse import urljoin, urlparse
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
import requests
|
||||
from requests.exceptions import HTTPError
|
||||
from tenacity.before_sleep import before_sleep_log
|
||||
|
||||
from swh.lister import USER_AGENT
|
||||
from swh.lister.pattern import CredentialsType, StatelessLister
|
||||
from swh.lister.utils import http_retry
|
||||
from swh.scheduler.interface import SchedulerInterface
|
||||
from swh.scheduler.model import ListedOrigin
|
||||
|
||||
|
@ -73,17 +69,12 @@ class CGitLister(StatelessLister[Repositories]):
|
|||
credentials=credentials,
|
||||
)
|
||||
|
||||
self.session = requests.Session()
|
||||
self.session.headers.update(
|
||||
{"Accept": "application/html", "User-Agent": USER_AGENT}
|
||||
)
|
||||
self.session.headers.update({"Accept": "application/html"})
|
||||
self.base_git_url = base_git_url
|
||||
|
||||
@http_retry(before_sleep=before_sleep_log(logger, logging.DEBUG))
|
||||
def _get_and_parse(self, url: str) -> BeautifulSoup:
|
||||
"""Get the given url and parse the retrieved HTML using BeautifulSoup"""
|
||||
response = self.session.get(url)
|
||||
response.raise_for_status()
|
||||
response = self.http_request(url)
|
||||
return BeautifulSoup(response.text, features="html.parser")
|
||||
|
||||
def get_pages(self) -> Iterator[Repositories]:
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
# Copyright (C) 2019-2021 The Software Heritage developers
|
||||
# Copyright (C) 2019-2022 The Software Heritage developers
|
||||
# License: GNU General Public License version 3, or any later version
|
||||
# See top-level LICENSE file for more information
|
||||
|
||||
|
@ -258,7 +258,7 @@ def test_lister_cgit_get_pages_with_pages_and_retry(
|
|||
|
||||
lister_cgit = CGitLister(swh_scheduler, url=url)
|
||||
|
||||
mocker.patch.object(lister_cgit._get_and_parse.retry, "sleep")
|
||||
mocker.patch.object(lister_cgit.http_request.retry, "sleep")
|
||||
|
||||
repos: List[List[str]] = list(lister_cgit.get_pages())
|
||||
flattened_repos = sum(repos, [])
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue