Arch: Add throttling retry for scrapping and resources download
This commit is contained in:
parent
b7b11887a0
commit
0acf5b0f4f
1 changed files with 29 additions and 2 deletions
|
@ -12,11 +12,14 @@ from urllib.parse import unquote, urljoin
|
|||
|
||||
from bs4 import BeautifulSoup
|
||||
import requests
|
||||
from tenacity.before_sleep import before_sleep_log
|
||||
|
||||
from swh.lister.utils import throttling_retry
|
||||
from swh.model.hashutil import hash_to_hex
|
||||
from swh.scheduler.interface import SchedulerInterface
|
||||
from swh.scheduler.model import ListedOrigin
|
||||
|
||||
from .. import USER_AGENT
|
||||
from ..pattern import CredentialsType, StatelessLister
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
@ -122,6 +125,29 @@ class ArchLister(StatelessLister[ArchListerPage]):
|
|||
)
|
||||
|
||||
self.flavours = flavours
|
||||
self.session = requests.Session()
|
||||
self.session.headers.update(
|
||||
{
|
||||
"User-Agent": USER_AGENT,
|
||||
}
|
||||
)
|
||||
|
||||
@throttling_retry(before_sleep=before_sleep_log(logger, logging.WARNING))
|
||||
def request_get(self, url: str, params: Dict[str, Any]) -> requests.Response:
|
||||
|
||||
logger.info("Fetching URL %s with params %s", url, params)
|
||||
|
||||
response = self.session.get(url, params=params)
|
||||
if response.status_code != 200:
|
||||
logger.warning(
|
||||
"Unexpected HTTP status code %s on %s: %s",
|
||||
response.status_code,
|
||||
response.url,
|
||||
response.content,
|
||||
)
|
||||
response.raise_for_status()
|
||||
|
||||
return response
|
||||
|
||||
def scrap_package_versions(
|
||||
self, name: str, repo: str, base_url: str
|
||||
|
@ -153,7 +179,8 @@ class ArchLister(StatelessLister[ArchListerPage]):
|
|||
url = self.ARCH_PACKAGE_VERSIONS_URL_PATTERN.format(
|
||||
pkgname=name, base_url=base_url
|
||||
)
|
||||
soup = BeautifulSoup(requests.get(url).text, "html.parser")
|
||||
response = self.request_get(url=url, params={})
|
||||
soup = BeautifulSoup(response.text, "html.parser")
|
||||
links = soup.find_all("a", href=True)
|
||||
|
||||
# drop the first line (used to go to up directory)
|
||||
|
@ -236,7 +263,7 @@ class ArchLister(StatelessLister[ArchListerPage]):
|
|||
Returns:
|
||||
a directory Path where the archive has been extracted to.
|
||||
"""
|
||||
res = requests.get(url)
|
||||
res = self.request_get(url=url, params={})
|
||||
destination_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
destination_path.write_bytes(res.content)
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue