Refactor and deduplicate HTTP requests code in listers

Numerous listers were using the same page_request method or equivalent
in their implementation so prefer to deduplicate that code by adding
an http_request method in base lister class: swh.lister.pattern.Lister.

That method simply wraps a call to requests.Session.request and logs
some useful info for debugging and error reporting, also an HTTPError
will be raised if a request ends up with an error.

All listers using that new method now benefit of requests retry when
an HTTP error occurs thanks to the use of the http_retry decorator.
This commit is contained in:
Antoine Lambert 2022-09-21 19:53:22 +02:00
parent 9c55acd286
commit db6ce12e9e
28 changed files with 174 additions and 449 deletions

View file

@ -1,4 +1,4 @@
# Copyright (C) 2020-2021 The Software Heritage developers
# Copyright (C) 2020-2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
@ -6,14 +6,23 @@
from __future__ import annotations
from dataclasses import dataclass
import logging
from typing import Any, Dict, Generic, Iterable, Iterator, List, Optional, TypeVar
from urllib.parse import urlparse
import requests
from tenacity.before_sleep import before_sleep_log
from swh.core.config import load_from_envvar
from swh.core.utils import grouper
from swh.scheduler import get_scheduler, model
from swh.scheduler.interface import SchedulerInterface
from . import USER_AGENT
from .utils import http_retry
logger = logging.getLogger(__name__)
@dataclass
class ListerStats:
@ -113,6 +122,27 @@ class Lister(Generic[StateType, PageType]):
self.state = self.get_state_from_scheduler()
self.updated = False
self.session = requests.Session()
# Declare the USER_AGENT is more sysadm-friendly for the forge we list
self.session.headers.update({"User-Agent": USER_AGENT})
@http_retry(before_sleep=before_sleep_log(logger, logging.WARNING))
def http_request(self, url: str, method="GET", **kwargs) -> requests.Response:
logger.debug("Fetching URL %s with params %s", url, kwargs.get("params"))
response = self.session.request(method, url, **kwargs)
if response.status_code not in (200, 304):
logger.warning(
"Unexpected HTTP status code %s on %s: %s",
response.status_code,
response.url,
response.content,
)
response.raise_for_status()
return response
def run(self) -> ListerStats:
"""Run the lister.