Refactor and deduplicate HTTP requests code in listers
Numerous listers were using the same page_request method or equivalent in their implementation so prefer to deduplicate that code by adding an http_request method in base lister class: swh.lister.pattern.Lister. That method simply wraps a call to requests.Session.request and logs some useful info for debugging and error reporting, also an HTTPError will be raised if a request ends up with an error. All listers using that new method now benefit of requests retry when an HTTP error occurs thanks to the use of the http_retry decorator.
This commit is contained in:
parent
9c55acd286
commit
db6ce12e9e
28 changed files with 174 additions and 449 deletions
|
@ -1,4 +1,4 @@
|
|||
# Copyright (C) 2021 The Software Heritage developers
|
||||
# Copyright (C) 2021-2022 The Software Heritage developers
|
||||
# See the AUTHORS file at the top-level directory of this distribution
|
||||
# License: GNU General Public License version 3, or any later version
|
||||
# See top-level LICENSE file for more information
|
||||
|
@ -8,14 +8,9 @@ import logging
|
|||
from typing import Any, Dict, Iterator, List
|
||||
from urllib.parse import urljoin
|
||||
|
||||
import requests
|
||||
from tenacity.before_sleep import before_sleep_log
|
||||
|
||||
from swh.lister.utils import http_retry
|
||||
from swh.scheduler.interface import SchedulerInterface
|
||||
from swh.scheduler.model import ListedOrigin
|
||||
|
||||
from .. import USER_AGENT
|
||||
from ..pattern import CredentialsType, Lister
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
@ -65,11 +60,7 @@ class NewForgeLister(Lister[NewForgeListerState, NewForgeListerPage]):
|
|||
instance=instance,
|
||||
)
|
||||
|
||||
self.session = requests.Session()
|
||||
# Declare the USER_AGENT is more sysadm-friendly for the forge we list
|
||||
self.session.headers.update(
|
||||
{"Accept": "application/json", "User-Agent": USER_AGENT}
|
||||
)
|
||||
self.session.headers.update({"Accept": "application/json"})
|
||||
|
||||
def state_from_dict(self, d: Dict[str, Any]) -> NewForgeListerState:
|
||||
return NewForgeListerState(**d)
|
||||
|
@ -77,32 +68,6 @@ class NewForgeLister(Lister[NewForgeListerState, NewForgeListerPage]):
|
|||
def state_to_dict(self, state: NewForgeListerState) -> Dict[str, Any]:
|
||||
return asdict(state)
|
||||
|
||||
@http_retry(before_sleep=before_sleep_log(logger, logging.WARNING))
|
||||
def page_request(self, url, params) -> requests.Response:
|
||||
# Do the network resource request under a retrying decorator
|
||||
# to handle rate limiting and transient errors up to a limit.
|
||||
# `http_retry` by default use the `requests` library to check
|
||||
# only for rate-limit and a base-10 exponential waiting strategy.
|
||||
# This can be customized by passed waiting, retrying and logging strategies
|
||||
# as functions. See the `tenacity` library documentation.
|
||||
|
||||
# Log listed URL to ease debugging
|
||||
logger.debug("Fetching URL %s with params %s", url, params)
|
||||
response = self.session.get(url, params=params)
|
||||
|
||||
if response.status_code != 200:
|
||||
# Log response content to ease debugging
|
||||
logger.warning(
|
||||
"Unexpected HTTP status code %s on %s: %s",
|
||||
response.status_code,
|
||||
response.url,
|
||||
response.content,
|
||||
)
|
||||
# The lister must fail on blocking errors
|
||||
response.raise_for_status()
|
||||
|
||||
return response
|
||||
|
||||
def get_pages(self) -> Iterator[NewForgeListerPage]:
|
||||
# The algorithm depends on the service, but should request data reliably,
|
||||
# following pagination if relevant and yielding pages in a streaming fashion.
|
||||
|
@ -120,7 +85,7 @@ class NewForgeLister(Lister[NewForgeListerState, NewForgeListerPage]):
|
|||
|
||||
while current is not None:
|
||||
# Parametrize the request for incremental listing
|
||||
body = self.page_request(url, {"current": current}).json()
|
||||
body = self.http_request(url, params={"current": current}).json()
|
||||
|
||||
# Simplify the page if possible to only the necessary elements
|
||||
# and yield it
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
# Copyright (C) 2018-2019 The Software Heritage developers
|
||||
# Copyright (C) 2018-2022 The Software Heritage developers
|
||||
# See the AUTHORS file at the top-level directory of this distribution
|
||||
# License: GNU General Public License version 3, or any later version
|
||||
# See top-level LICENSE file for more information
|
||||
|
@ -7,8 +7,6 @@ import logging
|
|||
|
||||
import pkg_resources
|
||||
|
||||
from swh.lister import pattern
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
|
@ -53,6 +51,9 @@ def get_lister(lister_name, db_url=None, **conf):
|
|||
|
||||
registry_entry = LISTERS[lister_name].load()()
|
||||
lister_cls = registry_entry["lister"]
|
||||
|
||||
from swh.lister import pattern
|
||||
|
||||
if issubclass(lister_cls, pattern.Lister):
|
||||
return lister_cls.from_config(**conf)
|
||||
else:
|
||||
|
|
|
@ -13,15 +13,11 @@ from typing import Any, Dict, Iterator, List, Optional
|
|||
from urllib.parse import unquote, urljoin
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
import requests
|
||||
from tenacity.before_sleep import before_sleep_log
|
||||
|
||||
from swh.lister.utils import http_retry
|
||||
from swh.model.hashutil import hash_to_hex
|
||||
from swh.scheduler.interface import SchedulerInterface
|
||||
from swh.scheduler.model import ListedOrigin
|
||||
|
||||
from .. import USER_AGENT
|
||||
from ..pattern import CredentialsType, StatelessLister
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
@ -125,29 +121,6 @@ class ArchLister(StatelessLister[ArchListerPage]):
|
|||
)
|
||||
|
||||
self.flavours = flavours
|
||||
self.session = requests.Session()
|
||||
self.session.headers.update(
|
||||
{
|
||||
"User-Agent": USER_AGENT,
|
||||
}
|
||||
)
|
||||
|
||||
@http_retry(before_sleep=before_sleep_log(logger, logging.WARNING))
|
||||
def request_get(self, url: str, params: Dict[str, Any]) -> requests.Response:
|
||||
|
||||
logger.debug("Fetching URL %s with params %s", url, params)
|
||||
|
||||
response = self.session.get(url, params=params)
|
||||
if response.status_code != 200:
|
||||
logger.warning(
|
||||
"Unexpected HTTP status code %s on %s: %s",
|
||||
response.status_code,
|
||||
response.url,
|
||||
response.content,
|
||||
)
|
||||
response.raise_for_status()
|
||||
|
||||
return response
|
||||
|
||||
def scrap_package_versions(
|
||||
self, name: str, repo: str, base_url: str
|
||||
|
@ -179,7 +152,7 @@ class ArchLister(StatelessLister[ArchListerPage]):
|
|||
url = self.ARCH_PACKAGE_VERSIONS_URL_PATTERN.format(
|
||||
pkgname=name, base_url=base_url
|
||||
)
|
||||
response = self.request_get(url=url, params={})
|
||||
response = self.http_request(url)
|
||||
soup = BeautifulSoup(response.text, "html.parser")
|
||||
links = soup.find_all("a", href=True)
|
||||
|
||||
|
@ -263,7 +236,7 @@ class ArchLister(StatelessLister[ArchListerPage]):
|
|||
Returns:
|
||||
a directory Path where the archive has been extracted to.
|
||||
"""
|
||||
res = self.request_get(url=url, params={})
|
||||
res = self.http_request(url)
|
||||
destination_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
destination_path.write_bytes(res.content)
|
||||
|
||||
|
|
|
@ -7,8 +7,6 @@ import datetime
|
|||
import logging
|
||||
from typing import Any, Dict, Iterator, List, Optional
|
||||
|
||||
import requests
|
||||
|
||||
from swh.scheduler.interface import SchedulerInterface
|
||||
from swh.scheduler.model import ListedOrigin
|
||||
|
||||
|
@ -65,7 +63,7 @@ class AurLister(StatelessLister[AurListerPage]):
|
|||
a directory Path where the archive has been downloaded to.
|
||||
"""
|
||||
url = self.DEFAULT_PACKAGES_INDEX_URL.format(base_url=self.url)
|
||||
return requests.get(url).json()
|
||||
return self.http_request(url).json()
|
||||
|
||||
def get_pages(self) -> Iterator[AurListerPage]:
|
||||
"""Yield an iterator which returns 'page'
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
# Copyright (C) 2017-2021 The Software Heritage developers
|
||||
# Copyright (C) 2017-2022 The Software Heritage developers
|
||||
# See the AUTHORS file at the top-level directory of this distribution
|
||||
# License: GNU General Public License version 3, or any later version
|
||||
# See top-level LICENSE file for more information
|
||||
|
@ -11,14 +11,10 @@ from typing import Any, Dict, Iterator, List, Optional
|
|||
from urllib import parse
|
||||
|
||||
import iso8601
|
||||
import requests
|
||||
from tenacity.before_sleep import before_sleep_log
|
||||
|
||||
from swh.lister.utils import http_retry
|
||||
from swh.scheduler.interface import SchedulerInterface
|
||||
from swh.scheduler.model import ListedOrigin
|
||||
|
||||
from .. import USER_AGENT
|
||||
from ..pattern import CredentialsType, Lister
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
@ -77,10 +73,7 @@ class BitbucketLister(Lister[BitbucketListerState, List[Dict[str, Any]]]):
|
|||
),
|
||||
}
|
||||
|
||||
self.session = requests.Session()
|
||||
self.session.headers.update(
|
||||
{"Accept": "application/json", "User-Agent": USER_AGENT}
|
||||
)
|
||||
self.session.headers.update({"Accept": "application/json"})
|
||||
|
||||
if len(self.credentials) > 0:
|
||||
cred = random.choice(self.credentials)
|
||||
|
@ -107,25 +100,6 @@ class BitbucketLister(Lister[BitbucketListerState, List[Dict[str, Any]]]):
|
|||
if username is not None and password is not None:
|
||||
self.session.auth = (username, password)
|
||||
|
||||
@http_retry(before_sleep=before_sleep_log(logger, logging.DEBUG))
|
||||
def page_request(self, last_repo_cdate: str) -> requests.Response:
|
||||
|
||||
self.url_params["after"] = last_repo_cdate
|
||||
logger.debug("Fetching URL %s with params %s", self.url, self.url_params)
|
||||
|
||||
response = self.session.get(self.url, params=self.url_params)
|
||||
|
||||
if response.status_code != 200:
|
||||
logger.warning(
|
||||
"Unexpected HTTP status code %s on %s: %s",
|
||||
response.status_code,
|
||||
response.url,
|
||||
response.content,
|
||||
)
|
||||
response.raise_for_status()
|
||||
|
||||
return response
|
||||
|
||||
def get_pages(self) -> Iterator[List[Dict[str, Any]]]:
|
||||
|
||||
last_repo_cdate: str = "1970-01-01"
|
||||
|
@ -137,7 +111,8 @@ class BitbucketLister(Lister[BitbucketListerState, List[Dict[str, Any]]]):
|
|||
last_repo_cdate = self.state.last_repo_cdate.isoformat()
|
||||
|
||||
while True:
|
||||
body = self.page_request(last_repo_cdate).json()
|
||||
self.url_params["after"] = last_repo_cdate
|
||||
body = self.http_request(self.url, params=self.url_params).json()
|
||||
|
||||
yield body["values"]
|
||||
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
# Copyright (C) 2017-2021 The Software Heritage developers
|
||||
# Copyright (C) 2017-2022 The Software Heritage developers
|
||||
# See the AUTHORS file at the top-level directory of this distribution
|
||||
# License: GNU General Public License version 3, or any later version
|
||||
# See top-level LICENSE file for more information
|
||||
|
@ -69,14 +69,16 @@ def test_bitbucket_incremental_lister(
|
|||
assert last_repo_cdate == bb_api_repositories_page2["values"][-1]["created_on"]
|
||||
|
||||
# Second listing, restarting from last state
|
||||
lister.session.get = mocker.spy(lister.session, "get")
|
||||
lister.session.request = mocker.spy(lister.session, "request")
|
||||
|
||||
lister.run()
|
||||
|
||||
url_params = lister.url_params
|
||||
url_params["after"] = last_repo_cdate
|
||||
|
||||
lister.session.get.assert_called_once_with(lister.API_URL, params=url_params)
|
||||
lister.session.request.assert_called_once_with(
|
||||
"GET", lister.API_URL, params=url_params
|
||||
)
|
||||
|
||||
all_origins = (
|
||||
bb_api_repositories_page1["values"] + bb_api_repositories_page2["values"]
|
||||
|
@ -106,7 +108,7 @@ def test_bitbucket_lister_rate_limit_hit(
|
|||
|
||||
lister = BitbucketLister(scheduler=swh_scheduler, page_size=10)
|
||||
|
||||
mocker.patch.object(lister.page_request.retry, "sleep")
|
||||
mocker.patch.object(lister.http_request.retry, "sleep")
|
||||
|
||||
stats = lister.run()
|
||||
|
||||
|
|
|
@ -2,17 +2,13 @@
|
|||
# See the AUTHORS file at the top-level directory of this distribution
|
||||
# License: GNU General Public License version 3, or any later version
|
||||
# See top-level LICENSE file for more information
|
||||
|
||||
import logging
|
||||
from typing import Any, Dict, Iterator, List, Optional
|
||||
from typing import Dict, Iterator, List, Optional
|
||||
|
||||
import requests
|
||||
from tenacity.before_sleep import before_sleep_log
|
||||
|
||||
from swh.lister.utils import http_retry
|
||||
from swh.scheduler.interface import SchedulerInterface
|
||||
from swh.scheduler.model import ListedOrigin
|
||||
|
||||
from .. import USER_AGENT
|
||||
from ..pattern import CredentialsType, StatelessLister
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
@ -41,30 +37,7 @@ class BowerLister(StatelessLister[BowerListerPage]):
|
|||
instance=self.INSTANCE,
|
||||
url=self.API_URL,
|
||||
)
|
||||
self.session = requests.Session()
|
||||
self.session.headers.update(
|
||||
{
|
||||
"Accept": "application/json",
|
||||
"User-Agent": USER_AGENT,
|
||||
}
|
||||
)
|
||||
|
||||
@http_retry(before_sleep=before_sleep_log(logger, logging.WARNING))
|
||||
def page_request(self, url: str, params: Dict[str, Any]) -> requests.Response:
|
||||
|
||||
logger.info("Fetching URL %s with params %s", url, params)
|
||||
|
||||
response = self.session.get(url, params=params)
|
||||
if response.status_code != 200:
|
||||
logger.warning(
|
||||
"Unexpected HTTP status code %s on %s: %s",
|
||||
response.status_code,
|
||||
response.url,
|
||||
response.content,
|
||||
)
|
||||
response.raise_for_status()
|
||||
|
||||
return response
|
||||
self.session.headers.update({"Accept": "application/json"})
|
||||
|
||||
def get_pages(self) -> Iterator[BowerListerPage]:
|
||||
"""Yield an iterator which returns 'page'
|
||||
|
@ -75,7 +48,7 @@ class BowerLister(StatelessLister[BowerListerPage]):
|
|||
|
||||
There is only one page that list all origins urls.
|
||||
"""
|
||||
response = self.page_request(url=self.url, params={})
|
||||
response = self.http_request(self.url)
|
||||
yield response.json()
|
||||
|
||||
def get_origins_from_page(self, page: BowerListerPage) -> Iterator[ListedOrigin]:
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
# Copyright (C) 2019-2021 The Software Heritage developers
|
||||
# Copyright (C) 2019-2022 The Software Heritage developers
|
||||
# License: GNU General Public License version 3, or any later version
|
||||
# See top-level LICENSE file for more information
|
||||
|
||||
|
@ -9,13 +9,9 @@ from typing import Any, Dict, Iterator, List, Optional
|
|||
from urllib.parse import urljoin, urlparse
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
import requests
|
||||
from requests.exceptions import HTTPError
|
||||
from tenacity.before_sleep import before_sleep_log
|
||||
|
||||
from swh.lister import USER_AGENT
|
||||
from swh.lister.pattern import CredentialsType, StatelessLister
|
||||
from swh.lister.utils import http_retry
|
||||
from swh.scheduler.interface import SchedulerInterface
|
||||
from swh.scheduler.model import ListedOrigin
|
||||
|
||||
|
@ -73,17 +69,12 @@ class CGitLister(StatelessLister[Repositories]):
|
|||
credentials=credentials,
|
||||
)
|
||||
|
||||
self.session = requests.Session()
|
||||
self.session.headers.update(
|
||||
{"Accept": "application/html", "User-Agent": USER_AGENT}
|
||||
)
|
||||
self.session.headers.update({"Accept": "application/html"})
|
||||
self.base_git_url = base_git_url
|
||||
|
||||
@http_retry(before_sleep=before_sleep_log(logger, logging.DEBUG))
|
||||
def _get_and_parse(self, url: str) -> BeautifulSoup:
|
||||
"""Get the given url and parse the retrieved HTML using BeautifulSoup"""
|
||||
response = self.session.get(url)
|
||||
response.raise_for_status()
|
||||
response = self.http_request(url)
|
||||
return BeautifulSoup(response.text, features="html.parser")
|
||||
|
||||
def get_pages(self) -> Iterator[Repositories]:
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
# Copyright (C) 2019-2021 The Software Heritage developers
|
||||
# Copyright (C) 2019-2022 The Software Heritage developers
|
||||
# License: GNU General Public License version 3, or any later version
|
||||
# See top-level LICENSE file for more information
|
||||
|
||||
|
@ -258,7 +258,7 @@ def test_lister_cgit_get_pages_with_pages_and_retry(
|
|||
|
||||
lister_cgit = CGitLister(swh_scheduler, url=url)
|
||||
|
||||
mocker.patch.object(lister_cgit._get_and_parse.retry, "sleep")
|
||||
mocker.patch.object(lister_cgit.http_request.retry, "sleep")
|
||||
|
||||
repos: List[List[str]] = list(lister_cgit.get_pages())
|
||||
flattened_repos = sum(repos, [])
|
||||
|
|
|
@ -1,9 +1,8 @@
|
|||
# Copyright (C) 2017-2021 The Software Heritage developers
|
||||
# Copyright (C) 2017-2022 The Software Heritage developers
|
||||
# See the AUTHORS file at the top-level directory of this distribution
|
||||
# License: GNU General Public License version 3, or any later version
|
||||
# See top-level LICENSE file for more information
|
||||
|
||||
|
||||
import bz2
|
||||
from collections import defaultdict
|
||||
from dataclasses import dataclass, field
|
||||
|
@ -17,12 +16,11 @@ from typing import Any, Callable, Dict, Iterator, List, Optional, Set, Tuple
|
|||
from urllib.parse import urljoin
|
||||
|
||||
from debian.deb822 import Sources
|
||||
import requests
|
||||
from requests.exceptions import HTTPError
|
||||
|
||||
from swh.scheduler.interface import SchedulerInterface
|
||||
from swh.scheduler.model import ListedOrigin
|
||||
|
||||
from .. import USER_AGENT
|
||||
from ..pattern import CredentialsType, Lister
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
@ -95,9 +93,6 @@ class DebianLister(Lister[DebianListerState, DebianPageType]):
|
|||
self.suites = suites
|
||||
self.components = components
|
||||
|
||||
self.session = requests.Session()
|
||||
self.session.headers.update({"User-Agent": USER_AGENT})
|
||||
|
||||
# will hold all listed origins info
|
||||
self.listed_origins: Dict[DebianOrigin, ListedOrigin] = {}
|
||||
# will contain origin urls that have already been listed
|
||||
|
@ -132,9 +127,11 @@ class DebianLister(Lister[DebianListerState, DebianPageType]):
|
|||
def page_request(self, suite: Suite, component: Component) -> DebianPageType:
|
||||
"""Return parsed package Sources file for a given debian suite and component."""
|
||||
for url, compression in self.debian_index_urls(suite, component):
|
||||
response = requests.get(url, stream=True)
|
||||
logging.debug("Fetched URL: %s, status code: %s", url, response.status_code)
|
||||
if response.status_code == 200:
|
||||
try:
|
||||
response = self.http_request(url, stream=True)
|
||||
except HTTPError:
|
||||
pass
|
||||
else:
|
||||
last_modified = response.headers.get("Last-Modified")
|
||||
self.last_sources_update = (
|
||||
parsedate_to_datetime(last_modified) if last_modified else None
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
# Copyright (C) 2018-2021 The Software Heritage developers
|
||||
# Copyright (C) 2018-2022 The Software Heritage developers
|
||||
# See the AUTHORS file at the top-level directory of this distribution
|
||||
# License: GNU General Public License version 3, or any later version
|
||||
# See top-level LICENSE file for more information
|
||||
|
@ -10,12 +10,10 @@ from typing import Any, Dict, Iterator, Optional, Tuple
|
|||
from urllib.parse import parse_qs, urlencode, urlparse
|
||||
|
||||
import iso8601
|
||||
import requests
|
||||
from requests.exceptions import HTTPError
|
||||
from requests.status_codes import codes
|
||||
from tenacity.before_sleep import before_sleep_log
|
||||
|
||||
from swh.lister import USER_AGENT
|
||||
from swh.lister.pattern import CredentialsType, Lister
|
||||
from swh.lister.utils import http_retry, is_retryable_exception
|
||||
from swh.scheduler.model import ListedOrigin
|
||||
|
@ -118,10 +116,7 @@ class GitLabLister(Lister[GitLabListerState, PageResult]):
|
|||
self.last_page: Optional[str] = None
|
||||
self.per_page = 100
|
||||
|
||||
self.session = requests.Session()
|
||||
self.session.headers.update(
|
||||
{"Accept": "application/json", "User-Agent": USER_AGENT}
|
||||
)
|
||||
self.session.headers.update({"Accept": "application/json"})
|
||||
|
||||
if len(self.credentials) > 0:
|
||||
cred = random.choice(self.credentials)
|
||||
|
|
|
@ -2,6 +2,7 @@
|
|||
# See the AUTHORS file at the top-level directory of this distribution
|
||||
# License: GNU General Public License version 3, or any later version
|
||||
# See top-level LICENSE file for more information
|
||||
|
||||
from dataclasses import asdict, dataclass
|
||||
import logging
|
||||
import random
|
||||
|
@ -9,14 +10,11 @@ from typing import Any, Dict, Iterator, List, Optional, Tuple
|
|||
from urllib.parse import parse_qs, parse_qsl, urlencode, urljoin, urlparse
|
||||
|
||||
import iso8601
|
||||
import requests
|
||||
from tenacity.before_sleep import before_sleep_log
|
||||
from requests.exceptions import HTTPError
|
||||
|
||||
from swh.lister.utils import http_retry
|
||||
from swh.scheduler.interface import SchedulerInterface
|
||||
from swh.scheduler.model import ListedOrigin
|
||||
|
||||
from .. import USER_AGENT
|
||||
from ..pattern import CredentialsType, Lister
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
@ -96,13 +94,7 @@ class GogsLister(Lister[GogsListerState, GogsListerPage]):
|
|||
# Raises an error on Gogs, or a warning on Gitea
|
||||
self.on_anonymous_mode()
|
||||
|
||||
self.session = requests.Session()
|
||||
self.session.headers.update(
|
||||
{
|
||||
"Accept": "application/json",
|
||||
"User-Agent": USER_AGENT,
|
||||
}
|
||||
)
|
||||
self.session.headers.update({"Accept": "application/json"})
|
||||
|
||||
if self.api_token:
|
||||
self.session.headers["Authorization"] = f"token {self.api_token}"
|
||||
|
@ -116,34 +108,27 @@ class GogsLister(Lister[GogsListerState, GogsListerPage]):
|
|||
def state_to_dict(self, state: GogsListerState) -> Dict[str, Any]:
|
||||
return asdict(state)
|
||||
|
||||
@http_retry(before_sleep=before_sleep_log(logger, logging.WARNING))
|
||||
def page_request(
|
||||
self, url: str, params: Dict[str, Any]
|
||||
) -> Tuple[Dict[str, Any], Dict[str, Any]]:
|
||||
|
||||
logger.debug("Fetching URL %s with params %s", url, params)
|
||||
|
||||
response = self.session.get(url, params=params)
|
||||
|
||||
if response.status_code != 200:
|
||||
logger.warning(
|
||||
"Unexpected HTTP status code %s on %s: %s",
|
||||
response.status_code,
|
||||
response.url,
|
||||
response.content,
|
||||
)
|
||||
if (
|
||||
response.status_code == 500
|
||||
): # Temporary hack for skipping fatal repos (T4423)
|
||||
url_parts = urlparse(url)
|
||||
query: Dict[str, Any] = dict(parse_qsl(url_parts.query))
|
||||
query.update({"page": _parse_page_id(url) + 1})
|
||||
next_page_link = url_parts._replace(query=urlencode(query)).geturl()
|
||||
body: Dict[str, Any] = {"data": []}
|
||||
links = {"next": {"url": next_page_link}}
|
||||
return body, links
|
||||
else:
|
||||
response.raise_for_status()
|
||||
try:
|
||||
response = self.http_request(url, params=params)
|
||||
except HTTPError as http_error:
|
||||
if (
|
||||
http_error.response.status_code == 500
|
||||
): # Temporary hack for skipping fatal repos (T4423)
|
||||
url_parts = urlparse(url)
|
||||
query: Dict[str, Any] = dict(parse_qsl(url_parts.query))
|
||||
query.update({"page": _parse_page_id(url) + 1})
|
||||
next_page_link = url_parts._replace(query=urlencode(query)).geturl()
|
||||
body: Dict[str, Any] = {"data": []}
|
||||
links = {"next": {"url": next_page_link}}
|
||||
return body, links
|
||||
else:
|
||||
raise
|
||||
|
||||
return response.json(), response.links
|
||||
|
||||
|
@ -159,6 +144,7 @@ class GogsLister(Lister[GogsListerState, GogsListerPage]):
|
|||
|
||||
# base with trailing slash, path without leading slash for urljoin
|
||||
next_link: Optional[str] = urljoin(self.url, self.REPO_LIST_PATH)
|
||||
assert next_link is not None
|
||||
|
||||
body, links = self.page_request(
|
||||
next_link, {**self.query_params, "page": page_id}
|
||||
|
|
|
@ -264,7 +264,7 @@ def test_gogs_incremental_lister(
|
|||
p3_text, p3_headers, p3_result, p3_origin_urls = trygogs_p3_last
|
||||
requests_mock.get(P3, text=p3_text, headers=p3_headers)
|
||||
|
||||
lister.session.get = mocker.spy(lister.session, "get")
|
||||
lister.session.request = mocker.spy(lister.session, "request")
|
||||
|
||||
attempt2_stats = lister.run()
|
||||
|
||||
|
@ -277,8 +277,8 @@ def test_gogs_incremental_lister(
|
|||
query_params = lister.query_params
|
||||
query_params["page"] = page_id
|
||||
|
||||
lister.session.get.assert_called_once_with(
|
||||
TRY_GOGS_URL + lister.REPO_LIST_PATH, params=query_params
|
||||
lister.session.request.assert_called_once_with(
|
||||
"GET", TRY_GOGS_URL + lister.REPO_LIST_PATH, params=query_params
|
||||
)
|
||||
|
||||
# All the 9 origins (3 pages) should be passed on to the scheduler:
|
||||
|
|
|
@ -10,14 +10,10 @@ import logging
|
|||
from typing import Any, Dict, Iterator, List, Optional, Tuple
|
||||
|
||||
import iso8601
|
||||
import requests
|
||||
from tenacity import before_sleep_log
|
||||
|
||||
from swh.lister.utils import http_retry
|
||||
from swh.scheduler.interface import SchedulerInterface
|
||||
from swh.scheduler.model import ListedOrigin
|
||||
|
||||
from .. import USER_AGENT
|
||||
from ..pattern import CredentialsType, Lister
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
@ -59,10 +55,7 @@ class GolangLister(Lister[GolangStateType, GolangPageType]):
|
|||
credentials=credentials,
|
||||
)
|
||||
|
||||
self.session = requests.Session()
|
||||
self.session.headers.update(
|
||||
{"Accept": "application/json", "User-Agent": USER_AGENT}
|
||||
)
|
||||
self.session.headers.update({"Accept": "application/json"})
|
||||
self.incremental = incremental
|
||||
|
||||
def state_from_dict(self, d: Dict[str, Any]) -> GolangStateType:
|
||||
|
@ -87,24 +80,8 @@ class GolangLister(Lister[GolangStateType, GolangPageType]):
|
|||
):
|
||||
self.updated = True
|
||||
|
||||
@http_retry(
|
||||
before_sleep=before_sleep_log(logger, logging.WARNING),
|
||||
)
|
||||
def api_request(self, url: str) -> List[str]:
|
||||
logger.debug("Fetching URL %s", url)
|
||||
|
||||
response = self.session.get(url)
|
||||
|
||||
if response.status_code not in (200, 304):
|
||||
# Log response content to ease debugging
|
||||
logger.warning(
|
||||
"Unexpected HTTP status code %s for URL %s",
|
||||
response.status_code,
|
||||
response.url,
|
||||
)
|
||||
|
||||
response.raise_for_status()
|
||||
|
||||
response = self.http_request(url)
|
||||
return response.text.split()
|
||||
|
||||
def get_single_page(
|
||||
|
|
|
@ -98,11 +98,12 @@ def _generate_responses(datadir, requests_mock):
|
|||
|
||||
|
||||
def test_golang_lister(swh_scheduler, mocker, requests_mock, datadir):
|
||||
# first listing, should return one origin per package
|
||||
lister = GolangLister(scheduler=swh_scheduler)
|
||||
|
||||
# Exponential retries take a long time, so stub time.sleep
|
||||
mocked_sleep = mocker.patch.object(lister.api_request.retry, "sleep")
|
||||
mocked_sleep = mocker.patch.object(GolangLister.http_request.retry, "sleep")
|
||||
|
||||
# first listing, should return one origin per package
|
||||
lister = GolangLister(scheduler=swh_scheduler)
|
||||
|
||||
_generate_responses(datadir, requests_mock)
|
||||
|
||||
|
@ -131,7 +132,7 @@ def test_golang_lister(swh_scheduler, mocker, requests_mock, datadir):
|
|||
|
||||
# doing it all again (without incremental) should give us the same result
|
||||
lister = GolangLister(scheduler=swh_scheduler)
|
||||
mocked_sleep = mocker.patch.object(lister.api_request.retry, "sleep")
|
||||
|
||||
_generate_responses(datadir, requests_mock)
|
||||
stats = lister.run()
|
||||
|
||||
|
|
|
@ -13,10 +13,8 @@ from urllib.parse import urljoin
|
|||
from bs4 import BeautifulSoup
|
||||
import lxml
|
||||
import requests
|
||||
from tenacity.before_sleep import before_sleep_log
|
||||
|
||||
from swh.core.github.utils import GitHubSession
|
||||
from swh.lister.utils import http_retry
|
||||
from swh.scheduler.interface import SchedulerInterface
|
||||
from swh.scheduler.model import ListedOrigin
|
||||
|
||||
|
@ -93,13 +91,7 @@ class MavenLister(Lister[MavenListerState, RepoPage]):
|
|||
instance=instance,
|
||||
)
|
||||
|
||||
self.session = requests.Session()
|
||||
self.session.headers.update(
|
||||
{
|
||||
"Accept": "application/json",
|
||||
"User-Agent": USER_AGENT,
|
||||
}
|
||||
)
|
||||
self.session.headers.update({"Accept": "application/json"})
|
||||
|
||||
self.jar_origins: Dict[str, ListedOrigin] = {}
|
||||
self.github_session = GitHubSession(
|
||||
|
@ -112,23 +104,6 @@ class MavenLister(Lister[MavenListerState, RepoPage]):
|
|||
def state_to_dict(self, state: MavenListerState) -> Dict[str, Any]:
|
||||
return asdict(state)
|
||||
|
||||
@http_retry(before_sleep=before_sleep_log(logger, logging.WARNING))
|
||||
def page_request(self, url: str, params: Dict[str, Any]) -> requests.Response:
|
||||
|
||||
logger.info("Fetching URL %s with params %s", url, params)
|
||||
|
||||
response = self.session.get(url, params=params)
|
||||
if response.status_code != 200:
|
||||
logger.warning(
|
||||
"Unexpected HTTP status code %s on %s: %s",
|
||||
response.status_code,
|
||||
response.url,
|
||||
response.content,
|
||||
)
|
||||
response.raise_for_status()
|
||||
|
||||
return response
|
||||
|
||||
def get_pages(self) -> Iterator[RepoPage]:
|
||||
"""Retrieve and parse exported maven indexes to
|
||||
identify all pom files and src archives.
|
||||
|
@ -155,10 +130,11 @@ class MavenLister(Lister[MavenListerState, RepoPage]):
|
|||
# Download the main text index file.
|
||||
logger.info("Downloading computed index from %s.", self.INDEX_URL)
|
||||
assert self.INDEX_URL is not None
|
||||
response = requests.get(self.INDEX_URL, stream=True)
|
||||
if response.status_code != 200:
|
||||
try:
|
||||
response = self.http_request(self.INDEX_URL, stream=True)
|
||||
except requests.HTTPError:
|
||||
logger.error("Index %s not found, stopping", self.INDEX_URL)
|
||||
response.raise_for_status()
|
||||
raise
|
||||
|
||||
# Prepare regexes to parse index exports.
|
||||
|
||||
|
@ -250,9 +226,9 @@ class MavenLister(Lister[MavenListerState, RepoPage]):
|
|||
# Now fetch pom files and scan them for scm info.
|
||||
|
||||
logger.info("Fetching poms..")
|
||||
for pom in out_pom:
|
||||
for pom_url in out_pom:
|
||||
try:
|
||||
response = self.page_request(pom, {})
|
||||
response = self.http_request(pom_url)
|
||||
parsed_pom = BeautifulSoup(response.content, "xml")
|
||||
project = parsed_pom.find("project")
|
||||
if project is None:
|
||||
|
@ -263,22 +239,24 @@ class MavenLister(Lister[MavenListerState, RepoPage]):
|
|||
if connection is not None:
|
||||
artifact_metadata_d = {
|
||||
"type": "scm",
|
||||
"doc": out_pom[pom],
|
||||
"doc": out_pom[pom_url],
|
||||
"url": connection.text,
|
||||
}
|
||||
logger.debug("* Yielding pom %s: %s", pom, artifact_metadata_d)
|
||||
logger.debug(
|
||||
"* Yielding pom %s: %s", pom_url, artifact_metadata_d
|
||||
)
|
||||
yield artifact_metadata_d
|
||||
else:
|
||||
logger.debug("No scm.connection in pom %s", pom)
|
||||
logger.debug("No scm.connection in pom %s", pom_url)
|
||||
else:
|
||||
logger.debug("No scm in pom %s", pom)
|
||||
logger.debug("No scm in pom %s", pom_url)
|
||||
except requests.HTTPError:
|
||||
logger.warning(
|
||||
"POM info page could not be fetched, skipping project '%s'",
|
||||
pom,
|
||||
pom_url,
|
||||
)
|
||||
except lxml.etree.Error as error:
|
||||
logger.info("Could not parse POM %s XML: %s.", pom, error)
|
||||
logger.info("Could not parse POM %s XML: %s.", pom_url, error)
|
||||
|
||||
def get_scm(self, page: RepoPage) -> Optional[ListedOrigin]:
|
||||
"""Retrieve scm origin out of the page information. Only called when type of the
|
||||
|
|
|
@ -127,7 +127,7 @@ def network_requests_mock(
|
|||
|
||||
@pytest.fixture(autouse=True)
|
||||
def retry_sleep_mock(mocker):
|
||||
mocker.patch.object(MavenLister.page_request.retry, "sleep")
|
||||
mocker.patch.object(MavenLister.http_request.retry, "sleep")
|
||||
|
||||
|
||||
def test_maven_full_listing(swh_scheduler):
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
# Copyright (C) 2018-2021 the Software Heritage developers
|
||||
# Copyright (C) 2018-2022 the Software Heritage developers
|
||||
# License: GNU General Public License version 3, or any later version
|
||||
# See top-level LICENSE file for more information
|
||||
|
||||
|
@ -7,12 +7,8 @@ import logging
|
|||
from typing import Any, Dict, Iterator, List, Optional
|
||||
|
||||
import iso8601
|
||||
import requests
|
||||
from tenacity.before_sleep import before_sleep_log
|
||||
|
||||
from swh.lister import USER_AGENT
|
||||
from swh.lister.pattern import CredentialsType, Lister
|
||||
from swh.lister.utils import http_retry
|
||||
from swh.scheduler.interface import SchedulerInterface
|
||||
from swh.scheduler.model import ListedOrigin
|
||||
|
||||
|
@ -75,10 +71,7 @@ class NpmLister(Lister[NpmListerState, List[Dict[str, Any]]]):
|
|||
self.page_size += 1
|
||||
self.incremental = incremental
|
||||
|
||||
self.session = requests.Session()
|
||||
self.session.headers.update(
|
||||
{"Accept": "application/json", "User-Agent": USER_AGENT}
|
||||
)
|
||||
self.session.headers.update({"Accept": "application/json"})
|
||||
|
||||
def state_from_dict(self, d: Dict[str, Any]) -> NpmListerState:
|
||||
return NpmListerState(**d)
|
||||
|
@ -95,21 +88,6 @@ class NpmLister(Lister[NpmListerState, List[Dict[str, Any]]]):
|
|||
params["startkey"] = last_package_id
|
||||
return params
|
||||
|
||||
@http_retry(before_sleep=before_sleep_log(logger, logging.WARNING))
|
||||
def page_request(self, last_package_id: str) -> requests.Response:
|
||||
params = self.request_params(last_package_id)
|
||||
logger.debug("Fetching URL %s with params %s", self.url, params)
|
||||
response = self.session.get(self.url, params=params)
|
||||
if response.status_code != 200:
|
||||
logger.warning(
|
||||
"Unexpected HTTP status code %s on %s: %s",
|
||||
response.status_code,
|
||||
response.url,
|
||||
response.content,
|
||||
)
|
||||
response.raise_for_status()
|
||||
return response
|
||||
|
||||
def get_pages(self) -> Iterator[List[Dict[str, Any]]]:
|
||||
last_package_id: str = "0" if self.incremental else '""'
|
||||
if (
|
||||
|
@ -121,7 +99,9 @@ class NpmLister(Lister[NpmListerState, List[Dict[str, Any]]]):
|
|||
|
||||
while True:
|
||||
|
||||
response = self.page_request(last_package_id)
|
||||
response = self.http_request(
|
||||
self.url, params=self.request_params(last_package_id)
|
||||
)
|
||||
|
||||
data = response.json()
|
||||
page = data["results"] if self.incremental else data["rows"]
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
# Copyright (C) 2018-2021 The Software Heritage developers
|
||||
# Copyright (C) 2018-2022 The Software Heritage developers
|
||||
# See the AUTHORS file at the top-level directory of this distribution
|
||||
# License: GNU General Public License version 3, or any later version
|
||||
# See top-level LICENSE file for more information
|
||||
|
@ -37,7 +37,7 @@ def npm_incremental_listing_page2(datadir):
|
|||
|
||||
@pytest.fixture(autouse=True)
|
||||
def retry_sleep_mock(mocker):
|
||||
mocker.patch.object(NpmLister.page_request.retry, "sleep")
|
||||
mocker.patch.object(NpmLister.http_request.retry, "sleep")
|
||||
|
||||
|
||||
def _check_listed_npm_packages(lister, packages, scheduler_origins):
|
||||
|
@ -78,19 +78,21 @@ def test_npm_lister_full(
|
|||
additional_matcher=_match_request,
|
||||
)
|
||||
|
||||
spy_get = mocker.spy(lister.session, "get")
|
||||
spy_request = mocker.spy(lister.session, "request")
|
||||
|
||||
stats = lister.run()
|
||||
assert stats.pages == 2
|
||||
assert stats.origins == page_size * stats.pages
|
||||
|
||||
spy_get.assert_has_calls(
|
||||
spy_request.assert_has_calls(
|
||||
[
|
||||
mocker.call(
|
||||
"GET",
|
||||
lister.API_FULL_LISTING_URL,
|
||||
params=_url_params(page_size + 1, startkey='""'),
|
||||
),
|
||||
mocker.call(
|
||||
"GET",
|
||||
lister.API_FULL_LISTING_URL,
|
||||
params=_url_params(
|
||||
page_size + 1,
|
||||
|
@ -132,7 +134,7 @@ def test_npm_lister_incremental(
|
|||
additional_matcher=_match_request,
|
||||
)
|
||||
|
||||
spy_get = mocker.spy(lister.session, "get")
|
||||
spy_request = mocker.spy(lister.session, "request")
|
||||
|
||||
assert lister.get_state_from_scheduler() == NpmListerState()
|
||||
|
||||
|
@ -142,13 +144,15 @@ def test_npm_lister_incremental(
|
|||
|
||||
last_seq = npm_incremental_listing_page2["results"][-1]["seq"]
|
||||
|
||||
spy_get.assert_has_calls(
|
||||
spy_request.assert_has_calls(
|
||||
[
|
||||
mocker.call(
|
||||
"GET",
|
||||
lister.API_INCREMENTAL_LISTING_URL,
|
||||
params=_url_params(page_size, since="0"),
|
||||
),
|
||||
mocker.call(
|
||||
"GET",
|
||||
lister.API_INCREMENTAL_LISTING_URL,
|
||||
params=_url_params(
|
||||
page_size,
|
||||
|
@ -156,6 +160,7 @@ def test_npm_lister_incremental(
|
|||
),
|
||||
),
|
||||
mocker.call(
|
||||
"GET",
|
||||
lister.API_INCREMENTAL_LISTING_URL,
|
||||
params=_url_params(page_size, since=str(last_seq)),
|
||||
),
|
||||
|
@ -189,11 +194,12 @@ def test_npm_lister_incremental_restart(
|
|||
|
||||
requests_mock.get(lister.API_INCREMENTAL_LISTING_URL, json={"results": []})
|
||||
|
||||
spy_get = mocker.spy(lister.session, "get")
|
||||
spy_request = mocker.spy(lister.session, "request")
|
||||
|
||||
lister.run()
|
||||
|
||||
spy_get.assert_called_with(
|
||||
spy_request.assert_called_with(
|
||||
"GET",
|
||||
lister.API_INCREMENTAL_LISTING_URL,
|
||||
params=_url_params(page_size, since=str(last_seq)),
|
||||
)
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
# Copyright (C) 2019-2021 The Software Heritage developers
|
||||
# Copyright (C) 2019-2022 The Software Heritage developers
|
||||
# See the AUTHORS file at the top-level directory of this distribution
|
||||
# License: GNU General Public License version 3, or any later version
|
||||
# See top-level LICENSE file for more information
|
||||
|
@ -14,7 +14,6 @@ import requests
|
|||
from swh.scheduler.interface import SchedulerInterface
|
||||
from swh.scheduler.model import ListedOrigin
|
||||
|
||||
from .. import USER_AGENT
|
||||
from ..pattern import CredentialsType, Lister
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
@ -62,10 +61,7 @@ class PackagistLister(Lister[PackagistListerState, PackagistPageType]):
|
|||
credentials=credentials,
|
||||
)
|
||||
|
||||
self.session = requests.Session()
|
||||
self.session.headers.update(
|
||||
{"Accept": "application/json", "User-Agent": USER_AGENT}
|
||||
)
|
||||
self.session.headers.update({"Accept": "application/json"})
|
||||
self.listing_date = datetime.now().astimezone(tz=timezone.utc)
|
||||
|
||||
def state_from_dict(self, d: Dict[str, Any]) -> PackagistListerState:
|
||||
|
@ -82,20 +78,7 @@ class PackagistLister(Lister[PackagistListerState, PackagistPageType]):
|
|||
return d
|
||||
|
||||
def api_request(self, url: str) -> Any:
|
||||
logger.debug("Fetching URL %s", url)
|
||||
|
||||
response = self.session.get(url)
|
||||
|
||||
if response.status_code not in (200, 304):
|
||||
logger.warning(
|
||||
"Unexpected HTTP status code %s on %s: %s",
|
||||
response.status_code,
|
||||
response.url,
|
||||
response.content,
|
||||
)
|
||||
|
||||
response.raise_for_status()
|
||||
|
||||
response = self.http_request(url)
|
||||
# response is empty when status code is 304
|
||||
return response.json() if response.status_code == 200 else {}
|
||||
|
||||
|
@ -134,7 +117,7 @@ class PackagistLister(Lister[PackagistListerState, PackagistPageType]):
|
|||
# missing package metadata in response
|
||||
continue
|
||||
versions_info = metadata["packages"][package_name].values()
|
||||
except requests.exceptions.HTTPError:
|
||||
except requests.HTTPError:
|
||||
# error when getting package metadata (usually 404 when a
|
||||
# package has been removed), skip it and process next package
|
||||
continue
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
# Copyright (C) 2020-2021 The Software Heritage developers
|
||||
# Copyright (C) 2020-2022 The Software Heritage developers
|
||||
# See the AUTHORS file at the top-level directory of this distribution
|
||||
# License: GNU General Public License version 3, or any later version
|
||||
# See top-level LICENSE file for more information
|
||||
|
@ -6,14 +6,23 @@
|
|||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass
|
||||
import logging
|
||||
from typing import Any, Dict, Generic, Iterable, Iterator, List, Optional, TypeVar
|
||||
from urllib.parse import urlparse
|
||||
|
||||
import requests
|
||||
from tenacity.before_sleep import before_sleep_log
|
||||
|
||||
from swh.core.config import load_from_envvar
|
||||
from swh.core.utils import grouper
|
||||
from swh.scheduler import get_scheduler, model
|
||||
from swh.scheduler.interface import SchedulerInterface
|
||||
|
||||
from . import USER_AGENT
|
||||
from .utils import http_retry
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class ListerStats:
|
||||
|
@ -113,6 +122,27 @@ class Lister(Generic[StateType, PageType]):
|
|||
self.state = self.get_state_from_scheduler()
|
||||
self.updated = False
|
||||
|
||||
self.session = requests.Session()
|
||||
# Declare the USER_AGENT is more sysadm-friendly for the forge we list
|
||||
self.session.headers.update({"User-Agent": USER_AGENT})
|
||||
|
||||
@http_retry(before_sleep=before_sleep_log(logger, logging.WARNING))
|
||||
def http_request(self, url: str, method="GET", **kwargs) -> requests.Response:
|
||||
|
||||
logger.debug("Fetching URL %s with params %s", url, kwargs.get("params"))
|
||||
|
||||
response = self.session.request(method, url, **kwargs)
|
||||
if response.status_code not in (200, 304):
|
||||
logger.warning(
|
||||
"Unexpected HTTP status code %s on %s: %s",
|
||||
response.status_code,
|
||||
response.url,
|
||||
response.content,
|
||||
)
|
||||
response.raise_for_status()
|
||||
|
||||
return response
|
||||
|
||||
def run(self) -> ListerStats:
|
||||
"""Run the lister.
|
||||
|
||||
|
|
|
@ -1,15 +1,13 @@
|
|||
# Copyright (C) 2019-2021 the Software Heritage developers
|
||||
# Copyright (C) 2019-2022 the Software Heritage developers
|
||||
# License: GNU General Public License version 3, or any later version
|
||||
# See top-level LICENSE file for more information
|
||||
|
||||
from collections import defaultdict
|
||||
import logging
|
||||
import random
|
||||
from typing import Any, Dict, Iterator, List, Optional
|
||||
from urllib.parse import urljoin
|
||||
|
||||
import requests
|
||||
|
||||
from swh.lister import USER_AGENT
|
||||
from swh.lister.pattern import CredentialsType, StatelessLister
|
||||
from swh.scheduler.interface import SchedulerInterface
|
||||
from swh.scheduler.model import ListedOrigin
|
||||
|
@ -47,10 +45,7 @@ class PhabricatorLister(StatelessLister[PageType]):
|
|||
scheduler, urljoin(url, self.API_REPOSITORY_PATH), instance, credentials
|
||||
)
|
||||
|
||||
self.session = requests.Session()
|
||||
self.session.headers.update(
|
||||
{"Accept": "application/json", "User-Agent": USER_AGENT}
|
||||
)
|
||||
self.session.headers.update({"Accept": "application/json"})
|
||||
|
||||
if api_token is not None:
|
||||
self.api_token = api_token
|
||||
|
@ -91,22 +86,7 @@ class PhabricatorLister(StatelessLister[PageType]):
|
|||
after: Optional[str] = None
|
||||
while True:
|
||||
params = self.get_request_params(after)
|
||||
logger.debug(
|
||||
"Retrieving results on URI %s with parameters %s",
|
||||
self.url,
|
||||
self.filter_params(params),
|
||||
)
|
||||
response = self.session.post(self.url, data=params)
|
||||
|
||||
if response.status_code != 200:
|
||||
logger.warning(
|
||||
"Unexpected HTTP status code %s on %s: %s",
|
||||
response.status_code,
|
||||
response.url,
|
||||
response.content,
|
||||
)
|
||||
|
||||
response.raise_for_status()
|
||||
response = self.http_request(self.url, method="POST", data=params)
|
||||
|
||||
response_data = response.json()
|
||||
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
# Copyright (C) 2019-2021 The Software Heritage developers
|
||||
# Copyright (C) 2019-2022 The Software Heritage developers
|
||||
# See the AUTHORS file at the top-level directory of this distribution
|
||||
# License: GNU General Public License version 3, or any later version
|
||||
# See top-level LICENSE file for more information
|
||||
|
@ -27,6 +27,11 @@ def phabricator_repositories_page2(datadir):
|
|||
)
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def retry_sleep_mock(mocker):
|
||||
mocker.patch.object(PhabricatorLister.http_request.retry, "sleep")
|
||||
|
||||
|
||||
def test_get_repo_url(phabricator_repositories_page1):
|
||||
repos = phabricator_repositories_page1["result"]["data"]
|
||||
for repo in repos:
|
||||
|
|
|
@ -2,15 +2,13 @@
|
|||
# See the AUTHORS file at the top-level directory of this distribution
|
||||
# License: GNU General Public License version 3, or any later version
|
||||
# See top-level LICENSE file for more information
|
||||
|
||||
import logging
|
||||
from typing import Any, Dict, Iterator, List, Optional
|
||||
from typing import Iterator, List, Optional
|
||||
|
||||
import iso8601
|
||||
import requests
|
||||
from requests.exceptions import HTTPError
|
||||
from tenacity.before_sleep import before_sleep_log
|
||||
|
||||
from swh.lister.utils import http_retry
|
||||
from swh.scheduler.interface import SchedulerInterface
|
||||
from swh.scheduler.model import ListedOrigin
|
||||
|
||||
|
@ -52,7 +50,7 @@ class PubDevLister(StatelessLister[PubDevListerPage]):
|
|||
instance=self.INSTANCE,
|
||||
url=self.BASE_URL,
|
||||
)
|
||||
self.session = requests.Session()
|
||||
|
||||
self.session.headers.update(
|
||||
{
|
||||
"Accept": "application/json",
|
||||
|
@ -60,23 +58,6 @@ class PubDevLister(StatelessLister[PubDevListerPage]):
|
|||
}
|
||||
)
|
||||
|
||||
@http_retry(before_sleep=before_sleep_log(logger, logging.WARNING))
|
||||
def page_request(self, url: str, params: Dict[str, Any]) -> requests.Response:
|
||||
|
||||
logger.debug("Fetching URL %s with params %s", url, params)
|
||||
|
||||
response = self.session.get(url, params=params)
|
||||
if response.status_code != 200:
|
||||
logger.warning(
|
||||
"Unexpected HTTP status code %s on %s: %s",
|
||||
response.status_code,
|
||||
response.url,
|
||||
response.content,
|
||||
)
|
||||
response.raise_for_status()
|
||||
|
||||
return response
|
||||
|
||||
def get_pages(self) -> Iterator[PubDevListerPage]:
|
||||
"""Yield an iterator which returns 'page'
|
||||
|
||||
|
@ -88,8 +69,8 @@ class PubDevLister(StatelessLister[PubDevListerPage]):
|
|||
|
||||
There is only one page that list all origins url based on "{base_url}packages/{pkgname}"
|
||||
"""
|
||||
response = self.page_request(
|
||||
url=self.PACKAGE_NAMES_URL_PATTERN.format(base_url=self.url), params={}
|
||||
response = self.http_request(
|
||||
url=self.PACKAGE_NAMES_URL_PATTERN.format(base_url=self.url)
|
||||
)
|
||||
yield response.json()["packages"]
|
||||
|
||||
|
@ -102,7 +83,7 @@ class PubDevLister(StatelessLister[PubDevListerPage]):
|
|||
base_url=self.url, pkgname=pkgname
|
||||
)
|
||||
try:
|
||||
response = self.page_request(url=package_info_url, params={})
|
||||
response = self.http_request(url=package_info_url)
|
||||
except HTTPError:
|
||||
logger.warning(
|
||||
"Failed to fetch metadata for package %s, skipping it from listing.",
|
||||
|
|
|
@ -15,14 +15,11 @@ from bs4 import BeautifulSoup
|
|||
import iso8601
|
||||
import lxml
|
||||
import requests
|
||||
from tenacity.before_sleep import before_sleep_log
|
||||
|
||||
from swh.core.api.classes import stream_results
|
||||
from swh.lister.utils import http_retry
|
||||
from swh.scheduler.interface import SchedulerInterface
|
||||
from swh.scheduler.model import ListedOrigin
|
||||
|
||||
from .. import USER_AGENT
|
||||
from ..pattern import CredentialsType, Lister
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
@ -127,11 +124,8 @@ class SourceForgeLister(Lister[SourceForgeListerState, SourceForgeListerPage]):
|
|||
# Will hold the currently saved "last modified" dates to compare against our
|
||||
# requests.
|
||||
self._project_last_modified: Optional[ProjectsLastModifiedCache] = None
|
||||
self.session = requests.Session()
|
||||
# Declare the USER_AGENT is more sysadm-friendly for the forge we list
|
||||
self.session.headers.update(
|
||||
{"Accept": "application/json", "User-Agent": USER_AGENT}
|
||||
)
|
||||
|
||||
self.session.headers.update({"Accept": "application/json"})
|
||||
self.incremental = incremental
|
||||
|
||||
def state_from_dict(self, d: Dict[str, Dict[str, Any]]) -> SourceForgeListerState:
|
||||
|
@ -208,26 +202,6 @@ class SourceForgeLister(Lister[SourceForgeListerState, SourceForgeListerPage]):
|
|||
self._project_last_modified = listed_origins
|
||||
return listed_origins
|
||||
|
||||
@http_retry(
|
||||
before_sleep=before_sleep_log(logger, logging.WARNING),
|
||||
)
|
||||
def page_request(self, url, params) -> requests.Response:
|
||||
# Log listed URL to ease debugging
|
||||
logger.debug("Fetching URL %s with params %s", url, params)
|
||||
response = self.session.get(url, params=params)
|
||||
|
||||
if response.status_code != 200:
|
||||
# Log response content to ease debugging
|
||||
logger.warning(
|
||||
"Unexpected HTTP status code %s for URL %s",
|
||||
response.status_code,
|
||||
response.url,
|
||||
)
|
||||
# The lister must fail on blocking errors
|
||||
response.raise_for_status()
|
||||
|
||||
return response
|
||||
|
||||
def get_pages(self) -> Iterator[SourceForgeListerPage]:
|
||||
"""
|
||||
SourceForge has a main XML sitemap that lists its sharded sitemaps for all
|
||||
|
@ -240,7 +214,7 @@ class SourceForgeLister(Lister[SourceForgeListerState, SourceForgeListerPage]):
|
|||
Lastly we use the information of which VCS are used to build the predictable
|
||||
clone URL for any given VCS.
|
||||
"""
|
||||
sitemap_contents = self.page_request(MAIN_SITEMAP_URL, {}).text
|
||||
sitemap_contents = self.http_request(MAIN_SITEMAP_URL).text
|
||||
tree = ElementTree.fromstring(sitemap_contents)
|
||||
|
||||
for subsitemap in tree.iterfind(f"{SITEMAP_XML_NAMESPACE}sitemap"):
|
||||
|
@ -259,7 +233,7 @@ class SourceForgeLister(Lister[SourceForgeListerState, SourceForgeListerPage]):
|
|||
continue
|
||||
|
||||
self.state.subsitemap_last_modified[sub_url] = last_modified
|
||||
subsitemap_contents = self.page_request(sub_url, {}).text
|
||||
subsitemap_contents = self.http_request(sub_url).text
|
||||
subtree = ElementTree.fromstring(subsitemap_contents)
|
||||
|
||||
yield from self._get_pages_from_subsitemap(subtree)
|
||||
|
@ -351,9 +325,9 @@ class SourceForgeLister(Lister[SourceForgeListerState, SourceForgeListerPage]):
|
|||
logger.debug(msg, namespace, project)
|
||||
|
||||
try:
|
||||
res = self.page_request(endpoint, {}).json()
|
||||
res = self.http_request(endpoint).json()
|
||||
except requests.HTTPError:
|
||||
# We've already logged in `page_request`
|
||||
# We've already logged in `http_request`
|
||||
return []
|
||||
|
||||
tools = res.get("tools")
|
||||
|
@ -373,7 +347,7 @@ class SourceForgeLister(Lister[SourceForgeListerState, SourceForgeListerPage]):
|
|||
# and multiple origin URLs can be produced for a same project.
|
||||
cvs_info_url = f"http://{project}.cvs.sourceforge.net"
|
||||
try:
|
||||
response = self.page_request(cvs_info_url, params={})
|
||||
response = self.http_request(cvs_info_url)
|
||||
except requests.HTTPError:
|
||||
logger.warning(
|
||||
"CVS info page could not be fetched, skipping project '%s'",
|
||||
|
@ -413,7 +387,7 @@ class SourceForgeLister(Lister[SourceForgeListerState, SourceForgeListerPage]):
|
|||
# and a lot of them are 404 now.
|
||||
url = f"http://{project}.bzr.sourceforge.net/bzr/{project}"
|
||||
try:
|
||||
response = self.page_request(url, params={})
|
||||
response = self.http_request(url)
|
||||
if "To get this branch, use:" not in response.text:
|
||||
# If a bzr project has multiple branches, we need to extract their
|
||||
# names from the repository landing page and create one listed origin
|
||||
|
|
|
@ -1,7 +1,8 @@
|
|||
# Copyright (C) 2021 The Software Heritage developers
|
||||
# Copyright (C) 2021-2022 The Software Heritage developers
|
||||
# See the AUTHORS file at the top-level directory of this distribution
|
||||
# License: GNU General Public License version 3, or any later version
|
||||
# See top-level LICENSE file for more information
|
||||
|
||||
import datetime
|
||||
import functools
|
||||
import json
|
||||
|
@ -368,7 +369,7 @@ def test_sourceforge_lister_retry(swh_scheduler, requests_mock, mocker, datadir)
|
|||
lister = SourceForgeLister(scheduler=swh_scheduler)
|
||||
|
||||
# Exponential retries take a long time, so stub time.sleep
|
||||
mocked_sleep = mocker.patch.object(lister.page_request.retry, "sleep")
|
||||
mocked_sleep = mocker.patch.object(lister.http_request.retry, "sleep")
|
||||
|
||||
requests_mock.get(
|
||||
MAIN_SITEMAP_URL,
|
||||
|
@ -438,7 +439,7 @@ def test_sourceforge_lister_http_error(
|
|||
lister = SourceForgeLister(scheduler=swh_scheduler)
|
||||
|
||||
# Exponential retries take a long time, so stub time.sleep
|
||||
mocked_sleep = mocker.patch.object(lister.page_request.retry, "sleep")
|
||||
mocked_sleep = mocker.patch.object(lister.http_request.retry, "sleep")
|
||||
|
||||
requests_mock.get(MAIN_SITEMAP_URL, status_code=status_code)
|
||||
|
||||
|
@ -458,7 +459,7 @@ def test_sourceforge_lister_project_error(
|
|||
):
|
||||
lister = SourceForgeLister(scheduler=swh_scheduler)
|
||||
# Exponential retries take a long time, so stub time.sleep
|
||||
mocker.patch.object(lister.page_request.retry, "sleep")
|
||||
mocker.patch.object(lister.http_request.retry, "sleep")
|
||||
|
||||
requests_mock.get(
|
||||
MAIN_SITEMAP_URL,
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
# Copyright (C) 2021 The Software Heritage developers
|
||||
# Copyright (C) 2021-2022 The Software Heritage developers
|
||||
# See the AUTHORS file at the top-level directory of this distribution
|
||||
# License: GNU General Public License version 3, or any later version
|
||||
# See top-level LICENSE file for more information
|
||||
|
@ -8,14 +8,10 @@ from typing import Any, Dict, Iterator, List, Optional
|
|||
from urllib.parse import urljoin
|
||||
|
||||
import iso8601
|
||||
import requests
|
||||
from tenacity.before_sleep import before_sleep_log
|
||||
|
||||
from swh.lister.utils import http_retry
|
||||
from swh.scheduler.interface import SchedulerInterface
|
||||
from swh.scheduler.model import ListedOrigin
|
||||
|
||||
from .. import USER_AGENT
|
||||
from ..pattern import CredentialsType, StatelessLister
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
@ -57,30 +53,7 @@ class TuleapLister(StatelessLister[RepoPage]):
|
|||
instance=instance,
|
||||
)
|
||||
|
||||
self.session = requests.Session()
|
||||
self.session.headers.update(
|
||||
{
|
||||
"Accept": "application/json",
|
||||
"User-Agent": USER_AGENT,
|
||||
}
|
||||
)
|
||||
|
||||
@http_retry(before_sleep=before_sleep_log(logger, logging.WARNING))
|
||||
def page_request(self, url: str, params: Dict[str, Any]) -> requests.Response:
|
||||
|
||||
logger.info("Fetching URL %s with params %s", url, params)
|
||||
|
||||
response = self.session.get(url, params=params)
|
||||
if response.status_code != 200:
|
||||
logger.warning(
|
||||
"Unexpected HTTP status code %s on %s: %s",
|
||||
response.status_code,
|
||||
response.url,
|
||||
response.content,
|
||||
)
|
||||
response.raise_for_status()
|
||||
|
||||
return response
|
||||
self.session.headers.update({"Accept": "application/json"})
|
||||
|
||||
@classmethod
|
||||
def results_simplified(cls, url: str, repo_type: str, repo: RepoPage) -> RepoPage:
|
||||
|
@ -97,14 +70,14 @@ class TuleapLister(StatelessLister[RepoPage]):
|
|||
return rep
|
||||
|
||||
def _get_repositories(self, url_repo) -> List[Dict[str, Any]]:
|
||||
ret = self.page_request(url_repo, {})
|
||||
ret = self.http_request(url_repo)
|
||||
reps_list = ret.json()["repositories"]
|
||||
limit = int(ret.headers["X-PAGINATION-LIMIT-MAX"])
|
||||
offset = int(ret.headers["X-PAGINATION-LIMIT"])
|
||||
size = int(ret.headers["X-PAGINATION-SIZE"])
|
||||
while offset < size:
|
||||
url_offset = url_repo + "?offset=" + str(offset) + "&limit=" + str(limit)
|
||||
ret = self.page_request(url_offset, {}).json()
|
||||
ret = self.http_request(url_offset).json()
|
||||
reps_list = reps_list + ret["repositories"]
|
||||
offset += limit
|
||||
return reps_list
|
||||
|
@ -115,7 +88,7 @@ class TuleapLister(StatelessLister[RepoPage]):
|
|||
url_projects = url_api + "/projects/"
|
||||
|
||||
# Get the list of projects.
|
||||
response = self.page_request(url_projects, {})
|
||||
response = self.http_request(url_projects)
|
||||
projects_list = response.json()
|
||||
limit = int(response.headers["X-PAGINATION-LIMIT-MAX"])
|
||||
offset = int(response.headers["X-PAGINATION-LIMIT"])
|
||||
|
@ -124,7 +97,7 @@ class TuleapLister(StatelessLister[RepoPage]):
|
|||
url_offset = (
|
||||
url_projects + "?offset=" + str(offset) + "&limit=" + str(limit)
|
||||
)
|
||||
ret = self.page_request(url_offset, {}).json()
|
||||
ret = self.http_request(url_offset).json()
|
||||
projects_list = projects_list + ret
|
||||
offset += limit
|
||||
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
# Copyright (C) 2021 The Software Heritage developers
|
||||
# Copyright (C) 2021-2022 The Software Heritage developers
|
||||
# See the AUTHORS file at the top-level directory of this distribution
|
||||
# License: GNU General Public License version 3, or any later version
|
||||
# See top-level LICENSE file for more information
|
||||
|
@ -94,7 +94,7 @@ def tuleap_repo_3(datadir) -> Tuple[str, Dict[str, str], List[RepoPage], List[st
|
|||
|
||||
@pytest.fixture(autouse=True)
|
||||
def retry_sleep_mock(mocker):
|
||||
mocker.patch.object(TuleapLister.page_request.retry, "sleep")
|
||||
mocker.patch.object(TuleapLister.http_request.retry, "sleep")
|
||||
|
||||
|
||||
def check_listed_origins(lister_urls: List[str], scheduler_origins: List[ListedOrigin]):
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue