cgit: Don't stop the listing when a repository page is not available

Related to T2988
This commit is contained in:
Vincent SELLIER 2021-01-27 12:22:47 +01:00
parent 91fcde8341
commit f6f9f1ca28
No known key found for this signature in database
GPG key ID: 3F13C434EADAD17D
2 changed files with 30 additions and 2 deletions

View file

@ -10,6 +10,7 @@ from urllib.parse import urljoin, urlparse
from bs4 import BeautifulSoup
import requests
from requests.exceptions import HTTPError
from swh.lister import USER_AGENT
from swh.lister.pattern import StatelessLister
@ -76,6 +77,7 @@ class CGitLister(StatelessLister[Repositories]):
next_page: Optional[str] = self.url
while next_page:
bs_idx = self._get_and_parse(next_page)
page_results = []
for tr in bs_idx.find("div", {"class": "content"}).find_all(
@ -113,7 +115,7 @@ class CGitLister(StatelessLister[Repositories]):
for repository in repositories:
origin_url = self._get_origin_from_repository_url(repository["url"])
if not origin_url:
if origin_url is None:
continue
yield ListedOrigin(
@ -125,7 +127,15 @@ class CGitLister(StatelessLister[Repositories]):
def _get_origin_from_repository_url(self, repository_url: str) -> Optional[str]:
"""Extract the git url from the repository page"""
bs = self._get_and_parse(repository_url)
try:
bs = self._get_and_parse(repository_url)
except HTTPError as e:
logger.warning(
"Unexpected HTTP status code %s on %s",
e.response.status_code,
e.response.url,
)
return None
# origin urls are listed on the repository page
# TODO check if forcing https is better or not ?

View file

@ -7,6 +7,7 @@ from typing import List
import pytest
from swh.core.pytest_plugin import requests_mock_datadir_factory
from swh.lister import __version__
from swh.lister.cgit.lister import CGitLister, _parse_last_updated_date
from swh.lister.pattern import ListerStats
@ -142,3 +143,20 @@ def test_lister_cgit_date_parsing(date_str, expected_date):
repository = {"url": "url", "last_updated_date": date_str}
assert _parse_last_updated_date(repository) == expected_date
requests_mock_datadir_missing_url = requests_mock_datadir_factory(
ignore_urls=["https://git.tizen/cgit/adaptation/ap_samsung/audio-hal-e4x12/",]
)
def test_lister_cgit_get_origin_from_repo_failing(
requests_mock_datadir_missing_url, swh_scheduler
):
url = "https://git.tizen/cgit/"
lister_cgit = CGitLister(swh_scheduler, url=url)
stats = lister_cgit.run()
expected_nb_origins = 15
assert stats == ListerStats(pages=3, origins=expected_nb_origins)