sourceforge: don't abort on error for project

It's suboptimal to say the least to stop the entire lister process
if a single project page is somehow broken (404, most likely). This
change logs the issue as a warning and carries on, as well as some
minor logging changes and comments touch ups.
This commit is contained in:
Raphaël Gomès 2021-05-11 10:03:04 +02:00
parent 2ff549e125
commit 8f3bbacd5e
2 changed files with 64 additions and 5 deletions

View file

@ -195,10 +195,9 @@ class SourceForgeLister(Lister[SourceForgeListerState, SourceForgeListerPage]):
if response.status_code != 200:
# Log response content to ease debugging
logger.warning(
"Unexpected HTTP status code %s on %s: %s",
"Unexpected HTTP status code %s for URL %s",
response.status_code,
response.url,
response.content,
)
# The lister must fail on blocking errors
response.raise_for_status()
@ -294,7 +293,8 @@ class SourceForgeLister(Lister[SourceForgeListerState, SourceForgeListerPage]):
else:
logger.debug("Project '%s' does not have any VCS", project)
else:
# Should always match, let's log it
# Should almost always match, let's log it
# The only ones that don't match are mostly specialized one-off URLs.
msg = "Project URL '%s' does not match expected pattern"
logger.warning(msg, project_url)
@ -324,11 +324,15 @@ class SourceForgeLister(Lister[SourceForgeListerState, SourceForgeListerPage]):
msg = "New project during an incremental run: %s/%s"
logger.debug(msg, namespace, project)
res = self.page_request(endpoint, {}).json()
try:
res = self.page_request(endpoint, {}).json()
except requests.HTTPError:
# We've already logged in `page_request`
return []
tools = res.get("tools")
if tools is None:
# This probably never happens
# This rarely happens, on very old URLs
logger.warning("Project '%s' does not have any tools", endpoint)
return []

View file

@ -338,3 +338,58 @@ def test_sourceforge_lister_http_error(swh_scheduler, requests_mock, status_code
with pytest.raises(HTTPError):
lister.run()
@pytest.mark.parametrize("status_code", [500, 503, 504, 403, 404])
def test_sourceforge_lister_project_error(
datadir, swh_scheduler, requests_mock, status_code,
):
lister = SourceForgeLister(scheduler=swh_scheduler)
requests_mock.get(
MAIN_SITEMAP_URL,
text=get_main_sitemap(datadir),
additional_matcher=_check_request_headers,
)
requests_mock.get(
"https://sourceforge.net/allura_sitemap/sitemap-0.xml",
text=get_subsitemap_0(datadir),
additional_matcher=_check_request_headers,
)
requests_mock.get(
"https://sourceforge.net/allura_sitemap/sitemap-1.xml",
text=get_subsitemap_1(datadir),
additional_matcher=_check_request_headers,
)
# Request mocks precedence is LIFO
requests_mock.get(
re.compile("https://sourceforge.net/rest/.*"),
json=functools.partial(get_project_json, datadir),
additional_matcher=_check_request_headers,
)
# Make all `mramm` requests fail
# `mramm` is in subsitemap 0, which ensures we keep listing after an error.
requests_mock.get(
re.compile("https://sourceforge.net/rest/p/mramm"), status_code=status_code
)
stats = lister.run()
# - os3dmodels (2 repos),
# - mojunk (3 repos),
# - backapps/website (1 repo).
# adobe and backapps itself have no repos.
# Did *not* list mramm
assert stats.pages == 3
assert stats.origins == 6
scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
res = {o.url: (o.visit_type, str(o.last_update.date())) for o in scheduler_origins}
# Ensure no `mramm` origins are listed, but all others are.
assert res == {
"svn.code.sf.net/p/backapps/website/code": ("svn", "2021-02-11"),
"git.code.sf.net/p/os3dmodels/git": ("git", "2017-03-31"),
"svn.code.sf.net/p/os3dmodels/svn": ("svn", "2017-03-31"),
"git.code.sf.net/p/mojunk/git": ("git", "2017-12-31"),
"git.code.sf.net/p/mojunk/git2": ("git", "2017-12-31"),
"svn.code.sf.net/p/mojunk/svn": ("svn", "2017-12-31"),
}