maven: Continue listing if unable to retrieve pom information

This aligns the behavior with other listers (e.g. sourceforge, ...) to continue listing
if some information is not retrievable at all.

Related to T3874
This commit is contained in:
Antoine R. Dumont (@ardumont) 2022-04-13 17:45:34 +02:00
parent e4b27a1e98
commit 7c8428d01c
No known key found for this signature in database
GPG key ID: 52E2E9840D10C3B8
2 changed files with 31 additions and 24 deletions

View file

@ -258,9 +258,9 @@ class MavenLister(Lister[MavenListerState, RepoPage]):
logger.info("Fetching poms..")
for pom in out_pom:
text = self.page_request(pom, {})
try:
project = xmltodict.parse(text.content.decode())
response = self.page_request(pom, {})
project = xmltodict.parse(response.content.decode())
if "scm" in project["project"]:
if "connection" in project["project"]["scm"]:
scm = project["project"]["scm"]["connection"]
@ -278,6 +278,11 @@ class MavenLister(Lister[MavenListerState, RepoPage]):
logger.debug("No scm.connection in pom %s", pom)
else:
logger.debug("No scm in pom %s", pom)
except requests.HTTPError:
logger.warning(
"POM info page could not be fetched, skipping project '%s'",
pom,
)
except xmltodict.expat.ExpatError as error:
logger.info("Could not parse POM %s XML: %s. Next.", pom, error)

View file

@ -291,35 +291,37 @@ def test_maven_incremental_listing(
@pytest.mark.parametrize("http_code", [400, 404, 500, 502])
def test_maven_list_http_error(
def test_maven_list_http_error_on_index_read(
swh_scheduler, requests_mock, mocker, maven_index, http_code
):
"""Test handling of some common HTTP errors:
- 400: Bad request.
- 404: Resource no found.
- 500: Internal server error.
- 502: Bad gateway ou proxy Error.
"""
"""should stop listing if the lister fails to retrieve the main index url."""
lister = MavenLister(scheduler=swh_scheduler, url=MVN_URL, index_url=INDEX_URL)
requests_mock.get(INDEX_URL, status_code=http_code)
with pytest.raises(requests.HTTPError): # listing cannot continues so stop
lister.run()
scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
assert len(scheduler_origins) == 0
@pytest.mark.parametrize("http_code", [400, 404, 500, 502])
def test_maven_list_http_error_artifacts(
swh_scheduler, requests_mock, mocker, maven_index, http_code, maven_pom_2
):
"""should continue listing when failing to retrieve artifacts."""
# Test failure of artefacts retrieval.
requests_mock.get(INDEX_URL, text=maven_index)
requests_mock.get(URL_POM_1, status_code=http_code)
requests_mock.get(URL_POM_2, text=maven_pom_2)
lister = MavenLister(scheduler=swh_scheduler, url=MVN_URL, index_url=INDEX_URL)
# Test failure of index retrieval.
requests_mock.get(INDEX_URL, status_code=http_code)
with pytest.raises(requests.HTTPError):
lister.run()
# Test failure of artefacts retrieval.
requests_mock.get(INDEX_URL, text=maven_index)
requests_mock.get(URL_POM_1, status_code=http_code)
with pytest.raises(requests.HTTPError):
lister.run()
# on artifacts though, that raises but continue listing
lister.run()
# If the maven_index step succeeded but not the get_pom step,
# then we get only the 2 maven-jar origins (and not the 2 additional
# src origins).
scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
assert len(scheduler_origins) == 2
assert len(scheduler_origins) == 3