maven: Prevent UnicodeDecodeError when processing pom file

Pass the raw bytes of pom file content in xmltodict.parse and let
it do the string decoding based on the encoding declared in pom file.

If the string decoding failed due to an invalid declared encoding,
xml.parsers.expat.ExpatError will be raised and will be caught by
the lister, ignoring the pom file and continuing listing.

Related to T3874
This commit is contained in:
Antoine Lambert 2022-05-02 12:03:56 +02:00
parent 0222a8f5c4
commit 3f6c7edc24
2 changed files with 38 additions and 23 deletions

View file

@ -246,7 +246,7 @@ class MavenLister(Lister[MavenListerState, RepoPage]):
for pom in out_pom:
try:
response = self.page_request(pom, {})
project = xmltodict.parse(response.content.decode())
project = xmltodict.parse(response.content)
project_d = project.get("project", {})
scm_d = project_d.get("scm")
if scm_d is not None:

View file

@ -52,48 +52,48 @@ LIST_SRC_DATA = (
@pytest.fixture
def maven_index_full(datadir) -> str:
return Path(datadir, "http_indexes", "export_full.fld").read_text()
def maven_index_full(datadir) -> bytes:
return Path(datadir, "http_indexes", "export_full.fld").read_bytes()
@pytest.fixture
def maven_index_incr_first(datadir) -> str:
return Path(datadir, "http_indexes", "export_incr_first.fld").read_text()
def maven_index_incr_first(datadir) -> bytes:
return Path(datadir, "http_indexes", "export_incr_first.fld").read_bytes()
@pytest.fixture
def maven_pom_1(datadir) -> str:
return Path(datadir, "https_maven.org", "sprova4j-0.1.0.pom").read_text()
def maven_pom_1(datadir) -> bytes:
return Path(datadir, "https_maven.org", "sprova4j-0.1.0.pom").read_bytes()
@pytest.fixture
def maven_index_null_mtime(datadir) -> str:
return Path(datadir, "http_indexes", "export_null_mtime.fld").read_text()
def maven_index_null_mtime(datadir) -> bytes:
return Path(datadir, "http_indexes", "export_null_mtime.fld").read_bytes()
@pytest.fixture
def maven_pom_1_malformed(datadir) -> str:
return Path(datadir, "https_maven.org", "sprova4j-0.1.0.malformed.pom").read_text()
def maven_pom_1_malformed(datadir) -> bytes:
return Path(datadir, "https_maven.org", "sprova4j-0.1.0.malformed.pom").read_bytes()
@pytest.fixture
def maven_pom_2(datadir) -> str:
return Path(datadir, "https_maven.org", "sprova4j-0.1.1.pom").read_text()
def maven_pom_2(datadir) -> bytes:
return Path(datadir, "https_maven.org", "sprova4j-0.1.1.pom").read_bytes()
@pytest.fixture
def maven_pom_3(datadir) -> str:
return Path(datadir, "https_maven.org", "arangodb-graphql-1.2.pom").read_text()
def maven_pom_3(datadir) -> bytes:
return Path(datadir, "https_maven.org", "arangodb-graphql-1.2.pom").read_bytes()
@pytest.fixture(autouse=True)
def network_requests_mock(
requests_mock, maven_index_full, maven_pom_1, maven_pom_2, maven_pom_3
):
requests_mock.get(INDEX_URL, text=maven_index_full)
requests_mock.get(URL_POM_1, text=maven_pom_1)
requests_mock.get(URL_POM_2, text=maven_pom_2)
requests_mock.get(URL_POM_3, text=maven_pom_3)
requests_mock.get(INDEX_URL, content=maven_index_full)
requests_mock.get(URL_POM_1, content=maven_pom_1)
requests_mock.get(URL_POM_2, content=maven_pom_2)
requests_mock.get(URL_POM_3, content=maven_pom_3)
def test_maven_full_listing(swh_scheduler):
@ -151,7 +151,7 @@ def test_maven_full_listing_malformed(
)
# Set up test.
requests_mock.get(URL_POM_1, text=maven_pom_1_malformed)
requests_mock.get(URL_POM_1, content=maven_pom_1_malformed)
# Then run the lister.
stats = lister.run()
@ -197,7 +197,7 @@ def test_maven_incremental_listing(
)
# Set up test.
requests_mock.get(INDEX_URL, text=maven_index_incr_first)
requests_mock.get(INDEX_URL, content=maven_index_incr_first)
# Then run the lister.
stats = lister.run()
@ -235,7 +235,7 @@ def test_maven_incremental_listing(
assert scheduler_state.last_seen_pom == 1
# Set up test.
requests_mock.get(INDEX_URL, text=maven_index_full)
requests_mock.get(INDEX_URL, content=maven_index_full)
# Then run the lister.
stats = lister.run()
@ -299,7 +299,7 @@ def test_maven_list_http_error_artifacts(
def test_maven_lister_null_mtime(swh_scheduler, requests_mock, maven_index_null_mtime):
requests_mock.get(INDEX_URL, text=maven_index_null_mtime)
requests_mock.get(INDEX_URL, content=maven_index_null_mtime)
# Run the lister.
lister = MavenLister(
@ -317,3 +317,18 @@ def test_maven_lister_null_mtime(swh_scheduler, requests_mock, maven_index_null_
scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
assert len(scheduler_origins) == 1
assert scheduler_origins[0].last_update is None
def test_maven_list_pom_bad_encoding(swh_scheduler, requests_mock, maven_pom_1):
"""should continue listing when failing to decode pom file."""
# Test failure of pom parsing by reencoding a UTF-8 pom file to a not expected one
requests_mock.get(URL_POM_1, content=maven_pom_1.decode("utf-8").encode("utf-32"))
lister = MavenLister(scheduler=swh_scheduler, url=MVN_URL, index_url=INDEX_URL)
lister.run()
# If the maven_index_full step succeeded but not the pom parsing step,
# then we get only one maven-jar origin and one git origin.
scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
assert len(scheduler_origins) == 3