maven: Create one origin per package instead of one per package version

Previously the maven lister was creating an origin for each source
archive (jar, zip) it discovered during the listing process.

This is not the way Software Heritage decided to archive sources
coming from package managers. Instead one origin should be created
per package and all its versions should be found as releases in the
snapshot produced by the package loader.

So modify the maven lister in order to create one origin per package
grouping all its versions.

This change also modifies the way incremental listing is handled,
ListedOrigin instances will be yielded only if we discovered new
versions of a package since the last listing.

Tests have been updated to reflect these changes.

Related to T3874
This commit is contained in:
Antoine Lambert 2022-04-27 17:54:53 +02:00
parent 985b71e80c
commit 22bcd9deb2
5 changed files with 162 additions and 252 deletions

View file

@ -97,6 +97,8 @@ class MavenLister(Lister[MavenListerState, RepoPage]):
}
)
self.jar_origins: Dict[str, ListedOrigin] = {}
def state_from_dict(self, d: Dict[str, Any]) -> MavenListerState:
return MavenListerState(**d)
@ -185,22 +187,10 @@ class MavenLister(Lister[MavenListerState, RepoPage]):
m_doc = re_doc.match(line)
if m_doc is not None:
doc_id = int(m_doc.group("doc"))
if (
self.incremental
and self.state
and self.state.last_seen_doc
and self.state.last_seen_doc >= doc_id
):
# jar_src["doc"] contains the id of the current document, whatever
# its type (scm or jar).
jar_src["doc"] = None
else:
jar_src["doc"] = doc_id
# jar_src["doc"] contains the id of the current document, whatever
# its type (scm or jar).
jar_src["doc"] = doc_id
else:
# If incremental mode, we don't record any line that is
# before our last recorded doc id.
if self.incremental and jar_src["doc"] is None:
continue
m_val = re_val.match(line)
if m_val is not None:
(gid, aid, version, classifier, ext) = m_val.groups()
@ -315,35 +305,61 @@ class MavenLister(Lister[MavenListerState, RepoPage]):
)
yield origin
else:
# Origin is a source archive:
# Origin is gathering source archives:
last_update_dt = None
last_update_iso = ""
last_update_seconds = str(page["time"])[:-3]
try:
last_update_dt = datetime.fromtimestamp(int(last_update_seconds))
last_update_dt_tz = last_update_dt.astimezone(timezone.utc)
last_update_dt = last_update_dt.astimezone(timezone.utc)
except OverflowError:
logger.warning("- Failed to convert datetime %s.", last_update_seconds)
if last_update_dt:
last_update_iso = last_update_dt_tz.isoformat()
origin = ListedOrigin(
lister_id=self.lister_obj.id,
url=page["url"],
visit_type=page["type"],
last_update=last_update_dt_tz,
extra_loader_arguments={
"artifacts": [
{
"time": last_update_iso,
"gid": page["gid"],
"aid": page["aid"],
"version": page["version"],
"base_url": self.BASE_URL,
}
]
},
)
yield origin
last_update_iso = last_update_dt.isoformat()
# Origin URL will target page holding sources for all versions of
# an artifactId (package name) inside a groupId (namespace)
path = "/".join(page["gid"].split("."))
origin_url = urljoin(self.BASE_URL, f"{path}/{page['aid']}")
artifact = {
**{k: v for k, v in page.items() if k != "doc"},
"time": last_update_iso,
"base_url": self.BASE_URL,
}
if origin_url not in self.jar_origins:
# Create ListedOrigin instance if we did not see that origin yet
jar_origin = ListedOrigin(
lister_id=self.lister_obj.id,
url=origin_url,
visit_type=page["type"],
last_update=last_update_dt,
extra_loader_arguments={"artifacts": [artifact]},
)
self.jar_origins[origin_url] = jar_origin
else:
# Update list of source artifacts for that origin otherwise
jar_origin = self.jar_origins[origin_url]
artifacts = jar_origin.extra_loader_arguments["artifacts"]
if artifact not in artifacts:
artifacts.append(artifact)
if (
jar_origin.last_update
and last_update_dt
and last_update_dt > jar_origin.last_update
):
jar_origin.last_update = last_update_dt
if not self.incremental or (
self.state and page["doc"] > self.state.last_seen_doc
):
# Yield origin with updated source artifacts, multiple instances of
# ListedOrigin for the same origin URL but with different artifacts
# list will be sent to the scheduler but it will deduplicate them and
# take the latest one to upsert in database
yield jar_origin
def commit_page(self, page: RepoPage) -> None:
"""Update currently stored state using the latest listed doc.

View file

@ -1,113 +0,0 @@
doc 0
field 0
name u
type string
value al.aldi|sprova4j|0.1.0|sources|jar
field 1
name m
type string
value 1626111735737
field 2
name i
type string
value jar|1626109619335|14316|2|2|0|jar
field 10
name n
type string
value sprova4j
field 11
name d
type string
value Java client for Sprova Test Management
doc 1
field 0
name u
type string
value al.aldi|sprova4j|0.1.0|NA|pom
field 1
name m
type string
value 1626111735764
field 2
name i
type string
value jar|1626109636636|-1|1|0|0|pom
field 10
name n
type string
value sprova4j
field 11
name d
type string
value Java client for Sprova Test Management
doc 2
field 0
name u
type string
value al.aldi|sprova4j|0.1.1|sources|jar
field 1
name m
type string
value 1626111784883
field 2
name i
type string
value jar|1626111425534|14510|2|2|0|jar
field 10
name n
type string
value sprova4j
field 11
name d
type string
value Java client for Sprova Test Management
doc 3
field 0
name u
type string
value al.aldi|sprova4j|0.1.1|NA|pom
field 1
name m
type string
value 1626111784915
field 2
name i
type string
value jar|1626111437014|-1|1|0|0|pom
field 10
name n
type string
value sprova4j
field 11
name d
type string
value Java client for Sprova Test Management
doc 4
field 14
name DESCRIPTOR
type string
value NexusIndex
field 15
name IDXINFO
type string
value 1.0|index
doc 5
field 16
name allGroups
type string
value allGroups
field 17
name allGroupsList
type string
value al.aldi
doc 6
field 18
name rootGroups
type string
value rootGroups
field 19
name rootGroupsList
type string
value al
END
checksum 00000000003321211082

View file

@ -0,0 +1,42 @@
doc 0
field 0
name u
type string
value al.aldi|sprova4j|0.1.0|sources|jar
field 1
name m
type string
value 1633786348254
field 2
name i
type string
value jar|1626109619335|14316|2|2|0|jar
field 10
name n
type string
value sprova4j
field 11
name d
type string
value Java client for Sprova Test Management
doc 1
field 0
name u
type string
value al.aldi|sprova4j|0.1.0|NA|pom
field 1
name m
type string
value 1633786348271
field 2
name i
type string
value jar|1626109636636|-1|1|0|0|pom
field 10
name n
type string
value sprova4j
field 11
name d
type string
value Java client for Sprova Test Management

View file

@ -1,9 +1,8 @@
# Copyright (C) 2021 The Software Heritage developers
# Copyright (C) 2021-2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from datetime import timezone
from pathlib import Path
import iso8601
@ -26,10 +25,7 @@ LIST_GIT = (
LIST_GIT_INCR = ("git://github.com/ArangoDB-Community/arangodb-graphql-java.git",)
LIST_SRC = (
MVN_URL + "al/aldi/sprova4j/0.1.0/sprova4j-0.1.0-sources.jar",
MVN_URL + "al/aldi/sprova4j/0.1.1/sprova4j-0.1.1-sources.jar",
)
LIST_SRC = (MVN_URL + "al/aldi/sprova4j",)
LIST_SRC_DATA = (
{
@ -40,6 +36,7 @@ LIST_SRC_DATA = (
"gid": "al.aldi",
"aid": "sprova4j",
"version": "0.1.0",
"base_url": MVN_URL,
},
{
"type": "maven",
@ -49,18 +46,19 @@ LIST_SRC_DATA = (
"gid": "al.aldi",
"aid": "sprova4j",
"version": "0.1.1",
"base_url": MVN_URL,
},
)
@pytest.fixture
def maven_index(datadir) -> str:
return Path(datadir, "http_indexes", "export.fld").read_text()
def maven_index_full(datadir) -> str:
return Path(datadir, "http_indexes", "export_full.fld").read_text()
@pytest.fixture
def maven_index_incr(datadir) -> str:
return Path(datadir, "http_indexes", "export_incr.fld").read_text()
def maven_index_incr_first(datadir) -> str:
return Path(datadir, "http_indexes", "export_incr_first.fld").read_text()
@pytest.fixture
@ -83,17 +81,21 @@ def maven_pom_3(datadir) -> str:
return Path(datadir, "https_maven.org", "arangodb-graphql-1.2.pom").read_text()
def test_maven_full_listing(
swh_scheduler,
requests_mock,
mocker,
maven_index,
maven_pom_1,
maven_pom_2,
@pytest.fixture(autouse=True)
def network_requests_mock(
requests_mock, maven_index_full, maven_pom_1, maven_pom_2, maven_pom_3
):
requests_mock.get(INDEX_URL, text=maven_index_full)
requests_mock.get(URL_POM_1, text=maven_pom_1)
requests_mock.get(URL_POM_2, text=maven_pom_2)
requests_mock.get(URL_POM_3, text=maven_pom_3)
def test_maven_full_listing(swh_scheduler):
"""Covers full listing of multiple pages, checking page results and listed
origins, statelessness."""
# Run the lister.
lister = MavenLister(
scheduler=swh_scheduler,
url=MVN_URL,
@ -102,43 +104,25 @@ def test_maven_full_listing(
incremental=False,
)
# Set up test.
index_text = maven_index
requests_mock.get(INDEX_URL, text=index_text)
requests_mock.get(URL_POM_1, text=maven_pom_1)
requests_mock.get(URL_POM_2, text=maven_pom_2)
# Then run the lister.
stats = lister.run()
# Start test checks.
assert stats.pages == 4
assert stats.origins == 4
assert stats.pages == 5
scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
origin_urls = [origin.url for origin in scheduler_origins]
assert sorted(origin_urls) == sorted(LIST_GIT + LIST_SRC)
# 3 git origins + 1 maven origin with 2 releases (one per jar)
assert len(origin_urls) == 4
assert sorted(origin_urls) == sorted(LIST_GIT + LIST_GIT_INCR + LIST_SRC)
for origin in scheduler_origins:
if origin.visit_type == "maven":
for src in LIST_SRC_DATA:
if src.get("url") == origin.url:
last_update_src = iso8601.parse_date(src.get("time")).astimezone(
tz=timezone.utc
)
assert last_update_src == origin.last_update
artifact = origin.extra_loader_arguments["artifacts"][0]
assert src.get("time") == artifact["time"]
assert src.get("gid") == artifact["gid"]
assert src.get("aid") == artifact["aid"]
assert src.get("version") == artifact["version"]
assert MVN_URL == artifact["base_url"]
break
else:
raise AssertionError(
"Could not find scheduler origin in referenced origins."
)
last_update_src = iso8601.parse_date(src["time"])
assert last_update_src <= origin.last_update
assert origin.extra_loader_arguments["artifacts"] == list(LIST_SRC_DATA)
scheduler_state = lister.get_state_from_scheduler()
assert scheduler_state is not None
assert scheduler_state.last_seen_doc == -1
@ -148,10 +132,7 @@ def test_maven_full_listing(
def test_maven_full_listing_malformed(
swh_scheduler,
requests_mock,
mocker,
maven_index,
maven_pom_1_malformed,
maven_pom_2,
):
"""Covers full listing of multiple pages, checking page results with a malformed
scm entry in pom."""
@ -165,39 +146,28 @@ def test_maven_full_listing_malformed(
)
# Set up test.
index_text = maven_index
requests_mock.get(INDEX_URL, text=index_text)
requests_mock.get(URL_POM_1, text=maven_pom_1_malformed)
requests_mock.get(URL_POM_2, text=maven_pom_2)
# Then run the lister.
stats = lister.run()
# Start test checks.
assert stats.pages == 4
assert stats.origins == 3
assert stats.pages == 5
scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
origin_urls = [origin.url for origin in scheduler_origins]
LIST_SRC_1 = ("https://github.com/aldialimucaj/sprova4j.git",)
assert sorted(origin_urls) == sorted(LIST_SRC_1 + LIST_SRC)
# 2 git origins + 1 maven origin with 2 releases (one per jar)
assert len(origin_urls) == 3
assert sorted(origin_urls) == sorted((LIST_GIT[1],) + LIST_GIT_INCR + LIST_SRC)
for origin in scheduler_origins:
if origin.visit_type == "maven":
for src in LIST_SRC_DATA:
if src.get("url") == origin.url:
artifact = origin.extra_loader_arguments["artifacts"][0]
assert src.get("time") == artifact["time"]
assert src.get("gid") == artifact["gid"]
assert src.get("aid") == artifact["aid"]
assert src.get("version") == artifact["version"]
assert MVN_URL == artifact["base_url"]
break
else:
raise AssertionError(
"Could not find scheduler origin in referenced origins."
)
last_update_src = iso8601.parse_date(src["time"])
assert last_update_src <= origin.last_update
assert origin.extra_loader_arguments["artifacts"] == list(LIST_SRC_DATA)
scheduler_state = lister.get_state_from_scheduler()
assert scheduler_state is not None
assert scheduler_state.last_seen_doc == -1
@ -207,12 +177,8 @@ def test_maven_full_listing_malformed(
def test_maven_incremental_listing(
swh_scheduler,
requests_mock,
mocker,
maven_index,
maven_index_incr,
maven_pom_1,
maven_pom_2,
maven_pom_3,
maven_index_full,
maven_index_incr_first,
):
"""Covers full listing of multiple pages, checking page results and listed
origins, with a second updated run for statefulness."""
@ -226,9 +192,7 @@ def test_maven_incremental_listing(
)
# Set up test.
requests_mock.get(INDEX_URL, text=maven_index)
requests_mock.get(URL_POM_1, text=maven_pom_1)
requests_mock.get(URL_POM_2, text=maven_pom_2)
requests_mock.get(INDEX_URL, text=maven_index_incr_first)
# Then run the lister.
stats = lister.run()
@ -236,8 +200,20 @@ def test_maven_incremental_listing(
# Start test checks.
assert lister.incremental
assert lister.updated
assert stats.pages == 4
assert stats.origins == 4
assert stats.pages == 2
scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
origin_urls = [origin.url for origin in scheduler_origins]
# 1 git origins + 1 maven origin with 1 release (one per jar)
assert len(origin_urls) == 2
assert sorted(origin_urls) == sorted((LIST_GIT[0],) + LIST_SRC)
for origin in scheduler_origins:
if origin.visit_type == "maven":
last_update_src = iso8601.parse_date(LIST_SRC_DATA[0]["time"])
assert last_update_src == origin.last_update
assert origin.extra_loader_arguments["artifacts"] == [LIST_SRC_DATA[0]]
# Second execution of the lister, incremental mode
lister = MavenLister(
@ -250,12 +226,11 @@ def test_maven_incremental_listing(
scheduler_state = lister.get_state_from_scheduler()
assert scheduler_state is not None
assert scheduler_state.last_seen_doc == 3
assert scheduler_state.last_seen_pom == 3
assert scheduler_state.last_seen_doc == 1
assert scheduler_state.last_seen_pom == 1
# Set up test.
requests_mock.get(INDEX_URL, text=maven_index_incr)
requests_mock.get(URL_POM_3, text=maven_pom_3)
requests_mock.get(INDEX_URL, text=maven_index_full)
# Then run the lister.
stats = lister.run()
@ -263,26 +238,19 @@ def test_maven_incremental_listing(
# Start test checks.
assert lister.incremental
assert lister.updated
assert stats.pages == 1
assert stats.origins == 1
assert stats.pages == 4
scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
origin_urls = [origin.url for origin in scheduler_origins]
assert sorted(origin_urls) == sorted(LIST_SRC + LIST_GIT + LIST_GIT_INCR)
for origin in scheduler_origins:
if origin.visit_type == "maven":
for src in LIST_SRC_DATA:
if src.get("url") == origin.url:
artifact = origin.extra_loader_arguments["artifacts"][0]
assert src.get("time") == artifact["time"]
assert src.get("gid") == artifact["gid"]
assert src.get("aid") == artifact["aid"]
assert src.get("version") == artifact["version"]
break
else:
raise AssertionError
last_update_src = iso8601.parse_date(src["time"])
assert last_update_src <= origin.last_update
assert origin.extra_loader_arguments["artifacts"] == list(LIST_SRC_DATA)
scheduler_state = lister.get_state_from_scheduler()
assert scheduler_state is not None
@ -291,9 +259,7 @@ def test_maven_incremental_listing(
@pytest.mark.parametrize("http_code", [400, 404, 500, 502])
def test_maven_list_http_error_on_index_read(
swh_scheduler, requests_mock, mocker, maven_index, http_code
):
def test_maven_list_http_error_on_index_read(swh_scheduler, requests_mock, http_code):
"""should stop listing if the lister fails to retrieve the main index url."""
lister = MavenLister(scheduler=swh_scheduler, url=MVN_URL, index_url=INDEX_URL)
@ -307,21 +273,20 @@ def test_maven_list_http_error_on_index_read(
@pytest.mark.parametrize("http_code", [400, 404, 500, 502])
def test_maven_list_http_error_artifacts(
swh_scheduler, requests_mock, mocker, maven_index, http_code, maven_pom_2
swh_scheduler,
requests_mock,
http_code,
):
"""should continue listing when failing to retrieve artifacts."""
# Test failure of artefacts retrieval.
requests_mock.get(INDEX_URL, text=maven_index)
requests_mock.get(URL_POM_1, status_code=http_code)
requests_mock.get(URL_POM_2, text=maven_pom_2)
lister = MavenLister(scheduler=swh_scheduler, url=MVN_URL, index_url=INDEX_URL)
# on artifacts though, that raises but continue listing
lister.run()
# If the maven_index step succeeded but not the get_pom step,
# then we get only the 2 maven-jar origins (and not the 2 additional
# src origins).
# If the maven_index_full step succeeded but not the get_pom step,
# then we get only one maven-jar origin and one git origin.
scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
assert len(scheduler_origins) == 3