From 22bcd9deb221a11a159422cffec44982f299e9ab Mon Sep 17 00:00:00 2001 From: Antoine Lambert Date: Wed, 27 Apr 2022 17:54:53 +0200 Subject: [PATCH] maven: Create one origin per package instead of one per package version Previously the maven lister was creating an origin for each source archive (jar, zip) it discovered during the listing process. This is not the way Software Heritage decided to archive sources coming from package managers. Instead one origin should be created per package and all its versions should be found as releases in the snapshot produced by the package loader. So modify the maven lister in order to create one origin per package grouping all its versions. This change also modifies the way incremental listing is handled, ListedOrigin instances will be yielded only if we discovered new versions of a package since the last listing. Tests have been updated to reflect these changes. Related to T3874 --- swh/lister/maven/lister.py | 88 +++++---- .../maven/tests/data/http_indexes/export.fld | 113 ------------ .../{export_incr.fld => export_full.fld} | 0 .../data/http_indexes/export_incr_first.fld | 42 +++++ swh/lister/maven/tests/test_lister.py | 171 +++++++----------- 5 files changed, 162 insertions(+), 252 deletions(-) delete mode 100755 swh/lister/maven/tests/data/http_indexes/export.fld rename swh/lister/maven/tests/data/http_indexes/{export_incr.fld => export_full.fld} (100%) mode change 100755 => 100644 create mode 100644 swh/lister/maven/tests/data/http_indexes/export_incr_first.fld diff --git a/swh/lister/maven/lister.py b/swh/lister/maven/lister.py index dce3fd2..6485f03 100644 --- a/swh/lister/maven/lister.py +++ b/swh/lister/maven/lister.py @@ -97,6 +97,8 @@ class MavenLister(Lister[MavenListerState, RepoPage]): } ) + self.jar_origins: Dict[str, ListedOrigin] = {} + def state_from_dict(self, d: Dict[str, Any]) -> MavenListerState: return MavenListerState(**d) @@ -185,22 +187,10 @@ class MavenLister(Lister[MavenListerState, RepoPage]): m_doc = re_doc.match(line) if m_doc is not None: doc_id = int(m_doc.group("doc")) - if ( - self.incremental - and self.state - and self.state.last_seen_doc - and self.state.last_seen_doc >= doc_id - ): - # jar_src["doc"] contains the id of the current document, whatever - # its type (scm or jar). - jar_src["doc"] = None - else: - jar_src["doc"] = doc_id + # jar_src["doc"] contains the id of the current document, whatever + # its type (scm or jar). + jar_src["doc"] = doc_id else: - # If incremental mode, we don't record any line that is - # before our last recorded doc id. - if self.incremental and jar_src["doc"] is None: - continue m_val = re_val.match(line) if m_val is not None: (gid, aid, version, classifier, ext) = m_val.groups() @@ -315,35 +305,61 @@ class MavenLister(Lister[MavenListerState, RepoPage]): ) yield origin else: - # Origin is a source archive: + # Origin is gathering source archives: last_update_dt = None last_update_iso = "" last_update_seconds = str(page["time"])[:-3] try: last_update_dt = datetime.fromtimestamp(int(last_update_seconds)) - last_update_dt_tz = last_update_dt.astimezone(timezone.utc) + last_update_dt = last_update_dt.astimezone(timezone.utc) except OverflowError: logger.warning("- Failed to convert datetime %s.", last_update_seconds) if last_update_dt: - last_update_iso = last_update_dt_tz.isoformat() - origin = ListedOrigin( - lister_id=self.lister_obj.id, - url=page["url"], - visit_type=page["type"], - last_update=last_update_dt_tz, - extra_loader_arguments={ - "artifacts": [ - { - "time": last_update_iso, - "gid": page["gid"], - "aid": page["aid"], - "version": page["version"], - "base_url": self.BASE_URL, - } - ] - }, - ) - yield origin + last_update_iso = last_update_dt.isoformat() + + # Origin URL will target page holding sources for all versions of + # an artifactId (package name) inside a groupId (namespace) + path = "/".join(page["gid"].split(".")) + origin_url = urljoin(self.BASE_URL, f"{path}/{page['aid']}") + + artifact = { + **{k: v for k, v in page.items() if k != "doc"}, + "time": last_update_iso, + "base_url": self.BASE_URL, + } + + if origin_url not in self.jar_origins: + # Create ListedOrigin instance if we did not see that origin yet + jar_origin = ListedOrigin( + lister_id=self.lister_obj.id, + url=origin_url, + visit_type=page["type"], + last_update=last_update_dt, + extra_loader_arguments={"artifacts": [artifact]}, + ) + self.jar_origins[origin_url] = jar_origin + else: + # Update list of source artifacts for that origin otherwise + jar_origin = self.jar_origins[origin_url] + artifacts = jar_origin.extra_loader_arguments["artifacts"] + if artifact not in artifacts: + artifacts.append(artifact) + + if ( + jar_origin.last_update + and last_update_dt + and last_update_dt > jar_origin.last_update + ): + jar_origin.last_update = last_update_dt + + if not self.incremental or ( + self.state and page["doc"] > self.state.last_seen_doc + ): + # Yield origin with updated source artifacts, multiple instances of + # ListedOrigin for the same origin URL but with different artifacts + # list will be sent to the scheduler but it will deduplicate them and + # take the latest one to upsert in database + yield jar_origin def commit_page(self, page: RepoPage) -> None: """Update currently stored state using the latest listed doc. diff --git a/swh/lister/maven/tests/data/http_indexes/export.fld b/swh/lister/maven/tests/data/http_indexes/export.fld deleted file mode 100755 index c8e64b0..0000000 --- a/swh/lister/maven/tests/data/http_indexes/export.fld +++ /dev/null @@ -1,113 +0,0 @@ -doc 0 - field 0 - name u - type string - value al.aldi|sprova4j|0.1.0|sources|jar - field 1 - name m - type string - value 1626111735737 - field 2 - name i - type string - value jar|1626109619335|14316|2|2|0|jar - field 10 - name n - type string - value sprova4j - field 11 - name d - type string - value Java client for Sprova Test Management -doc 1 - field 0 - name u - type string - value al.aldi|sprova4j|0.1.0|NA|pom - field 1 - name m - type string - value 1626111735764 - field 2 - name i - type string - value jar|1626109636636|-1|1|0|0|pom - field 10 - name n - type string - value sprova4j - field 11 - name d - type string - value Java client for Sprova Test Management -doc 2 - field 0 - name u - type string - value al.aldi|sprova4j|0.1.1|sources|jar - field 1 - name m - type string - value 1626111784883 - field 2 - name i - type string - value jar|1626111425534|14510|2|2|0|jar - field 10 - name n - type string - value sprova4j - field 11 - name d - type string - value Java client for Sprova Test Management -doc 3 - field 0 - name u - type string - value al.aldi|sprova4j|0.1.1|NA|pom - field 1 - name m - type string - value 1626111784915 - field 2 - name i - type string - value jar|1626111437014|-1|1|0|0|pom - field 10 - name n - type string - value sprova4j - field 11 - name d - type string - value Java client for Sprova Test Management -doc 4 - field 14 - name DESCRIPTOR - type string - value NexusIndex - field 15 - name IDXINFO - type string - value 1.0|index -doc 5 - field 16 - name allGroups - type string - value allGroups - field 17 - name allGroupsList - type string - value al.aldi -doc 6 - field 18 - name rootGroups - type string - value rootGroups - field 19 - name rootGroupsList - type string - value al -END -checksum 00000000003321211082 diff --git a/swh/lister/maven/tests/data/http_indexes/export_incr.fld b/swh/lister/maven/tests/data/http_indexes/export_full.fld old mode 100755 new mode 100644 similarity index 100% rename from swh/lister/maven/tests/data/http_indexes/export_incr.fld rename to swh/lister/maven/tests/data/http_indexes/export_full.fld diff --git a/swh/lister/maven/tests/data/http_indexes/export_incr_first.fld b/swh/lister/maven/tests/data/http_indexes/export_incr_first.fld new file mode 100644 index 0000000..c943c2f --- /dev/null +++ b/swh/lister/maven/tests/data/http_indexes/export_incr_first.fld @@ -0,0 +1,42 @@ +doc 0 + field 0 + name u + type string + value al.aldi|sprova4j|0.1.0|sources|jar + field 1 + name m + type string + value 1633786348254 + field 2 + name i + type string + value jar|1626109619335|14316|2|2|0|jar + field 10 + name n + type string + value sprova4j + field 11 + name d + type string + value Java client for Sprova Test Management +doc 1 + field 0 + name u + type string + value al.aldi|sprova4j|0.1.0|NA|pom + field 1 + name m + type string + value 1633786348271 + field 2 + name i + type string + value jar|1626109636636|-1|1|0|0|pom + field 10 + name n + type string + value sprova4j + field 11 + name d + type string + value Java client for Sprova Test Management diff --git a/swh/lister/maven/tests/test_lister.py b/swh/lister/maven/tests/test_lister.py index 267da95..32f1a39 100644 --- a/swh/lister/maven/tests/test_lister.py +++ b/swh/lister/maven/tests/test_lister.py @@ -1,9 +1,8 @@ -# Copyright (C) 2021 The Software Heritage developers +# Copyright (C) 2021-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -from datetime import timezone from pathlib import Path import iso8601 @@ -26,10 +25,7 @@ LIST_GIT = ( LIST_GIT_INCR = ("git://github.com/ArangoDB-Community/arangodb-graphql-java.git",) -LIST_SRC = ( - MVN_URL + "al/aldi/sprova4j/0.1.0/sprova4j-0.1.0-sources.jar", - MVN_URL + "al/aldi/sprova4j/0.1.1/sprova4j-0.1.1-sources.jar", -) +LIST_SRC = (MVN_URL + "al/aldi/sprova4j",) LIST_SRC_DATA = ( { @@ -40,6 +36,7 @@ LIST_SRC_DATA = ( "gid": "al.aldi", "aid": "sprova4j", "version": "0.1.0", + "base_url": MVN_URL, }, { "type": "maven", @@ -49,18 +46,19 @@ LIST_SRC_DATA = ( "gid": "al.aldi", "aid": "sprova4j", "version": "0.1.1", + "base_url": MVN_URL, }, ) @pytest.fixture -def maven_index(datadir) -> str: - return Path(datadir, "http_indexes", "export.fld").read_text() +def maven_index_full(datadir) -> str: + return Path(datadir, "http_indexes", "export_full.fld").read_text() @pytest.fixture -def maven_index_incr(datadir) -> str: - return Path(datadir, "http_indexes", "export_incr.fld").read_text() +def maven_index_incr_first(datadir) -> str: + return Path(datadir, "http_indexes", "export_incr_first.fld").read_text() @pytest.fixture @@ -83,17 +81,21 @@ def maven_pom_3(datadir) -> str: return Path(datadir, "https_maven.org", "arangodb-graphql-1.2.pom").read_text() -def test_maven_full_listing( - swh_scheduler, - requests_mock, - mocker, - maven_index, - maven_pom_1, - maven_pom_2, +@pytest.fixture(autouse=True) +def network_requests_mock( + requests_mock, maven_index_full, maven_pom_1, maven_pom_2, maven_pom_3 ): + requests_mock.get(INDEX_URL, text=maven_index_full) + requests_mock.get(URL_POM_1, text=maven_pom_1) + requests_mock.get(URL_POM_2, text=maven_pom_2) + requests_mock.get(URL_POM_3, text=maven_pom_3) + + +def test_maven_full_listing(swh_scheduler): """Covers full listing of multiple pages, checking page results and listed origins, statelessness.""" + # Run the lister. lister = MavenLister( scheduler=swh_scheduler, url=MVN_URL, @@ -102,43 +104,25 @@ def test_maven_full_listing( incremental=False, ) - # Set up test. - index_text = maven_index - requests_mock.get(INDEX_URL, text=index_text) - requests_mock.get(URL_POM_1, text=maven_pom_1) - requests_mock.get(URL_POM_2, text=maven_pom_2) - - # Then run the lister. stats = lister.run() # Start test checks. - assert stats.pages == 4 - assert stats.origins == 4 + assert stats.pages == 5 scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results - origin_urls = [origin.url for origin in scheduler_origins] - assert sorted(origin_urls) == sorted(LIST_GIT + LIST_SRC) + + # 3 git origins + 1 maven origin with 2 releases (one per jar) + assert len(origin_urls) == 4 + assert sorted(origin_urls) == sorted(LIST_GIT + LIST_GIT_INCR + LIST_SRC) for origin in scheduler_origins: if origin.visit_type == "maven": for src in LIST_SRC_DATA: - if src.get("url") == origin.url: - last_update_src = iso8601.parse_date(src.get("time")).astimezone( - tz=timezone.utc - ) - assert last_update_src == origin.last_update - artifact = origin.extra_loader_arguments["artifacts"][0] - assert src.get("time") == artifact["time"] - assert src.get("gid") == artifact["gid"] - assert src.get("aid") == artifact["aid"] - assert src.get("version") == artifact["version"] - assert MVN_URL == artifact["base_url"] - break - else: - raise AssertionError( - "Could not find scheduler origin in referenced origins." - ) + last_update_src = iso8601.parse_date(src["time"]) + assert last_update_src <= origin.last_update + assert origin.extra_loader_arguments["artifacts"] == list(LIST_SRC_DATA) + scheduler_state = lister.get_state_from_scheduler() assert scheduler_state is not None assert scheduler_state.last_seen_doc == -1 @@ -148,10 +132,7 @@ def test_maven_full_listing( def test_maven_full_listing_malformed( swh_scheduler, requests_mock, - mocker, - maven_index, maven_pom_1_malformed, - maven_pom_2, ): """Covers full listing of multiple pages, checking page results with a malformed scm entry in pom.""" @@ -165,39 +146,28 @@ def test_maven_full_listing_malformed( ) # Set up test. - index_text = maven_index - requests_mock.get(INDEX_URL, text=index_text) requests_mock.get(URL_POM_1, text=maven_pom_1_malformed) - requests_mock.get(URL_POM_2, text=maven_pom_2) # Then run the lister. stats = lister.run() # Start test checks. - assert stats.pages == 4 - assert stats.origins == 3 + assert stats.pages == 5 scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results - origin_urls = [origin.url for origin in scheduler_origins] - LIST_SRC_1 = ("https://github.com/aldialimucaj/sprova4j.git",) - assert sorted(origin_urls) == sorted(LIST_SRC_1 + LIST_SRC) + + # 2 git origins + 1 maven origin with 2 releases (one per jar) + assert len(origin_urls) == 3 + assert sorted(origin_urls) == sorted((LIST_GIT[1],) + LIST_GIT_INCR + LIST_SRC) for origin in scheduler_origins: if origin.visit_type == "maven": for src in LIST_SRC_DATA: - if src.get("url") == origin.url: - artifact = origin.extra_loader_arguments["artifacts"][0] - assert src.get("time") == artifact["time"] - assert src.get("gid") == artifact["gid"] - assert src.get("aid") == artifact["aid"] - assert src.get("version") == artifact["version"] - assert MVN_URL == artifact["base_url"] - break - else: - raise AssertionError( - "Could not find scheduler origin in referenced origins." - ) + last_update_src = iso8601.parse_date(src["time"]) + assert last_update_src <= origin.last_update + assert origin.extra_loader_arguments["artifacts"] == list(LIST_SRC_DATA) + scheduler_state = lister.get_state_from_scheduler() assert scheduler_state is not None assert scheduler_state.last_seen_doc == -1 @@ -207,12 +177,8 @@ def test_maven_full_listing_malformed( def test_maven_incremental_listing( swh_scheduler, requests_mock, - mocker, - maven_index, - maven_index_incr, - maven_pom_1, - maven_pom_2, - maven_pom_3, + maven_index_full, + maven_index_incr_first, ): """Covers full listing of multiple pages, checking page results and listed origins, with a second updated run for statefulness.""" @@ -226,9 +192,7 @@ def test_maven_incremental_listing( ) # Set up test. - requests_mock.get(INDEX_URL, text=maven_index) - requests_mock.get(URL_POM_1, text=maven_pom_1) - requests_mock.get(URL_POM_2, text=maven_pom_2) + requests_mock.get(INDEX_URL, text=maven_index_incr_first) # Then run the lister. stats = lister.run() @@ -236,8 +200,20 @@ def test_maven_incremental_listing( # Start test checks. assert lister.incremental assert lister.updated - assert stats.pages == 4 - assert stats.origins == 4 + assert stats.pages == 2 + + scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results + origin_urls = [origin.url for origin in scheduler_origins] + + # 1 git origins + 1 maven origin with 1 release (one per jar) + assert len(origin_urls) == 2 + assert sorted(origin_urls) == sorted((LIST_GIT[0],) + LIST_SRC) + + for origin in scheduler_origins: + if origin.visit_type == "maven": + last_update_src = iso8601.parse_date(LIST_SRC_DATA[0]["time"]) + assert last_update_src == origin.last_update + assert origin.extra_loader_arguments["artifacts"] == [LIST_SRC_DATA[0]] # Second execution of the lister, incremental mode lister = MavenLister( @@ -250,12 +226,11 @@ def test_maven_incremental_listing( scheduler_state = lister.get_state_from_scheduler() assert scheduler_state is not None - assert scheduler_state.last_seen_doc == 3 - assert scheduler_state.last_seen_pom == 3 + assert scheduler_state.last_seen_doc == 1 + assert scheduler_state.last_seen_pom == 1 # Set up test. - requests_mock.get(INDEX_URL, text=maven_index_incr) - requests_mock.get(URL_POM_3, text=maven_pom_3) + requests_mock.get(INDEX_URL, text=maven_index_full) # Then run the lister. stats = lister.run() @@ -263,26 +238,19 @@ def test_maven_incremental_listing( # Start test checks. assert lister.incremental assert lister.updated - assert stats.pages == 1 - assert stats.origins == 1 + assert stats.pages == 4 scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results - origin_urls = [origin.url for origin in scheduler_origins] + assert sorted(origin_urls) == sorted(LIST_SRC + LIST_GIT + LIST_GIT_INCR) for origin in scheduler_origins: if origin.visit_type == "maven": for src in LIST_SRC_DATA: - if src.get("url") == origin.url: - artifact = origin.extra_loader_arguments["artifacts"][0] - assert src.get("time") == artifact["time"] - assert src.get("gid") == artifact["gid"] - assert src.get("aid") == artifact["aid"] - assert src.get("version") == artifact["version"] - break - else: - raise AssertionError + last_update_src = iso8601.parse_date(src["time"]) + assert last_update_src <= origin.last_update + assert origin.extra_loader_arguments["artifacts"] == list(LIST_SRC_DATA) scheduler_state = lister.get_state_from_scheduler() assert scheduler_state is not None @@ -291,9 +259,7 @@ def test_maven_incremental_listing( @pytest.mark.parametrize("http_code", [400, 404, 500, 502]) -def test_maven_list_http_error_on_index_read( - swh_scheduler, requests_mock, mocker, maven_index, http_code -): +def test_maven_list_http_error_on_index_read(swh_scheduler, requests_mock, http_code): """should stop listing if the lister fails to retrieve the main index url.""" lister = MavenLister(scheduler=swh_scheduler, url=MVN_URL, index_url=INDEX_URL) @@ -307,21 +273,20 @@ def test_maven_list_http_error_on_index_read( @pytest.mark.parametrize("http_code", [400, 404, 500, 502]) def test_maven_list_http_error_artifacts( - swh_scheduler, requests_mock, mocker, maven_index, http_code, maven_pom_2 + swh_scheduler, + requests_mock, + http_code, ): """should continue listing when failing to retrieve artifacts.""" # Test failure of artefacts retrieval. - requests_mock.get(INDEX_URL, text=maven_index) requests_mock.get(URL_POM_1, status_code=http_code) - requests_mock.get(URL_POM_2, text=maven_pom_2) lister = MavenLister(scheduler=swh_scheduler, url=MVN_URL, index_url=INDEX_URL) # on artifacts though, that raises but continue listing lister.run() - # If the maven_index step succeeded but not the get_pom step, - # then we get only the 2 maven-jar origins (and not the 2 additional - # src origins). + # If the maven_index_full step succeeded but not the get_pom step, + # then we get only one maven-jar origin and one git origin. scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results assert len(scheduler_origins) == 3