From ff80a91f0af81a067ea2c455b7fd2750ff81761b Mon Sep 17 00:00:00 2001 From: "Antoine R. Dumont (@ardumont)" Date: Tue, 4 Oct 2022 23:25:20 +0200 Subject: [PATCH] nixguix: Improve git origins detection Without this, some git repositories are detected as file (due to upstream misqualification too). This does some extra effort to detect those to avoid sending noise to loaders. This also refactors some common code to build vcs artifacts to avoid duplication. Related to T3781 --- swh/lister/nixguix/lister.py | 47 ++++++++++++------- .../tests/data/nixpkgs-swh_sources.json | 7 +++ swh/lister/nixguix/tests/test_lister.py | 8 +++- 3 files changed, 44 insertions(+), 18 deletions(-) diff --git a/swh/lister/nixguix/lister.py b/swh/lister/nixguix/lister.py index 0b95922..46ae454 100644 --- a/swh/lister/nixguix/lister.py +++ b/swh/lister/nixguix/lister.py @@ -293,6 +293,21 @@ class NixGuixLister(StatelessLister[PageResult]): else None ) + def build_artifact( + self, artifact_url: str, artifact_type: str, artifact_ref: Optional[str] = None + ) -> Optional[Tuple[ArtifactType, VCS]]: + """Build a canonicalized vcs artifact when possible.""" + origin = ( + self.github_session.get_canonical_url(artifact_url) + if self.github_session + else artifact_url + ) + if not origin: + return None + return ArtifactType.VCS, VCS( + origin=origin, type=artifact_type, ref=artifact_ref + ) + def get_pages(self) -> Iterator[PageResult]: """Yield one page per "typed" origin referenced in manifest.""" # fetch and parse the manifest... @@ -321,16 +336,12 @@ class NixGuixLister(StatelessLister[PageResult]): if artifact_type in VCS_SUPPORTED: plain_url = artifact[VCS_KEYS_MAPPING[artifact_type]["url"]] plain_ref = artifact[VCS_KEYS_MAPPING[artifact_type]["ref"]] - artifact_url = ( - self.github_session.get_canonical_url(plain_url) - if self.github_session - else plain_url + built_artifact = self.build_artifact( + plain_url, artifact_type, plain_ref ) - if not artifact_url: + if not built_artifact: continue - yield ArtifactType.VCS, VCS( - origin=artifact_url, type=artifact_type, ref=plain_ref - ) + yield built_artifact elif artifact_type == "url": # It's either a tarball or a file origin_urls = artifact.get("urls") @@ -354,6 +365,13 @@ class NixGuixLister(StatelessLister[PageResult]): origin, *fallback_urls = urls + if origin.endswith(".git"): + built_artifact = self.build_artifact(origin, "git") + if not built_artifact: + continue + yield built_artifact + continue + integrity = artifact.get("integrity") if integrity is None: logger.warning("Skipping url <%s>: missing integrity field", origin) @@ -367,17 +385,12 @@ class NixGuixLister(StatelessLister[PageResult]): ) urlparsed = urlparse(origin) artifact_type = urlparsed.scheme + if artifact_type in VCS_SUPPORTED: - artifact_url = ( - self.github_session.get_canonical_url(origin) - if self.github_session - else origin - ) - if not artifact_url: + built_artifact = self.build_artifact(origin, artifact_type) + if not built_artifact: continue - yield ArtifactType.VCS, VCS( - origin=artifact_url, type=artifact_type - ) + yield built_artifact else: logger.warning( "Skipping url <%s>: undetected remote artifact type", origin diff --git a/swh/lister/nixguix/tests/data/nixpkgs-swh_sources.json b/swh/lister/nixguix/tests/data/nixpkgs-swh_sources.json index 082a8f1..b43c077 100644 --- a/swh/lister/nixguix/tests/data/nixpkgs-swh_sources.json +++ b/swh/lister/nixguix/tests/data/nixpkgs-swh_sources.json @@ -57,6 +57,13 @@ "integrity": "sha256-j7xp1svMeYIm+WScVe/B7w0jNjMtvkp9a1hLLLlO92g=", "inferredFetcher": "fetchzip" }, + { + "type": "url", + "urls": [ + "https://github.com/trie/trie.git" + ], + "integrity": "sha256-j7xp1svMeYIm+WScVe/B7w0jNjMtvkp9a1hLLLlO92g=" + }, { "type": "git", "git_url": "https://example.org/pali/0xffff", diff --git a/swh/lister/nixguix/tests/test_lister.py b/swh/lister/nixguix/tests/test_lister.py index 2f6bf09..e497010 100644 --- a/swh/lister/nixguix/tests/test_lister.py +++ b/swh/lister/nixguix/tests/test_lister.py @@ -169,6 +169,10 @@ def test_lister_nixguix_ok(datadir, swh_scheduler, requests_mock): url, [{"json": response}], ) + requests_mock.get( + "https://api.github.com/repos/trie/trie", + [{"json": {"html_url": "https://github.com/trie/trie.git"}}], + ) expected_visit_types = defaultdict(int) # origin upstream is added as origin @@ -186,7 +190,9 @@ def test_lister_nixguix_ok(datadir, swh_scheduler, requests_mock): expected_visit_types[artifact_type] += 1 elif artifact_type == "url": url = artifact["urls"][0] - if url.endswith(".c") or url.endswith(".txt"): + if url.endswith(".git"): + expected_visit_types["git"] += 1 + elif url.endswith(".c") or url.endswith(".txt"): expected_visit_types["content"] += 1 elif url.startswith("svn"): # mistyped artifact rendered as vcs nonetheless expected_visit_types["svn"] += 1