nixguix: Improve git origins detection

Without this, some git repositories are detected as file (due to upstream
misqualification too). This does some extra effort to detect those to avoid sending
noise to loaders.

This also refactors some common code to build vcs artifacts to avoid duplication.

Related to T3781
This commit is contained in:
Antoine R. Dumont (@ardumont) 2022-10-04 23:25:20 +02:00
parent 2fbd66778f
commit ff80a91f0a
No known key found for this signature in database
GPG key ID: 52E2E9840D10C3B8
3 changed files with 44 additions and 18 deletions

View file

@ -293,6 +293,21 @@ class NixGuixLister(StatelessLister[PageResult]):
else None
)
def build_artifact(
self, artifact_url: str, artifact_type: str, artifact_ref: Optional[str] = None
) -> Optional[Tuple[ArtifactType, VCS]]:
"""Build a canonicalized vcs artifact when possible."""
origin = (
self.github_session.get_canonical_url(artifact_url)
if self.github_session
else artifact_url
)
if not origin:
return None
return ArtifactType.VCS, VCS(
origin=origin, type=artifact_type, ref=artifact_ref
)
def get_pages(self) -> Iterator[PageResult]:
"""Yield one page per "typed" origin referenced in manifest."""
# fetch and parse the manifest...
@ -321,16 +336,12 @@ class NixGuixLister(StatelessLister[PageResult]):
if artifact_type in VCS_SUPPORTED:
plain_url = artifact[VCS_KEYS_MAPPING[artifact_type]["url"]]
plain_ref = artifact[VCS_KEYS_MAPPING[artifact_type]["ref"]]
artifact_url = (
self.github_session.get_canonical_url(plain_url)
if self.github_session
else plain_url
built_artifact = self.build_artifact(
plain_url, artifact_type, plain_ref
)
if not artifact_url:
if not built_artifact:
continue
yield ArtifactType.VCS, VCS(
origin=artifact_url, type=artifact_type, ref=plain_ref
)
yield built_artifact
elif artifact_type == "url":
# It's either a tarball or a file
origin_urls = artifact.get("urls")
@ -354,6 +365,13 @@ class NixGuixLister(StatelessLister[PageResult]):
origin, *fallback_urls = urls
if origin.endswith(".git"):
built_artifact = self.build_artifact(origin, "git")
if not built_artifact:
continue
yield built_artifact
continue
integrity = artifact.get("integrity")
if integrity is None:
logger.warning("Skipping url <%s>: missing integrity field", origin)
@ -367,17 +385,12 @@ class NixGuixLister(StatelessLister[PageResult]):
)
urlparsed = urlparse(origin)
artifact_type = urlparsed.scheme
if artifact_type in VCS_SUPPORTED:
artifact_url = (
self.github_session.get_canonical_url(origin)
if self.github_session
else origin
)
if not artifact_url:
built_artifact = self.build_artifact(origin, artifact_type)
if not built_artifact:
continue
yield ArtifactType.VCS, VCS(
origin=artifact_url, type=artifact_type
)
yield built_artifact
else:
logger.warning(
"Skipping url <%s>: undetected remote artifact type", origin

View file

@ -57,6 +57,13 @@
"integrity": "sha256-j7xp1svMeYIm+WScVe/B7w0jNjMtvkp9a1hLLLlO92g=",
"inferredFetcher": "fetchzip"
},
{
"type": "url",
"urls": [
"https://github.com/trie/trie.git"
],
"integrity": "sha256-j7xp1svMeYIm+WScVe/B7w0jNjMtvkp9a1hLLLlO92g="
},
{
"type": "git",
"git_url": "https://example.org/pali/0xffff",

View file

@ -169,6 +169,10 @@ def test_lister_nixguix_ok(datadir, swh_scheduler, requests_mock):
url,
[{"json": response}],
)
requests_mock.get(
"https://api.github.com/repos/trie/trie",
[{"json": {"html_url": "https://github.com/trie/trie.git"}}],
)
expected_visit_types = defaultdict(int)
# origin upstream is added as origin
@ -186,7 +190,9 @@ def test_lister_nixguix_ok(datadir, swh_scheduler, requests_mock):
expected_visit_types[artifact_type] += 1
elif artifact_type == "url":
url = artifact["urls"][0]
if url.endswith(".c") or url.endswith(".txt"):
if url.endswith(".git"):
expected_visit_types["git"] += 1
elif url.endswith(".c") or url.endswith(".txt"):
expected_visit_types["content"] += 1
elif url.startswith("svn"): # mistyped artifact rendered as vcs nonetheless
expected_visit_types["svn"] += 1