nixguix: Improve git origins detection
Without this, some git repositories are detected as file (due to upstream misqualification too). This does some extra effort to detect those to avoid sending noise to loaders. This also refactors some common code to build vcs artifacts to avoid duplication. Related to T3781
This commit is contained in:
parent
2fbd66778f
commit
ff80a91f0a
3 changed files with 44 additions and 18 deletions
|
@ -293,6 +293,21 @@ class NixGuixLister(StatelessLister[PageResult]):
|
|||
else None
|
||||
)
|
||||
|
||||
def build_artifact(
|
||||
self, artifact_url: str, artifact_type: str, artifact_ref: Optional[str] = None
|
||||
) -> Optional[Tuple[ArtifactType, VCS]]:
|
||||
"""Build a canonicalized vcs artifact when possible."""
|
||||
origin = (
|
||||
self.github_session.get_canonical_url(artifact_url)
|
||||
if self.github_session
|
||||
else artifact_url
|
||||
)
|
||||
if not origin:
|
||||
return None
|
||||
return ArtifactType.VCS, VCS(
|
||||
origin=origin, type=artifact_type, ref=artifact_ref
|
||||
)
|
||||
|
||||
def get_pages(self) -> Iterator[PageResult]:
|
||||
"""Yield one page per "typed" origin referenced in manifest."""
|
||||
# fetch and parse the manifest...
|
||||
|
@ -321,16 +336,12 @@ class NixGuixLister(StatelessLister[PageResult]):
|
|||
if artifact_type in VCS_SUPPORTED:
|
||||
plain_url = artifact[VCS_KEYS_MAPPING[artifact_type]["url"]]
|
||||
plain_ref = artifact[VCS_KEYS_MAPPING[artifact_type]["ref"]]
|
||||
artifact_url = (
|
||||
self.github_session.get_canonical_url(plain_url)
|
||||
if self.github_session
|
||||
else plain_url
|
||||
built_artifact = self.build_artifact(
|
||||
plain_url, artifact_type, plain_ref
|
||||
)
|
||||
if not artifact_url:
|
||||
if not built_artifact:
|
||||
continue
|
||||
yield ArtifactType.VCS, VCS(
|
||||
origin=artifact_url, type=artifact_type, ref=plain_ref
|
||||
)
|
||||
yield built_artifact
|
||||
elif artifact_type == "url":
|
||||
# It's either a tarball or a file
|
||||
origin_urls = artifact.get("urls")
|
||||
|
@ -354,6 +365,13 @@ class NixGuixLister(StatelessLister[PageResult]):
|
|||
|
||||
origin, *fallback_urls = urls
|
||||
|
||||
if origin.endswith(".git"):
|
||||
built_artifact = self.build_artifact(origin, "git")
|
||||
if not built_artifact:
|
||||
continue
|
||||
yield built_artifact
|
||||
continue
|
||||
|
||||
integrity = artifact.get("integrity")
|
||||
if integrity is None:
|
||||
logger.warning("Skipping url <%s>: missing integrity field", origin)
|
||||
|
@ -367,17 +385,12 @@ class NixGuixLister(StatelessLister[PageResult]):
|
|||
)
|
||||
urlparsed = urlparse(origin)
|
||||
artifact_type = urlparsed.scheme
|
||||
|
||||
if artifact_type in VCS_SUPPORTED:
|
||||
artifact_url = (
|
||||
self.github_session.get_canonical_url(origin)
|
||||
if self.github_session
|
||||
else origin
|
||||
)
|
||||
if not artifact_url:
|
||||
built_artifact = self.build_artifact(origin, artifact_type)
|
||||
if not built_artifact:
|
||||
continue
|
||||
yield ArtifactType.VCS, VCS(
|
||||
origin=artifact_url, type=artifact_type
|
||||
)
|
||||
yield built_artifact
|
||||
else:
|
||||
logger.warning(
|
||||
"Skipping url <%s>: undetected remote artifact type", origin
|
||||
|
|
|
@ -57,6 +57,13 @@
|
|||
"integrity": "sha256-j7xp1svMeYIm+WScVe/B7w0jNjMtvkp9a1hLLLlO92g=",
|
||||
"inferredFetcher": "fetchzip"
|
||||
},
|
||||
{
|
||||
"type": "url",
|
||||
"urls": [
|
||||
"https://github.com/trie/trie.git"
|
||||
],
|
||||
"integrity": "sha256-j7xp1svMeYIm+WScVe/B7w0jNjMtvkp9a1hLLLlO92g="
|
||||
},
|
||||
{
|
||||
"type": "git",
|
||||
"git_url": "https://example.org/pali/0xffff",
|
||||
|
|
|
@ -169,6 +169,10 @@ def test_lister_nixguix_ok(datadir, swh_scheduler, requests_mock):
|
|||
url,
|
||||
[{"json": response}],
|
||||
)
|
||||
requests_mock.get(
|
||||
"https://api.github.com/repos/trie/trie",
|
||||
[{"json": {"html_url": "https://github.com/trie/trie.git"}}],
|
||||
)
|
||||
|
||||
expected_visit_types = defaultdict(int)
|
||||
# origin upstream is added as origin
|
||||
|
@ -186,7 +190,9 @@ def test_lister_nixguix_ok(datadir, swh_scheduler, requests_mock):
|
|||
expected_visit_types[artifact_type] += 1
|
||||
elif artifact_type == "url":
|
||||
url = artifact["urls"][0]
|
||||
if url.endswith(".c") or url.endswith(".txt"):
|
||||
if url.endswith(".git"):
|
||||
expected_visit_types["git"] += 1
|
||||
elif url.endswith(".c") or url.endswith(".txt"):
|
||||
expected_visit_types["content"] += 1
|
||||
elif url.startswith("svn"): # mistyped artifact rendered as vcs nonetheless
|
||||
expected_visit_types["svn"] += 1
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue