nixguix: Deal with mistyped origins
Some origins are listed as urls while they are not. They are possibly vcs. So this commit tries to detect and and deal with those if possible. If not possible, they are skipped. Related to T3781 Related to P1470
This commit is contained in:
parent
1b4fe51f62
commit
a94b75f366
4 changed files with 107 additions and 18 deletions
|
@ -41,6 +41,13 @@ class ArtifactNatureUndetected(ValueError):
|
|||
pass
|
||||
|
||||
|
||||
class ArtifactNatureMistyped(ValueError):
|
||||
"""Raised when a remote artifact's neither a tarball nor a file. It's probably a
|
||||
misconfiguration in the manifest that badly typed a vcs repository."""
|
||||
|
||||
pass
|
||||
|
||||
|
||||
@dataclass
|
||||
class OriginUpstream:
|
||||
"""Upstream origin (e.g. NixOS/nixpkgs, Guix/Guix)."""
|
||||
|
@ -73,10 +80,10 @@ class VCS:
|
|||
|
||||
origin: str
|
||||
"""Origin url of the vcs"""
|
||||
ref: Optional[str]
|
||||
"""Reference either a svn commit id, a git commit, ..."""
|
||||
type: str
|
||||
"""Type of (d)vcs, e.g. svn, git, hg, ..."""
|
||||
ref: Optional[str] = None
|
||||
"""Reference either a svn commit id, a git commit, ..."""
|
||||
|
||||
|
||||
class ArtifactType(Enum):
|
||||
|
@ -109,6 +116,8 @@ def is_tarball(urls: List[str], request: Optional[Any] = None) -> Tuple[bool, st
|
|||
Raises:
|
||||
ArtifactNatureUndetected when the artifact's nature cannot be detected out
|
||||
of its url
|
||||
ArtifactNatureMistyped when the artifact is not a tarball nor a file. It's up to
|
||||
the caller to do what's right with it.
|
||||
|
||||
Returns: A tuple (bool, url). The boolean represents whether the url is an archive
|
||||
or not. The second parameter is the actual url once the head request is issued
|
||||
|
@ -123,27 +132,37 @@ def is_tarball(urls: List[str], request: Optional[Any] = None) -> Tuple[bool, st
|
|||
IndexError in case no extension is available
|
||||
|
||||
"""
|
||||
return Path(urlparse(url).path).suffixes[-1].lstrip(".") in TARBALL_EXTENSIONS
|
||||
urlparsed = urlparse(url)
|
||||
if urlparsed.scheme not in ("http", "https", "ftp"):
|
||||
raise ArtifactNatureMistyped(f"Mistyped artifact '{url}'")
|
||||
return Path(urlparsed.path).suffixes[-1].lstrip(".") in TARBALL_EXTENSIONS
|
||||
|
||||
index = random.randrange(len(urls))
|
||||
url = urls[index]
|
||||
|
||||
try:
|
||||
is_tar = _is_tarball(url)
|
||||
return is_tar, urls[0]
|
||||
except IndexError:
|
||||
if request is None:
|
||||
raise ArtifactNatureUndetected(
|
||||
"Cannot determine artifact type from url %s", url
|
||||
f"Cannot determine artifact type from url <{url}>"
|
||||
)
|
||||
logger.warning(
|
||||
"Cannot detect extension for '%s'. Fallback to http head query",
|
||||
"Cannot detect extension for <%s>. Fallback to http head query",
|
||||
url,
|
||||
)
|
||||
response = request.head(url)
|
||||
|
||||
try:
|
||||
response = request.head(url)
|
||||
except requests.exceptions.InvalidSchema:
|
||||
raise ArtifactNatureUndetected(
|
||||
f"Cannot determine artifact type from url <{url}>"
|
||||
)
|
||||
|
||||
if not response.ok or response.status_code == 404:
|
||||
raise ArtifactNatureUndetected(
|
||||
"Cannot determine artifact type from url %s", url
|
||||
f"Cannot determine artifact type from url <{url}>"
|
||||
)
|
||||
location = response.headers.get("Location")
|
||||
if location: # It's not always present
|
||||
|
@ -154,7 +173,7 @@ def is_tarball(urls: List[str], request: Optional[Any] = None) -> Tuple[bool, st
|
|||
return _is_tarball(location), location
|
||||
except IndexError:
|
||||
logger.warning(
|
||||
"Still cannot detect extension through location '%s'...",
|
||||
"Still cannot detect extension through location <%s>...",
|
||||
url,
|
||||
)
|
||||
|
||||
|
@ -166,7 +185,7 @@ def is_tarball(urls: List[str], request: Optional[Any] = None) -> Tuple[bool, st
|
|||
return content_type in POSSIBLE_TARBALL_MIMETYPES, urls[0]
|
||||
|
||||
raise ArtifactNatureUndetected(
|
||||
"Cannot determine artifact type from url %s", url
|
||||
f"Cannot determine artifact type from url <{url}>"
|
||||
)
|
||||
|
||||
|
||||
|
@ -287,23 +306,48 @@ class NixGuixLister(StatelessLister[PageResult]):
|
|||
urls = artifact.get("urls")
|
||||
if not urls:
|
||||
# Nothing to fetch
|
||||
logger.warning("Skipping url '%s': empty artifact", artifact)
|
||||
logger.warning("Skipping url <%s>: empty artifact", artifact)
|
||||
continue
|
||||
|
||||
assert urls is not None
|
||||
|
||||
# Deal with misplaced origins
|
||||
|
||||
# FIXME: T3294: Fix missing scheme in urls
|
||||
origin, *fallback_urls = urls
|
||||
|
||||
integrity = artifact.get("integrity")
|
||||
if integrity is None:
|
||||
logger.warning("Skipping url '%s': missing integrity field", origin)
|
||||
logger.warning("Skipping url <%s>: missing integrity field", origin)
|
||||
continue
|
||||
|
||||
try:
|
||||
is_tar, origin = is_tarball(urls, self.session)
|
||||
except ArtifactNatureMistyped:
|
||||
logger.warning(
|
||||
"Mistyped url <%s>: trying to deal with it properly", origin
|
||||
)
|
||||
urlparsed = urlparse(origin)
|
||||
artifact_type = urlparsed.scheme
|
||||
if artifact_type in VCS_SUPPORTED:
|
||||
artifact_url = (
|
||||
self.github_session.get_canonical_url(origin)
|
||||
if self.github_session
|
||||
else origin
|
||||
)
|
||||
if not artifact_url:
|
||||
continue
|
||||
yield ArtifactType.VCS, VCS(
|
||||
origin=artifact_url, type=artifact_type
|
||||
)
|
||||
else:
|
||||
logger.warning(
|
||||
"Skipping url <%s>: undetected remote artifact type", origin
|
||||
)
|
||||
continue
|
||||
except ArtifactNatureUndetected:
|
||||
logger.warning(
|
||||
"Skipping url '%s': undetected remote artifact type", origin
|
||||
"Skipping url <%s>: undetected remote artifact type", origin
|
||||
)
|
||||
continue
|
||||
|
||||
|
@ -325,7 +369,7 @@ class NixGuixLister(StatelessLister[PageResult]):
|
|||
)
|
||||
else:
|
||||
logger.warning(
|
||||
"Skipping artifact '%s': unsupported type %s",
|
||||
"Skipping artifact <%s>: unsupported type %s",
|
||||
artifact,
|
||||
artifact_type,
|
||||
)
|
||||
|
|
|
@ -12,6 +12,18 @@
|
|||
{
|
||||
"type": "url",
|
||||
"urls": [ "https://example.org/another-file-no-integrity-so-skipped.txt" ]
|
||||
},
|
||||
{
|
||||
"type": "url",
|
||||
"urls": [
|
||||
"ftp://ftp.ourproject.org/file-with-no-extension"
|
||||
],
|
||||
"integrity": "sha256-bss09x9yOnuW+Q5BHHjf8nNcCNxCKMdl9/2/jKSFcrQ="
|
||||
},
|
||||
{
|
||||
"type": "url",
|
||||
"urls": [ "unknown://example.org/wrong-scheme-so-skipped.txt" ],
|
||||
"integrity": "sha256-wAEswtkl3ulAw3zq4perrGS6Wlww5XXnQYsEAoYT9fI="
|
||||
}
|
||||
],
|
||||
"version":"1",
|
||||
|
|
|
@ -22,6 +22,13 @@
|
|||
],
|
||||
"integrity": "sha256-1w3NdfRzp9XIFDLD2SYJJr+Nnf9c1UF5YWlJfRxSLt0="
|
||||
},
|
||||
{
|
||||
"type": "url",
|
||||
"urls": [
|
||||
"ftp://ftp.ourproject.org/pub/ytalk/ytalk-3.3.0.tar.gz"
|
||||
],
|
||||
"integrity": "sha256-bss09x9yOnuW+Q5BHHjf8nNcCNxCKMdl9/2/jKSFcrQ="
|
||||
},
|
||||
{
|
||||
"type": "url",
|
||||
"urls": [
|
||||
|
@ -45,6 +52,11 @@
|
|||
"type": "svn",
|
||||
"svn_url": "https://code.call-cc.org/svn/chicken-eggs/release/5/iset/tags/2.2",
|
||||
"svn_revision": 39057
|
||||
},
|
||||
{
|
||||
"type": "url",
|
||||
"urls": ["svn://svn.code.sf.net/p/acme-crossass/code-0/trunk"],
|
||||
"integrity": "sha256-VifIQ+UEVMKJ+cNS+Xxusazinr5Cgu1lmGuhqj/5Mpk="
|
||||
}
|
||||
],
|
||||
"version": "1",
|
||||
|
|
|
@ -15,6 +15,7 @@ import requests
|
|||
from swh.lister import TARBALL_EXTENSIONS
|
||||
from swh.lister.nixguix.lister import (
|
||||
POSSIBLE_TARBALL_MIMETYPES,
|
||||
ArtifactNatureMistyped,
|
||||
ArtifactNatureUndetected,
|
||||
NixGuixLister,
|
||||
is_tarball,
|
||||
|
@ -31,19 +32,20 @@ def page_response(datadir, instance: str) -> List[Dict]:
|
|||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"urls",
|
||||
"tarballs",
|
||||
[[f"one.{ext}", f"two.{ext}"] for ext in TARBALL_EXTENSIONS]
|
||||
+ [[f"one.{ext}?foo=bar"] for ext in TARBALL_EXTENSIONS],
|
||||
)
|
||||
def test_is_tarball_simple(urls):
|
||||
def test_is_tarball_simple(tarballs):
|
||||
"""Simple check on tarball should discriminate betwenn tarball and file"""
|
||||
urls = [f"https://example.org/{tarball}" for tarball in tarballs]
|
||||
is_tar, origin = is_tarball(urls)
|
||||
assert is_tar is True
|
||||
assert origin == urls[0]
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"urls",
|
||||
"files",
|
||||
[
|
||||
["abc.lisp"],
|
||||
["one.abc", "two.bcd"],
|
||||
|
@ -52,8 +54,9 @@ def test_is_tarball_simple(urls):
|
|||
["config.nix", "flakes.nix"],
|
||||
],
|
||||
)
|
||||
def test_is_tarball_simple_not_tarball(urls):
|
||||
def test_is_tarball_simple_not_tarball(files):
|
||||
"""Simple check on tarball should discriminate betwenn tarball and file"""
|
||||
urls = [f"http://example.org/{file}" for file in files]
|
||||
is_tar, origin = is_tarball(urls)
|
||||
assert is_tar is False
|
||||
assert origin == urls[0]
|
||||
|
@ -65,7 +68,7 @@ def test_is_tarball_complex_with_no_result(requests_mock):
|
|||
url = "https://example.org/crates/package/download"
|
||||
urls = [url]
|
||||
with pytest.raises(ArtifactNatureUndetected):
|
||||
is_tarball(url) # no request parameter, this cannot fallback, raises
|
||||
is_tarball(urls) # no request parameter, this cannot fallback, raises
|
||||
|
||||
with pytest.raises(ArtifactNatureUndetected):
|
||||
requests_mock.head(
|
||||
|
@ -87,6 +90,16 @@ def test_is_tarball_complex_with_no_result(requests_mock):
|
|||
)
|
||||
is_tarball(urls, requests)
|
||||
|
||||
with pytest.raises(ArtifactNatureMistyped):
|
||||
is_tarball(["foo://example.org/unsupported-scheme"])
|
||||
|
||||
with pytest.raises(ArtifactNatureMistyped):
|
||||
fallback_url = "foo://example.org/unsupported-scheme"
|
||||
requests_mock.head(
|
||||
url, headers={"location": fallback_url} # still no extension, cannot detect
|
||||
)
|
||||
is_tarball(urls, requests)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"fallback_url, expected_result",
|
||||
|
@ -162,6 +175,8 @@ def test_lister_nixguix(datadir, swh_scheduler, requests_mock):
|
|||
url = artifact["urls"][0]
|
||||
if url.endswith(".c") or url.endswith(".txt"):
|
||||
expected_visit_types["content"] += 1
|
||||
elif url.startswith("svn"): # mistyped artifact rendered as vcs nonetheless
|
||||
expected_visit_types["svn"] += 1
|
||||
else:
|
||||
expected_visit_types["directory"] += 1
|
||||
|
||||
|
@ -214,6 +229,12 @@ def test_lister_nixguix_mostly_noop(datadir, swh_scheduler, requests_mock):
|
|||
"https://crates.io/api/v1/0.1.5/no-extension-and-head-404-so-skipped",
|
||||
status_code=404,
|
||||
)
|
||||
# Will raise for that origin, this will get ignored as we cannot determine anything
|
||||
# from its name
|
||||
requests_mock.head(
|
||||
"ftp://ftp.ourproject.org/file-with-no-extension",
|
||||
exc=requests.exceptions.InvalidSchema,
|
||||
)
|
||||
|
||||
listed_result = lister.run()
|
||||
# only the origin upstream is listed, every other entries are unsupported or incomplete
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue