nixguix: Improve tarball detection

Without this, some tarballs hidden within query parameters are not detected. This does
some extra effort to detect those to avoid sending noise to loaders.

Related to T3781
This commit is contained in:
Antoine R. Dumont (@ardumont) 2022-10-04 20:42:20 +02:00
parent 944d4b5b60
commit 2fbd66778f
No known key found for this signature in database
GPG key ID: 52E2E9840D10C3B8
2 changed files with 45 additions and 7 deletions

View file

@ -22,7 +22,7 @@ import logging
from pathlib import Path
import random
from typing import Any, Dict, Iterator, List, Optional, Tuple, Union
from urllib.parse import urlparse
from urllib.parse import parse_qsl, urlparse
import requests
from requests.exceptions import ConnectionError, InvalidSchema, SSLError
@ -43,7 +43,7 @@ class ArtifactNatureUndetected(ValueError):
class ArtifactNatureMistyped(ValueError):
"""Raised when a remote artifact's neither a tarball nor a file.
"""Raised when a remote artifact is neither a tarball nor a file.
Error of this type are' probably a misconfiguration in the manifest generation that
badly typed a vcs repository.
@ -53,6 +53,16 @@ class ArtifactNatureMistyped(ValueError):
pass
class ArtifactWithoutExtension(ValueError):
"""Raised when an artifact nature cannot be determined by its name.
This exception is solely for internal use of the :meth:`is_tarball` method.
"""
pass
class ChecksumsComputation(Enum):
"""The possible artifact types listed out of the manifest."""
@ -140,20 +150,36 @@ def is_tarball(urls: List[str], request: Optional[Any] = None) -> Tuple[bool, st
"""Determine out of an extension whether url is a tarball.
Raises:
IndexError in case no extension is available
ArtifactWithoutExtension in case no extension is available
"""
urlparsed = urlparse(url)
if urlparsed.scheme not in ("http", "https", "ftp"):
raise ArtifactNatureMistyped(f"Mistyped artifact '{url}'")
return Path(urlparsed.path).suffixes[-1].lstrip(".") in TARBALL_EXTENSIONS
errors = []
query_params = dict(parse_qsl(urlparsed.query))
for path in [query_params.get(key) for key in ["f", "file", "url", "name"]] + [
urlparsed.path
]:
if not path:
continue
try:
file_ = Path(path).suffixes[-1]
break
except IndexError as e:
errors.append(ArtifactWithoutExtension(e))
if errors:
raise errors[-1]
return file_.lstrip(".") in TARBALL_EXTENSIONS
index = random.randrange(len(urls))
url = urls[index]
try:
return _is_tarball(url), urls[0]
except IndexError:
except ArtifactWithoutExtension:
if request is None:
raise ArtifactNatureUndetected(
f"Cannot determine artifact type from url <{url}>"
@ -181,7 +207,7 @@ def is_tarball(urls: List[str], request: Optional[Any] = None) -> Tuple[bool, st
# FIXME: location is also returned as it's considered the true origin,
# true enough?
return _is_tarball(location), location
except IndexError:
except ArtifactWithoutExtension:
logger.warning(
"Still cannot detect extension through location <%s>...",
url,

View file

@ -38,13 +38,25 @@ def page_response(datadir, instance: str) -> List[Dict]:
+ [[f"one.{ext}?foo=bar"] for ext in TARBALL_EXTENSIONS],
)
def test_is_tarball_simple(tarballs):
"""Simple check on tarball should discriminate betwenn tarball and file"""
"""Simple check on tarball should discriminate between tarball and file"""
urls = [f"https://example.org/{tarball}" for tarball in tarballs]
is_tar, origin = is_tarball(urls)
assert is_tar is True
assert origin == urls[0]
@pytest.mark.parametrize(
"query_param",
["file", "f", "url", "name"],
)
def test_is_tarball_not_so_simple(query_param):
"""More involved check on tarball should discriminate between tarball and file"""
url = f"https://example.org/download.php?foo=bar&{query_param}=one.tar.gz"
is_tar, origin = is_tarball([url])
assert is_tar is True
assert origin == url
@pytest.mark.parametrize(
"files",
[