From 2fbd66778f3286c77149796339426aeccd11c1d0 Mon Sep 17 00:00:00 2001 From: "Antoine R. Dumont (@ardumont)" Date: Tue, 4 Oct 2022 20:42:20 +0200 Subject: [PATCH] nixguix: Improve tarball detection Without this, some tarballs hidden within query parameters are not detected. This does some extra effort to detect those to avoid sending noise to loaders. Related to T3781 --- swh/lister/nixguix/lister.py | 38 +++++++++++++++++++++---- swh/lister/nixguix/tests/test_lister.py | 14 ++++++++- 2 files changed, 45 insertions(+), 7 deletions(-) diff --git a/swh/lister/nixguix/lister.py b/swh/lister/nixguix/lister.py index 2623ef2..0b95922 100644 --- a/swh/lister/nixguix/lister.py +++ b/swh/lister/nixguix/lister.py @@ -22,7 +22,7 @@ import logging from pathlib import Path import random from typing import Any, Dict, Iterator, List, Optional, Tuple, Union -from urllib.parse import urlparse +from urllib.parse import parse_qsl, urlparse import requests from requests.exceptions import ConnectionError, InvalidSchema, SSLError @@ -43,7 +43,7 @@ class ArtifactNatureUndetected(ValueError): class ArtifactNatureMistyped(ValueError): - """Raised when a remote artifact's neither a tarball nor a file. + """Raised when a remote artifact is neither a tarball nor a file. Error of this type are' probably a misconfiguration in the manifest generation that badly typed a vcs repository. @@ -53,6 +53,16 @@ class ArtifactNatureMistyped(ValueError): pass +class ArtifactWithoutExtension(ValueError): + """Raised when an artifact nature cannot be determined by its name. + + This exception is solely for internal use of the :meth:`is_tarball` method. + + """ + + pass + + class ChecksumsComputation(Enum): """The possible artifact types listed out of the manifest.""" @@ -140,20 +150,36 @@ def is_tarball(urls: List[str], request: Optional[Any] = None) -> Tuple[bool, st """Determine out of an extension whether url is a tarball. Raises: - IndexError in case no extension is available + ArtifactWithoutExtension in case no extension is available """ urlparsed = urlparse(url) if urlparsed.scheme not in ("http", "https", "ftp"): raise ArtifactNatureMistyped(f"Mistyped artifact '{url}'") - return Path(urlparsed.path).suffixes[-1].lstrip(".") in TARBALL_EXTENSIONS + + errors = [] + query_params = dict(parse_qsl(urlparsed.query)) + for path in [query_params.get(key) for key in ["f", "file", "url", "name"]] + [ + urlparsed.path + ]: + if not path: + continue + try: + file_ = Path(path).suffixes[-1] + break + except IndexError as e: + errors.append(ArtifactWithoutExtension(e)) + + if errors: + raise errors[-1] + return file_.lstrip(".") in TARBALL_EXTENSIONS index = random.randrange(len(urls)) url = urls[index] try: return _is_tarball(url), urls[0] - except IndexError: + except ArtifactWithoutExtension: if request is None: raise ArtifactNatureUndetected( f"Cannot determine artifact type from url <{url}>" @@ -181,7 +207,7 @@ def is_tarball(urls: List[str], request: Optional[Any] = None) -> Tuple[bool, st # FIXME: location is also returned as it's considered the true origin, # true enough? return _is_tarball(location), location - except IndexError: + except ArtifactWithoutExtension: logger.warning( "Still cannot detect extension through location <%s>...", url, diff --git a/swh/lister/nixguix/tests/test_lister.py b/swh/lister/nixguix/tests/test_lister.py index 6ed4d1d..2f6bf09 100644 --- a/swh/lister/nixguix/tests/test_lister.py +++ b/swh/lister/nixguix/tests/test_lister.py @@ -38,13 +38,25 @@ def page_response(datadir, instance: str) -> List[Dict]: + [[f"one.{ext}?foo=bar"] for ext in TARBALL_EXTENSIONS], ) def test_is_tarball_simple(tarballs): - """Simple check on tarball should discriminate betwenn tarball and file""" + """Simple check on tarball should discriminate between tarball and file""" urls = [f"https://example.org/{tarball}" for tarball in tarballs] is_tar, origin = is_tarball(urls) assert is_tar is True assert origin == urls[0] +@pytest.mark.parametrize( + "query_param", + ["file", "f", "url", "name"], +) +def test_is_tarball_not_so_simple(query_param): + """More involved check on tarball should discriminate between tarball and file""" + url = f"https://example.org/download.php?foo=bar&{query_param}=one.tar.gz" + is_tar, origin = is_tarball([url]) + assert is_tar is True + assert origin == url + + @pytest.mark.parametrize( "files", [