From 026fea21da49b30fda9c5adbb24c6d7a8c24c8df Mon Sep 17 00:00:00 2001 From: "Antoine R. Dumont (@ardumont)" Date: Tue, 25 Oct 2022 17:04:29 +0200 Subject: [PATCH] nixguix: Deal with edge case url with version instead of extension Prior to this, some urls were detected as file because their version name were wrongly detected as extension, hence not matching tarball extensions. Related to T3781 --- swh/lister/nixguix/lister.py | 25 ++++++++++++++++--- .../nixguix/tests/data/sources-success.json | 7 ++++++ swh/lister/nixguix/tests/test_lister.py | 23 ++++++++++++++--- 3 files changed, 47 insertions(+), 8 deletions(-) diff --git a/swh/lister/nixguix/lister.py b/swh/lister/nixguix/lister.py index b5b6e89..21cae67 100644 --- a/swh/lister/nixguix/lister.py +++ b/swh/lister/nixguix/lister.py @@ -22,6 +22,7 @@ from enum import Enum import logging from pathlib import Path import random +import re from typing import Any, Dict, Iterator, List, Optional, Tuple, Union from urllib.parse import parse_qsl, urlparse @@ -136,20 +137,36 @@ VCS_SUPPORTED = ("git", "svn", "hg") POSSIBLE_TARBALL_MIMETYPES = tuple(MIMETYPE_TO_ARCHIVE_FORMAT.keys()) +PATTERN_VERSION = re.compile(r"(v*[0-9]+[.])([0-9]+[.]*)+") + + def url_endswith( urlparsed, extensions: List[str], raise_when_no_extension: bool = True ) -> bool: - """Determine whether urlparsed ends with one of the extensions. + """Determine whether urlparsed ends with one of the extensions passed as parameter. + + This also account for the edge case of a filename with only a version as name (so no + extension in the end.) Raises: - ArtifactWithoutExtension in case no extension is available and raise_when_no_extension - is True (the default) + ArtifactWithoutExtension in case no extension is available and + raise_when_no_extension is True (the default) """ paths = [Path(p) for (_, p) in [("_", urlparsed.path)] + parse_qsl(urlparsed.query)] if raise_when_no_extension and not any(path.suffix != "" for path in paths): raise ArtifactWithoutExtension - return any(path.suffix.endswith(tuple(extensions)) for path in paths) + match = any(path.suffix.endswith(tuple(extensions)) for path in paths) + if match: + return match + # Some false negative can happen (e.g. https:///path/0.1.5)), so make sure + # to catch those + name = Path(urlparsed.path).name + if not PATTERN_VERSION.match(name): + return match + if raise_when_no_extension: + raise ArtifactWithoutExtension + return False def is_tarball(urls: List[str], request: Optional[Any] = None) -> Tuple[bool, str]: diff --git a/swh/lister/nixguix/tests/data/sources-success.json b/swh/lister/nixguix/tests/data/sources-success.json index 0e5ccb0..3178159 100644 --- a/swh/lister/nixguix/tests/data/sources-success.json +++ b/swh/lister/nixguix/tests/data/sources-success.json @@ -265,6 +265,13 @@ "https://github.com/Doom-Utils/deutex/releases/download/v5.2.2/deutex-5.2.2.tar.zst" ], "integrity": "sha256-EO0OelM+yXy20DVI1CWPvsiIUqRbXqTPVDQ3atQXS18=" + }, + { + "type": "url", + "urls": [ + "https://codeload.github.com/fifengine/fifechan/tar.gz/0.1.5" + ], + "integrity": "sha256-Kb5f9LN54vxPiO99i8FyNCEw3T53owYfZMinXv5OunM=" } ], "version": "1", diff --git a/swh/lister/nixguix/tests/test_lister.py b/swh/lister/nixguix/tests/test_lister.py index 78bde96..13ee116 100644 --- a/swh/lister/nixguix/tests/test_lister.py +++ b/swh/lister/nixguix/tests/test_lister.py @@ -52,7 +52,13 @@ def page_response(datadir, instance: str = "success") -> List[Dict]: [(f"one.{ext}", True) for ext in TARBALL_EXTENSIONS] + [(f"one.{ext}?foo=bar", True) for ext in TARBALL_EXTENSIONS] + [(f"one?p0=1&foo=bar.{ext}", True) for ext in DEFAULT_EXTENSIONS_TO_IGNORE] - + [("two?file=something.el", False), ("foo?two=two&three=three", False)], + + [ + ("two?file=something.el", False), + ("foo?two=two&three=three", False), + ("v1.2.3", False), # with raise_when_no_extension is False + ("2048-game-20151026.1233", False), + ("v2048-game-20151026.1233", False), + ], ) def test_url_endswith(name, expected_result): """It should detect whether url or query params of the urls ends with extensions""" @@ -67,9 +73,12 @@ def test_url_endswith(name, expected_result): ) -def test_url_endswith_raise(): +@pytest.mark.parametrize( + "name", ["foo?two=two&three=three", "tar.gz/0.1.5", "tar.gz/v10.3.1"] +) +def test_url_endswith_raise(name): """It should raise when the tested url has no extension""" - urlparsed = urlparse("https://example.org/foo?two=two&three=three") + urlparsed = urlparse(f"https://example.org/{name}") with pytest.raises(ArtifactWithoutExtension): url_endswith(urlparsed, ["unimportant"]) @@ -225,6 +234,12 @@ def test_lister_nixguix_ok(datadir, swh_scheduler, requests_mock): "Location": "https://static.crates.io/crates/syntect/syntect-4.6.0.crate" }, ) + requests_mock.head( + "https://codeload.github.com/fifengine/fifechan/tar.gz/0.1.5", + headers={ + "Content-Type": "application/x-gzip", + }, + ) expected_visit_types = defaultdict(int) # origin upstream is added as origin @@ -248,7 +263,7 @@ def test_lister_nixguix_ok(datadir, swh_scheduler, requests_mock): expected_visit_types["content"] += 1 elif url.startswith("svn"): # mistyped artifact rendered as vcs nonetheless expected_visit_types["svn"] += 1 - elif "crates.io" in url: + elif "crates.io" in url or "codeload.github.com" in url: expected_visit_types["directory"] += 1 else: # tarball artifacts expected_visit_types["directory"] += 1