nixguix: Deal with edge case url with version instead of extension
Prior to this, some urls were detected as file because their version name were wrongly detected as extension, hence not matching tarball extensions. Related to T3781
This commit is contained in:
parent
8355fee25f
commit
026fea21da
3 changed files with 47 additions and 8 deletions
|
@ -22,6 +22,7 @@ from enum import Enum
|
|||
import logging
|
||||
from pathlib import Path
|
||||
import random
|
||||
import re
|
||||
from typing import Any, Dict, Iterator, List, Optional, Tuple, Union
|
||||
from urllib.parse import parse_qsl, urlparse
|
||||
|
||||
|
@ -136,20 +137,36 @@ VCS_SUPPORTED = ("git", "svn", "hg")
|
|||
POSSIBLE_TARBALL_MIMETYPES = tuple(MIMETYPE_TO_ARCHIVE_FORMAT.keys())
|
||||
|
||||
|
||||
PATTERN_VERSION = re.compile(r"(v*[0-9]+[.])([0-9]+[.]*)+")
|
||||
|
||||
|
||||
def url_endswith(
|
||||
urlparsed, extensions: List[str], raise_when_no_extension: bool = True
|
||||
) -> bool:
|
||||
"""Determine whether urlparsed ends with one of the extensions.
|
||||
"""Determine whether urlparsed ends with one of the extensions passed as parameter.
|
||||
|
||||
This also account for the edge case of a filename with only a version as name (so no
|
||||
extension in the end.)
|
||||
|
||||
Raises:
|
||||
ArtifactWithoutExtension in case no extension is available and raise_when_no_extension
|
||||
is True (the default)
|
||||
ArtifactWithoutExtension in case no extension is available and
|
||||
raise_when_no_extension is True (the default)
|
||||
|
||||
"""
|
||||
paths = [Path(p) for (_, p) in [("_", urlparsed.path)] + parse_qsl(urlparsed.query)]
|
||||
if raise_when_no_extension and not any(path.suffix != "" for path in paths):
|
||||
raise ArtifactWithoutExtension
|
||||
return any(path.suffix.endswith(tuple(extensions)) for path in paths)
|
||||
match = any(path.suffix.endswith(tuple(extensions)) for path in paths)
|
||||
if match:
|
||||
return match
|
||||
# Some false negative can happen (e.g. https://<netloc>/path/0.1.5)), so make sure
|
||||
# to catch those
|
||||
name = Path(urlparsed.path).name
|
||||
if not PATTERN_VERSION.match(name):
|
||||
return match
|
||||
if raise_when_no_extension:
|
||||
raise ArtifactWithoutExtension
|
||||
return False
|
||||
|
||||
|
||||
def is_tarball(urls: List[str], request: Optional[Any] = None) -> Tuple[bool, str]:
|
||||
|
|
|
@ -265,6 +265,13 @@
|
|||
"https://github.com/Doom-Utils/deutex/releases/download/v5.2.2/deutex-5.2.2.tar.zst"
|
||||
],
|
||||
"integrity": "sha256-EO0OelM+yXy20DVI1CWPvsiIUqRbXqTPVDQ3atQXS18="
|
||||
},
|
||||
{
|
||||
"type": "url",
|
||||
"urls": [
|
||||
"https://codeload.github.com/fifengine/fifechan/tar.gz/0.1.5"
|
||||
],
|
||||
"integrity": "sha256-Kb5f9LN54vxPiO99i8FyNCEw3T53owYfZMinXv5OunM="
|
||||
}
|
||||
],
|
||||
"version": "1",
|
||||
|
|
|
@ -52,7 +52,13 @@ def page_response(datadir, instance: str = "success") -> List[Dict]:
|
|||
[(f"one.{ext}", True) for ext in TARBALL_EXTENSIONS]
|
||||
+ [(f"one.{ext}?foo=bar", True) for ext in TARBALL_EXTENSIONS]
|
||||
+ [(f"one?p0=1&foo=bar.{ext}", True) for ext in DEFAULT_EXTENSIONS_TO_IGNORE]
|
||||
+ [("two?file=something.el", False), ("foo?two=two&three=three", False)],
|
||||
+ [
|
||||
("two?file=something.el", False),
|
||||
("foo?two=two&three=three", False),
|
||||
("v1.2.3", False), # with raise_when_no_extension is False
|
||||
("2048-game-20151026.1233", False),
|
||||
("v2048-game-20151026.1233", False),
|
||||
],
|
||||
)
|
||||
def test_url_endswith(name, expected_result):
|
||||
"""It should detect whether url or query params of the urls ends with extensions"""
|
||||
|
@ -67,9 +73,12 @@ def test_url_endswith(name, expected_result):
|
|||
)
|
||||
|
||||
|
||||
def test_url_endswith_raise():
|
||||
@pytest.mark.parametrize(
|
||||
"name", ["foo?two=two&three=three", "tar.gz/0.1.5", "tar.gz/v10.3.1"]
|
||||
)
|
||||
def test_url_endswith_raise(name):
|
||||
"""It should raise when the tested url has no extension"""
|
||||
urlparsed = urlparse("https://example.org/foo?two=two&three=three")
|
||||
urlparsed = urlparse(f"https://example.org/{name}")
|
||||
with pytest.raises(ArtifactWithoutExtension):
|
||||
url_endswith(urlparsed, ["unimportant"])
|
||||
|
||||
|
@ -225,6 +234,12 @@ def test_lister_nixguix_ok(datadir, swh_scheduler, requests_mock):
|
|||
"Location": "https://static.crates.io/crates/syntect/syntect-4.6.0.crate"
|
||||
},
|
||||
)
|
||||
requests_mock.head(
|
||||
"https://codeload.github.com/fifengine/fifechan/tar.gz/0.1.5",
|
||||
headers={
|
||||
"Content-Type": "application/x-gzip",
|
||||
},
|
||||
)
|
||||
|
||||
expected_visit_types = defaultdict(int)
|
||||
# origin upstream is added as origin
|
||||
|
@ -248,7 +263,7 @@ def test_lister_nixguix_ok(datadir, swh_scheduler, requests_mock):
|
|||
expected_visit_types["content"] += 1
|
||||
elif url.startswith("svn"): # mistyped artifact rendered as vcs nonetheless
|
||||
expected_visit_types["svn"] += 1
|
||||
elif "crates.io" in url:
|
||||
elif "crates.io" in url or "codeload.github.com" in url:
|
||||
expected_visit_types["directory"] += 1
|
||||
else: # tarball artifacts
|
||||
expected_visit_types["directory"] += 1
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue