nixguix: Allow lister to ignore specific extensions
Those extensions can be extended through configuration. They default to some binary format already encountered during docker runs. Related to T3781
This commit is contained in:
parent
d96a39d5b0
commit
ca4ab7f277
3 changed files with 216 additions and 15 deletions
|
@ -37,6 +37,21 @@ from swh.scheduler.model import ListedOrigin
|
|||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# By default, ignore binary files and archives containing binaries
|
||||
DEFAULT_EXTENSIONS_TO_IGNORE = [
|
||||
"AppImage",
|
||||
"bin",
|
||||
"exe",
|
||||
"iso",
|
||||
"linux64",
|
||||
"msi",
|
||||
"png",
|
||||
"dic",
|
||||
"deb",
|
||||
"rpm",
|
||||
]
|
||||
|
||||
|
||||
class ArtifactNatureUndetected(ValueError):
|
||||
"""Raised when a remote artifact's nature (tarball, file) cannot be detected."""
|
||||
|
||||
|
@ -55,11 +70,7 @@ class ArtifactNatureMistyped(ValueError):
|
|||
|
||||
|
||||
class ArtifactWithoutExtension(ValueError):
|
||||
"""Raised when an artifact nature cannot be determined by its name.
|
||||
|
||||
This exception is solely for internal use of the :meth:`is_tarball` method.
|
||||
|
||||
"""
|
||||
"""Raised when an artifact nature cannot be determined by its name."""
|
||||
|
||||
pass
|
||||
|
||||
|
@ -125,6 +136,22 @@ VCS_SUPPORTED = ("git", "svn", "hg")
|
|||
POSSIBLE_TARBALL_MIMETYPES = tuple(MIMETYPE_TO_ARCHIVE_FORMAT.keys())
|
||||
|
||||
|
||||
def url_endswith(
|
||||
urlparsed, extensions: List[str], raise_when_no_extension: bool = True
|
||||
) -> bool:
|
||||
"""Determine whether urlparsed ends with one of the extensions.
|
||||
|
||||
Raises:
|
||||
ArtifactWithoutExtension in case no extension is available and raise_when_no_extension
|
||||
is True (the default)
|
||||
|
||||
"""
|
||||
paths = [Path(p) for (_, p) in [("_", urlparsed.path)] + parse_qsl(urlparsed.query)]
|
||||
if raise_when_no_extension and not any(path.suffix != "" for path in paths):
|
||||
raise ArtifactWithoutExtension
|
||||
return any(path.suffix.endswith(tuple(extensions)) for path in paths)
|
||||
|
||||
|
||||
def is_tarball(urls: List[str], request: Optional[Any] = None) -> Tuple[bool, str]:
|
||||
"""Determine whether a list of files actually are tarballs or simple files.
|
||||
|
||||
|
@ -157,13 +184,7 @@ def is_tarball(urls: List[str], request: Optional[Any] = None) -> Tuple[bool, st
|
|||
urlparsed = urlparse(url)
|
||||
if urlparsed.scheme not in ("http", "https", "ftp"):
|
||||
raise ArtifactNatureMistyped(f"Mistyped artifact '{url}'")
|
||||
|
||||
paths = [
|
||||
Path(p) for (_, p) in [("_", urlparsed.path)] + parse_qsl(urlparsed.query)
|
||||
]
|
||||
if not any(path.suffix != "" for path in paths):
|
||||
raise ArtifactWithoutExtension
|
||||
return any(path.suffix.endswith(tuple(TARBALL_EXTENSIONS)) for path in paths)
|
||||
return url_endswith(urlparsed, TARBALL_EXTENSIONS)
|
||||
|
||||
index = random.randrange(len(urls))
|
||||
url = urls[index]
|
||||
|
@ -247,6 +268,10 @@ class NixGuixLister(StatelessLister[PageResult]):
|
|||
it fallbacks to query (HEAD) the url to retrieve the origin out of the `Location`
|
||||
response header, and then checks the extension again.
|
||||
|
||||
Optionally, when the `extension_to_ignore` parameter is provided, it extends the
|
||||
default extensions to ignore (`DEFAULT_EXTENSIONS_TO_IGNORE`) with those passed.
|
||||
This can be used to drop further binary files detected in the wild.
|
||||
|
||||
"""
|
||||
|
||||
LISTER_NAME = "nixguix"
|
||||
|
@ -260,6 +285,7 @@ class NixGuixLister(StatelessLister[PageResult]):
|
|||
credentials: Optional[CredentialsType] = None,
|
||||
# canonicalize urls, can be turned off during docker runs
|
||||
canonicalize: bool = True,
|
||||
extensions_to_ignore: List[str] = [],
|
||||
**kwargs: Any,
|
||||
):
|
||||
super().__init__(
|
||||
|
@ -271,6 +297,7 @@ class NixGuixLister(StatelessLister[PageResult]):
|
|||
# either full fqdn NixOS/nixpkgs or guix repository urls
|
||||
# maybe add an assert on those specific urls?
|
||||
self.origin_upstream = origin_upstream
|
||||
self.extensions_to_ignore = DEFAULT_EXTENSIONS_TO_IGNORE + extensions_to_ignore
|
||||
|
||||
self.session = requests.Session()
|
||||
# for testing purposes, we may want to skip this step (e.g. docker run and rate
|
||||
|
@ -435,13 +462,34 @@ class NixGuixLister(StatelessLister[PageResult]):
|
|||
# 'critical' information about how to recompute the hash (e.g. fs
|
||||
# layout, executable bit, ...)
|
||||
logger.warning(
|
||||
"Skipping artifact <%s>: 'file' artifact of type <%s> is "
|
||||
"Skipping artifact <%s>: 'file' artifact of type <%s> is"
|
||||
" missing information to properly check its integrity",
|
||||
artifact,
|
||||
artifact_type,
|
||||
)
|
||||
continue
|
||||
|
||||
# At this point plenty of heuristics happened and we should have found
|
||||
# the right origin and its nature.
|
||||
|
||||
# Let's check and filter it out if it is to be ignored (if possible).
|
||||
# Some origin urls may not have extension at this point (e.g
|
||||
# http://git.marmaro.de/?p=mmh;a=snp;h=<id>;sf=tgz), let them through.
|
||||
if url_endswith(
|
||||
urlparse(origin),
|
||||
self.extensions_to_ignore,
|
||||
raise_when_no_extension=False,
|
||||
):
|
||||
logger.warning(
|
||||
"Skipping artifact <%s>: 'file' artifact of type <%s> is"
|
||||
" ignored due to lister configuration. It should ignore"
|
||||
" origins with extension [%s]",
|
||||
origin,
|
||||
artifact_type,
|
||||
",".join(self.extensions_to_ignore),
|
||||
)
|
||||
continue
|
||||
|
||||
logger.debug("%s: %s", "dir" if is_tar else "cnt", origin)
|
||||
yield ArtifactType.ARTIFACT, Artifact(
|
||||
origin=origin,
|
||||
|
|
|
@ -57,6 +57,123 @@
|
|||
"type": "url",
|
||||
"urls": [ "https://code.9front.org/hg/plan9front" ],
|
||||
"integrity": "sha256-wAEswtkl3ulAw3zq4perrGS6Wlww5XXnQYsEAoYT9fI="
|
||||
},
|
||||
{
|
||||
"outputHash": "sha256-IgPqUEDpaIuGoaGoH2GCEzh3KxF3pkJC3VjTYXwSiQE=",
|
||||
"outputHashAlgo": "sha256",
|
||||
"outputHashMode": "flat",
|
||||
"type": "url",
|
||||
"urls": [
|
||||
"https://github.com/KSP-CKAN/CKAN/releases/download/v1.30.4/ckan.exe"
|
||||
],
|
||||
"integrity": "sha256-IgPqUEDpaIuGoaGoH2GCEzh3KxF3pkJC3VjTYXwSiQE=",
|
||||
"inferredFetcher": "unclassified"
|
||||
},
|
||||
{
|
||||
"outputHash": "sha256-ezJN/t0iNk0haMLPioEQSNXU4ugVeJe44GNVGd+cOF4=",
|
||||
"outputHashAlgo": "sha256",
|
||||
"outputHashMode": "flat",
|
||||
"type": "url",
|
||||
"urls": [
|
||||
"https://github.com/johannesjo/super-productivity/releases/download/v7.5.1/superProductivity-7.5.1.AppImage"
|
||||
],
|
||||
"integrity": "sha256-ezJN/t0iNk0haMLPioEQSNXU4ugVeJe44GNVGd+cOF4=",
|
||||
"inferredFetcher": "unclassified"
|
||||
},
|
||||
{
|
||||
"outputHash": "19ir6x4c01825hpx2wbbcxkk70ymwbw4j03v8b2xc13ayylwzx0r",
|
||||
"outputHashAlgo": "sha256",
|
||||
"outputHashMode": "flat",
|
||||
"type": "url",
|
||||
"urls": [
|
||||
"http://gorilla.dp100.com/downloads/gorilla1537_64.bin"
|
||||
],
|
||||
"integrity": "sha256-GfTPqfdqBNbFQnsASfji1YMzZ2drcdEvLAIFwEg3OaY=",
|
||||
"inferredFetcher": "unclassified"
|
||||
},
|
||||
{
|
||||
"outputHash": "1zj53xybygps66m3v5kzi61vqy987zp6bfgk0qin9pja68qq75vx",
|
||||
"outputHashAlgo": "sha256",
|
||||
"outputHashMode": "flat",
|
||||
"type": "url",
|
||||
"urls": [
|
||||
"https://fedorapeople.org/groups/virt/virtio-win/direct-downloads/archive-virtio/virtio-win-0.1.196-1/virtio-win.iso"
|
||||
],
|
||||
"integrity": "sha256-fZeDMTJK3mQjBvO5Ze4/KHm8g4l/lj2qMfo+v3wfRf4=",
|
||||
"inferredFetcher": "unclassified"
|
||||
},
|
||||
{
|
||||
"outputHash": "02qgsj4h4zrjxkcclx7clsqbqd699kg0dq1xxa9hbj3vfnddjv1f",
|
||||
"outputHashAlgo": "sha256",
|
||||
"outputHashMode": "flat",
|
||||
"type": "url",
|
||||
"urls": [
|
||||
"https://www.pjrc.com/teensy/td_153/TeensyduinoInstall.linux64"
|
||||
],
|
||||
"integrity": "sha256-LmzZmnV7yAWT6j3gBt5MyTS8sKbsdMrY7DJ/AonUDws=",
|
||||
"inferredFetcher": "unclassified"
|
||||
},
|
||||
{
|
||||
"outputHash": "sha256-24uF87kQWQ9hrb+gAFqZXWE+KZocxz0AVT1w3IEBDjY=",
|
||||
"outputHashAlgo": "sha256",
|
||||
"outputHashMode": "flat",
|
||||
"type": "url",
|
||||
"urls": [
|
||||
"https://dl.winehq.org/wine/wine-mono/6.4.0/wine-mono-6.4.0-x86.msi"
|
||||
],
|
||||
"integrity": "sha256-24uF87kQWQ9hrb+gAFqZXWE+KZocxz0AVT1w3IEBDjY=",
|
||||
"inferredFetcher": "unclassified"
|
||||
},
|
||||
{
|
||||
"outputHash": "00y96w9shbbrdbf6xcjlahqd08154kkrxmqraik7qshiwcqpw7p4",
|
||||
"outputHashAlgo": "sha256",
|
||||
"outputHashMode": "flat",
|
||||
"type": "url",
|
||||
"urls": [
|
||||
"https://raw.githubusercontent.com/webtorrent/webtorrent-desktop/v0.21.0/static/linux/share/icons/hicolor/48x48/apps/webtorrent-desktop.png"
|
||||
],
|
||||
"integrity": "sha256-5B5+MeMRanxmVBnXnuckJSDQMFRUsm7canktqBM3yQM=",
|
||||
"inferredFetcher": "unclassified"
|
||||
},
|
||||
{
|
||||
"outputHash": "0lw193jr7ldvln5x5z9p21rz1by46h0say9whfcw2kxs9vprd5b3",
|
||||
"outputHashAlgo": "sha256",
|
||||
"outputHashMode": "flat",
|
||||
"type": "url",
|
||||
"urls": [
|
||||
"http://xuxen.eus/static/hunspell/eu_ES.dic"
|
||||
],
|
||||
"integrity": "sha256-Y5WW7066T8GZgzx5pQE0xK/wcxA3/dKLpbvRk+VIgVM=",
|
||||
"inferredFetcher": "unclassified"
|
||||
},
|
||||
{
|
||||
"outputHash": "0wbhvypdr96a5ddg6kj41dn9sbl49n7pfi2vs762ij82hm2gvwcm",
|
||||
"outputHashAlgo": "sha256",
|
||||
"outputHashMode": "flat",
|
||||
"type": "url",
|
||||
"urls": [
|
||||
"https://www.openprinting.org/download/printdriver/components/lsb3.2/main/RPMS/noarch/openprinting-ppds-postscript-lexmark-20160218-1lsb3.2.noarch.rpm"
|
||||
],
|
||||
"integrity": "sha256-lfH9RIUCySjM0VtEd49NhC6dbAtETvNaK8qk3K7fcHE=",
|
||||
"inferredFetcher": "unclassified"
|
||||
},
|
||||
{
|
||||
"outputHash": "01gy84gr0gw5ap7hpy72azaf6hlzac7vxkn5cgad5sfbyzxgjgc9",
|
||||
"outputHashAlgo": "sha256",
|
||||
"outputHashMode": "flat",
|
||||
"type": "url",
|
||||
"urls": [
|
||||
"https://wire-app.wire.com/linux/debian/pool/main/Wire-3.26.2941_amd64.deb"
|
||||
],
|
||||
"integrity": "sha256-iT35+vfL6dLUY8XOvg9Tn0Lj1Ffi+AvPVYU/kB9B/gU=",
|
||||
"inferredFetcher": "unclassified"
|
||||
},
|
||||
{
|
||||
"type": "url",
|
||||
"urls": [
|
||||
"https://elpa.gnu.org/packages/zones.foobar"
|
||||
],
|
||||
"integrity": "sha256-YRZc7dI3DjUzoSIp4fIshUyhMXIQ/fPKaKnjeYVa4WI="
|
||||
}
|
||||
],
|
||||
"version":"1",
|
||||
|
|
|
@ -8,6 +8,7 @@ import json
|
|||
import logging
|
||||
from pathlib import Path
|
||||
from typing import Dict, List
|
||||
from urllib.parse import urlparse
|
||||
|
||||
import pytest
|
||||
import requests
|
||||
|
@ -15,11 +16,14 @@ from requests.exceptions import ConnectionError, InvalidSchema, SSLError
|
|||
|
||||
from swh.lister import TARBALL_EXTENSIONS
|
||||
from swh.lister.nixguix.lister import (
|
||||
DEFAULT_EXTENSIONS_TO_IGNORE,
|
||||
POSSIBLE_TARBALL_MIMETYPES,
|
||||
ArtifactNatureMistyped,
|
||||
ArtifactNatureUndetected,
|
||||
ArtifactWithoutExtension,
|
||||
NixGuixLister,
|
||||
is_tarball,
|
||||
url_endswith,
|
||||
)
|
||||
from swh.lister.pattern import ListerStats
|
||||
|
||||
|
@ -43,6 +47,33 @@ def page_response(datadir, instance: str = "success") -> List[Dict]:
|
|||
return json.loads(datapath.read_text()) if datapath.exists else []
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"name,expected_result",
|
||||
[(f"one.{ext}", True) for ext in TARBALL_EXTENSIONS]
|
||||
+ [(f"one.{ext}?foo=bar", True) for ext in TARBALL_EXTENSIONS]
|
||||
+ [(f"one?p0=1&foo=bar.{ext}", True) for ext in DEFAULT_EXTENSIONS_TO_IGNORE]
|
||||
+ [("two?file=something.el", False), ("foo?two=two&three=three", False)],
|
||||
)
|
||||
def test_url_endswith(name, expected_result):
|
||||
"""It should detect whether url or query params of the urls ends with extensions"""
|
||||
urlparsed = urlparse(f"https://example.org/{name}")
|
||||
assert (
|
||||
url_endswith(
|
||||
urlparsed,
|
||||
TARBALL_EXTENSIONS + DEFAULT_EXTENSIONS_TO_IGNORE,
|
||||
raise_when_no_extension=False,
|
||||
)
|
||||
is expected_result
|
||||
)
|
||||
|
||||
|
||||
def test_url_endswith_raise():
|
||||
"""It should raise when the tested url has no extension"""
|
||||
urlparsed = urlparse("https://example.org/foo?two=two&three=three")
|
||||
with pytest.raises(ArtifactWithoutExtension):
|
||||
url_endswith(urlparsed, ["unimportant"])
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"tarballs",
|
||||
[[f"one.{ext}", f"two.{ext}"] for ext in TARBALL_EXTENSIONS]
|
||||
|
@ -254,10 +285,15 @@ def test_lister_nixguix_ok(datadir, swh_scheduler, requests_mock):
|
|||
|
||||
|
||||
def test_lister_nixguix_mostly_noop(datadir, swh_scheduler, requests_mock):
|
||||
"""NixGuixLister should ignore unsupported or incomplete origins"""
|
||||
"""NixGuixLister should ignore unsupported or incomplete or to ignore origins"""
|
||||
url = SOURCES["nixpkgs"]["manifest"]
|
||||
origin_upstream = SOURCES["nixpkgs"]["repo"]
|
||||
lister = NixGuixLister(swh_scheduler, url=url, origin_upstream=origin_upstream)
|
||||
lister = NixGuixLister(
|
||||
swh_scheduler,
|
||||
url=url,
|
||||
origin_upstream=origin_upstream,
|
||||
extensions_to_ignore=["foobar"],
|
||||
)
|
||||
|
||||
response = page_response(datadir, "failure")
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue