Move tarball validation functions from nixguix to utils
This commit is contained in:
parent
c0dc8edb05
commit
6618cf341c
3 changed files with 216 additions and 193 deletions
|
@ -1,4 +1,4 @@
|
|||
# Copyright (C) 2020-2023 The Software Heritage developers
|
||||
# Copyright (C) 2020-2024 The Software Heritage developers
|
||||
# See the AUTHORS file at the top-level directory of this distribution
|
||||
# License: GNU General Public License version 3, or any later version
|
||||
# See top-level LICENSE file for more information
|
||||
|
@ -20,18 +20,21 @@ import binascii
|
|||
from dataclasses import dataclass
|
||||
from enum import Enum
|
||||
import logging
|
||||
from pathlib import Path
|
||||
import random
|
||||
import re
|
||||
from typing import Any, Dict, Iterator, List, Optional, Tuple, Union
|
||||
from urllib.parse import parse_qsl, urlparse
|
||||
from urllib.parse import urlparse
|
||||
|
||||
import requests
|
||||
from requests.exceptions import ConnectionError, InvalidSchema, SSLError
|
||||
|
||||
from swh.core.tarball import MIMETYPE_TO_ARCHIVE_FORMAT
|
||||
from swh.lister import TARBALL_EXTENSIONS
|
||||
from swh.lister.pattern import CredentialsType, StatelessLister
|
||||
from swh.lister.utils import (
|
||||
ArtifactNatureMistyped,
|
||||
ArtifactNatureUndetected,
|
||||
is_tarball,
|
||||
url_contains_tarball_filename,
|
||||
)
|
||||
from swh.scheduler.model import ListedOrigin
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
@ -52,29 +55,6 @@ DEFAULT_EXTENSIONS_TO_IGNORE = [
|
|||
]
|
||||
|
||||
|
||||
class ArtifactNatureUndetected(ValueError):
|
||||
"""Raised when a remote artifact's nature (tarball, file) cannot be detected."""
|
||||
|
||||
pass
|
||||
|
||||
|
||||
class ArtifactNatureMistyped(ValueError):
|
||||
"""Raised when a remote artifact is neither a tarball nor a file.
|
||||
|
||||
Error of this type are' probably a misconfiguration in the manifest generation that
|
||||
badly typed a vcs repository.
|
||||
|
||||
"""
|
||||
|
||||
pass
|
||||
|
||||
|
||||
class ArtifactWithoutExtension(ValueError):
|
||||
"""Raised when an artifact nature cannot be determined by its name."""
|
||||
|
||||
pass
|
||||
|
||||
|
||||
class ChecksumLayout(Enum):
|
||||
"""The possible artifact types listed out of the manifest."""
|
||||
|
||||
|
@ -147,163 +127,6 @@ POSSIBLE_TARBALL_MIMETYPES = tuple(MIMETYPE_TO_ARCHIVE_FORMAT.keys())
|
|||
PATTERN_VERSION = re.compile(r"(v*[0-9]+[.])([0-9]+[.]*)+")
|
||||
|
||||
|
||||
def url_contains_tarball_filename(
|
||||
urlparsed, extensions: List[str], raise_when_no_extension: bool = True
|
||||
) -> bool:
|
||||
"""Determine whether urlparsed contains a tarball filename ending with one of the
|
||||
extensions passed as parameter, path parts and query parameters are checked.
|
||||
|
||||
This also account for the edge case of a filename with only a version as name (so no
|
||||
extension in the end.)
|
||||
|
||||
Raises:
|
||||
ArtifactWithoutExtension in case no extension is available and
|
||||
raise_when_no_extension is True (the default)
|
||||
|
||||
"""
|
||||
paths = [Path(p) for (_, p) in [("_", urlparsed.path)] + parse_qsl(urlparsed.query)]
|
||||
match = any(
|
||||
path_part.endswith(tuple(extensions))
|
||||
for path in paths
|
||||
for path_part in path.parts
|
||||
)
|
||||
if match:
|
||||
return match
|
||||
if raise_when_no_extension and not any(path.suffix != "" for path in paths):
|
||||
raise ArtifactWithoutExtension
|
||||
# Some false negative can happen (e.g. https://<netloc>/path/0.1.5)), so make sure
|
||||
# to catch those
|
||||
name = Path(urlparsed.path).name
|
||||
if not PATTERN_VERSION.match(name):
|
||||
return match
|
||||
if raise_when_no_extension:
|
||||
raise ArtifactWithoutExtension
|
||||
return False
|
||||
|
||||
|
||||
def is_tarball(
|
||||
urls: List[str],
|
||||
request: Optional[Any] = None,
|
||||
) -> Tuple[bool, str]:
|
||||
"""Determine whether a list of files actually are tarball or simple files.
|
||||
|
||||
This iterates over the list of urls provided to detect the artifact's nature. When
|
||||
this cannot be answered simply out of the url and ``request`` is provided, this
|
||||
executes a HTTP `HEAD` query on the url to determine the information. If request is
|
||||
not provided, this raises an ArtifactNatureUndetected exception.
|
||||
|
||||
If, at the end of the iteration on the urls, no detection could be deduced, this
|
||||
raises an ArtifactNatureUndetected.
|
||||
|
||||
Args:
|
||||
urls: name of the remote files to check for artifact nature.
|
||||
request: (Optional) Request object allowing http calls. If not provided and
|
||||
naive check cannot detect anything, this raises ArtifactNatureUndetected.
|
||||
|
||||
Raises:
|
||||
ArtifactNatureUndetected when the artifact's nature cannot be detected out
|
||||
of its urls
|
||||
ArtifactNatureMistyped when the artifact is not a tarball nor a file. It's up to
|
||||
the caller to do what's right with it.
|
||||
|
||||
Returns: A tuple (bool, url). The boolean represents whether the url is an archive
|
||||
or not. The second parameter is the actual url once the head request is issued
|
||||
as a fallback of not finding out whether the urls are tarballs or not.
|
||||
|
||||
"""
|
||||
|
||||
def _is_tarball(url):
|
||||
"""Determine out of an extension whether url is a tarball.
|
||||
|
||||
Raises:
|
||||
ArtifactWithoutExtension in case no extension is available
|
||||
|
||||
"""
|
||||
urlparsed = urlparse(url)
|
||||
if urlparsed.scheme not in ("http", "https", "ftp"):
|
||||
raise ArtifactNatureMistyped(f"Mistyped artifact '{url}'")
|
||||
return url_contains_tarball_filename(urlparsed, TARBALL_EXTENSIONS)
|
||||
|
||||
# Check all urls and as soon as an url allows the nature detection, this stops.
|
||||
exceptions_to_raise = []
|
||||
for url in urls:
|
||||
try:
|
||||
return _is_tarball(url), urls[0]
|
||||
except ArtifactWithoutExtension:
|
||||
if request is None:
|
||||
exc = ArtifactNatureUndetected(
|
||||
f"Cannot determine artifact type from url <{url}>"
|
||||
)
|
||||
exceptions_to_raise.append(exc)
|
||||
continue
|
||||
|
||||
logger.warning(
|
||||
"Cannot detect extension for <%s>. Fallback to http head query",
|
||||
url,
|
||||
)
|
||||
|
||||
try:
|
||||
response = request.head(url)
|
||||
except (InvalidSchema, SSLError, ConnectionError):
|
||||
exc = ArtifactNatureUndetected(
|
||||
f"Cannot determine artifact type from url <{url}>"
|
||||
)
|
||||
exceptions_to_raise.append(exc)
|
||||
continue
|
||||
|
||||
if not response.ok or response.status_code == 404:
|
||||
exc = ArtifactNatureUndetected(
|
||||
f"Cannot determine artifact type from url <{url}>"
|
||||
)
|
||||
exceptions_to_raise.append(exc)
|
||||
continue
|
||||
|
||||
location = response.headers.get("Location")
|
||||
if location: # It's not always present
|
||||
logger.debug("Location: %s", location)
|
||||
try:
|
||||
return _is_tarball(location), url
|
||||
except ArtifactWithoutExtension:
|
||||
logger.warning(
|
||||
"Still cannot detect extension through location <%s>...",
|
||||
url,
|
||||
)
|
||||
|
||||
origin = urls[0]
|
||||
|
||||
content_type = response.headers.get("Content-Type")
|
||||
if content_type:
|
||||
logger.debug("Content-Type: %s", content_type)
|
||||
if content_type == "application/json":
|
||||
return False, origin
|
||||
return content_type.startswith(POSSIBLE_TARBALL_MIMETYPES), origin
|
||||
|
||||
content_disposition = response.headers.get("Content-Disposition")
|
||||
if content_disposition:
|
||||
logger.debug("Content-Disposition: %s", content_disposition)
|
||||
if "filename=" in content_disposition:
|
||||
fields = content_disposition.split("; ")
|
||||
for field in fields:
|
||||
if "filename=" in field:
|
||||
_, filename = field.split("filename=")
|
||||
break
|
||||
|
||||
return (
|
||||
url_contains_tarball_filename(
|
||||
urlparse(filename),
|
||||
TARBALL_EXTENSIONS,
|
||||
raise_when_no_extension=False,
|
||||
),
|
||||
origin,
|
||||
)
|
||||
|
||||
if len(exceptions_to_raise) > 0:
|
||||
raise exceptions_to_raise[0]
|
||||
raise ArtifactNatureUndetected(
|
||||
f"Cannot determine artifact type from url <{urls[0]}>"
|
||||
)
|
||||
|
||||
|
||||
VCS_KEYS_MAPPING = {
|
||||
"git": {
|
||||
"ref": "git_ref",
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
# Copyright (C) 2022 The Software Heritage developers
|
||||
# Copyright (C) 2022-2024 The Software Heritage developers
|
||||
# See the AUTHORS file at the top-level directory of this distribution
|
||||
# License: GNU General Public License version 3, or any later version
|
||||
# See top-level LICENSE file for more information
|
||||
|
@ -19,14 +19,16 @@ from swh.lister.nixguix.lister import (
|
|||
DEFAULT_EXTENSIONS_TO_IGNORE,
|
||||
POSSIBLE_TARBALL_MIMETYPES,
|
||||
VCS_ARTIFACT_TYPE_TO_VISIT_TYPE,
|
||||
ArtifactNatureMistyped,
|
||||
ArtifactNatureUndetected,
|
||||
ArtifactWithoutExtension,
|
||||
NixGuixLister,
|
||||
is_tarball,
|
||||
url_contains_tarball_filename,
|
||||
)
|
||||
from swh.lister.pattern import ListerStats
|
||||
from swh.lister.utils import (
|
||||
ArtifactNatureMistyped,
|
||||
ArtifactNatureUndetected,
|
||||
ArtifactWithoutExtension,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
|
|
@ -1,9 +1,20 @@
|
|||
# Copyright (C) 2018-2023 the Software Heritage developers
|
||||
# Copyright (C) 2018-2024 the Software Heritage developers
|
||||
# License: GNU General Public License version 3, or any later version
|
||||
# See top-level LICENSE file for more information
|
||||
|
||||
from typing import Iterator, Optional, Tuple
|
||||
import urllib.parse
|
||||
|
||||
import logging
|
||||
from pathlib import Path
|
||||
import re
|
||||
from typing import Any, Iterator, List, Optional, Tuple
|
||||
from urllib.parse import parse_qsl, urlparse
|
||||
|
||||
from requests.exceptions import ConnectionError, InvalidSchema, SSLError
|
||||
|
||||
from swh.core.tarball import MIMETYPE_TO_ARCHIVE_FORMAT
|
||||
from swh.lister import TARBALL_EXTENSIONS
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def split_range(total_pages: int, nb_pages: int) -> Iterator[Tuple[int, int]]:
|
||||
|
@ -65,7 +76,7 @@ def is_valid_origin_url(url: Optional[str]) -> bool:
|
|||
# Empty or None
|
||||
return False
|
||||
|
||||
parsed = urllib.parse.urlparse(url)
|
||||
parsed = urlparse(url)
|
||||
if not parsed.netloc:
|
||||
# Is parsed as a relative URL
|
||||
return False
|
||||
|
@ -75,3 +86,190 @@ def is_valid_origin_url(url: Optional[str]) -> bool:
|
|||
return False
|
||||
|
||||
return True
|
||||
|
||||
|
||||
class ArtifactNatureUndetected(ValueError):
|
||||
"""Raised when a remote artifact's nature (tarball, file) cannot be detected."""
|
||||
|
||||
pass
|
||||
|
||||
|
||||
class ArtifactNatureMistyped(ValueError):
|
||||
"""Raised when a remote artifact is neither a tarball nor a file.
|
||||
|
||||
Error of this type are' probably a misconfiguration in the manifest generation that
|
||||
badly typed a vcs repository.
|
||||
|
||||
"""
|
||||
|
||||
pass
|
||||
|
||||
|
||||
class ArtifactWithoutExtension(ValueError):
|
||||
"""Raised when an artifact nature cannot be determined by its name."""
|
||||
|
||||
pass
|
||||
|
||||
|
||||
# Rough approximation of what we can find of mimetypes for tarballs "out there"
|
||||
POSSIBLE_TARBALL_MIMETYPES = tuple(MIMETYPE_TO_ARCHIVE_FORMAT.keys())
|
||||
|
||||
|
||||
PATTERN_VERSION = re.compile(r"(v*[0-9]+[.])([0-9]+[.]*)+")
|
||||
|
||||
|
||||
def url_contains_tarball_filename(
|
||||
urlparsed, extensions: List[str], raise_when_no_extension: bool = True
|
||||
) -> bool:
|
||||
"""Determine whether urlparsed contains a tarball filename ending with one of the
|
||||
extensions passed as parameter, path parts and query parameters are checked.
|
||||
|
||||
This also account for the edge case of a filename with only a version as name (so no
|
||||
extension in the end.)
|
||||
|
||||
Raises:
|
||||
ArtifactWithoutExtension in case no extension is available and
|
||||
raise_when_no_extension is True (the default)
|
||||
|
||||
"""
|
||||
paths = [Path(p) for (_, p) in [("_", urlparsed.path)] + parse_qsl(urlparsed.query)]
|
||||
match = any(
|
||||
path_part.endswith(tuple(extensions))
|
||||
for path in paths
|
||||
for path_part in path.parts
|
||||
)
|
||||
if match:
|
||||
return match
|
||||
if raise_when_no_extension and not any(path.suffix != "" for path in paths):
|
||||
raise ArtifactWithoutExtension
|
||||
# Some false negative can happen (e.g. https://<netloc>/path/0.1.5)), so make sure
|
||||
# to catch those
|
||||
name = Path(urlparsed.path).name
|
||||
if not PATTERN_VERSION.match(name):
|
||||
return match
|
||||
if raise_when_no_extension:
|
||||
raise ArtifactWithoutExtension
|
||||
return False
|
||||
|
||||
|
||||
def is_tarball(
|
||||
urls: List[str],
|
||||
request: Optional[Any] = None,
|
||||
) -> Tuple[bool, str]:
|
||||
"""Determine whether a list of files actually are tarball or simple files.
|
||||
|
||||
This iterates over the list of urls provided to detect the artifact's nature. When
|
||||
this cannot be answered simply out of the url and ``request`` is provided, this
|
||||
executes a HTTP `HEAD` query on the url to determine the information. If request is
|
||||
not provided, this raises an ArtifactNatureUndetected exception.
|
||||
|
||||
If, at the end of the iteration on the urls, no detection could be deduced, this
|
||||
raises an ArtifactNatureUndetected.
|
||||
|
||||
Args:
|
||||
urls: name of the remote files to check for artifact nature.
|
||||
request: (Optional) Request object allowing http calls. If not provided and
|
||||
naive check cannot detect anything, this raises ArtifactNatureUndetected.
|
||||
|
||||
Raises:
|
||||
ArtifactNatureUndetected when the artifact's nature cannot be detected out
|
||||
of its urls
|
||||
ArtifactNatureMistyped when the artifact is not a tarball nor a file. It's up to
|
||||
the caller to do what's right with it.
|
||||
|
||||
Returns: A tuple (bool, url). The boolean represents whether the url is an archive
|
||||
or not. The second parameter is the actual url once the head request is issued
|
||||
as a fallback of not finding out whether the urls are tarballs or not.
|
||||
|
||||
"""
|
||||
|
||||
def _is_tarball(url):
|
||||
"""Determine out of an extension whether url is a tarball.
|
||||
|
||||
Raises:
|
||||
ArtifactWithoutExtension in case no extension is available
|
||||
|
||||
"""
|
||||
urlparsed = urlparse(url)
|
||||
if urlparsed.scheme not in ("http", "https", "ftp"):
|
||||
raise ArtifactNatureMistyped(f"Mistyped artifact '{url}'")
|
||||
return url_contains_tarball_filename(urlparsed, TARBALL_EXTENSIONS)
|
||||
|
||||
# Check all urls and as soon as an url allows the nature detection, this stops.
|
||||
exceptions_to_raise = []
|
||||
for url in urls:
|
||||
try:
|
||||
return _is_tarball(url), urls[0]
|
||||
except ArtifactWithoutExtension:
|
||||
if request is None:
|
||||
exc = ArtifactNatureUndetected(
|
||||
f"Cannot determine artifact type from url <{url}>"
|
||||
)
|
||||
exceptions_to_raise.append(exc)
|
||||
continue
|
||||
|
||||
logger.warning(
|
||||
"Cannot detect extension for <%s>. Fallback to http head query",
|
||||
url,
|
||||
)
|
||||
|
||||
try:
|
||||
response = request.head(url)
|
||||
except (InvalidSchema, SSLError, ConnectionError):
|
||||
exc = ArtifactNatureUndetected(
|
||||
f"Cannot determine artifact type from url <{url}>"
|
||||
)
|
||||
exceptions_to_raise.append(exc)
|
||||
continue
|
||||
|
||||
if not response.ok or response.status_code == 404:
|
||||
exc = ArtifactNatureUndetected(
|
||||
f"Cannot determine artifact type from url <{url}>"
|
||||
)
|
||||
exceptions_to_raise.append(exc)
|
||||
continue
|
||||
|
||||
location = response.headers.get("Location")
|
||||
if location: # It's not always present
|
||||
logger.debug("Location: %s", location)
|
||||
try:
|
||||
return _is_tarball(location), url
|
||||
except ArtifactWithoutExtension:
|
||||
logger.warning(
|
||||
"Still cannot detect extension through location <%s>...",
|
||||
url,
|
||||
)
|
||||
|
||||
origin = urls[0]
|
||||
|
||||
content_type = response.headers.get("Content-Type")
|
||||
if content_type:
|
||||
logger.debug("Content-Type: %s", content_type)
|
||||
if content_type == "application/json":
|
||||
return False, origin
|
||||
return content_type.startswith(POSSIBLE_TARBALL_MIMETYPES), origin
|
||||
|
||||
content_disposition = response.headers.get("Content-Disposition")
|
||||
if content_disposition:
|
||||
logger.debug("Content-Disposition: %s", content_disposition)
|
||||
if "filename=" in content_disposition:
|
||||
fields = content_disposition.split("; ")
|
||||
for field in fields:
|
||||
if "filename=" in field:
|
||||
_, filename = field.split("filename=")
|
||||
break
|
||||
|
||||
return (
|
||||
url_contains_tarball_filename(
|
||||
urlparse(filename),
|
||||
TARBALL_EXTENSIONS,
|
||||
raise_when_no_extension=False,
|
||||
),
|
||||
origin,
|
||||
)
|
||||
|
||||
if len(exceptions_to_raise) > 0:
|
||||
raise exceptions_to_raise[0]
|
||||
raise ArtifactNatureUndetected(
|
||||
f"Cannot determine artifact type from url <{urls[0]}>"
|
||||
)
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue