lister/nixguix: Make artifact nature check happen on all urls

Starting with the first url. As soon as one detection succeeds, this stops and yields
the result. Otherwise, continue with the detection on the next mirror url.

This should fix the current misbehavior [1] when multiple mirror urls are not ok but the
first one is.

[1] https://gitlab.softwareheritage.org/swh/infra/sysadm-environment/-/issues/4868#note_137483

Refs. swh/infra/sysadm-environment#4868
This commit is contained in:
Antoine R. Dumont (@ardumont) 2023-04-27 12:11:42 +02:00
parent bf826618b4
commit 5ebc57912f
No known key found for this signature in database
GPG key ID: 52E2E9840D10C3B8
4 changed files with 100 additions and 69 deletions

View file

@ -168,19 +168,28 @@ def url_endswith(
return False
def is_tarball(urls: List[str], request: Optional[Any] = None) -> Tuple[bool, str]:
"""Determine whether a list of files actually are tarballs or simple files.
def is_tarball(
urls: List[str],
request: Optional[Any] = None,
) -> Tuple[bool, str]:
"""Determine whether a list of files actually are tarball or simple files.
When this cannot be answered simply out of the url, when request is provided, this
This iterates over the list of urls provided to detect the artifact's nature. When
this cannot be answered simply out of the url and ``request`` is provided, this
executes a HTTP `HEAD` query on the url to determine the information. If request is
not provided, this raises an ArtifactNatureUndetected exception.
If, at the end of the iteration on the urls, no detection could be deduced, this
raises an ArtifactNatureUndetected.
Args:
urls: name of the remote files for which the extension needs to be checked.
urls: name of the remote files to check for artifact nature.
request: (Optional) Request object allowing http calls. If not provided and
naive check cannot detect anything, this raises ArtifactNatureUndetected.
Raises:
ArtifactNatureUndetected when the artifact's nature cannot be detected out
of its url
of its urls
ArtifactNatureMistyped when the artifact is not a tarball nor a file. It's up to
the caller to do what's right with it.
@ -202,76 +211,86 @@ def is_tarball(urls: List[str], request: Optional[Any] = None) -> Tuple[bool, st
raise ArtifactNatureMistyped(f"Mistyped artifact '{url}'")
return url_endswith(urlparsed, TARBALL_EXTENSIONS)
index = random.randrange(len(urls))
url = urls[index]
try:
return _is_tarball(url), urls[0]
except ArtifactWithoutExtension:
if request is None:
raise ArtifactNatureUndetected(
f"Cannot determine artifact type from url <{url}>"
)
logger.warning(
"Cannot detect extension for <%s>. Fallback to http head query",
url,
)
# Check all urls and as soon as an url allows the nature detection, this stops.
exceptions_to_raise = []
for url in urls:
try:
response = request.head(url)
except (InvalidSchema, SSLError, ConnectionError):
raise ArtifactNatureUndetected(
f"Cannot determine artifact type from url <{url}>"
return _is_tarball(url), urls[0]
except ArtifactWithoutExtension:
if request is None:
exc = ArtifactNatureUndetected(
f"Cannot determine artifact type from url <{url}>"
)
exceptions_to_raise.append(exc)
continue
logger.warning(
"Cannot detect extension for <%s>. Fallback to http head query",
url,
)
if not response.ok or response.status_code == 404:
raise ArtifactNatureUndetected(
f"Cannot determine artifact type from url <{url}>"
)
location = response.headers.get("Location")
if location: # It's not always present
logger.debug("Location: %s", location)
try:
# FIXME: location is also returned as it's considered the true origin,
# true enough?
return _is_tarball(location), location
except ArtifactWithoutExtension:
logger.warning(
"Still cannot detect extension through location <%s>...",
url,
response = request.head(url)
except (InvalidSchema, SSLError, ConnectionError):
exc = ArtifactNatureUndetected(
f"Cannot determine artifact type from url <{url}>"
)
exceptions_to_raise.append(exc)
continue
origin = urls[0]
content_type = response.headers.get("Content-Type")
if content_type:
logger.debug("Content-Type: %s", content_type)
if content_type == "application/json":
return False, origin
return content_type.startswith(POSSIBLE_TARBALL_MIMETYPES), origin
content_disposition = response.headers.get("Content-Disposition")
if content_disposition:
logger.debug("Content-Disposition: %s", content_disposition)
if "filename=" in content_disposition:
fields = content_disposition.split("; ")
for field in fields:
if "filename=" in field:
_, filename = field.split("filename=")
break
return (
url_endswith(
urlparse(filename),
TARBALL_EXTENSIONS,
raise_when_no_extension=False,
),
origin,
if not response.ok or response.status_code == 404:
exc = ArtifactNatureUndetected(
f"Cannot determine artifact type from url <{url}>"
)
exceptions_to_raise.append(exc)
continue
raise ArtifactNatureUndetected(
f"Cannot determine artifact type from url <{url}>"
)
location = response.headers.get("Location")
if location: # It's not always present
logger.debug("Location: %s", location)
try:
# FIXME: location is also returned as it's considered the true
# origin, true enough?
return _is_tarball(location), location
except ArtifactWithoutExtension:
logger.warning(
"Still cannot detect extension through location <%s>...",
url,
)
origin = urls[0]
content_type = response.headers.get("Content-Type")
if content_type:
logger.debug("Content-Type: %s", content_type)
if content_type == "application/json":
return False, origin
return content_type.startswith(POSSIBLE_TARBALL_MIMETYPES), origin
content_disposition = response.headers.get("Content-Disposition")
if content_disposition:
logger.debug("Content-Disposition: %s", content_disposition)
if "filename=" in content_disposition:
fields = content_disposition.split("; ")
for field in fields:
if "filename=" in field:
_, filename = field.split("filename=")
break
return (
url_endswith(
urlparse(filename),
TARBALL_EXTENSIONS,
raise_when_no_extension=False,
),
origin,
)
if len(exceptions_to_raise) > 0:
raise exceptions_to_raise[0]
raise ArtifactNatureUndetected(
f"Cannot determine artifact type from url <{urls[0]}>"
)
VCS_KEYS_MAPPING = {
@ -438,6 +457,7 @@ class NixGuixLister(StatelessLister[PageResult]):
# We'll deal with outputHash as integrity field
integrity = outputHash
# Checks urls for the artifact nature of the origin
try:
is_tar, origin = is_tarball(urls, self.session)
except ArtifactNatureMistyped:

View file

@ -16,6 +16,7 @@
{
"type": "url",
"urls": [
"ftp://example.mirror.org/extensionless-file",
"ftp://ftp.ourproject.org/file-with-no-extension"
],
"integrity": "sha256-bss09x9yOnuW+Q5BHHjf8nNcCNxCKMdl9/2/jKSFcrQ="

View file

@ -25,6 +25,7 @@
{
"type": "url",
"urls": [
"ftp://ftp.ourproject.org/pub/ytalk",
"ftp://ftp.ourproject.org/pub/ytalk/ytalk-3.3.0.tar.gz"
],
"integrity": "sha256-bss09x9yOnuW+Q5BHHjf8nNcCNxCKMdl9/2/jKSFcrQ="

View file

@ -253,6 +253,11 @@ def test_lister_nixguix_ok(datadir, swh_scheduler, requests_mock):
"filename=fifengine-0.4.2.tar.gz; other=stuff",
},
)
# The url will not succeed, so the other url will be fetched
requests_mock.head(
"ftp://ftp.ourproject.org/pub/ytalk",
status_code=404,
)
expected_visit_types = defaultdict(int)
# origin upstream is added as origin
@ -335,8 +340,12 @@ def test_lister_nixguix_mostly_noop(datadir, swh_scheduler, requests_mock):
"https://crates.io/api/v1/0.1.5/no-extension-and-head-404-so-skipped",
status_code=404,
)
# Invalid schema for that origin (and no extension), so skip origin
# from its name
# Either not found or invalid schema for that origin (and no extension), so skip
# origin from its name
requests_mock.head(
"ftp://example.mirror.org/extensionless-file",
status_code=404,
)
requests_mock.head(
"ftp://ftp.ourproject.org/file-with-no-extension",
exc=InvalidSchema,