lister/nixguix: Make artifact nature check happen on all urls
Starting with the first url. As soon as one detection succeeds, this stops and yields the result. Otherwise, continue with the detection on the next mirror url. This should fix the current misbehavior [1] when multiple mirror urls are not ok but the first one is. [1] https://gitlab.softwareheritage.org/swh/infra/sysadm-environment/-/issues/4868#note_137483 Refs. swh/infra/sysadm-environment#4868
This commit is contained in:
parent
bf826618b4
commit
5ebc57912f
4 changed files with 100 additions and 69 deletions
|
@ -168,19 +168,28 @@ def url_endswith(
|
|||
return False
|
||||
|
||||
|
||||
def is_tarball(urls: List[str], request: Optional[Any] = None) -> Tuple[bool, str]:
|
||||
"""Determine whether a list of files actually are tarballs or simple files.
|
||||
def is_tarball(
|
||||
urls: List[str],
|
||||
request: Optional[Any] = None,
|
||||
) -> Tuple[bool, str]:
|
||||
"""Determine whether a list of files actually are tarball or simple files.
|
||||
|
||||
When this cannot be answered simply out of the url, when request is provided, this
|
||||
This iterates over the list of urls provided to detect the artifact's nature. When
|
||||
this cannot be answered simply out of the url and ``request`` is provided, this
|
||||
executes a HTTP `HEAD` query on the url to determine the information. If request is
|
||||
not provided, this raises an ArtifactNatureUndetected exception.
|
||||
|
||||
If, at the end of the iteration on the urls, no detection could be deduced, this
|
||||
raises an ArtifactNatureUndetected.
|
||||
|
||||
Args:
|
||||
urls: name of the remote files for which the extension needs to be checked.
|
||||
urls: name of the remote files to check for artifact nature.
|
||||
request: (Optional) Request object allowing http calls. If not provided and
|
||||
naive check cannot detect anything, this raises ArtifactNatureUndetected.
|
||||
|
||||
Raises:
|
||||
ArtifactNatureUndetected when the artifact's nature cannot be detected out
|
||||
of its url
|
||||
of its urls
|
||||
ArtifactNatureMistyped when the artifact is not a tarball nor a file. It's up to
|
||||
the caller to do what's right with it.
|
||||
|
||||
|
@ -202,76 +211,86 @@ def is_tarball(urls: List[str], request: Optional[Any] = None) -> Tuple[bool, st
|
|||
raise ArtifactNatureMistyped(f"Mistyped artifact '{url}'")
|
||||
return url_endswith(urlparsed, TARBALL_EXTENSIONS)
|
||||
|
||||
index = random.randrange(len(urls))
|
||||
url = urls[index]
|
||||
|
||||
try:
|
||||
return _is_tarball(url), urls[0]
|
||||
except ArtifactWithoutExtension:
|
||||
if request is None:
|
||||
raise ArtifactNatureUndetected(
|
||||
f"Cannot determine artifact type from url <{url}>"
|
||||
)
|
||||
logger.warning(
|
||||
"Cannot detect extension for <%s>. Fallback to http head query",
|
||||
url,
|
||||
)
|
||||
|
||||
# Check all urls and as soon as an url allows the nature detection, this stops.
|
||||
exceptions_to_raise = []
|
||||
for url in urls:
|
||||
try:
|
||||
response = request.head(url)
|
||||
except (InvalidSchema, SSLError, ConnectionError):
|
||||
raise ArtifactNatureUndetected(
|
||||
f"Cannot determine artifact type from url <{url}>"
|
||||
return _is_tarball(url), urls[0]
|
||||
except ArtifactWithoutExtension:
|
||||
if request is None:
|
||||
exc = ArtifactNatureUndetected(
|
||||
f"Cannot determine artifact type from url <{url}>"
|
||||
)
|
||||
exceptions_to_raise.append(exc)
|
||||
continue
|
||||
|
||||
logger.warning(
|
||||
"Cannot detect extension for <%s>. Fallback to http head query",
|
||||
url,
|
||||
)
|
||||
|
||||
if not response.ok or response.status_code == 404:
|
||||
raise ArtifactNatureUndetected(
|
||||
f"Cannot determine artifact type from url <{url}>"
|
||||
)
|
||||
location = response.headers.get("Location")
|
||||
if location: # It's not always present
|
||||
logger.debug("Location: %s", location)
|
||||
try:
|
||||
# FIXME: location is also returned as it's considered the true origin,
|
||||
# true enough?
|
||||
return _is_tarball(location), location
|
||||
except ArtifactWithoutExtension:
|
||||
logger.warning(
|
||||
"Still cannot detect extension through location <%s>...",
|
||||
url,
|
||||
response = request.head(url)
|
||||
except (InvalidSchema, SSLError, ConnectionError):
|
||||
exc = ArtifactNatureUndetected(
|
||||
f"Cannot determine artifact type from url <{url}>"
|
||||
)
|
||||
exceptions_to_raise.append(exc)
|
||||
continue
|
||||
|
||||
origin = urls[0]
|
||||
|
||||
content_type = response.headers.get("Content-Type")
|
||||
if content_type:
|
||||
logger.debug("Content-Type: %s", content_type)
|
||||
if content_type == "application/json":
|
||||
return False, origin
|
||||
return content_type.startswith(POSSIBLE_TARBALL_MIMETYPES), origin
|
||||
|
||||
content_disposition = response.headers.get("Content-Disposition")
|
||||
if content_disposition:
|
||||
logger.debug("Content-Disposition: %s", content_disposition)
|
||||
if "filename=" in content_disposition:
|
||||
fields = content_disposition.split("; ")
|
||||
for field in fields:
|
||||
if "filename=" in field:
|
||||
_, filename = field.split("filename=")
|
||||
break
|
||||
|
||||
return (
|
||||
url_endswith(
|
||||
urlparse(filename),
|
||||
TARBALL_EXTENSIONS,
|
||||
raise_when_no_extension=False,
|
||||
),
|
||||
origin,
|
||||
if not response.ok or response.status_code == 404:
|
||||
exc = ArtifactNatureUndetected(
|
||||
f"Cannot determine artifact type from url <{url}>"
|
||||
)
|
||||
exceptions_to_raise.append(exc)
|
||||
continue
|
||||
|
||||
raise ArtifactNatureUndetected(
|
||||
f"Cannot determine artifact type from url <{url}>"
|
||||
)
|
||||
location = response.headers.get("Location")
|
||||
if location: # It's not always present
|
||||
logger.debug("Location: %s", location)
|
||||
try:
|
||||
# FIXME: location is also returned as it's considered the true
|
||||
# origin, true enough?
|
||||
return _is_tarball(location), location
|
||||
except ArtifactWithoutExtension:
|
||||
logger.warning(
|
||||
"Still cannot detect extension through location <%s>...",
|
||||
url,
|
||||
)
|
||||
|
||||
origin = urls[0]
|
||||
|
||||
content_type = response.headers.get("Content-Type")
|
||||
if content_type:
|
||||
logger.debug("Content-Type: %s", content_type)
|
||||
if content_type == "application/json":
|
||||
return False, origin
|
||||
return content_type.startswith(POSSIBLE_TARBALL_MIMETYPES), origin
|
||||
|
||||
content_disposition = response.headers.get("Content-Disposition")
|
||||
if content_disposition:
|
||||
logger.debug("Content-Disposition: %s", content_disposition)
|
||||
if "filename=" in content_disposition:
|
||||
fields = content_disposition.split("; ")
|
||||
for field in fields:
|
||||
if "filename=" in field:
|
||||
_, filename = field.split("filename=")
|
||||
break
|
||||
|
||||
return (
|
||||
url_endswith(
|
||||
urlparse(filename),
|
||||
TARBALL_EXTENSIONS,
|
||||
raise_when_no_extension=False,
|
||||
),
|
||||
origin,
|
||||
)
|
||||
|
||||
if len(exceptions_to_raise) > 0:
|
||||
raise exceptions_to_raise[0]
|
||||
raise ArtifactNatureUndetected(
|
||||
f"Cannot determine artifact type from url <{urls[0]}>"
|
||||
)
|
||||
|
||||
|
||||
VCS_KEYS_MAPPING = {
|
||||
|
@ -438,6 +457,7 @@ class NixGuixLister(StatelessLister[PageResult]):
|
|||
# We'll deal with outputHash as integrity field
|
||||
integrity = outputHash
|
||||
|
||||
# Checks urls for the artifact nature of the origin
|
||||
try:
|
||||
is_tar, origin = is_tarball(urls, self.session)
|
||||
except ArtifactNatureMistyped:
|
||||
|
|
|
@ -16,6 +16,7 @@
|
|||
{
|
||||
"type": "url",
|
||||
"urls": [
|
||||
"ftp://example.mirror.org/extensionless-file",
|
||||
"ftp://ftp.ourproject.org/file-with-no-extension"
|
||||
],
|
||||
"integrity": "sha256-bss09x9yOnuW+Q5BHHjf8nNcCNxCKMdl9/2/jKSFcrQ="
|
||||
|
|
|
@ -25,6 +25,7 @@
|
|||
{
|
||||
"type": "url",
|
||||
"urls": [
|
||||
"ftp://ftp.ourproject.org/pub/ytalk",
|
||||
"ftp://ftp.ourproject.org/pub/ytalk/ytalk-3.3.0.tar.gz"
|
||||
],
|
||||
"integrity": "sha256-bss09x9yOnuW+Q5BHHjf8nNcCNxCKMdl9/2/jKSFcrQ="
|
||||
|
|
|
@ -253,6 +253,11 @@ def test_lister_nixguix_ok(datadir, swh_scheduler, requests_mock):
|
|||
"filename=fifengine-0.4.2.tar.gz; other=stuff",
|
||||
},
|
||||
)
|
||||
# The url will not succeed, so the other url will be fetched
|
||||
requests_mock.head(
|
||||
"ftp://ftp.ourproject.org/pub/ytalk",
|
||||
status_code=404,
|
||||
)
|
||||
|
||||
expected_visit_types = defaultdict(int)
|
||||
# origin upstream is added as origin
|
||||
|
@ -335,8 +340,12 @@ def test_lister_nixguix_mostly_noop(datadir, swh_scheduler, requests_mock):
|
|||
"https://crates.io/api/v1/0.1.5/no-extension-and-head-404-so-skipped",
|
||||
status_code=404,
|
||||
)
|
||||
# Invalid schema for that origin (and no extension), so skip origin
|
||||
# from its name
|
||||
# Either not found or invalid schema for that origin (and no extension), so skip
|
||||
# origin from its name
|
||||
requests_mock.head(
|
||||
"ftp://example.mirror.org/extensionless-file",
|
||||
status_code=404,
|
||||
)
|
||||
requests_mock.head(
|
||||
"ftp://ftp.ourproject.org/file-with-no-extension",
|
||||
exc=InvalidSchema,
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue