nixguix: Improve further tarball detection

The current content type detection was a bit off mostly for content which includes
charset. This commit fixes it.

Related to T3781
This commit is contained in:
Antoine R. Dumont (@ardumont) 2022-10-05 11:11:08 +02:00
parent ff80a91f0a
commit 2ee103e2bc
No known key found for this signature in database
GPG key ID: 52E2E9840D10C3B8
3 changed files with 13 additions and 2 deletions

View file

@ -121,7 +121,7 @@ PageResult = Tuple[ArtifactType, Union[Artifact, VCS]]
VCS_SUPPORTED = ("git", "svn", "hg")
# Rough approximation of what we can find of mimetypes for tarballs "out there"
POSSIBLE_TARBALL_MIMETYPES = set(MIMETYPE_TO_ARCHIVE_FORMAT.keys())
POSSIBLE_TARBALL_MIMETYPES = tuple(MIMETYPE_TO_ARCHIVE_FORMAT.keys())
def is_tarball(urls: List[str], request: Optional[Any] = None) -> Tuple[bool, str]:
@ -218,7 +218,7 @@ def is_tarball(urls: List[str], request: Optional[Any] = None) -> Tuple[bool, st
logger.debug("Content-Type: %s", content_type)
if content_type == "application/json":
return False, urls[0]
return content_type in POSSIBLE_TARBALL_MIMETYPES, urls[0]
return content_type.startswith(POSSIBLE_TARBALL_MIMETYPES), urls[0]
raise ArtifactNatureUndetected(
f"Cannot determine artifact type from url <{url}>"

View file

@ -79,6 +79,13 @@
"svn_url": "https://code.call-cc.org/svn/chicken-eggs/release/5/iset/tags/2.2",
"svn_revision": 39057
},
{
"type": "url",
"urls": [
"http://git.marmaro.de/?p=mmh;a=snapshot;h=431604647f89d5aac7b199a7883e98e56e4ccf9e;sf=tgz"
],
"integrity": "sha256-G/7oY5qdCSJ59VlwHtIbvMdT6+mriXhMqQIHNx65J+E="
},
{
"type": "url",
"urls": ["svn://svn.code.sf.net/p/acme-crossass/code-0/trunk"],

View file

@ -173,6 +173,10 @@ def test_lister_nixguix_ok(datadir, swh_scheduler, requests_mock):
"https://api.github.com/repos/trie/trie",
[{"json": {"html_url": "https://github.com/trie/trie.git"}}],
)
requests_mock.head(
"http://git.marmaro.de/?p=mmh;a=snapshot;h=431604647f89d5aac7b199a7883e98e56e4ccf9e;sf=tgz",
headers={"Content-Type": "application/gzip; charset=ISO-8859-1"},
)
expected_visit_types = defaultdict(int)
# origin upstream is added as origin