nixguix: Add lister
Related to T3781
This commit is contained in:
parent
fa1205c4df
commit
fbfdf88ea4
9 changed files with 724 additions and 18 deletions
12
swh/lister/nixguix/__init__.py
Normal file
12
swh/lister/nixguix/__init__.py
Normal file
|
@ -0,0 +1,12 @@
|
|||
# Copyright (C) 2022 the Software Heritage developers
|
||||
# License: GNU General Public License version 3, or any later version
|
||||
# See top-level LICENSE file for more information
|
||||
|
||||
|
||||
def register():
|
||||
from .lister import NixGuixLister
|
||||
|
||||
return {
|
||||
"lister": NixGuixLister,
|
||||
"task_modules": ["%s.tasks" % __name__],
|
||||
}
|
370
swh/lister/nixguix/lister.py
Normal file
370
swh/lister/nixguix/lister.py
Normal file
|
@ -0,0 +1,370 @@
|
|||
# Copyright (C) 2020-2022 The Software Heritage developers
|
||||
# See the AUTHORS file at the top-level directory of this distribution
|
||||
# License: GNU General Public License version 3, or any later version
|
||||
# See top-level LICENSE file for more information
|
||||
|
||||
"""NixGuix lister definition.
|
||||
|
||||
This lists artifacts out of manifest for Guix or Nixpkgs manifests.
|
||||
|
||||
Artifacts can be of types:
|
||||
- upstream git repository (NixOS/nixpkgs, Guix)
|
||||
- VCS repositories (svn, git, hg, ...)
|
||||
- unique file
|
||||
- unique tarball
|
||||
|
||||
"""
|
||||
|
||||
import base64
|
||||
from dataclasses import dataclass
|
||||
from enum import Enum
|
||||
import logging
|
||||
from pathlib import Path
|
||||
import random
|
||||
from typing import Any, Dict, Iterator, List, Optional, Tuple, Union
|
||||
from urllib.parse import urlparse
|
||||
|
||||
import requests
|
||||
|
||||
from swh.core.github.utils import GitHubSession
|
||||
from swh.core.tarball import MIMETYPE_TO_ARCHIVE_FORMAT
|
||||
from swh.lister import TARBALL_EXTENSIONS
|
||||
from swh.lister.pattern import CredentialsType, StatelessLister
|
||||
from swh.scheduler.model import ListedOrigin
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class ArtifactNatureUndetected(ValueError):
|
||||
"""Raised when a remote artifact's nature (tarball, file) cannot be detected."""
|
||||
|
||||
pass
|
||||
|
||||
|
||||
@dataclass
|
||||
class OriginUpstream:
|
||||
"""Upstream origin (e.g. NixOS/nixpkgs, Guix/Guix)."""
|
||||
|
||||
origin: str
|
||||
"""Canonical url of the repository"""
|
||||
version: int
|
||||
"""Version of the repository (dismissed?)"""
|
||||
revision: str
|
||||
"""Revision of the repository (dismissed?)"""
|
||||
|
||||
|
||||
@dataclass
|
||||
class Artifact:
|
||||
"""Metadata information on Remote Artifact with url (tarball or file)."""
|
||||
|
||||
origin: str
|
||||
"""Canonical url retrieve the tarball artifact."""
|
||||
visit_type: str
|
||||
"""Either 'tar' or 'file' """
|
||||
fallback_urls: List[str]
|
||||
"""List of urls to retrieve tarball artifact if canonical url no longer works."""
|
||||
checksums: Dict[str, str]
|
||||
"""Integrity hash converted into a checksum dict."""
|
||||
|
||||
|
||||
@dataclass
|
||||
class VCS:
|
||||
"""Metadata information on VCS."""
|
||||
|
||||
origin: str
|
||||
"""Origin url of the vcs"""
|
||||
ref: Optional[str]
|
||||
"""Reference either a svn commit id, a git commit, ..."""
|
||||
type: str
|
||||
"""Type of (d)vcs, e.g. svn, git, hg, ..."""
|
||||
|
||||
|
||||
class ArtifactType(Enum):
|
||||
"""The possible artifact types listed out of the manifest."""
|
||||
|
||||
ARTIFACT = "artifact"
|
||||
ORIGIN = "origin"
|
||||
VCS = "vcs"
|
||||
|
||||
|
||||
PageResult = Tuple[ArtifactType, Union[Artifact, VCS, OriginUpstream]]
|
||||
|
||||
|
||||
VCS_SUPPORTED = ("git", "svn", "hg")
|
||||
|
||||
# Rough approximation of what we can find of mimetypes for tarballs "out there"
|
||||
POSSIBLE_TARBALL_MIMETYPES = set(MIMETYPE_TO_ARCHIVE_FORMAT.keys())
|
||||
|
||||
|
||||
def is_tarball(urls: List[str], request: Optional[Any] = None) -> Tuple[bool, str]:
|
||||
"""Determine whether a list of files actually are tarballs or simple files.
|
||||
|
||||
When this cannot be answered simply out of the url, when request is provided, this
|
||||
executes a HTTP `HEAD` query on the url to determine the information. If request is
|
||||
not provided, this raises an ArtifactNatureUndetected exception.
|
||||
|
||||
Args:
|
||||
urls: name of the remote files for which the extension needs to be checked.
|
||||
|
||||
Raises:
|
||||
ArtifactNatureUndetected when the artifact's nature cannot be detected out
|
||||
of its url
|
||||
|
||||
Returns: A tuple (bool, url). The boolean represents whether the url is an archive
|
||||
or not. The second parameter is the actual url once the head request is issued
|
||||
as a fallback of not finding out whether the urls are tarballs or not.
|
||||
|
||||
"""
|
||||
|
||||
def _is_tarball(url):
|
||||
"""Determine out of an extension whether url is a tarball.
|
||||
|
||||
Raises:
|
||||
IndexError in case no extension is available
|
||||
|
||||
"""
|
||||
return Path(urlparse(url).path).suffixes[-1].lstrip(".") in TARBALL_EXTENSIONS
|
||||
|
||||
index = random.randrange(len(urls))
|
||||
url = urls[index]
|
||||
try:
|
||||
is_tar = _is_tarball(url)
|
||||
return is_tar, urls[0]
|
||||
except IndexError:
|
||||
if request is None:
|
||||
raise ArtifactNatureUndetected(
|
||||
"Cannot determine artifact type from url %s", url
|
||||
)
|
||||
logger.warning(
|
||||
"Cannot detect extension for '%s'. Fallback to http head query",
|
||||
url,
|
||||
)
|
||||
response = request.head(url)
|
||||
|
||||
if not response.ok or response.status_code == 404:
|
||||
raise ArtifactNatureUndetected(
|
||||
"Cannot determine artifact type from url %s", url
|
||||
)
|
||||
location = response.headers.get("Location")
|
||||
if location: # It's not always present
|
||||
logger.debug("Location: %s", location)
|
||||
try:
|
||||
# FIXME: location is also returned as it's considered the true origin,
|
||||
# true enough?
|
||||
return _is_tarball(location), location
|
||||
except IndexError:
|
||||
logger.warning(
|
||||
"Still cannot detect extension through location '%s'...",
|
||||
url,
|
||||
)
|
||||
|
||||
content_type = response.headers.get("Content-Type")
|
||||
if content_type:
|
||||
logger.debug("Content-Type: %s", content_type)
|
||||
if content_type == "application/json":
|
||||
return False, urls[0]
|
||||
return content_type in POSSIBLE_TARBALL_MIMETYPES, urls[0]
|
||||
|
||||
raise ArtifactNatureUndetected(
|
||||
"Cannot determine artifact type from url %s", url
|
||||
)
|
||||
|
||||
|
||||
VCS_KEYS_MAPPING = {
|
||||
"git": {
|
||||
"ref": "git_ref",
|
||||
"url": "git_url",
|
||||
},
|
||||
"svn": {
|
||||
"ref": "svn_revision",
|
||||
"url": "svn_url",
|
||||
},
|
||||
"hg": {
|
||||
"ref": "hg_changeset",
|
||||
"url": "hg_url",
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
class NixGuixLister(StatelessLister[PageResult]):
|
||||
"""List Guix or Nix sources out of a public json manifest.
|
||||
|
||||
This lister can output:
|
||||
- unique tarball (.tar.gz, .tbz2, ...)
|
||||
- vcs repositories (e.g. git, hg, svn)
|
||||
- unique file (.lisp, .py, ...)
|
||||
|
||||
Note that no `last_update` is available in either manifest.
|
||||
|
||||
For `url` types artifacts, this tries to determine the artifact's nature, tarball or
|
||||
file. It first tries to compute out of the "url" extension. In case of no extension,
|
||||
it fallbacks to query (HEAD) the url to retrieve the origin out of the `Location`
|
||||
response header, and then checks the extension again.
|
||||
|
||||
"""
|
||||
|
||||
LISTER_NAME = "nixguix"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
scheduler,
|
||||
url: str,
|
||||
origin_upstream: str,
|
||||
instance: Optional[str] = None,
|
||||
credentials: Optional[CredentialsType] = None,
|
||||
# canonicalize urls, can be turned off during docker runs
|
||||
canonicalize: bool = True,
|
||||
**kwargs: Any,
|
||||
):
|
||||
super().__init__(
|
||||
scheduler=scheduler,
|
||||
url=url.rstrip("/"),
|
||||
instance=instance,
|
||||
credentials=credentials,
|
||||
)
|
||||
# either full fqdn NixOS/nixpkgs or guix repository urls
|
||||
# maybe add an assert on those specific urls?
|
||||
self.origin_upstream = origin_upstream
|
||||
|
||||
self.session = requests.Session()
|
||||
# for testing purposes, we may want to skip this step (e.g. docker run and rate
|
||||
# limit)
|
||||
self.github_session = (
|
||||
GitHubSession(
|
||||
credentials=self.credentials,
|
||||
user_agent=str(self.session.headers["User-Agent"]),
|
||||
)
|
||||
if canonicalize
|
||||
else None
|
||||
)
|
||||
|
||||
def get_pages(self) -> Iterator[PageResult]:
|
||||
"""Yield one page per "typed" origin referenced in manifest."""
|
||||
# fetch and parse the manifest...
|
||||
response = self.http_request(self.url)
|
||||
|
||||
# ... if any
|
||||
raw_data = response.json()
|
||||
version = raw_data["version"]
|
||||
revision = raw_data["revision"]
|
||||
yield ArtifactType.ORIGIN, OriginUpstream(
|
||||
self.origin_upstream,
|
||||
version,
|
||||
revision,
|
||||
)
|
||||
|
||||
# grep '"type"' guix-sources.json | sort | uniq
|
||||
# "type": false <<<<<<<<< noise
|
||||
# "type": "git",
|
||||
# "type": "hg",
|
||||
# "type": "no-origin", <<<<<<<<< noise
|
||||
# "type": "svn",
|
||||
# "type": "url",
|
||||
|
||||
# grep '"type"' nixpkgs-sources-unstable.json | sort | uniq
|
||||
# "type": "url",
|
||||
|
||||
for artifact in raw_data["sources"]:
|
||||
artifact_type = artifact["type"]
|
||||
if artifact_type in VCS_SUPPORTED:
|
||||
plain_url = artifact[VCS_KEYS_MAPPING[artifact_type]["url"]]
|
||||
plain_ref = artifact[VCS_KEYS_MAPPING[artifact_type]["ref"]]
|
||||
artifact_url = (
|
||||
self.github_session.get_canonical_url(plain_url)
|
||||
if self.github_session
|
||||
else plain_url
|
||||
)
|
||||
if not artifact_url:
|
||||
continue
|
||||
yield ArtifactType.VCS, VCS(
|
||||
origin=artifact_url, type=artifact_type, ref=plain_ref
|
||||
)
|
||||
elif artifact_type == "url":
|
||||
# It's either a tarball or a file
|
||||
urls = artifact.get("urls")
|
||||
if not urls:
|
||||
# Nothing to fetch
|
||||
logger.warning("Skipping url '%s': empty artifact", artifact)
|
||||
continue
|
||||
|
||||
assert urls is not None
|
||||
# FIXME: T3294: Fix missing scheme in urls
|
||||
origin, *fallback_urls = urls
|
||||
|
||||
integrity = artifact.get("integrity")
|
||||
if integrity is None:
|
||||
logger.warning("Skipping url '%s': missing integrity field", origin)
|
||||
continue
|
||||
|
||||
try:
|
||||
is_tar, origin = is_tarball(urls, self.session)
|
||||
except ArtifactNatureUndetected:
|
||||
logger.warning(
|
||||
"Skipping url '%s': undetected remote artifact type", origin
|
||||
)
|
||||
continue
|
||||
|
||||
# Determine the content checksum stored in the integrity field and
|
||||
# convert into a dict of checksums. This only parses the
|
||||
# `hash-expression` (hash-<b64-encoded-checksum>) as defined in
|
||||
# https://w3c.github.io/webappsec-subresource-integrity/#the-integrity-attribute
|
||||
chksum_algo, chksum_b64 = integrity.split("-")
|
||||
checksums: Dict[str, str] = {
|
||||
chksum_algo: base64.decodebytes(chksum_b64.encode()).hex()
|
||||
}
|
||||
|
||||
logger.debug("%s: %s", "dir" if is_tar else "cnt", origin)
|
||||
yield ArtifactType.ARTIFACT, Artifact(
|
||||
origin=origin,
|
||||
fallback_urls=fallback_urls,
|
||||
checksums=checksums,
|
||||
visit_type="directory" if is_tar else "content",
|
||||
)
|
||||
else:
|
||||
logger.warning(
|
||||
"Skipping artifact '%s': unsupported type %s",
|
||||
artifact,
|
||||
artifact_type,
|
||||
)
|
||||
|
||||
def vcs_to_listed_origin(self, artifact: VCS) -> Iterator[ListedOrigin]:
|
||||
"""Given a vcs repository, yield a ListedOrigin."""
|
||||
assert self.lister_obj.id is not None
|
||||
# FIXME: What to do with the "ref" (e.g. git/hg/svn commit, ...)
|
||||
yield ListedOrigin(
|
||||
lister_id=self.lister_obj.id,
|
||||
url=artifact.origin,
|
||||
visit_type=artifact.type,
|
||||
)
|
||||
|
||||
def origin_to_listed_origin(
|
||||
self, origin_upstream: OriginUpstream
|
||||
) -> Iterator[ListedOrigin]:
|
||||
"""Given an upstream origin, yield a ListedOrigin."""
|
||||
assert self.lister_obj.id is not None
|
||||
yield ListedOrigin(
|
||||
lister_id=self.lister_obj.id,
|
||||
url=origin_upstream.origin,
|
||||
visit_type="git", # both nixpkgs and guix are git origins so far
|
||||
)
|
||||
|
||||
def artifact_to_listed_origin(self, artifact: Artifact) -> Iterator[ListedOrigin]:
|
||||
"""Given an artifact (tarball, file), yield one ListedOrigin."""
|
||||
assert self.lister_obj.id is not None
|
||||
yield ListedOrigin(
|
||||
lister_id=self.lister_obj.id,
|
||||
url=artifact.origin,
|
||||
visit_type=artifact.visit_type,
|
||||
extra_loader_arguments={
|
||||
"checksums": artifact.checksums,
|
||||
"fallback_urls": artifact.fallback_urls,
|
||||
},
|
||||
)
|
||||
|
||||
def get_origins_from_page(
|
||||
self, artifact_tuple: PageResult
|
||||
) -> Iterator[ListedOrigin]:
|
||||
"""Given an artifact tuple (type, artifact), yield a ListedOrigin."""
|
||||
artifact_type, artifact = artifact_tuple
|
||||
mapping_type_fn = getattr(self, f"{artifact_type.value}_to_listed_origin")
|
||||
yield from mapping_type_fn(artifact)
|
0
swh/lister/nixguix/tests/__init__.py
Normal file
0
swh/lister/nixguix/tests/__init__.py
Normal file
19
swh/lister/nixguix/tests/data/guix-swh_sources.json
Normal file
19
swh/lister/nixguix/tests/data/guix-swh_sources.json
Normal file
|
@ -0,0 +1,19 @@
|
|||
{
|
||||
"sources": [
|
||||
{"type": "git", "git_url": "", "git_ref": ""},
|
||||
{"type": false},
|
||||
{"type": "no-origin"},
|
||||
{"type": "url", "urls": []},
|
||||
{
|
||||
"type": "url",
|
||||
"urls": ["https://crates.io/api/v1/0.1.5/no-extension-and-head-404-so-skipped"],
|
||||
"integrity": "sha256-HW6jxFlbljY8E5Q0l9s0r0Rg+0dKlcQ/REatNBuMl4U="
|
||||
},
|
||||
{
|
||||
"type": "url",
|
||||
"urls": [ "https://example.org/another-file-no-integrity-so-skipped.txt" ]
|
||||
}
|
||||
],
|
||||
"version":"1",
|
||||
"revision":"ab59155c5a38dda7efaceb47c7528578fcf0def4"
|
||||
}
|
52
swh/lister/nixguix/tests/data/nixpkgs-swh_sources.json
Normal file
52
swh/lister/nixguix/tests/data/nixpkgs-swh_sources.json
Normal file
|
@ -0,0 +1,52 @@
|
|||
{
|
||||
"sources": [
|
||||
{
|
||||
"type": "url",
|
||||
"urls": [ "https://github.com/owner-1/repository-1/revision-1.tgz" ],
|
||||
"integrity": "sha256-3vm2Nt+O4zHf3Ovd/qsv1gKTEUwodX9FLxlrQdry0zs="
|
||||
},
|
||||
{
|
||||
"type": "url",
|
||||
"urls": [ "https://github.com/owner-3/repository-1/revision-1.tgz" ],
|
||||
"integrity": "sha256-3vm2Nt+O4zHf3Ovd/qsv1gKTEUwodX9FLxlrQdry0zs="
|
||||
},
|
||||
{
|
||||
"type": "url",
|
||||
"urls": [ "https://example.com/file.txt" ],
|
||||
"integrity": "sha256-Q0copBCnj1b8G1iZw1k0NuYasMcx6QctleltspAgXlM="
|
||||
},
|
||||
{
|
||||
"type": "url",
|
||||
"urls": [
|
||||
"https://releases.wildfiregames.com/0ad-0.0.25b-alpha-unix-build.tar.xz"
|
||||
],
|
||||
"integrity": "sha256-1w3NdfRzp9XIFDLD2SYJJr+Nnf9c1UF5YWlJfRxSLt0="
|
||||
},
|
||||
{
|
||||
"type": "url",
|
||||
"urls": [
|
||||
"http://downloads.sourceforge.net/project/nmon/lmon16n.c",
|
||||
"http://ufpr.dl.sourceforge.net/project/nmon/lmon16n.c",
|
||||
"http://netassist.dl.sourceforge.net/project/nmon/lmon16n.c"
|
||||
],
|
||||
"integrity": "sha256-wAEswtkl3ulAw3zq4perrGS6Wlww5XXnQYsEAoYT9fI="
|
||||
},
|
||||
{
|
||||
"type": "git",
|
||||
"git_url": "https://example.org/pali/0xffff",
|
||||
"git_ref": "0.9"
|
||||
},
|
||||
{
|
||||
"type": "hg",
|
||||
"hg_url": "https://example.org/vityok/cl-string-match",
|
||||
"hg_changeset": "5048480a61243e6f1b02884012c8f25cdbee6d97"
|
||||
},
|
||||
{
|
||||
"type": "svn",
|
||||
"svn_url": "https://code.call-cc.org/svn/chicken-eggs/release/5/iset/tags/2.2",
|
||||
"svn_revision": 39057
|
||||
}
|
||||
],
|
||||
"version": "1",
|
||||
"revision": "cc4e04c26672dd74e5fd0fecb78b435fb55368f7"
|
||||
}
|
244
swh/lister/nixguix/tests/test_lister.py
Normal file
244
swh/lister/nixguix/tests/test_lister.py
Normal file
|
@ -0,0 +1,244 @@
|
|||
# Copyright (C) 2022 The Software Heritage developers
|
||||
# See the AUTHORS file at the top-level directory of this distribution
|
||||
# License: GNU General Public License version 3, or any later version
|
||||
# See top-level LICENSE file for more information
|
||||
|
||||
from collections import defaultdict
|
||||
import json
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import Dict, List
|
||||
|
||||
import pytest
|
||||
import requests
|
||||
|
||||
from swh.lister import TARBALL_EXTENSIONS
|
||||
from swh.lister.nixguix.lister import (
|
||||
POSSIBLE_TARBALL_MIMETYPES,
|
||||
ArtifactNatureUndetected,
|
||||
NixGuixLister,
|
||||
is_tarball,
|
||||
)
|
||||
from swh.lister.pattern import ListerStats
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def page_response(datadir, instance: str) -> List[Dict]:
|
||||
"""Return list of repositories (out of test dataset)"""
|
||||
datapath = Path(datadir, f"{instance}-swh_sources.json")
|
||||
return json.loads(datapath.read_text()) if datapath.exists else []
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"urls",
|
||||
[[f"one.{ext}", f"two.{ext}"] for ext in TARBALL_EXTENSIONS]
|
||||
+ [[f"one.{ext}?foo=bar"] for ext in TARBALL_EXTENSIONS],
|
||||
)
|
||||
def test_is_tarball_simple(urls):
|
||||
"""Simple check on tarball should discriminate betwenn tarball and file"""
|
||||
is_tar, origin = is_tarball(urls)
|
||||
assert is_tar is True
|
||||
assert origin == urls[0]
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"urls",
|
||||
[
|
||||
["abc.lisp"],
|
||||
["one.abc", "two.bcd"],
|
||||
["abc.c", "other.c"],
|
||||
["one.scm?foo=bar", "two.scm?foo=bar"],
|
||||
["config.nix", "flakes.nix"],
|
||||
],
|
||||
)
|
||||
def test_is_tarball_simple_not_tarball(urls):
|
||||
"""Simple check on tarball should discriminate betwenn tarball and file"""
|
||||
is_tar, origin = is_tarball(urls)
|
||||
assert is_tar is False
|
||||
assert origin == urls[0]
|
||||
|
||||
|
||||
def test_is_tarball_complex_with_no_result(requests_mock):
|
||||
"""Complex tarball detection without proper information should fail."""
|
||||
# No extension, this won't detect immediately the nature of the url
|
||||
url = "https://example.org/crates/package/download"
|
||||
urls = [url]
|
||||
with pytest.raises(ArtifactNatureUndetected):
|
||||
is_tarball(url) # no request parameter, this cannot fallback, raises
|
||||
|
||||
with pytest.raises(ArtifactNatureUndetected):
|
||||
requests_mock.head(
|
||||
url,
|
||||
status_code=404, # not found so cannot detect anything
|
||||
)
|
||||
is_tarball(urls, requests)
|
||||
|
||||
with pytest.raises(ArtifactNatureUndetected):
|
||||
requests_mock.head(
|
||||
url, headers={}
|
||||
) # response ok without headers, cannot detect anything
|
||||
is_tarball(urls, requests)
|
||||
|
||||
with pytest.raises(ArtifactNatureUndetected):
|
||||
fallback_url = "https://example.org/mirror/crates/package/download"
|
||||
requests_mock.head(
|
||||
url, headers={"location": fallback_url} # still no extension, cannot detect
|
||||
)
|
||||
is_tarball(urls, requests)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"fallback_url, expected_result",
|
||||
[
|
||||
("https://example.org/mirror/crates/package/download.tar.gz", True),
|
||||
("https://example.org/mirror/package/download.lisp", False),
|
||||
],
|
||||
)
|
||||
def test_is_tarball_complex_with_location_result(
|
||||
requests_mock, fallback_url, expected_result
|
||||
):
|
||||
"""Complex tarball detection with information should detect artifact nature"""
|
||||
# No extension, this won't detect immediately the nature of the url
|
||||
url = "https://example.org/crates/package/download"
|
||||
urls = [url]
|
||||
|
||||
# One scenario where the url renders a location with a proper extension
|
||||
requests_mock.head(url, headers={"location": fallback_url})
|
||||
is_tar, origin = is_tarball(urls, requests)
|
||||
assert is_tar == expected_result
|
||||
if is_tar:
|
||||
assert origin == fallback_url
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"content_type, expected_result",
|
||||
[("application/json", False), ("application/something", False)]
|
||||
+ [(ext, True) for ext in POSSIBLE_TARBALL_MIMETYPES],
|
||||
)
|
||||
def test_is_tarball_complex_with_content_type_result(
|
||||
requests_mock, content_type, expected_result
|
||||
):
|
||||
"""Complex tarball detection with information should detect artifact nature"""
|
||||
# No extension, this won't detect immediately the nature of the url
|
||||
url = "https://example.org/crates/package/download"
|
||||
urls = [url]
|
||||
|
||||
# One scenario where the url renders a location with a proper extension
|
||||
requests_mock.head(url, headers={"Content-Type": content_type})
|
||||
is_tar, origin = is_tarball(urls, requests)
|
||||
assert is_tar == expected_result
|
||||
if is_tar:
|
||||
assert origin == url
|
||||
|
||||
|
||||
def test_lister_nixguix(datadir, swh_scheduler, requests_mock):
|
||||
"""NixGuixLister should list all origins per visit type"""
|
||||
url = "https://nix-community.github.io/nixpkgs-swh/sources-unstable.json"
|
||||
origin_upstream = "https://github.com/NixOS/nixpkgs"
|
||||
lister = NixGuixLister(swh_scheduler, url=url, origin_upstream=origin_upstream)
|
||||
|
||||
response = page_response(datadir, "nixpkgs")
|
||||
requests_mock.get(
|
||||
url,
|
||||
[{"json": response}],
|
||||
)
|
||||
|
||||
expected_visit_types = defaultdict(int)
|
||||
# origin upstream is added as origin
|
||||
expected_nb_origins = 1
|
||||
expected_visit_types["git"] += 1
|
||||
for artifact in response["sources"]:
|
||||
# Each artifact is considered an origin (even "url" artifacts with mirror urls)
|
||||
expected_nb_origins += 1
|
||||
artifact_type = artifact["type"]
|
||||
if artifact_type in [
|
||||
"git",
|
||||
"svn",
|
||||
"hg",
|
||||
]:
|
||||
expected_visit_types[artifact_type] += 1
|
||||
elif artifact_type == "url":
|
||||
url = artifact["urls"][0]
|
||||
if url.endswith(".c") or url.endswith(".txt"):
|
||||
expected_visit_types["content"] += 1
|
||||
else:
|
||||
expected_visit_types["directory"] += 1
|
||||
|
||||
assert set(expected_visit_types.keys()) == {
|
||||
"content",
|
||||
"git",
|
||||
"svn",
|
||||
"hg",
|
||||
"directory",
|
||||
}
|
||||
|
||||
listed_result = lister.run()
|
||||
|
||||
# 1 page read is 1 origin
|
||||
nb_pages = expected_nb_origins
|
||||
assert listed_result == ListerStats(pages=nb_pages, origins=expected_nb_origins)
|
||||
|
||||
scheduler_origins = lister.scheduler.get_listed_origins(
|
||||
lister.lister_obj.id
|
||||
).results
|
||||
assert len(scheduler_origins) == expected_nb_origins
|
||||
|
||||
mapping_visit_types = defaultdict(int)
|
||||
|
||||
for listed_origin in scheduler_origins:
|
||||
assert listed_origin.visit_type in expected_visit_types
|
||||
# no last update is listed on those manifests
|
||||
assert listed_origin.last_update is None
|
||||
|
||||
mapping_visit_types[listed_origin.visit_type] += 1
|
||||
|
||||
assert dict(mapping_visit_types) == expected_visit_types
|
||||
|
||||
|
||||
def test_lister_nixguix_mostly_noop(datadir, swh_scheduler, requests_mock):
|
||||
"""NixGuixLister should ignore unsupported or incomplete origins"""
|
||||
url = "https://guix.gnu.org/sources.json"
|
||||
origin_upstream = "https://git.savannah.gnu.org/git/guix.git"
|
||||
lister = NixGuixLister(swh_scheduler, url=url, origin_upstream=origin_upstream)
|
||||
|
||||
response = page_response(datadir, "guix")
|
||||
|
||||
requests_mock.get(
|
||||
url,
|
||||
[{"json": response}],
|
||||
)
|
||||
# Amongst artifacts, this url does not allow to determine its nature (tarball, file)
|
||||
# It's ending up doing a http head query which ends up being 404, so it's skipped.
|
||||
requests_mock.head(
|
||||
"https://crates.io/api/v1/0.1.5/no-extension-and-head-404-so-skipped",
|
||||
status_code=404,
|
||||
)
|
||||
|
||||
listed_result = lister.run()
|
||||
# only the origin upstream is listed, every other entries are unsupported or incomplete
|
||||
assert listed_result == ListerStats(pages=1, origins=1)
|
||||
|
||||
scheduler_origins = lister.scheduler.get_listed_origins(
|
||||
lister.lister_obj.id
|
||||
).results
|
||||
assert len(scheduler_origins) == 1
|
||||
|
||||
assert scheduler_origins[0].visit_type == "git"
|
||||
|
||||
|
||||
def test_lister_nixguix_fail(datadir, swh_scheduler, requests_mock):
|
||||
url = "https://nix-community.github.io/nixpkgs-swh/sources-unstable.json"
|
||||
origin_upstream = "https://github.com/NixOS/nixpkgs"
|
||||
lister = NixGuixLister(swh_scheduler, url=url, origin_upstream=origin_upstream)
|
||||
|
||||
requests_mock.get(
|
||||
url,
|
||||
status_code=404,
|
||||
)
|
||||
|
||||
with pytest.raises(requests.HTTPError): # listing cannot continues so stop
|
||||
lister.run()
|
||||
|
||||
scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
|
||||
assert len(scheduler_origins) == 0
|
Loading…
Add table
Add a link
Reference in a new issue