Adapt directory loader visit type depending on the vcs tree to ingest

Prior to this, it was sending only 'directory' types for all vcs trees. Multiple
directory loaders now exist whose visit type are currently diverging, so the scheduling
would not happen correctly without it. This commit is the required adaptation for the
scheduling to work appropriately.

Refs. swh/meta#4979
This commit is contained in:
Antoine R. Dumont (@ardumont) 2023-06-02 14:52:50 +02:00
parent 9f252fc85f
commit 0756c44ea3
No known key found for this signature in database
GPG key ID: 52E2E9840D10C3B8
3 changed files with 68 additions and 18 deletions

View file

@ -312,6 +312,14 @@ VCS_KEYS_MAPPING = {
}
VCS_ARTIFACT_TYPE_TO_VISIT_TYPE = {
"git": "git-checkout",
"hg": "hg-checkout",
"svn": "svn-export",
}
"""Mapping between the vcs artifact type to the loader's visit type."""
class NixGuixLister(StatelessLister[PageResult]):
"""List Guix or Nix sources out of a public json manifest.
@ -462,7 +470,7 @@ class NixGuixLister(StatelessLister[PageResult]):
fallback_urls=[],
checksums=checksums,
checksum_layout=MAPPING_CHECKSUM_LAYOUT[outputHashMode],
visit_type="directory",
visit_type=VCS_ARTIFACT_TYPE_TO_VISIT_TYPE[artifact_type],
ref=plain_ref,
)

View file

@ -295,6 +295,22 @@
"outputHashAlgo": "sha256",
"outputHashMode": "recursive",
"integrity": "sha256-yOAaOu/HiG1N/r2tAtdou/fPB+rEJt1TQbIGzQn7/pI="
},
{
"type": "hg",
"hg_url": "https://hg.sr.ht/~olly/yoyo",
"integrity": "sha256-mME9v34RyvpoCATSiLYqN78gtPNK4+1Pj54P2d5KX2A=",
"outputHashAlgo": "sha256",
"outputHashMode": "recursive",
"hg_changeset": "v7.2.0-release"
},
{
"type": "svn",
"svn_url": "svn://svn.savannah.gnu.org/apl/trunk",
"integrity": "sha256-48aUFwjpsAlJ4Kw6oBWW3d57NQwV5igwzr8Ml4Aa7K0=",
"outputHashAlgo": "sha256",
"outputHashMode": "recursive",
"svn_revision": "1550"
}
],
"version": "1",

View file

@ -18,6 +18,7 @@ from swh.lister import TARBALL_EXTENSIONS
from swh.lister.nixguix.lister import (
DEFAULT_EXTENSIONS_TO_IGNORE,
POSSIBLE_TARBALL_MIMETYPES,
VCS_ARTIFACT_TYPE_TO_VISIT_TYPE,
ArtifactNatureMistyped,
ArtifactNatureUndetected,
ArtifactWithoutExtension,
@ -275,21 +276,25 @@ def test_lister_nixguix_ok(datadir, swh_scheduler, requests_mock):
expected_visit_types[artifact_type] += 1
outputHashMode = artifact.get("outputHashMode", "flat")
if outputHashMode == "recursive":
# 1 origin of type "directory" is listed in that case too
expected_visit_types["directory"] += 1
# Those are specific
visit_type = VCS_ARTIFACT_TYPE_TO_VISIT_TYPE[artifact_type]
expected_visit_types[visit_type] += 1
# 1 origin of type `visit_type` is listed in that case too
expected_nb_pages += 1
elif artifact_type == "url":
url = artifact["urls"][0]
if url.endswith(".git"):
expected_visit_types["git"] += 1
visit_type = "git"
elif url.endswith(".c") or url.endswith(".txt"):
expected_visit_types["content"] += 1
visit_type = "content"
elif url.startswith("svn"): # mistyped artifact rendered as vcs nonetheless
expected_visit_types["svn"] += 1
visit_type = "svn"
elif "crates.io" in url or "codeload.github.com" in url:
expected_visit_types["directory"] += 1
visit_type = "directory"
else: # tarball artifacts
expected_visit_types["directory"] += 1
visit_type = "directory"
expected_visit_types[visit_type] += 1
assert set(expected_visit_types.keys()) == {
"content",
@ -297,14 +302,19 @@ def test_lister_nixguix_ok(datadir, swh_scheduler, requests_mock):
"svn",
"hg",
"directory",
"git-checkout",
"svn-export",
"hg-checkout",
}
listed_result = lister.run()
# Each artifact is considered an origin (even "url" artifacts with mirror urls) but
expected_nb_origins = sum(expected_visit_types.values())
# 1 origin is duplicated for both visit_type 'git' and 'directory'
expected_nb_dictincts_origins = expected_nb_origins - 1
# 3 origins have their recursive hash mentioned, they are sent both as vcs and as
# specific vcs directory to ingest. So they are duplicated with visit_type 'git' and
# 'git-checkout', 'svn' and 'svn-export', 'hg' and 'hg-checkout'.
expected_nb_dictincts_origins = expected_nb_origins - 3
# 1 page read is 1 origin
assert listed_result == ListerStats(
@ -316,15 +326,31 @@ def test_lister_nixguix_ok(datadir, swh_scheduler, requests_mock):
).results
assert len(scheduler_origins) == expected_nb_origins
# The dataset will trigger 2 listed origins, with 2 distinct visit types
duplicated_url = "https://example.org/rgerganov/footswitch"
duplicated_visit_types = [
origin.visit_type
for origin in scheduler_origins
if origin.url == duplicated_url
]
# The test dataset will trigger some origins duplicated as mentioned above
# Let's check them out
duplicated_visit_types = []
for duplicated_url in [
"https://example.org/rgerganov/footswitch",
"https://hg.sr.ht/~olly/yoyo",
"svn://svn.savannah.gnu.org/apl/trunk",
]:
duplicated_visit_types.extend(
[
origin.visit_type
for origin in scheduler_origins
if origin.url == duplicated_url
]
)
assert set(duplicated_visit_types) == {"git", "directory"}
assert len(duplicated_visit_types) == 6
assert set(duplicated_visit_types) == {
"git",
"git-checkout",
"svn",
"svn-export",
"hg",
"hg-checkout",
}
mapping_visit_types = defaultdict(int)