diff --git a/swh/lister/nixguix/lister.py b/swh/lister/nixguix/lister.py index aa2a323..1107aff 100644 --- a/swh/lister/nixguix/lister.py +++ b/swh/lister/nixguix/lister.py @@ -312,6 +312,14 @@ VCS_KEYS_MAPPING = { } +VCS_ARTIFACT_TYPE_TO_VISIT_TYPE = { + "git": "git-checkout", + "hg": "hg-checkout", + "svn": "svn-export", +} +"""Mapping between the vcs artifact type to the loader's visit type.""" + + class NixGuixLister(StatelessLister[PageResult]): """List Guix or Nix sources out of a public json manifest. @@ -462,7 +470,7 @@ class NixGuixLister(StatelessLister[PageResult]): fallback_urls=[], checksums=checksums, checksum_layout=MAPPING_CHECKSUM_LAYOUT[outputHashMode], - visit_type="directory", + visit_type=VCS_ARTIFACT_TYPE_TO_VISIT_TYPE[artifact_type], ref=plain_ref, ) diff --git a/swh/lister/nixguix/tests/data/sources-success.json b/swh/lister/nixguix/tests/data/sources-success.json index 923e30e..b6a40f4 100644 --- a/swh/lister/nixguix/tests/data/sources-success.json +++ b/swh/lister/nixguix/tests/data/sources-success.json @@ -295,6 +295,22 @@ "outputHashAlgo": "sha256", "outputHashMode": "recursive", "integrity": "sha256-yOAaOu/HiG1N/r2tAtdou/fPB+rEJt1TQbIGzQn7/pI=" + }, + { + "type": "hg", + "hg_url": "https://hg.sr.ht/~olly/yoyo", + "integrity": "sha256-mME9v34RyvpoCATSiLYqN78gtPNK4+1Pj54P2d5KX2A=", + "outputHashAlgo": "sha256", + "outputHashMode": "recursive", + "hg_changeset": "v7.2.0-release" + }, + { + "type": "svn", + "svn_url": "svn://svn.savannah.gnu.org/apl/trunk", + "integrity": "sha256-48aUFwjpsAlJ4Kw6oBWW3d57NQwV5igwzr8Ml4Aa7K0=", + "outputHashAlgo": "sha256", + "outputHashMode": "recursive", + "svn_revision": "1550" } ], "version": "1", diff --git a/swh/lister/nixguix/tests/test_lister.py b/swh/lister/nixguix/tests/test_lister.py index 71e10e8..148688a 100644 --- a/swh/lister/nixguix/tests/test_lister.py +++ b/swh/lister/nixguix/tests/test_lister.py @@ -18,6 +18,7 @@ from swh.lister import TARBALL_EXTENSIONS from swh.lister.nixguix.lister import ( DEFAULT_EXTENSIONS_TO_IGNORE, POSSIBLE_TARBALL_MIMETYPES, + VCS_ARTIFACT_TYPE_TO_VISIT_TYPE, ArtifactNatureMistyped, ArtifactNatureUndetected, ArtifactWithoutExtension, @@ -275,21 +276,25 @@ def test_lister_nixguix_ok(datadir, swh_scheduler, requests_mock): expected_visit_types[artifact_type] += 1 outputHashMode = artifact.get("outputHashMode", "flat") if outputHashMode == "recursive": - # 1 origin of type "directory" is listed in that case too - expected_visit_types["directory"] += 1 + # Those are specific + visit_type = VCS_ARTIFACT_TYPE_TO_VISIT_TYPE[artifact_type] + expected_visit_types[visit_type] += 1 + # 1 origin of type `visit_type` is listed in that case too expected_nb_pages += 1 + elif artifact_type == "url": url = artifact["urls"][0] if url.endswith(".git"): - expected_visit_types["git"] += 1 + visit_type = "git" elif url.endswith(".c") or url.endswith(".txt"): - expected_visit_types["content"] += 1 + visit_type = "content" elif url.startswith("svn"): # mistyped artifact rendered as vcs nonetheless - expected_visit_types["svn"] += 1 + visit_type = "svn" elif "crates.io" in url or "codeload.github.com" in url: - expected_visit_types["directory"] += 1 + visit_type = "directory" else: # tarball artifacts - expected_visit_types["directory"] += 1 + visit_type = "directory" + expected_visit_types[visit_type] += 1 assert set(expected_visit_types.keys()) == { "content", @@ -297,14 +302,19 @@ def test_lister_nixguix_ok(datadir, swh_scheduler, requests_mock): "svn", "hg", "directory", + "git-checkout", + "svn-export", + "hg-checkout", } listed_result = lister.run() # Each artifact is considered an origin (even "url" artifacts with mirror urls) but expected_nb_origins = sum(expected_visit_types.values()) - # 1 origin is duplicated for both visit_type 'git' and 'directory' - expected_nb_dictincts_origins = expected_nb_origins - 1 + # 3 origins have their recursive hash mentioned, they are sent both as vcs and as + # specific vcs directory to ingest. So they are duplicated with visit_type 'git' and + # 'git-checkout', 'svn' and 'svn-export', 'hg' and 'hg-checkout'. + expected_nb_dictincts_origins = expected_nb_origins - 3 # 1 page read is 1 origin assert listed_result == ListerStats( @@ -316,15 +326,31 @@ def test_lister_nixguix_ok(datadir, swh_scheduler, requests_mock): ).results assert len(scheduler_origins) == expected_nb_origins - # The dataset will trigger 2 listed origins, with 2 distinct visit types - duplicated_url = "https://example.org/rgerganov/footswitch" - duplicated_visit_types = [ - origin.visit_type - for origin in scheduler_origins - if origin.url == duplicated_url - ] + # The test dataset will trigger some origins duplicated as mentioned above + # Let's check them out + duplicated_visit_types = [] + for duplicated_url in [ + "https://example.org/rgerganov/footswitch", + "https://hg.sr.ht/~olly/yoyo", + "svn://svn.savannah.gnu.org/apl/trunk", + ]: + duplicated_visit_types.extend( + [ + origin.visit_type + for origin in scheduler_origins + if origin.url == duplicated_url + ] + ) - assert set(duplicated_visit_types) == {"git", "directory"} + assert len(duplicated_visit_types) == 6 + assert set(duplicated_visit_types) == { + "git", + "git-checkout", + "svn", + "svn-export", + "hg", + "hg-checkout", + } mapping_visit_types = defaultdict(int)