crates: Create one origin per package instead of per version

Previously we had as many origins as version for a crate package, url was a link
to a specific crate version package.

Refactor to have one origin per package name and add an 'artifacts' entry to
extra_loader_arguments that list all versions, package url and checksum.
Origin url is now a link to the related http api endpoint for a package name.

Related to T4104
This commit is contained in:
Franck Bret 2022-04-26 10:48:02 +02:00
parent c251594a1f
commit 985b71e80c
2 changed files with 97 additions and 55 deletions

View file

@ -8,6 +8,7 @@ import logging
from pathlib import Path
import subprocess
from typing import Any, Dict, Iterator, List
from urllib.parse import urlparse
import iso8601
@ -32,7 +33,7 @@ class CratesLister(StatelessLister[CratesListerPage]):
# Part of the lister API, that identifies this lister
LISTER_NAME = "crates"
# (Optional) CVS type of the origins listed by this lister, if constant
VISIT_TYPE = "rust-crate"
VISIT_TYPE = "crates"
INSTANCE = "crates"
INDEX_REPOSITORY_URL = "https://github.com/rust-lang/crates.io-index.git"
@ -40,6 +41,7 @@ class CratesLister(StatelessLister[CratesListerPage]):
CRATE_FILE_URL_PATTERN = (
"https://static.crates.io/crates/{crate}/{crate}-{version}.crate"
)
CRATE_API_URL_PATTERN = "https://crates.io/api/v1/crates/{crate}"
def __init__(
self,
@ -131,15 +133,30 @@ class CratesLister(StatelessLister[CratesListerPage]):
assert self.lister_obj.id is not None
url = self.CRATE_API_URL_PATTERN.format(crate=page[0]["name"])
last_update = page[0]["last_update"]
artifacts = []
for version in page:
yield ListedOrigin(
lister_id=self.lister_obj.id,
visit_type=self.VISIT_TYPE,
url=version["crate_file"],
last_update=version["last_update"],
extra_loader_arguments={
"name": version["name"],
"version": version["version"],
"checksum": version["checksum"],
filename = urlparse(version["crate_file"]).path.split("/")[-1]
# Build an artifact entry following original-artifacts-json specification
# https://docs.softwareheritage.org/devel/swh-storage/extrinsic-metadata-specification.html#original-artifacts-json # noqa: B950
artifact = {
"filename": f"{filename}",
"checksums": {
"sha256": f"{version['checksum']}",
},
)
"url": version["crate_file"],
"version": version["version"],
}
artifacts.append(artifact)
yield ListedOrigin(
lister_id=self.lister_obj.id,
visit_type=self.VISIT_TYPE,
url=url,
last_update=last_update,
extra_loader_arguments={
"artifacts": artifacts,
},
)

View file

@ -10,46 +10,75 @@ from swh.lister.crates.tests import prepare_repository_from_archive
expected_origins = [
{
"name": "rand",
"version": "0.1.1",
"checksum": "48a45b46c2a8c38348adb1205b13c3c5eb0174e0c0fec52cc88e9fb1de14c54d",
"url": "https://static.crates.io/crates/rand/rand-0.1.1.crate",
"url": "https://crates.io/api/v1/crates/rand",
"artifacts": [
{
"checksums": {
"sha256": "48a45b46c2a8c38348adb1205b13c3c5eb0174e0c0fec52cc88e9fb1de14c54d", # noqa: B950
},
"filename": "rand-0.1.1.crate",
"url": "https://static.crates.io/crates/rand/rand-0.1.1.crate",
"version": "0.1.1",
},
{
"checksums": {
"sha256": "6e229ed392842fa93c1d76018d197b7e1b74250532bafb37b0e1d121a92d4cf7", # noqa: B950
},
"filename": "rand-0.1.2.crate",
"url": "https://static.crates.io/crates/rand/rand-0.1.2.crate",
"version": "0.1.2",
},
],
},
{
"name": "rand",
"version": "0.1.2",
"checksum": "6e229ed392842fa93c1d76018d197b7e1b74250532bafb37b0e1d121a92d4cf7",
"url": "https://static.crates.io/crates/rand/rand-0.1.2.crate",
"url": "https://crates.io/api/v1/crates/regex",
"artifacts": [
{
"checksums": {
"sha256": "f0ff1ca641d3c9a2c30464dac30183a8b91cdcc959d616961be020cdea6255c5", # noqa: B950
},
"filename": "regex-0.1.0.crate",
"url": "https://static.crates.io/crates/regex/regex-0.1.0.crate",
"version": "0.1.0",
},
{
"checksums": {
"sha256": "a07bef996bd38a73c21a8e345d2c16848b41aa7ec949e2fedffe9edf74cdfb36", # noqa: B950
},
"filename": "regex-0.1.1.crate",
"url": "https://static.crates.io/crates/regex/regex-0.1.1.crate",
"version": "0.1.1",
},
{
"checksums": {
"sha256": "343bd0171ee23346506db6f4c64525de6d72f0e8cc533f83aea97f3e7488cbf9", # noqa: B950
},
"filename": "regex-0.1.2.crate",
"url": "https://static.crates.io/crates/regex/regex-0.1.2.crate",
"version": "0.1.2",
},
{
"checksums": {
"sha256": "defb220c4054ca1b95fe8b0c9a6e782dda684c1bdf8694df291733ae8a3748e3", # noqa: B950
},
"filename": "regex-0.1.3.crate",
"url": "https://static.crates.io/crates/regex/regex-0.1.3.crate",
"version": "0.1.3",
},
],
},
{
"name": "regex",
"version": "0.1.0",
"checksum": "f0ff1ca641d3c9a2c30464dac30183a8b91cdcc959d616961be020cdea6255c5",
"url": "https://static.crates.io/crates/regex/regex-0.1.0.crate",
},
{
"name": "regex",
"version": "0.1.1",
"checksum": "a07bef996bd38a73c21a8e345d2c16848b41aa7ec949e2fedffe9edf74cdfb36",
"url": "https://static.crates.io/crates/regex/regex-0.1.1.crate",
},
{
"name": "regex",
"version": "0.1.2",
"checksum": "343bd0171ee23346506db6f4c64525de6d72f0e8cc533f83aea97f3e7488cbf9",
"url": "https://static.crates.io/crates/regex/regex-0.1.2.crate",
},
{
"name": "regex",
"version": "0.1.3",
"checksum": "defb220c4054ca1b95fe8b0c9a6e782dda684c1bdf8694df291733ae8a3748e3",
"url": "https://static.crates.io/crates/regex/regex-0.1.3.crate",
},
{
"name": "regex-syntax",
"version": "0.1.0",
"checksum": "398952a2f6cd1d22bc1774fd663808e32cf36add0280dee5cdd84a8fff2db944",
"url": "https://static.crates.io/crates/regex-syntax/regex-syntax-0.1.0.crate",
"url": "https://crates.io/api/v1/crates/regex-syntax",
"artifacts": [
{
"checksums": {
"sha256": "398952a2f6cd1d22bc1774fd663808e32cf36add0280dee5cdd84a8fff2db944", # noqa: B950
},
"filename": "regex-syntax-0.1.0.crate",
"url": "https://static.crates.io/crates/regex-syntax/regex-syntax-0.1.0.crate",
"version": "0.1.0",
},
],
},
]
@ -67,7 +96,7 @@ def test_crates_lister(datadir, tmp_path, swh_scheduler):
res = lister.run()
assert res.pages == 3
assert res.origins == 7
assert res.origins == 3
expected_origins_sorted = sorted(expected_origins, key=lambda x: x.get("url"))
scheduler_origins_sorted = sorted(
@ -76,14 +105,10 @@ def test_crates_lister(datadir, tmp_path, swh_scheduler):
)
for scheduled, expected in zip(scheduler_origins_sorted, expected_origins_sorted):
assert scheduled.visit_type == "rust-crate"
assert scheduled.visit_type == "crates"
assert scheduled.url == expected.get("url")
assert scheduled.extra_loader_arguments.get("name") == expected.get("name")
assert scheduled.extra_loader_arguments.get("version") == expected.get(
"version"
)
assert scheduled.extra_loader_arguments.get("checksum") == expected.get(
"checksum"
assert scheduled.extra_loader_arguments.get("artifacts") == expected.get(
"artifacts"
)
assert len(scheduler_origins_sorted) == len(expected_origins_sorted)