lister: Add new rust crates lister

The Crates lister retrieves crates package for Rust lang.

It basically fetches https://github.com/rust-lang/crates.io-index.git
to a temp directory and then walks through each file to get the
crate's info.
This commit is contained in:
Franck Bret 2022-03-17 12:13:04 +01:00
parent ff0035a60b
commit fea6fc04aa
10 changed files with 357 additions and 0 deletions

View file

@ -17,6 +17,7 @@ repos:
- id: codespell
name: Check source code spelling
exclude: ^(swh/lister/.*/tests/data/.*)$
args: [-L crate]
stages: [commit]
- id: codespell
name: Check commit message spelling

View file

@ -58,6 +58,7 @@ setup(
lister.bitbucket=swh.lister.bitbucket:register
lister.cgit=swh.lister.cgit:register
lister.cran=swh.lister.cran:register
lister.crates=swh.lister.crates:register
lister.debian=swh.lister.debian:register
lister.gitea=swh.lister.gitea:register
lister.github=swh.lister.github:register

View file

@ -0,0 +1,12 @@
# Copyright (C) 2022 the Software Heritage developers
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
def register():
from .lister import CratesLister
return {
"lister": CratesLister,
"task_modules": ["%s.tasks" % __name__],
}

138
swh/lister/crates/lister.py Normal file
View file

@ -0,0 +1,138 @@
# Copyright (C) 2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import json
import logging
from pathlib import Path
import subprocess
from typing import Any, Dict, Iterator, List
import iso8601
from swh.scheduler.interface import SchedulerInterface
from swh.scheduler.model import ListedOrigin
from ..pattern import CredentialsType, StatelessLister
logger = logging.getLogger(__name__)
# Aliasing the page results returned by `get_pages` method from the lister.
CratesListerPage = List[Dict[str, Any]]
class CratesLister(StatelessLister[CratesListerPage]):
"""List origins from the "crates.io" forge.
It basically fetches https://github.com/rust-lang/crates.io-index.git to a
temp directory and then walks through each file to get the crate's info.
"""
# Part of the lister API, that identifies this lister
LISTER_NAME = "crates"
# (Optional) CVS type of the origins listed by this lister, if constant
VISIT_TYPE = "rust-crate"
INSTANCE = "crates"
INDEX_REPOSITORY_URL = "https://github.com/rust-lang/crates.io-index.git"
DESTINATION_PATH = Path("/tmp/crates.io-index")
CRATE_FILE_URL_PATTERN = (
"https://static.crates.io/crates/{crate}/{crate}-{version}.crate"
)
def __init__(
self, scheduler: SchedulerInterface, credentials: CredentialsType = None,
):
super().__init__(
scheduler=scheduler,
credentials=credentials,
url=self.INDEX_REPOSITORY_URL,
instance=self.INSTANCE,
)
def get_index_repository(self) -> None:
"""Get crates.io-index repository up to date running git command."""
subprocess.check_call(
["git", "clone", self.INDEX_REPOSITORY_URL, self.DESTINATION_PATH,]
)
def get_crates_index(self) -> List[Path]:
"""Build a sorted list of file paths excluding dotted directories and
dotted files.
Each file path corresponds to a crate that lists all available
versions.
"""
crates_index = sorted(
path
for path in self.DESTINATION_PATH.rglob("*")
if not any(part.startswith(".") for part in path.parts)
and path.is_file()
and path != self.DESTINATION_PATH / "config.json"
)
return crates_index
def get_pages(self) -> Iterator[CratesListerPage]:
"""Yield an iterator sorted by name in ascending order of pages.
Each page is a list of crate versions with:
- name: Name of the crate
- version: Version
- checksum: Checksum
- crate_file: Url of the crate file
- last_update: Date of the last commit of the corresponding index
file
"""
# Fetch crates.io index repository
self.get_index_repository()
# Get a list of all crates files from the index repository
crates_index = self.get_crates_index()
logger.debug("found %s crates in crates_index", len(crates_index))
for crate in crates_index:
page = []
# %cI is for strict iso8601 date formatting
last_update_str = subprocess.check_output(
["git", "log", "-1", "--pretty=format:%cI", str(crate)],
cwd=self.DESTINATION_PATH,
)
last_update = iso8601.parse_date(last_update_str.decode().strip())
with crate.open("rb") as current_file:
for line in current_file:
data = json.loads(line)
# pick only the data we need
page.append(
dict(
name=data["name"],
version=data["vers"],
checksum=data["cksum"],
crate_file=self.CRATE_FILE_URL_PATTERN.format(
crate=data["name"], version=data["vers"]
),
last_update=last_update,
)
)
yield page
def get_origins_from_page(self, page: CratesListerPage) -> Iterator[ListedOrigin]:
"""Iterate on all crate pages and yield ListedOrigin instances."""
assert self.lister_obj.id is not None
for version in page:
yield ListedOrigin(
lister_id=self.lister_obj.id,
visit_type=self.VISIT_TYPE,
url=version["crate_file"],
last_update=version["last_update"],
extra_loader_arguments={
"name": version["name"],
"version": version["version"],
"checksum": version["checksum"],
},
)

View file

@ -0,0 +1,19 @@
# Copyright (C) 2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from celery import shared_task
from swh.lister.crates.lister import CratesLister
@shared_task(name=__name__ + ".CratesListerTask")
def list_crates(**lister_args):
"""Lister task for crates (rust) registry"""
return CratesLister.from_configfile(**lister_args).run().dict()
@shared_task(name=__name__ + ".ping")
def _ping():
return "OK"

View file

@ -0,0 +1,29 @@
# Copyright (C) 2022 the Software Heritage developers
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import os
from pathlib import PosixPath
import subprocess
from typing import Optional, Union
def prepare_repository_from_archive(
archive_path: str,
filename: Optional[str] = None,
tmp_path: Union[PosixPath, str] = "/tmp",
) -> str:
"""Given an existing archive_path, uncompress it.
Returns a file repo url which can be used as origin url.
This does not deal with the case where the archive passed along does not exist.
"""
if not isinstance(tmp_path, str):
tmp_path = str(tmp_path)
# uncompress folder/repositories/dump for the loader to ingest
subprocess.check_output(["tar", "xf", archive_path, "-C", tmp_path])
# build the origin url (or some derivative form)
_fname = filename if filename else os.path.basename(archive_path)
repo_url = f"file://{tmp_path}/{_fname}"
return repo_url

View file

@ -0,0 +1,37 @@
#!/usr/bin/env bash
# Script to generate fake-crates-repository.tar.gz
# Creates a git repository like https://github.com/rust-lang/crates.io-index
# for tests purposes
set -euo pipefail
# files and directories
mkdir -p tmp_dir/crates.io-index/
cd tmp_dir/crates.io-index/
mkdir -p .dot-dir
touch .dot-dir/empty
mkdir -p ra/nd
mkdir -p re/ge
touch .dot-file
touch config.json
echo '{"name":"rand","vers":"0.1.1","deps":[],"cksum":"48a45b46c2a8c38348adb1205b13c3c5eb0174e0c0fec52cc88e9fb1de14c54d","features":{},"yanked":false}' > ra/nd/rand
echo '{"name":"rand","vers":"0.1.2","deps":[{"name":"libc","req":"^0.1.1","features":[""],"optional":false,"default_features":true,"target":null,"kind":"normal"},{"name":"log","req":"^0.2.1","features":[""],"optional":false,"default_features":true,"target":null,"kind":"normal"}],"cksum":"6e229ed392842fa93c1d76018d197b7e1b74250532bafb37b0e1d121a92d4cf7","features":{},"yanked":false}' >> ra/nd/rand
echo '{"name":"regex","vers":"0.1.0","deps":[],"cksum":"f0ff1ca641d3c9a2c30464dac30183a8b91cdcc959d616961be020cdea6255c5","features":{},"yanked":false}' > re/ge/regex
echo '{"name":"regex","vers":"0.1.1","deps":[{"name":"regex_macros","req":"^0.1.0","features":[""],"optional":false,"default_features":true,"target":null,"kind":"dev"}],"cksum":"a07bef996bd38a73c21a8e345d2c16848b41aa7ec949e2fedffe9edf74cdfb36","features":{},"yanked":false}' >> re/ge/regex
echo '{"name":"regex","vers":"0.1.2","deps":[{"name":"regex_macros","req":"^0.1.0","features":[""],"optional":false,"default_features":true,"target":null,"kind":"dev"}],"cksum":"343bd0171ee23346506db6f4c64525de6d72f0e8cc533f83aea97f3e7488cbf9","features":{},"yanked":false}' >> re/ge/regex
echo '{"name":"regex","vers":"0.1.3","deps":[{"name":"regex_macros","req":"^0.1.0","features":[""],"optional":false,"default_features":true,"target":null,"kind":"dev"}],"cksum":"defb220c4054ca1b95fe8b0c9a6e782dda684c1bdf8694df291733ae8a3748e3","features":{},"yanked":false}' >> re/ge/regex
echo '{"name":"regex-syntax","vers":"0.1.0","deps":[{"name":"rand","req":"^0.3","features":[""],"optional":false,"default_features":true,"target":null,"kind":"dev"},{"name":"quickcheck","req":"^0.2","features":[""],"optional":false,"default_features":true,"target":null,"kind":"dev"}],"cksum":"398952a2f6cd1d22bc1774fd663808e32cf36add0280dee5cdd84a8fff2db944","features":{},"yanked":false}' > re/ge/regex-syntax
# Init as a git repository
git init
git add .
git commit -m "Init fake crates.io-index repository for tests purpose"
# Save some space
rm .git/hooks/*.sample

View file

@ -0,0 +1,89 @@
# Copyright (C) 2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from pathlib import Path
from swh.lister.crates.lister import CratesLister
from swh.lister.crates.tests import prepare_repository_from_archive
expected_origins = [
{
"name": "rand",
"version": "0.1.1",
"checksum": "48a45b46c2a8c38348adb1205b13c3c5eb0174e0c0fec52cc88e9fb1de14c54d",
"url": "https://static.crates.io/crates/rand/rand-0.1.1.crate",
},
{
"name": "rand",
"version": "0.1.2",
"checksum": "6e229ed392842fa93c1d76018d197b7e1b74250532bafb37b0e1d121a92d4cf7",
"url": "https://static.crates.io/crates/rand/rand-0.1.2.crate",
},
{
"name": "regex",
"version": "0.1.0",
"checksum": "f0ff1ca641d3c9a2c30464dac30183a8b91cdcc959d616961be020cdea6255c5",
"url": "https://static.crates.io/crates/regex/regex-0.1.0.crate",
},
{
"name": "regex",
"version": "0.1.1",
"checksum": "a07bef996bd38a73c21a8e345d2c16848b41aa7ec949e2fedffe9edf74cdfb36",
"url": "https://static.crates.io/crates/regex/regex-0.1.1.crate",
},
{
"name": "regex",
"version": "0.1.2",
"checksum": "343bd0171ee23346506db6f4c64525de6d72f0e8cc533f83aea97f3e7488cbf9",
"url": "https://static.crates.io/crates/regex/regex-0.1.2.crate",
},
{
"name": "regex",
"version": "0.1.3",
"checksum": "defb220c4054ca1b95fe8b0c9a6e782dda684c1bdf8694df291733ae8a3748e3",
"url": "https://static.crates.io/crates/regex/regex-0.1.3.crate",
},
{
"name": "regex-syntax",
"version": "0.1.0",
"checksum": "398952a2f6cd1d22bc1774fd663808e32cf36add0280dee5cdd84a8fff2db944",
"url": "https://static.crates.io/crates/regex-syntax/regex-syntax-0.1.0.crate",
},
]
def test_crates_lister(datadir, tmp_path, swh_scheduler):
archive_path = Path(datadir, "fake-crates-repository.tar.gz")
repo_url = prepare_repository_from_archive(
archive_path, "crates.io-index", tmp_path
)
lister = CratesLister(scheduler=swh_scheduler)
lister.INDEX_REPOSITORY_URL = repo_url
lister.DESTINATION_PATH = tmp_path.parent / "crates.io-index-tests"
res = lister.run()
assert res.pages == 3
assert res.origins == 7
expected_origins_sorted = sorted(expected_origins, key=lambda x: x.get("url"))
scheduler_origins_sorted = sorted(
swh_scheduler.get_listed_origins(lister.lister_obj.id).results,
key=lambda x: x.url,
)
for scheduled, expected in zip(scheduler_origins_sorted, expected_origins_sorted):
assert scheduled.visit_type == "rust-crate"
assert scheduled.url == expected.get("url")
assert scheduled.extra_loader_arguments.get("name") == expected.get("name")
assert scheduled.extra_loader_arguments.get("version") == expected.get(
"version"
)
assert scheduled.extra_loader_arguments.get("checksum") == expected.get(
"checksum"
)
assert len(scheduler_origins_sorted) == len(expected_origins_sorted)

View file

@ -0,0 +1,31 @@
# Copyright (C) 2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from swh.lister.pattern import ListerStats
def test_crates_ping(swh_scheduler_celery_app, swh_scheduler_celery_worker):
res = swh_scheduler_celery_app.send_task("swh.lister.crates.tasks.ping")
assert res
res.wait()
assert res.successful()
assert res.result == "OK"
def test_crates_lister(swh_scheduler_celery_app, swh_scheduler_celery_worker, mocker):
# setup the mocked CratesLister
lister = mocker.patch("swh.lister.crates.tasks.CratesLister")
lister.from_configfile.return_value = lister
stats = ListerStats(pages=42, origins=42)
lister.run.return_value = stats
res = swh_scheduler_celery_app.send_task("swh.lister.crates.tasks.CratesListerTask")
assert res
res.wait()
assert res.successful()
assert res.result == stats.dict()
lister.from_configfile.assert_called_once_with()
lister.run.assert_called_once_with()