lister: Add new rust crates lister
The Crates lister retrieves crates package for Rust lang. It basically fetches https://github.com/rust-lang/crates.io-index.git to a temp directory and then walks through each file to get the crate's info.
This commit is contained in:
parent
ff0035a60b
commit
fea6fc04aa
10 changed files with 357 additions and 0 deletions
|
@ -17,6 +17,7 @@ repos:
|
|||
- id: codespell
|
||||
name: Check source code spelling
|
||||
exclude: ^(swh/lister/.*/tests/data/.*)$
|
||||
args: [-L crate]
|
||||
stages: [commit]
|
||||
- id: codespell
|
||||
name: Check commit message spelling
|
||||
|
|
1
setup.py
1
setup.py
|
@ -58,6 +58,7 @@ setup(
|
|||
lister.bitbucket=swh.lister.bitbucket:register
|
||||
lister.cgit=swh.lister.cgit:register
|
||||
lister.cran=swh.lister.cran:register
|
||||
lister.crates=swh.lister.crates:register
|
||||
lister.debian=swh.lister.debian:register
|
||||
lister.gitea=swh.lister.gitea:register
|
||||
lister.github=swh.lister.github:register
|
||||
|
|
12
swh/lister/crates/__init__.py
Normal file
12
swh/lister/crates/__init__.py
Normal file
|
@ -0,0 +1,12 @@
|
|||
# Copyright (C) 2022 the Software Heritage developers
|
||||
# License: GNU General Public License version 3, or any later version
|
||||
# See top-level LICENSE file for more information
|
||||
|
||||
|
||||
def register():
|
||||
from .lister import CratesLister
|
||||
|
||||
return {
|
||||
"lister": CratesLister,
|
||||
"task_modules": ["%s.tasks" % __name__],
|
||||
}
|
138
swh/lister/crates/lister.py
Normal file
138
swh/lister/crates/lister.py
Normal file
|
@ -0,0 +1,138 @@
|
|||
# Copyright (C) 2022 The Software Heritage developers
|
||||
# See the AUTHORS file at the top-level directory of this distribution
|
||||
# License: GNU General Public License version 3, or any later version
|
||||
# See top-level LICENSE file for more information
|
||||
|
||||
import json
|
||||
import logging
|
||||
from pathlib import Path
|
||||
import subprocess
|
||||
from typing import Any, Dict, Iterator, List
|
||||
|
||||
import iso8601
|
||||
|
||||
from swh.scheduler.interface import SchedulerInterface
|
||||
from swh.scheduler.model import ListedOrigin
|
||||
|
||||
from ..pattern import CredentialsType, StatelessLister
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Aliasing the page results returned by `get_pages` method from the lister.
|
||||
CratesListerPage = List[Dict[str, Any]]
|
||||
|
||||
|
||||
class CratesLister(StatelessLister[CratesListerPage]):
|
||||
"""List origins from the "crates.io" forge.
|
||||
|
||||
It basically fetches https://github.com/rust-lang/crates.io-index.git to a
|
||||
temp directory and then walks through each file to get the crate's info.
|
||||
"""
|
||||
|
||||
# Part of the lister API, that identifies this lister
|
||||
LISTER_NAME = "crates"
|
||||
# (Optional) CVS type of the origins listed by this lister, if constant
|
||||
VISIT_TYPE = "rust-crate"
|
||||
|
||||
INSTANCE = "crates"
|
||||
INDEX_REPOSITORY_URL = "https://github.com/rust-lang/crates.io-index.git"
|
||||
DESTINATION_PATH = Path("/tmp/crates.io-index")
|
||||
CRATE_FILE_URL_PATTERN = (
|
||||
"https://static.crates.io/crates/{crate}/{crate}-{version}.crate"
|
||||
)
|
||||
|
||||
def __init__(
|
||||
self, scheduler: SchedulerInterface, credentials: CredentialsType = None,
|
||||
):
|
||||
super().__init__(
|
||||
scheduler=scheduler,
|
||||
credentials=credentials,
|
||||
url=self.INDEX_REPOSITORY_URL,
|
||||
instance=self.INSTANCE,
|
||||
)
|
||||
|
||||
def get_index_repository(self) -> None:
|
||||
"""Get crates.io-index repository up to date running git command."""
|
||||
|
||||
subprocess.check_call(
|
||||
["git", "clone", self.INDEX_REPOSITORY_URL, self.DESTINATION_PATH,]
|
||||
)
|
||||
|
||||
def get_crates_index(self) -> List[Path]:
|
||||
"""Build a sorted list of file paths excluding dotted directories and
|
||||
dotted files.
|
||||
|
||||
Each file path corresponds to a crate that lists all available
|
||||
versions.
|
||||
"""
|
||||
|
||||
crates_index = sorted(
|
||||
path
|
||||
for path in self.DESTINATION_PATH.rglob("*")
|
||||
if not any(part.startswith(".") for part in path.parts)
|
||||
and path.is_file()
|
||||
and path != self.DESTINATION_PATH / "config.json"
|
||||
)
|
||||
|
||||
return crates_index
|
||||
|
||||
def get_pages(self) -> Iterator[CratesListerPage]:
|
||||
"""Yield an iterator sorted by name in ascending order of pages.
|
||||
|
||||
Each page is a list of crate versions with:
|
||||
- name: Name of the crate
|
||||
- version: Version
|
||||
- checksum: Checksum
|
||||
- crate_file: Url of the crate file
|
||||
- last_update: Date of the last commit of the corresponding index
|
||||
file
|
||||
"""
|
||||
# Fetch crates.io index repository
|
||||
self.get_index_repository()
|
||||
# Get a list of all crates files from the index repository
|
||||
crates_index = self.get_crates_index()
|
||||
logger.debug("found %s crates in crates_index", len(crates_index))
|
||||
|
||||
for crate in crates_index:
|
||||
page = []
|
||||
# %cI is for strict iso8601 date formatting
|
||||
last_update_str = subprocess.check_output(
|
||||
["git", "log", "-1", "--pretty=format:%cI", str(crate)],
|
||||
cwd=self.DESTINATION_PATH,
|
||||
)
|
||||
last_update = iso8601.parse_date(last_update_str.decode().strip())
|
||||
|
||||
with crate.open("rb") as current_file:
|
||||
for line in current_file:
|
||||
data = json.loads(line)
|
||||
# pick only the data we need
|
||||
page.append(
|
||||
dict(
|
||||
name=data["name"],
|
||||
version=data["vers"],
|
||||
checksum=data["cksum"],
|
||||
crate_file=self.CRATE_FILE_URL_PATTERN.format(
|
||||
crate=data["name"], version=data["vers"]
|
||||
),
|
||||
last_update=last_update,
|
||||
)
|
||||
)
|
||||
yield page
|
||||
|
||||
def get_origins_from_page(self, page: CratesListerPage) -> Iterator[ListedOrigin]:
|
||||
"""Iterate on all crate pages and yield ListedOrigin instances."""
|
||||
|
||||
assert self.lister_obj.id is not None
|
||||
|
||||
for version in page:
|
||||
yield ListedOrigin(
|
||||
lister_id=self.lister_obj.id,
|
||||
visit_type=self.VISIT_TYPE,
|
||||
url=version["crate_file"],
|
||||
last_update=version["last_update"],
|
||||
extra_loader_arguments={
|
||||
"name": version["name"],
|
||||
"version": version["version"],
|
||||
"checksum": version["checksum"],
|
||||
},
|
||||
)
|
19
swh/lister/crates/tasks.py
Normal file
19
swh/lister/crates/tasks.py
Normal file
|
@ -0,0 +1,19 @@
|
|||
# Copyright (C) 2022 The Software Heritage developers
|
||||
# See the AUTHORS file at the top-level directory of this distribution
|
||||
# License: GNU General Public License version 3, or any later version
|
||||
# See top-level LICENSE file for more information
|
||||
|
||||
from celery import shared_task
|
||||
|
||||
from swh.lister.crates.lister import CratesLister
|
||||
|
||||
|
||||
@shared_task(name=__name__ + ".CratesListerTask")
|
||||
def list_crates(**lister_args):
|
||||
"""Lister task for crates (rust) registry"""
|
||||
return CratesLister.from_configfile(**lister_args).run().dict()
|
||||
|
||||
|
||||
@shared_task(name=__name__ + ".ping")
|
||||
def _ping():
|
||||
return "OK"
|
29
swh/lister/crates/tests/__init__.py
Normal file
29
swh/lister/crates/tests/__init__.py
Normal file
|
@ -0,0 +1,29 @@
|
|||
# Copyright (C) 2022 the Software Heritage developers
|
||||
# License: GNU General Public License version 3, or any later version
|
||||
# See top-level LICENSE file for more information
|
||||
|
||||
import os
|
||||
from pathlib import PosixPath
|
||||
import subprocess
|
||||
from typing import Optional, Union
|
||||
|
||||
|
||||
def prepare_repository_from_archive(
|
||||
archive_path: str,
|
||||
filename: Optional[str] = None,
|
||||
tmp_path: Union[PosixPath, str] = "/tmp",
|
||||
) -> str:
|
||||
"""Given an existing archive_path, uncompress it.
|
||||
Returns a file repo url which can be used as origin url.
|
||||
|
||||
This does not deal with the case where the archive passed along does not exist.
|
||||
|
||||
"""
|
||||
if not isinstance(tmp_path, str):
|
||||
tmp_path = str(tmp_path)
|
||||
# uncompress folder/repositories/dump for the loader to ingest
|
||||
subprocess.check_output(["tar", "xf", archive_path, "-C", tmp_path])
|
||||
# build the origin url (or some derivative form)
|
||||
_fname = filename if filename else os.path.basename(archive_path)
|
||||
repo_url = f"file://{tmp_path}/{_fname}"
|
||||
return repo_url
|
BIN
swh/lister/crates/tests/data/fake-crates-repository.tar.gz
Normal file
BIN
swh/lister/crates/tests/data/fake-crates-repository.tar.gz
Normal file
Binary file not shown.
37
swh/lister/crates/tests/data/fake_crates_repository_init.sh
Executable file
37
swh/lister/crates/tests/data/fake_crates_repository_init.sh
Executable file
|
@ -0,0 +1,37 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
# Script to generate fake-crates-repository.tar.gz
|
||||
# Creates a git repository like https://github.com/rust-lang/crates.io-index
|
||||
# for tests purposes
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# files and directories
|
||||
mkdir -p tmp_dir/crates.io-index/
|
||||
cd tmp_dir/crates.io-index/
|
||||
|
||||
mkdir -p .dot-dir
|
||||
touch .dot-dir/empty
|
||||
mkdir -p ra/nd
|
||||
mkdir -p re/ge
|
||||
|
||||
touch .dot-file
|
||||
touch config.json
|
||||
|
||||
echo '{"name":"rand","vers":"0.1.1","deps":[],"cksum":"48a45b46c2a8c38348adb1205b13c3c5eb0174e0c0fec52cc88e9fb1de14c54d","features":{},"yanked":false}' > ra/nd/rand
|
||||
echo '{"name":"rand","vers":"0.1.2","deps":[{"name":"libc","req":"^0.1.1","features":[""],"optional":false,"default_features":true,"target":null,"kind":"normal"},{"name":"log","req":"^0.2.1","features":[""],"optional":false,"default_features":true,"target":null,"kind":"normal"}],"cksum":"6e229ed392842fa93c1d76018d197b7e1b74250532bafb37b0e1d121a92d4cf7","features":{},"yanked":false}' >> ra/nd/rand
|
||||
|
||||
echo '{"name":"regex","vers":"0.1.0","deps":[],"cksum":"f0ff1ca641d3c9a2c30464dac30183a8b91cdcc959d616961be020cdea6255c5","features":{},"yanked":false}' > re/ge/regex
|
||||
echo '{"name":"regex","vers":"0.1.1","deps":[{"name":"regex_macros","req":"^0.1.0","features":[""],"optional":false,"default_features":true,"target":null,"kind":"dev"}],"cksum":"a07bef996bd38a73c21a8e345d2c16848b41aa7ec949e2fedffe9edf74cdfb36","features":{},"yanked":false}' >> re/ge/regex
|
||||
echo '{"name":"regex","vers":"0.1.2","deps":[{"name":"regex_macros","req":"^0.1.0","features":[""],"optional":false,"default_features":true,"target":null,"kind":"dev"}],"cksum":"343bd0171ee23346506db6f4c64525de6d72f0e8cc533f83aea97f3e7488cbf9","features":{},"yanked":false}' >> re/ge/regex
|
||||
echo '{"name":"regex","vers":"0.1.3","deps":[{"name":"regex_macros","req":"^0.1.0","features":[""],"optional":false,"default_features":true,"target":null,"kind":"dev"}],"cksum":"defb220c4054ca1b95fe8b0c9a6e782dda684c1bdf8694df291733ae8a3748e3","features":{},"yanked":false}' >> re/ge/regex
|
||||
|
||||
echo '{"name":"regex-syntax","vers":"0.1.0","deps":[{"name":"rand","req":"^0.3","features":[""],"optional":false,"default_features":true,"target":null,"kind":"dev"},{"name":"quickcheck","req":"^0.2","features":[""],"optional":false,"default_features":true,"target":null,"kind":"dev"}],"cksum":"398952a2f6cd1d22bc1774fd663808e32cf36add0280dee5cdd84a8fff2db944","features":{},"yanked":false}' > re/ge/regex-syntax
|
||||
|
||||
# Init as a git repository
|
||||
git init
|
||||
git add .
|
||||
git commit -m "Init fake crates.io-index repository for tests purpose"
|
||||
|
||||
# Save some space
|
||||
rm .git/hooks/*.sample
|
89
swh/lister/crates/tests/test_lister.py
Normal file
89
swh/lister/crates/tests/test_lister.py
Normal file
|
@ -0,0 +1,89 @@
|
|||
# Copyright (C) 2022 The Software Heritage developers
|
||||
# See the AUTHORS file at the top-level directory of this distribution
|
||||
# License: GNU General Public License version 3, or any later version
|
||||
# See top-level LICENSE file for more information
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
from swh.lister.crates.lister import CratesLister
|
||||
from swh.lister.crates.tests import prepare_repository_from_archive
|
||||
|
||||
expected_origins = [
|
||||
{
|
||||
"name": "rand",
|
||||
"version": "0.1.1",
|
||||
"checksum": "48a45b46c2a8c38348adb1205b13c3c5eb0174e0c0fec52cc88e9fb1de14c54d",
|
||||
"url": "https://static.crates.io/crates/rand/rand-0.1.1.crate",
|
||||
},
|
||||
{
|
||||
"name": "rand",
|
||||
"version": "0.1.2",
|
||||
"checksum": "6e229ed392842fa93c1d76018d197b7e1b74250532bafb37b0e1d121a92d4cf7",
|
||||
"url": "https://static.crates.io/crates/rand/rand-0.1.2.crate",
|
||||
},
|
||||
{
|
||||
"name": "regex",
|
||||
"version": "0.1.0",
|
||||
"checksum": "f0ff1ca641d3c9a2c30464dac30183a8b91cdcc959d616961be020cdea6255c5",
|
||||
"url": "https://static.crates.io/crates/regex/regex-0.1.0.crate",
|
||||
},
|
||||
{
|
||||
"name": "regex",
|
||||
"version": "0.1.1",
|
||||
"checksum": "a07bef996bd38a73c21a8e345d2c16848b41aa7ec949e2fedffe9edf74cdfb36",
|
||||
"url": "https://static.crates.io/crates/regex/regex-0.1.1.crate",
|
||||
},
|
||||
{
|
||||
"name": "regex",
|
||||
"version": "0.1.2",
|
||||
"checksum": "343bd0171ee23346506db6f4c64525de6d72f0e8cc533f83aea97f3e7488cbf9",
|
||||
"url": "https://static.crates.io/crates/regex/regex-0.1.2.crate",
|
||||
},
|
||||
{
|
||||
"name": "regex",
|
||||
"version": "0.1.3",
|
||||
"checksum": "defb220c4054ca1b95fe8b0c9a6e782dda684c1bdf8694df291733ae8a3748e3",
|
||||
"url": "https://static.crates.io/crates/regex/regex-0.1.3.crate",
|
||||
},
|
||||
{
|
||||
"name": "regex-syntax",
|
||||
"version": "0.1.0",
|
||||
"checksum": "398952a2f6cd1d22bc1774fd663808e32cf36add0280dee5cdd84a8fff2db944",
|
||||
"url": "https://static.crates.io/crates/regex-syntax/regex-syntax-0.1.0.crate",
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
def test_crates_lister(datadir, tmp_path, swh_scheduler):
|
||||
archive_path = Path(datadir, "fake-crates-repository.tar.gz")
|
||||
repo_url = prepare_repository_from_archive(
|
||||
archive_path, "crates.io-index", tmp_path
|
||||
)
|
||||
|
||||
lister = CratesLister(scheduler=swh_scheduler)
|
||||
lister.INDEX_REPOSITORY_URL = repo_url
|
||||
lister.DESTINATION_PATH = tmp_path.parent / "crates.io-index-tests"
|
||||
|
||||
res = lister.run()
|
||||
|
||||
assert res.pages == 3
|
||||
assert res.origins == 7
|
||||
|
||||
expected_origins_sorted = sorted(expected_origins, key=lambda x: x.get("url"))
|
||||
scheduler_origins_sorted = sorted(
|
||||
swh_scheduler.get_listed_origins(lister.lister_obj.id).results,
|
||||
key=lambda x: x.url,
|
||||
)
|
||||
|
||||
for scheduled, expected in zip(scheduler_origins_sorted, expected_origins_sorted):
|
||||
assert scheduled.visit_type == "rust-crate"
|
||||
assert scheduled.url == expected.get("url")
|
||||
assert scheduled.extra_loader_arguments.get("name") == expected.get("name")
|
||||
assert scheduled.extra_loader_arguments.get("version") == expected.get(
|
||||
"version"
|
||||
)
|
||||
assert scheduled.extra_loader_arguments.get("checksum") == expected.get(
|
||||
"checksum"
|
||||
)
|
||||
|
||||
assert len(scheduler_origins_sorted) == len(expected_origins_sorted)
|
31
swh/lister/crates/tests/test_tasks.py
Normal file
31
swh/lister/crates/tests/test_tasks.py
Normal file
|
@ -0,0 +1,31 @@
|
|||
# Copyright (C) 2022 The Software Heritage developers
|
||||
# See the AUTHORS file at the top-level directory of this distribution
|
||||
# License: GNU General Public License version 3, or any later version
|
||||
# See top-level LICENSE file for more information
|
||||
|
||||
from swh.lister.pattern import ListerStats
|
||||
|
||||
|
||||
def test_crates_ping(swh_scheduler_celery_app, swh_scheduler_celery_worker):
|
||||
res = swh_scheduler_celery_app.send_task("swh.lister.crates.tasks.ping")
|
||||
assert res
|
||||
res.wait()
|
||||
assert res.successful()
|
||||
assert res.result == "OK"
|
||||
|
||||
|
||||
def test_crates_lister(swh_scheduler_celery_app, swh_scheduler_celery_worker, mocker):
|
||||
# setup the mocked CratesLister
|
||||
lister = mocker.patch("swh.lister.crates.tasks.CratesLister")
|
||||
lister.from_configfile.return_value = lister
|
||||
stats = ListerStats(pages=42, origins=42)
|
||||
lister.run.return_value = stats
|
||||
|
||||
res = swh_scheduler_celery_app.send_task("swh.lister.crates.tasks.CratesListerTask")
|
||||
assert res
|
||||
res.wait()
|
||||
assert res.successful()
|
||||
assert res.result == stats.dict()
|
||||
|
||||
lister.from_configfile.assert_called_once_with()
|
||||
lister.run.assert_called_once_with()
|
Loading…
Add table
Add a link
Reference in a new issue