crates.lister: Implement incremental mode:

Add incremental mode support based on a 'last_commit' state, used to get
new package versions from git diff range of commits.
This commit is contained in:
Franck Bret 2022-07-08 12:46:11 +02:00
parent d34a6232a6
commit a6f796b268
6 changed files with 296 additions and 49 deletions

View file

@ -43,3 +43,5 @@ ignore_missing_imports = True
[mypy-xmltodict.*]
ignore_missing_imports = True
[mypy-dulwich.*]
ignore_missing_imports = True

View file

@ -7,3 +7,4 @@ launchpadlib
tenacity >= 6.2
xmltodict
lxml
dulwich

View file

@ -2,20 +2,24 @@
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from dataclasses import asdict, dataclass
import datetime
import io
import json
import logging
from pathlib import Path
import subprocess
from typing import Any, Dict, Iterator, List
import shutil
from typing import Any, Dict, Iterator, List, Optional
from urllib.parse import urlparse
import iso8601
from dulwich import porcelain
from dulwich.patch import write_tree_diff
from dulwich.repo import Repo
from swh.scheduler.interface import SchedulerInterface
from swh.scheduler.model import ListedOrigin
from ..pattern import CredentialsType, StatelessLister
from ..pattern import CredentialsType, Lister
logger = logging.getLogger(__name__)
@ -23,11 +27,25 @@ logger = logging.getLogger(__name__)
CratesListerPage = List[Dict[str, Any]]
class CratesLister(StatelessLister[CratesListerPage]):
@dataclass
class CratesListerState:
"""Store lister state for incremental mode operations.
'last_commit' represents a git commit hash
"""
last_commit: str = ""
class CratesLister(Lister[CratesListerState, CratesListerPage]):
"""List origins from the "crates.io" forge.
It basically fetches https://github.com/rust-lang/crates.io-index.git to a
temp directory and then walks through each file to get the crate's info.
temp directory and then walks through each file to get the crate's info on
the first run.
In incremental mode, it relies on the same Git repository but instead of reading
each file of the repo, it get the differences through ``git log last_commit..HEAD``.
Resulting output string is parsed to build page entries.
"""
# Part of the lister API, that identifies this lister
@ -55,17 +73,24 @@ class CratesLister(StatelessLister[CratesListerPage]):
instance=self.INSTANCE,
)
def state_from_dict(self, d: Dict[str, Any]) -> CratesListerState:
if "last_commit" not in d:
d["last_commit"] = ""
return CratesListerState(**d)
def state_to_dict(self, state: CratesListerState) -> Dict[str, Any]:
return asdict(state)
def get_index_repository(self) -> None:
"""Get crates.io-index repository up to date running git command."""
subprocess.check_call(
[
"git",
"clone",
self.INDEX_REPOSITORY_URL,
self.DESTINATION_PATH,
]
)
if self.DESTINATION_PATH.exists():
porcelain.pull(
self.DESTINATION_PATH, remote_location=self.INDEX_REPOSITORY_URL
)
else:
porcelain.clone(
source=self.INDEX_REPOSITORY_URL, target=self.DESTINATION_PATH
)
def get_crates_index(self) -> List[Path]:
"""Build a sorted list of file paths excluding dotted directories and
@ -74,7 +99,6 @@ class CratesLister(StatelessLister[CratesListerPage]):
Each file path corresponds to a crate that lists all available
versions.
"""
crates_index = sorted(
path
for path in self.DESTINATION_PATH.rglob("*")
@ -85,6 +109,51 @@ class CratesLister(StatelessLister[CratesListerPage]):
return crates_index
def get_last_commit_hash(self, repository_path: Path) -> str:
"""Returns the last commit hash of a git repository"""
assert repository_path.exists()
repo = Repo(str(repository_path))
head = repo.head()
last_commit = repo[head]
return last_commit.id.decode()
def get_last_update_by_file(self, filepath: Path) -> Optional[datetime.datetime]:
"""Given a file path within a Git repository, returns its last commit
date as iso8601
"""
repo = Repo(str(self.DESTINATION_PATH))
# compute relative path otherwise it fails
relative_path = filepath.relative_to(self.DESTINATION_PATH)
walker = repo.get_walker(paths=[bytes(relative_path)], max_entries=1)
try:
commit = next(iter(walker)).commit
except StopIteration:
logger.error(
"Can not find %s related commits in repository %s", relative_path, repo
)
return None
else:
last_update = datetime.datetime.fromtimestamp(
commit.author_time, datetime.timezone.utc
)
return last_update
def page_entry_dict(self, entry: Dict[str, Any]) -> Dict[str, Any]:
"""Transform package version definition dict to a suitable
page entry dict
"""
return dict(
name=entry["name"],
version=entry["vers"],
checksum=entry["cksum"],
yanked=entry["yanked"],
crate_file=self.CRATE_FILE_URL_PATTERN.format(
crate=entry["name"], version=entry["vers"]
),
)
def get_pages(self) -> Iterator[CratesListerPage]:
"""Yield an iterator sorted by name in ascending order of pages.
@ -98,34 +167,41 @@ class CratesLister(StatelessLister[CratesListerPage]):
"""
# Fetch crates.io index repository
self.get_index_repository()
# Get a list of all crates files from the index repository
crates_index = self.get_crates_index()
logger.debug("found %s crates in crates_index", len(crates_index))
if not self.state.last_commit:
# First discovery
# List all crates files from the index repository
crates_index = self.get_crates_index()
else:
# Incremental case
# Get new package version by parsing a range of commits from index repository
repo = Repo(str(self.DESTINATION_PATH))
head = repo[repo.head()]
last = repo[self.state.last_commit.encode()]
outstream = io.BytesIO()
write_tree_diff(outstream, repo.object_store, last.tree, head.tree)
raw_diff = outstream.getvalue()
crates_index = []
for line in raw_diff.splitlines():
if line.startswith(b"+++ b/"):
filepath = line.split(b"+++ b/", 1)[1]
crates_index.append(self.DESTINATION_PATH / filepath.decode())
crates_index = sorted(crates_index)
logger.debug("Found %s crates in crates_index", len(crates_index))
# Each line of a crate file is a json entry describing released versions
# for a package
for crate in crates_index:
page = []
# %cI is for strict iso8601 date formatting
last_update_str = subprocess.check_output(
["git", "log", "-1", "--pretty=format:%cI", str(crate)],
cwd=self.DESTINATION_PATH,
)
last_update = iso8601.parse_date(last_update_str.decode().strip())
last_update = self.get_last_update_by_file(crate)
with crate.open("rb") as current_file:
for line in current_file:
data = json.loads(line)
# pick only the data we need
page.append(
dict(
name=data["name"],
version=data["vers"],
checksum=data["cksum"],
crate_file=self.CRATE_FILE_URL_PATTERN.format(
crate=data["name"], version=data["vers"]
),
last_update=last_update,
)
)
entry = self.page_entry_dict(data)
entry["last_update"] = last_update
page.append(entry)
yield page
def get_origins_from_page(self, page: CratesListerPage) -> Iterator[ListedOrigin]:
@ -136,6 +212,7 @@ class CratesLister(StatelessLister[CratesListerPage]):
url = self.CRATE_API_URL_PATTERN.format(crate=page[0]["name"])
last_update = page[0]["last_update"]
artifacts = []
crates_metadata = []
for version in page:
filename = urlparse(version["crate_file"]).path.split("/")[-1]
@ -150,6 +227,8 @@ class CratesLister(StatelessLister[CratesListerPage]):
"version": version["version"],
}
artifacts.append(artifact)
data = {f"{version['version']}": {"yanked": f"{version['yanked']}"}}
crates_metadata.append(data)
yield ListedOrigin(
lister_id=self.lister_obj.id,
@ -158,5 +237,23 @@ class CratesLister(StatelessLister[CratesListerPage]):
last_update=last_update,
extra_loader_arguments={
"artifacts": artifacts,
"crates_metadata": crates_metadata,
},
)
def finalize(self) -> None:
last = self.get_last_commit_hash(repository_path=self.DESTINATION_PATH)
if self.state.last_commit == last:
self.updated = False
else:
self.state.last_commit = last
self.updated = True
logger.debug("Listing crates origin completed with last commit id %s", last)
# Cleanup by removing the repository directory
if self.DESTINATION_PATH.exists():
shutil.rmtree(self.DESTINATION_PATH)
logger.debug(
"Successfully removed %s directory", str(self.DESTINATION_PATH)
)

View file

@ -18,20 +18,47 @@ mkdir -p re/ge
touch .dot-file
touch config.json
echo '{"name":"rand","vers":"0.1.1","deps":[],"cksum":"48a45b46c2a8c38348adb1205b13c3c5eb0174e0c0fec52cc88e9fb1de14c54d","features":{},"yanked":false}' > ra/nd/rand
echo '{"name":"rand","vers":"0.1.2","deps":[{"name":"libc","req":"^0.1.1","features":[""],"optional":false,"default_features":true,"target":null,"kind":"normal"},{"name":"log","req":"^0.2.1","features":[""],"optional":false,"default_features":true,"target":null,"kind":"normal"}],"cksum":"6e229ed392842fa93c1d76018d197b7e1b74250532bafb37b0e1d121a92d4cf7","features":{},"yanked":false}' >> ra/nd/rand
echo '{"name":"regex","vers":"0.1.0","deps":[],"cksum":"f0ff1ca641d3c9a2c30464dac30183a8b91cdcc959d616961be020cdea6255c5","features":{},"yanked":false}' > re/ge/regex
echo '{"name":"regex","vers":"0.1.1","deps":[{"name":"regex_macros","req":"^0.1.0","features":[""],"optional":false,"default_features":true,"target":null,"kind":"dev"}],"cksum":"a07bef996bd38a73c21a8e345d2c16848b41aa7ec949e2fedffe9edf74cdfb36","features":{},"yanked":false}' >> re/ge/regex
echo '{"name":"regex","vers":"0.1.2","deps":[{"name":"regex_macros","req":"^0.1.0","features":[""],"optional":false,"default_features":true,"target":null,"kind":"dev"}],"cksum":"343bd0171ee23346506db6f4c64525de6d72f0e8cc533f83aea97f3e7488cbf9","features":{},"yanked":false}' >> re/ge/regex
echo '{"name":"regex","vers":"0.1.3","deps":[{"name":"regex_macros","req":"^0.1.0","features":[""],"optional":false,"default_features":true,"target":null,"kind":"dev"}],"cksum":"defb220c4054ca1b95fe8b0c9a6e782dda684c1bdf8694df291733ae8a3748e3","features":{},"yanked":false}' >> re/ge/regex
echo '{"name":"regex-syntax","vers":"0.1.0","deps":[{"name":"rand","req":"^0.3","features":[""],"optional":false,"default_features":true,"target":null,"kind":"dev"},{"name":"quickcheck","req":"^0.2","features":[""],"optional":false,"default_features":true,"target":null,"kind":"dev"}],"cksum":"398952a2f6cd1d22bc1774fd663808e32cf36add0280dee5cdd84a8fff2db944","features":{},"yanked":false}' > re/ge/regex-syntax
# Init as a git repository
git init
git add .
git commit -m "Init fake crates.io-index repository for tests purpose"
echo '{"name":"rand","vers":"0.1.1","deps":[],"cksum":"48a45b46c2a8c38348adb1205b13c3c5eb0174e0c0fec52cc88e9fb1de14c54d","features":{},"yanked":false}' > ra/nd/rand
git add .
git commit -m " Updating crate rand#0.1.1"
echo '{"name":"rand","vers":"0.1.2","deps":[{"name":"libc","req":"^0.1.1","features":[""],"optional":false,"default_features":true,"target":null,"kind":"normal"},{"name":"log","req":"^0.2.1","features":[""],"optional":false,"default_features":true,"target":null,"kind":"normal"}],"cksum":"6e229ed392842fa93c1d76018d197b7e1b74250532bafb37b0e1d121a92d4cf7","features":{},"yanked":false}' >> ra/nd/rand
git add .
git commit -m " Updating crate rand#0.1.2"
echo '{"name":"regex","vers":"0.1.0","deps":[],"cksum":"f0ff1ca641d3c9a2c30464dac30183a8b91cdcc959d616961be020cdea6255c5","features":{},"yanked":false}' > re/ge/regex
git add .
git commit -m " Updating crate regex#0.1.0"
echo '{"name":"regex","vers":"0.1.1","deps":[{"name":"regex_macros","req":"^0.1.0","features":[""],"optional":false,"default_features":true,"target":null,"kind":"dev"}],"cksum":"a07bef996bd38a73c21a8e345d2c16848b41aa7ec949e2fedffe9edf74cdfb36","features":{},"yanked":false}' >> re/ge/regex
git add .
git commit -m " Updating crate regex#0.1.1"
echo '{"name":"regex","vers":"0.1.2","deps":[{"name":"regex_macros","req":"^0.1.0","features":[""],"optional":false,"default_features":true,"target":null,"kind":"dev"}],"cksum":"343bd0171ee23346506db6f4c64525de6d72f0e8cc533f83aea97f3e7488cbf9","features":{},"yanked":false}' >> re/ge/regex
git add .
git commit -m " Updating crate regex#0.1.2"
echo '{"name":"regex","vers":"0.1.3","deps":[{"name":"regex_macros","req":"^0.1.0","features":[""],"optional":false,"default_features":true,"target":null,"kind":"dev"}],"cksum":"defb220c4054ca1b95fe8b0c9a6e782dda684c1bdf8694df291733ae8a3748e3","features":{},"yanked":false}' >> re/ge/regex
git add .
git commit -m " Updating crate regex#0.1.3"
echo '{"name":"regex-syntax","vers":"0.1.0","deps":[{"name":"rand","req":"^0.3","features":[""],"optional":false,"default_features":true,"target":null,"kind":"dev"},{"name":"quickcheck","req":"^0.2","features":[""],"optional":false,"default_features":true,"target":null,"kind":"dev"}],"cksum":"398952a2f6cd1d22bc1774fd663808e32cf36add0280dee5cdd84a8fff2db944","features":{},"yanked":false}' > re/ge/regex-syntax
git add .
git commit -m " Updating crate regex-syntax#0.1.0"
# Save some space
rm .git/hooks/*.sample
# Compress git directory as a tar.gz archive
cd ../
tar -cvzf fake-crates-repository.tar.gz crates.io-index
mv fake-crates-repository.tar.gz ../
# Clean up tmp_dir
cd ../
rm -rf tmp_dir

View file

@ -5,7 +5,9 @@
from pathlib import Path
from swh.lister.crates.lister import CratesLister
from dulwich.repo import Repo
from swh.lister.crates.lister import CratesLister, CratesListerState
from swh.lister.crates.tests import prepare_repository_from_archive
expected_origins = [
@ -29,6 +31,16 @@ expected_origins = [
"version": "0.1.2",
},
],
"metadata": [
{
"version": "0.1.1",
"yanked": False,
},
{
"version": "0.1.2",
"yanked": False,
},
],
},
{
"url": "https://crates.io/api/v1/crates/regex",
@ -66,6 +78,24 @@ expected_origins = [
"version": "0.1.3",
},
],
"metadata": [
{
"version": "0.1.0",
"yanked": False,
},
{
"version": "0.1.1",
"yanked": False,
},
{
"version": "0.1.2",
"yanked": False,
},
{
"version": "0.1.3",
"yanked": False,
},
],
},
{
"url": "https://crates.io/api/v1/crates/regex-syntax",
@ -79,10 +109,19 @@ expected_origins = [
"version": "0.1.0",
},
],
"metadata": [
{
"version": "0.1.0",
"yanked": False,
},
],
},
]
expected_origins_incremental = [expected_origins[1], expected_origins[2]]
def test_crates_lister(datadir, tmp_path, swh_scheduler):
archive_path = Path(datadir, "fake-crates-repository.tar.gz")
repo_url = prepare_repository_from_archive(
@ -112,3 +151,84 @@ def test_crates_lister(datadir, tmp_path, swh_scheduler):
)
assert len(scheduler_origins_sorted) == len(expected_origins_sorted)
def test_crates_lister_incremental(datadir, tmp_path, swh_scheduler):
archive_path = Path(datadir, "fake-crates-repository.tar.gz")
repo_url = prepare_repository_from_archive(
archive_path, "crates.io-index", tmp_path
)
lister = CratesLister(scheduler=swh_scheduler)
lister.INDEX_REPOSITORY_URL = repo_url
lister.DESTINATION_PATH = tmp_path.parent / "crates.io-index-tests"
# The lister has not run yet, get the index repository
lister.get_index_repository()
# Set a CratesListerState with a last commit value to force incremental case
repo = Repo(lister.DESTINATION_PATH)
# Lets set this last commit to third one from head
step = list(repo.get_walker(max_entries=3))[-1]
last_commit_state = CratesListerState(last_commit=step.commit.id.decode())
lister.state = last_commit_state
res = lister.run()
assert res.pages == 2
assert res.origins == 2
expected_origins_sorted = sorted(
expected_origins_incremental, key=lambda x: x.get("url")
)
scheduler_origins_sorted = sorted(
swh_scheduler.get_listed_origins(lister.lister_obj.id).results,
key=lambda x: x.url,
)
for scheduled, expected in zip(scheduler_origins_sorted, expected_origins_sorted):
assert scheduled.visit_type == "crates"
assert scheduled.url == expected.get("url")
assert scheduled.extra_loader_arguments.get("artifacts") == expected.get(
"artifacts"
)
assert len(scheduler_origins_sorted) == len(expected_origins_sorted)
def test_crates_lister_incremental_nothing_new(datadir, tmp_path, swh_scheduler):
"""Ensure incremental mode runs fine when the repository last commit is the same
than lister.state.las-_commit"""
archive_path = Path(datadir, "fake-crates-repository.tar.gz")
repo_url = prepare_repository_from_archive(
archive_path, "crates.io-index", tmp_path
)
lister = CratesLister(scheduler=swh_scheduler)
lister.INDEX_REPOSITORY_URL = repo_url
lister.DESTINATION_PATH = tmp_path.parent / "crates.io-index-tests"
lister.get_index_repository()
repo = Repo(lister.DESTINATION_PATH)
# Set a CratesListerState with a last commit value to force incremental case
last_commit_state = CratesListerState(last_commit=repo.head().decode())
lister.state = last_commit_state
res = lister.run()
assert res.pages == 0
assert res.origins == 0
def test_crates_lister_repository_cleanup(datadir, tmp_path, swh_scheduler):
archive_path = Path(datadir, "fake-crates-repository.tar.gz")
repo_url = prepare_repository_from_archive(
archive_path, "crates.io-index", tmp_path
)
lister = CratesLister(scheduler=swh_scheduler)
lister.INDEX_REPOSITORY_URL = repo_url
lister.DESTINATION_PATH = tmp_path.parent / "crates.io-index-tests"
lister.run()
# Repository directory should not exists after the lister runs
assert not lister.DESTINATION_PATH.exists()