Stateful Julia lister
Add a state to the lister to store the ``last_seen_commit`` as a Git commit hash. Use Dulwich to retrieve a Git commit walker since ``last_seen_commit`` if any. For each commit detect if it is a new package or a new package version commit and returns its origin with commit date as last_update.
This commit is contained in:
parent
053f0a93d5
commit
99bbd9d68f
7 changed files with 284 additions and 35 deletions
|
@ -28,9 +28,9 @@ Origins retrieval strategy
|
|||
--------------------------
|
||||
|
||||
To build a list of origins we clone the `Julia General registry`_ Git repository, then
|
||||
read the `Registry.toml`_ file to get the path to packages directories.
|
||||
Each directory have a `Package.toml` file from where we get the Git repository url for
|
||||
a package.
|
||||
walk through commits with the help of `Dulwich`_ to detect commit related to a new package
|
||||
or a new version of a package. For each of those commits we get the path to `Package.toml`
|
||||
file from where we get the Git repository url for a package.
|
||||
|
||||
Page listing
|
||||
------------
|
||||
|
@ -40,7 +40,12 @@ There is only one page listing all origins url.
|
|||
Origins from page
|
||||
-----------------
|
||||
|
||||
The lister is stateless and yields all origins url from one page.
|
||||
The lister yields all origins url from one page.
|
||||
|
||||
Each time the lister is executed, the HEAD commit id of `Julia General registry`_
|
||||
is stored as ``state.last_seen_commit`` and used on next run to retrieve new origins
|
||||
since the last commit.
|
||||
|
||||
Each url corresponds to the Git url of the package repository.
|
||||
|
||||
Running tests
|
||||
|
@ -71,6 +76,7 @@ You can follow lister execution by displaying logs of swh-lister service::
|
|||
.. _JuliaHub: https://juliahub.com/
|
||||
.. _Julia Packages: https://julialang.org/packages/
|
||||
.. _Registry.toml: https://github.com/JuliaRegistries/General/blob/master/Registry.toml
|
||||
.. _Dulwich: https://www.dulwich.io/
|
||||
""" # noqa: B950
|
||||
|
||||
|
||||
|
|
|
@ -3,27 +3,40 @@
|
|||
# License: GNU General Public License version 3, or any later version
|
||||
# See top-level LICENSE file for more information
|
||||
|
||||
from dataclasses import asdict, dataclass
|
||||
import datetime
|
||||
import logging
|
||||
from pathlib import Path
|
||||
import shutil
|
||||
import tempfile
|
||||
from typing import Any, Iterator, List, Optional, Tuple
|
||||
from typing import Any, Dict, Iterator, Optional
|
||||
|
||||
from dulwich import porcelain
|
||||
from dulwich.repo import Repo
|
||||
from dulwich.walk import WalkEntry
|
||||
import iso8601
|
||||
import toml
|
||||
|
||||
from swh.scheduler.interface import SchedulerInterface
|
||||
from swh.scheduler.model import ListedOrigin
|
||||
|
||||
from ..pattern import CredentialsType, StatelessLister
|
||||
from ..pattern import CredentialsType, Lister
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Aliasing the page results returned by `get_pages` method from the lister.
|
||||
JuliaListerPage = List[Tuple[str, Any]]
|
||||
JuliaListerPage = Dict[str, Any]
|
||||
|
||||
|
||||
class JuliaLister(StatelessLister[JuliaListerPage]):
|
||||
@dataclass
|
||||
class JuliaListerState:
|
||||
"""Store lister state for incremental mode operations"""
|
||||
|
||||
last_seen_commit: Optional[str] = None
|
||||
"""Hash of the latest Git commit when lister was executed"""
|
||||
|
||||
|
||||
class JuliaLister(Lister[JuliaListerState, JuliaListerPage]):
|
||||
"""List Julia packages origins"""
|
||||
|
||||
LISTER_NAME = "julia"
|
||||
|
@ -34,7 +47,6 @@ class JuliaLister(StatelessLister[JuliaListerPage]):
|
|||
"https://github.com/JuliaRegistries/General.git" # Julia General Registry
|
||||
)
|
||||
REPO_PATH = Path(tempfile.mkdtemp(), "General")
|
||||
REGISTRY_PATH = REPO_PATH / "Registry.toml"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
|
@ -63,40 +75,111 @@ class JuliaLister(StatelessLister[JuliaListerPage]):
|
|||
except FileExistsError:
|
||||
porcelain.pull(self.REPO_PATH, remote_location=self.url)
|
||||
|
||||
def state_from_dict(self, d: Dict[str, Any]) -> JuliaListerState:
|
||||
return JuliaListerState(**d)
|
||||
|
||||
def state_to_dict(self, state: JuliaListerState) -> Dict[str, Any]:
|
||||
return asdict(state)
|
||||
|
||||
def get_origin_data(self, entry: WalkEntry) -> Dict[str, Any]:
|
||||
"""
|
||||
Given an entry object parse its commit message and other attributes
|
||||
to detect if the commit is valid to describe a new package or
|
||||
a new package version.
|
||||
|
||||
Returns a dict with origin url as key and iso8601 commit date as value
|
||||
"""
|
||||
assert entry
|
||||
|
||||
if (
|
||||
entry.commit
|
||||
and entry.changes()
|
||||
and (
|
||||
entry.commit.message.startswith(b"New package: ")
|
||||
or entry.commit.message.startswith(b"New version: ")
|
||||
)
|
||||
):
|
||||
package_toml = None
|
||||
for change in entry.changes():
|
||||
if change and hasattr(change, "new"):
|
||||
if change.new.path.endswith(b"/Package.toml"):
|
||||
package_toml = self.REPO_PATH / change.new.path.decode()
|
||||
break
|
||||
elif change.new.path.endswith(b"/Versions.toml"):
|
||||
versions_path = self.REPO_PATH / change.new.path.decode()
|
||||
if versions_path.exists():
|
||||
package_path, _ = change.new.path.decode().split(
|
||||
"Versions.toml"
|
||||
)
|
||||
package_toml = (
|
||||
self.REPO_PATH / package_path / "Package.toml"
|
||||
)
|
||||
break
|
||||
|
||||
if package_toml and package_toml.exists():
|
||||
origin = toml.load(package_toml)["repo"]
|
||||
last_update = datetime.datetime.fromtimestamp(
|
||||
entry.commit.commit_time,
|
||||
tz=datetime.timezone.utc,
|
||||
).isoformat()
|
||||
return {f"{origin}": last_update}
|
||||
|
||||
return {}
|
||||
|
||||
def get_pages(self) -> Iterator[JuliaListerPage]:
|
||||
"""Yield an iterator which returns 'page'
|
||||
|
||||
To build a list of origins the `Julia General registry` Git
|
||||
repository is cloned to get a `Registry.toml` file, an index file of
|
||||
packages directories.
|
||||
To build a list of origins the ``Julia General registry`` Git
|
||||
repository is cloned to look at commits history to discover new
|
||||
package and new package versions.
|
||||
|
||||
Depending on ``last_seen_commit`` state it initiate a commit walker
|
||||
since the last time the lister has been executed.
|
||||
|
||||
There is only one page that list all origins urls.
|
||||
"""
|
||||
# Clone the repository
|
||||
self.get_registry_repository()
|
||||
assert self.REGISTRY_PATH.exists()
|
||||
registry = toml.load(self.REGISTRY_PATH)
|
||||
yield registry["packages"].items()
|
||||
assert self.REPO_PATH.exists()
|
||||
|
||||
repo = Repo(str(self.REPO_PATH))
|
||||
|
||||
# Detect commits related to new package and new versions since last_seen_commit
|
||||
if not self.state.last_seen_commit:
|
||||
walker = repo.get_walker()
|
||||
else:
|
||||
last = repo[self.state.last_seen_commit.encode()]
|
||||
walker = repo.get_walker(since=last.commit_time, exclude=[last.id])
|
||||
|
||||
assert walker
|
||||
packages = {}
|
||||
for entry in walker:
|
||||
packages.update(self.get_origin_data(entry=entry))
|
||||
|
||||
yield packages
|
||||
|
||||
def get_origins_from_page(self, page: JuliaListerPage) -> Iterator[ListedOrigin]:
|
||||
"""Iterate on all pages and yield ListedOrigin instances
|
||||
|
||||
Each directory of the Git repository have a `Package.toml` file from
|
||||
where we get the Git repository url for each package.
|
||||
Each directory of the Git repository have a ``Package.toml`` file from
|
||||
where we get the Git repository url as an origin for each package.
|
||||
"""
|
||||
assert self.lister_obj.id is not None
|
||||
assert self.REPO_PATH.exists()
|
||||
|
||||
for uuid, info in page:
|
||||
package_info_path = self.REPO_PATH / info["path"] / "Package.toml"
|
||||
package_info = toml.load(package_info_path)
|
||||
for origin, last_update in page.items():
|
||||
last_update = iso8601.parse_date(last_update)
|
||||
yield ListedOrigin(
|
||||
lister_id=self.lister_obj.id,
|
||||
visit_type=self.VISIT_TYPE,
|
||||
url=package_info["repo"],
|
||||
last_update=None,
|
||||
url=origin,
|
||||
last_update=last_update,
|
||||
)
|
||||
|
||||
def finalize(self) -> None:
|
||||
# Get Git HEAD commit hash
|
||||
repo = Repo(str(self.REPO_PATH))
|
||||
self.state.last_seen_commit = repo.head().decode("ascii")
|
||||
self.updated = True
|
||||
# Rm tmp directory REPO_PATH
|
||||
if self.REPO_PATH.exists():
|
||||
shutil.rmtree(self.REPO_PATH)
|
||||
|
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
@ -27,6 +27,9 @@ some amount of consideration when choosing package names.
|
|||
[packages]' > Registry.toml
|
||||
|
||||
# Init as a git repository
|
||||
# Force author and commit date to be the same
|
||||
export GIT_AUTHOR_DATE='2001-01-01T17:18:19+00:00'
|
||||
export GIT_COMMITTER_DATE=$GIT_AUTHOR_DATE
|
||||
git init
|
||||
git add .
|
||||
git commit -m "Init fake Julia registry repository for tests purpose"
|
||||
|
@ -50,6 +53,8 @@ git-tree-sha1 = "65301af3ab06b04cf8a52cd43b06222bab5249c2"
|
|||
|
||||
echo 'a3ea4736-0a3b-4c29-ac8a-20364318a635 = { name = "Fable", path = "F/Fable" }' >> Registry.toml
|
||||
|
||||
export GIT_AUTHOR_DATE='2001-01-02T17:18:19+00:00'
|
||||
export GIT_COMMITTER_DATE=$GIT_AUTHOR_DATE
|
||||
git add .
|
||||
git commit -m "New package: Fable v0.0.2"
|
||||
|
||||
|
@ -132,16 +137,60 @@ git-tree-sha1 = "59619a31c56c9e61b5dabdbd339e30c227c5d13d"
|
|||
|
||||
echo 'f1435218-dba5-11e9-1e4d-f1a5fab5fc13 = { name = "Oscar", path = "O/Oscar" }' >> Registry.toml
|
||||
|
||||
export GIT_AUTHOR_DATE='2001-01-03T17:18:19+00:00'
|
||||
export GIT_COMMITTER_DATE=$GIT_AUTHOR_DATE
|
||||
git add .
|
||||
git commit -m "New package: Oscar v0.12.1"
|
||||
|
||||
# Save some space
|
||||
rm .git/hooks/*.sample
|
||||
|
||||
# Archive
|
||||
# First Archive
|
||||
cd ../
|
||||
tar -czf fake-julia-registry-repository.tar.gz General
|
||||
mv fake-julia-registry-repository.tar.gz ../
|
||||
tar -czf fake-julia-registry-repository_0.tar.gz General
|
||||
mv fake-julia-registry-repository_0.tar.gz ../
|
||||
|
||||
# Add some more commits and build a second archive for incremental tests purpose
|
||||
cd General
|
||||
echo '
|
||||
|
||||
["0.13.0"]
|
||||
git-tree-sha1 = "c090495f818a063ed23d2d911fe74cc4358b5351"
|
||||
' >> O/Oscar/Versions.toml
|
||||
|
||||
# New version, replace previous uuid with a new one
|
||||
sed -i -e 's/f1435218-dba5-11e9-1e4d-f1a5fab5fc13/a3ea4736-0a3b-4c29-ac8a-20364318a635/g' Registry.toml
|
||||
|
||||
export GIT_AUTHOR_DATE='2001-01-04T17:18:19+00:00'
|
||||
export GIT_COMMITTER_DATE=$GIT_AUTHOR_DATE
|
||||
git add .
|
||||
git commit -m "New version: Oscar v0.13.0"
|
||||
|
||||
mkdir -p V/VulkanSpec
|
||||
|
||||
touch V/VulkanSpec/Package.toml
|
||||
touch V/VulkanSpec/Versions.toml
|
||||
|
||||
echo 'name = "VulkanSpec"
|
||||
uuid = "99a7788f-8f0f-454f-8f6c-c6cf389551ae"
|
||||
repo = "https://github.com/serenity4/VulkanSpec.jl.git"
|
||||
' > V/VulkanSpec/Package.toml
|
||||
|
||||
echo '["0.1.0"]
|
||||
git-tree-sha1 = "b5fef67130191c797007a1484f4dc6bfc840caa2"
|
||||
' > V/VulkanSpec/Versions.toml
|
||||
|
||||
echo '99a7788f-8f0f-454f-8f6c-c6cf389551ae = { name = "VulkanSpec", path = "V/VulkanSpec" }' >> Registry.toml
|
||||
|
||||
export GIT_AUTHOR_DATE='2001-01-05T17:18:19+00:00'
|
||||
export GIT_COMMITTER_DATE=$GIT_AUTHOR_DATE
|
||||
git add .
|
||||
git commit -m "New package: VulkanSpec v0.1.0"
|
||||
|
||||
# Second Archive
|
||||
cd ../
|
||||
tar -czf fake-julia-registry-repository_1.tar.gz General
|
||||
mv fake-julia-registry-repository_1.tar.gz ../
|
||||
|
||||
# Clean up tmp_dir
|
||||
cd ../
|
||||
|
|
|
@ -5,17 +5,25 @@
|
|||
|
||||
from pathlib import Path
|
||||
|
||||
from dulwich import porcelain
|
||||
import iso8601
|
||||
|
||||
from swh.lister.julia.lister import JuliaLister
|
||||
from swh.lister.julia.tests import prepare_repository_from_archive
|
||||
|
||||
expected_origins = [
|
||||
"https://github.com/leios/Fable.jl.git",
|
||||
"https://github.com/oscar-system/Oscar.jl.git",
|
||||
]
|
||||
expected_origins_0 = {
|
||||
"https://github.com/leios/Fable.jl.git": "2001-01-02T17:18:19+00:00",
|
||||
"https://github.com/oscar-system/Oscar.jl.git": "2001-01-03T17:18:19+00:00",
|
||||
}
|
||||
|
||||
expected_origins_1 = {
|
||||
"https://github.com/oscar-system/Oscar.jl.git": "2001-01-04T17:18:19+00:00",
|
||||
"https://github.com/serenity4/VulkanSpec.jl.git": "2001-01-05T17:18:19+00:00",
|
||||
}
|
||||
|
||||
|
||||
def test_julia_get_registry_repository(datadir, tmp_path, swh_scheduler):
|
||||
archive_path = Path(datadir, "fake-julia-registry-repository.tar.gz")
|
||||
archive_path = Path(datadir, "fake-julia-registry-repository_0.tar.gz")
|
||||
repo_url = prepare_repository_from_archive(archive_path, "General", tmp_path)
|
||||
|
||||
lister = JuliaLister(url=repo_url, scheduler=swh_scheduler)
|
||||
|
@ -33,17 +41,18 @@ def test_julia_get_registry_repository(datadir, tmp_path, swh_scheduler):
|
|||
|
||||
|
||||
def test_julia_lister(datadir, tmp_path, swh_scheduler):
|
||||
archive_path = Path(datadir, "fake-julia-registry-repository.tar.gz")
|
||||
archive_path = Path(datadir, "fake-julia-registry-repository_0.tar.gz")
|
||||
repo_url = prepare_repository_from_archive(archive_path, "General", tmp_path)
|
||||
lister = JuliaLister(url=repo_url, scheduler=swh_scheduler)
|
||||
lister.REPO_PATH = Path(tmp_path, "General")
|
||||
lister.REGISTRY_PATH = lister.REPO_PATH / "Registry.toml"
|
||||
|
||||
res = lister.run()
|
||||
assert res.origins == 1 + 1
|
||||
assert res.origins == len(expected_origins_0)
|
||||
|
||||
scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
|
||||
assert len(scheduler_origins) == len(expected_origins)
|
||||
assert len(scheduler_origins) == len(expected_origins_0)
|
||||
|
||||
assert {
|
||||
(
|
||||
scheduled.visit_type,
|
||||
|
@ -51,4 +60,106 @@ def test_julia_lister(datadir, tmp_path, swh_scheduler):
|
|||
scheduled.last_update,
|
||||
)
|
||||
for scheduled in scheduler_origins
|
||||
} == {("git", expected, None) for expected in expected_origins}
|
||||
} == {
|
||||
("git", origin, iso8601.parse_date(last_update))
|
||||
for origin, last_update in expected_origins_0.items()
|
||||
}
|
||||
|
||||
|
||||
def test_julia_lister_incremental(datadir, tmp_path, swh_scheduler):
|
||||
archive_path = Path(datadir, "fake-julia-registry-repository_0.tar.gz")
|
||||
repo_url = prepare_repository_from_archive(archive_path, "General", tmp_path)
|
||||
|
||||
# Prepare first run
|
||||
lister = JuliaLister(url=repo_url, scheduler=swh_scheduler)
|
||||
lister.REPO_PATH = Path(tmp_path, "General")
|
||||
lister.REGISTRY_PATH = lister.REPO_PATH / "Registry.toml"
|
||||
# Latest Git commit hash expected
|
||||
with porcelain.open_repo_closing(lister.REPO_PATH) as r:
|
||||
expected_last_seen_commit = r.head().decode("ascii")
|
||||
|
||||
assert expected_last_seen_commit is not None
|
||||
assert lister.state.last_seen_commit is None
|
||||
|
||||
# First run
|
||||
res = lister.run()
|
||||
assert res.pages == 1
|
||||
assert res.origins == len(expected_origins_0)
|
||||
assert lister.state.last_seen_commit == expected_last_seen_commit
|
||||
|
||||
scheduler_origins_0 = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
|
||||
assert len(scheduler_origins_0) == len(expected_origins_0)
|
||||
assert {
|
||||
(
|
||||
scheduled.visit_type,
|
||||
scheduled.url,
|
||||
scheduled.last_update,
|
||||
)
|
||||
for scheduled in scheduler_origins_0
|
||||
} == {
|
||||
("git", origin, iso8601.parse_date(last_update))
|
||||
for origin, last_update in expected_origins_0.items()
|
||||
}
|
||||
|
||||
# Prepare second run
|
||||
archive_path = Path(datadir, "fake-julia-registry-repository_1.tar.gz")
|
||||
repo_url = prepare_repository_from_archive(archive_path, "General", tmp_path)
|
||||
|
||||
lister = JuliaLister(url=repo_url, scheduler=swh_scheduler)
|
||||
lister.REPO_PATH = Path(tmp_path, "General")
|
||||
lister.REGISTRY_PATH = lister.REPO_PATH / "Registry.toml"
|
||||
|
||||
assert lister.state.last_seen_commit == expected_last_seen_commit
|
||||
|
||||
with porcelain.open_repo_closing(lister.REPO_PATH) as repo:
|
||||
new_expected_last_seen_commit = repo.head().decode("ascii")
|
||||
|
||||
assert expected_last_seen_commit != new_expected_last_seen_commit
|
||||
|
||||
# Second run
|
||||
res = lister.run()
|
||||
assert lister.state.last_seen_commit == new_expected_last_seen_commit
|
||||
assert res.pages == 1
|
||||
# One new package, one new version
|
||||
assert res.origins == len(expected_origins_1)
|
||||
|
||||
scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
|
||||
expected_origins = {**expected_origins_0, **expected_origins_1}
|
||||
assert len(scheduler_origins) == len(expected_origins)
|
||||
|
||||
|
||||
def test_julia_lister_incremental_no_changes(datadir, tmp_path, swh_scheduler):
|
||||
archive_path = Path(datadir, "fake-julia-registry-repository_0.tar.gz")
|
||||
repo_url = prepare_repository_from_archive(archive_path, "General", tmp_path)
|
||||
lister = JuliaLister(url=repo_url, scheduler=swh_scheduler)
|
||||
lister.REPO_PATH = Path(tmp_path, "General")
|
||||
lister.REGISTRY_PATH = lister.REPO_PATH / "Registry.toml"
|
||||
|
||||
# Latest Git commit hash expected
|
||||
with porcelain.open_repo_closing(lister.REPO_PATH) as r:
|
||||
expected_last_seen_commit = r.head().decode("ascii")
|
||||
|
||||
assert expected_last_seen_commit is not None
|
||||
assert lister.state.last_seen_commit is None
|
||||
|
||||
# First run
|
||||
res = lister.run()
|
||||
assert res.pages == 1
|
||||
assert res.origins == len(expected_origins_0)
|
||||
assert expected_last_seen_commit is not None
|
||||
assert lister.state.last_seen_commit == expected_last_seen_commit
|
||||
|
||||
scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
|
||||
assert len(scheduler_origins) == len(expected_origins_0)
|
||||
|
||||
# Prepare second run, repository state is the same as the one of the first run
|
||||
repo_url = prepare_repository_from_archive(archive_path, "General", tmp_path)
|
||||
lister = JuliaLister(url=repo_url, scheduler=swh_scheduler)
|
||||
assert lister.state.last_seen_commit == expected_last_seen_commit
|
||||
|
||||
# Second run
|
||||
res = lister.run()
|
||||
assert lister.state.last_seen_commit == expected_last_seen_commit
|
||||
assert res.pages == 1
|
||||
# Nothing new
|
||||
assert res.origins == 0
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue