Conda: List origins for Anaconda, the package manager that provides tooling for datascience
Related T4547
This commit is contained in:
parent
fd1a4244a0
commit
8ff418fbc2
13 changed files with 387 additions and 0 deletions
1
setup.py
1
setup.py
|
@ -60,6 +60,7 @@ setup(
|
|||
lister.bitbucket=swh.lister.bitbucket:register
|
||||
lister.bower=swh.lister.bower:register
|
||||
lister.cgit=swh.lister.cgit:register
|
||||
lister.conda=swh.lister.conda:register
|
||||
lister.cran=swh.lister.cran:register
|
||||
lister.crates=swh.lister.crates:register
|
||||
lister.debian=swh.lister.debian:register
|
||||
|
|
124
swh/lister/conda/__init__.py
Normal file
124
swh/lister/conda/__init__.py
Normal file
|
@ -0,0 +1,124 @@
|
|||
# Copyright (C) 2022 The Software Heritage developers
|
||||
# See the AUTHORS file at the top-level directory of this distribution
|
||||
# License: GNU General Public License version 3, or any later version
|
||||
# See top-level LICENSE file for more information
|
||||
|
||||
|
||||
"""
|
||||
Conda lister
|
||||
============
|
||||
|
||||
Anaconda is a package manager that provides tooling for datascience.
|
||||
|
||||
The Conda lister list `packages`_ from Anaconda `repositories`_.
|
||||
Those repositories host packages for several languages (Python, R) operating systems
|
||||
and architecture.
|
||||
Packages are grouped within free or commercial `channels`_.
|
||||
|
||||
To instantiate a conda lister we need to give some `channel`and `arch` arguments::
|
||||
|
||||
lister = CondaLister(
|
||||
scheduler=swh_scheduler, channel="free", archs=["linux-64", "osx-64", "win-64"]
|
||||
)
|
||||
|
||||
The default `url` value of lister is `https://repo.anaconda.com/pkgs`. One can set another
|
||||
repository url, for example::
|
||||
|
||||
lister = CondaLister(
|
||||
scheduler=swh_scheduler,
|
||||
url="https://conda.anaconda.org",
|
||||
channel="conda-forge",
|
||||
archs=["linux-64"],
|
||||
)
|
||||
|
||||
Origins retrieving strategy
|
||||
---------------------------
|
||||
|
||||
Each channel provides several `repodata.json`_ files that list available packages
|
||||
and related versions.
|
||||
|
||||
Given a channel and a list of system and architecture the lister download and parse
|
||||
corresponding repodata.json.
|
||||
|
||||
We use bz2 compressed version of repodata.json. See for example `main/linux-64`_ page
|
||||
to view available repodata files.
|
||||
|
||||
Page listing
|
||||
------------
|
||||
|
||||
The lister returns one page per channel / architecture that list all available package
|
||||
versions.
|
||||
|
||||
Origins from page
|
||||
-----------------
|
||||
|
||||
Origins urls are built following this pattern `https://anaconda.org/{channel}/{pkgname}`.
|
||||
Each origin is yield with an `artifacts` entry in `extra_loader_arguments` that list
|
||||
artifact metadata for each archived package version.
|
||||
|
||||
Origin data example for one origin with two related versions.::
|
||||
|
||||
{
|
||||
"url": "https://anaconda.org/conda-forge/lifetimes",
|
||||
"artifacts": {
|
||||
"linux-64/0.11.1-py36h9f0ad1d_1": {
|
||||
"url": "https://conda.anaconda.org/conda-forge/linux-64/lifetimes-0.11.1-py36h9f0ad1d_1.tar.bz2", # noqa: B950
|
||||
"date": "2020-07-06T12:19:36.425000+00:00",
|
||||
"version": "0.11.1",
|
||||
"filename": "lifetimes-0.11.1-py36h9f0ad1d_1.tar.bz2",
|
||||
"checksums": {
|
||||
"md5": "faa398f7ba0d60ce44aa6eeded490cee",
|
||||
"sha256": "f82a352dfae8abceeeaa538b220fd9c5e4aa4e59092a6a6cea70b9ec0581ea03", # noqa: B950
|
||||
},
|
||||
},
|
||||
"linux-64/0.11.1-py36hc560c46_1": {
|
||||
"url": "https://conda.anaconda.org/conda-forge/linux-64/lifetimes-0.11.1-py36hc560c46_1.tar.bz2", # noqa: B950
|
||||
"date": "2020-07-06T12:19:37.032000+00:00",
|
||||
"version": "0.11.1",
|
||||
"filename": "lifetimes-0.11.1-py36hc560c46_1.tar.bz2",
|
||||
"checksums": {
|
||||
"md5": "c53a689a4c5948e84211bdfc23e3fe68",
|
||||
"sha256": "76146c2ebd6e3b65928bde53a2585287759d77beba785c0eeb889ee565c0035d", # noqa: B950
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
Running tests
|
||||
-------------
|
||||
|
||||
Activate the virtualenv and run from within swh-lister directory::
|
||||
|
||||
pytest -s -vv --log-cli-level=DEBUG swh/lister/conda/tests
|
||||
|
||||
Testing with Docker
|
||||
-------------------
|
||||
|
||||
Change directory to swh/docker then launch the docker environment::
|
||||
|
||||
docker compose up -d
|
||||
|
||||
Then schedule a conda listing task::
|
||||
|
||||
docker compose exec swh-scheduler swh scheduler task add -p oneshot list-conda channel="free" archs="[linux-64, osx-64, win-64]" # noqa: B950
|
||||
|
||||
You can follow lister execution by displaying logs of swh-lister service::
|
||||
|
||||
docker compose logs -f swh-lister
|
||||
|
||||
.. _packages: https://docs.anaconda.com/anaconda/packages/pkg-docs/
|
||||
.. _Anaconda: https://anaconda.com/
|
||||
.. _repositories: https://repo.anaconda.com/pkgs/
|
||||
.. _channels: https://docs.anaconda.com/anaconda/user-guide/tasks/using-repositories/
|
||||
.. _main/linux-64: https://repo.anaconda.com/pkgs/main/linux-64/
|
||||
.. _repodata.json: https://repo.anaconda.com/pkgs/free/linux-64/repodata.json
|
||||
"""
|
||||
|
||||
|
||||
def register():
|
||||
from .lister import CondaLister
|
||||
|
||||
return {
|
||||
"lister": CondaLister,
|
||||
"task_modules": ["%s.tasks" % __name__],
|
||||
}
|
118
swh/lister/conda/lister.py
Normal file
118
swh/lister/conda/lister.py
Normal file
|
@ -0,0 +1,118 @@
|
|||
# Copyright (C) 2022 The Software Heritage developers
|
||||
# See the AUTHORS file at the top-level directory of this distribution
|
||||
# License: GNU General Public License version 3, or any later version
|
||||
# See top-level LICENSE file for more information
|
||||
|
||||
import bz2
|
||||
from collections import defaultdict
|
||||
import datetime
|
||||
import json
|
||||
import logging
|
||||
from typing import Any, Dict, Iterator, List, Optional, Tuple
|
||||
|
||||
import iso8601
|
||||
|
||||
from swh.scheduler.interface import SchedulerInterface
|
||||
from swh.scheduler.model import ListedOrigin
|
||||
|
||||
from ..pattern import CredentialsType, StatelessLister
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Aliasing the page results returned by `get_pages` method from the lister.
|
||||
CondaListerPage = Tuple[str, Dict[str, Dict[str, Any]]]
|
||||
|
||||
|
||||
class CondaLister(StatelessLister[CondaListerPage]):
|
||||
"""List Conda (anaconda.com) origins."""
|
||||
|
||||
LISTER_NAME = "conda"
|
||||
VISIT_TYPE = "conda"
|
||||
INSTANCE = "conda"
|
||||
BASE_REPO_URL = "https://repo.anaconda.com/pkgs"
|
||||
REPO_URL_PATTERN = "{url}/{channel}/{arch}/repodata.json.bz2"
|
||||
ORIGIN_URL_PATTERN = "https://anaconda.org/{channel}/{pkgname}"
|
||||
ARCHIVE_URL_PATTERN = "{url}/{channel}/{arch}/{filename}"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
scheduler: SchedulerInterface,
|
||||
credentials: Optional[CredentialsType] = None,
|
||||
url: str = BASE_REPO_URL,
|
||||
channel: str = "",
|
||||
archs: List = [],
|
||||
):
|
||||
super().__init__(
|
||||
scheduler=scheduler,
|
||||
credentials=credentials,
|
||||
instance=self.INSTANCE,
|
||||
url=url,
|
||||
)
|
||||
self.channel: str = channel
|
||||
self.archs: List[str] = archs
|
||||
self.packages: Dict[str, Any] = defaultdict(dict)
|
||||
self.package_dates: Dict[str, Any] = defaultdict(list)
|
||||
|
||||
def get_pages(self) -> Iterator[CondaListerPage]:
|
||||
"""Yield an iterator which returns 'page'"""
|
||||
|
||||
for arch in self.archs:
|
||||
repodata_url = self.REPO_URL_PATTERN.format(
|
||||
url=self.url, channel=self.channel, arch=arch
|
||||
)
|
||||
response = self.http_request(url=repodata_url)
|
||||
packages = json.loads(bz2.decompress(response.content))["packages"]
|
||||
yield (arch, packages)
|
||||
|
||||
def get_origins_from_page(self, page: CondaListerPage) -> Iterator[ListedOrigin]:
|
||||
"""Iterate on all pages and yield ListedOrigin instances."""
|
||||
assert self.lister_obj.id is not None
|
||||
arch, packages = page
|
||||
|
||||
for filename, package_metadata in packages.items():
|
||||
artifact = {
|
||||
"filename": filename,
|
||||
"url": self.ARCHIVE_URL_PATTERN.format(
|
||||
url=self.url,
|
||||
channel=self.channel,
|
||||
filename=filename,
|
||||
arch=arch,
|
||||
),
|
||||
"version": package_metadata["version"],
|
||||
"checksums": {},
|
||||
}
|
||||
|
||||
for checksum in ("md5", "sha256"):
|
||||
if checksum in package_metadata:
|
||||
artifact["checksums"][checksum] = package_metadata[checksum]
|
||||
|
||||
version_key = (
|
||||
f"{arch}/{package_metadata['version']}-{package_metadata['build']}"
|
||||
)
|
||||
self.packages[package_metadata["name"]][version_key] = artifact
|
||||
|
||||
package_date = None
|
||||
if "timestamp" in package_metadata:
|
||||
package_date = datetime.datetime.fromtimestamp(
|
||||
package_metadata["timestamp"] / 1e3, datetime.timezone.utc
|
||||
)
|
||||
elif "date" in package_metadata:
|
||||
package_date = iso8601.parse_date(package_metadata["date"])
|
||||
|
||||
last_update = None
|
||||
if package_date:
|
||||
artifact["date"] = package_date.isoformat()
|
||||
self.package_dates[package_metadata["name"]].append(package_date)
|
||||
last_update = max(self.package_dates[package_metadata["name"]])
|
||||
|
||||
yield ListedOrigin(
|
||||
lister_id=self.lister_obj.id,
|
||||
visit_type=self.VISIT_TYPE,
|
||||
url=self.ORIGIN_URL_PATTERN.format(
|
||||
channel=self.channel, pkgname=package_metadata["name"]
|
||||
),
|
||||
last_update=last_update,
|
||||
extra_loader_arguments={
|
||||
"artifacts": self.packages[package_metadata["name"]],
|
||||
},
|
||||
)
|
19
swh/lister/conda/tasks.py
Normal file
19
swh/lister/conda/tasks.py
Normal file
|
@ -0,0 +1,19 @@
|
|||
# Copyright (C) 2022 The Software Heritage developers
|
||||
# See the AUTHORS file at the top-level directory of this distribution
|
||||
# License: GNU General Public License version 3, or any later version
|
||||
# See top-level LICENSE file for more information
|
||||
|
||||
from celery import shared_task
|
||||
|
||||
from swh.lister.conda.lister import CondaLister
|
||||
|
||||
|
||||
@shared_task(name=__name__ + ".CondaListerTask")
|
||||
def list_conda(**lister_args):
|
||||
"""Lister task for Anaconda registry"""
|
||||
return CondaLister.from_configfile(**lister_args).run().dict()
|
||||
|
||||
|
||||
@shared_task(name=__name__ + ".ping")
|
||||
def _ping():
|
||||
return "OK"
|
0
swh/lister/conda/tests/__init__.py
Normal file
0
swh/lister/conda/tests/__init__.py
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
94
swh/lister/conda/tests/test_lister.py
Normal file
94
swh/lister/conda/tests/test_lister.py
Normal file
|
@ -0,0 +1,94 @@
|
|||
# Copyright (C) 2022 The Software Heritage developers
|
||||
# See the AUTHORS file at the top-level directory of this distribution
|
||||
# License: GNU General Public License version 3, or any later version
|
||||
# See top-level LICENSE file for more information
|
||||
|
||||
from swh.lister.conda.lister import CondaLister
|
||||
|
||||
|
||||
def test_conda_lister_free_channel(datadir, requests_mock_datadir, swh_scheduler):
|
||||
lister = CondaLister(
|
||||
scheduler=swh_scheduler, channel="free", archs=["linux-64", "osx-64", "win-64"]
|
||||
)
|
||||
res = lister.run()
|
||||
|
||||
assert res.pages == 3
|
||||
assert res.origins == 14
|
||||
|
||||
|
||||
def test_conda_lister_conda_forge_channel(
|
||||
datadir, requests_mock_datadir, swh_scheduler
|
||||
):
|
||||
lister = CondaLister(
|
||||
scheduler=swh_scheduler,
|
||||
url="https://conda.anaconda.org",
|
||||
channel="conda-forge",
|
||||
archs=["linux-64"],
|
||||
)
|
||||
res = lister.run()
|
||||
|
||||
assert res.pages == 1
|
||||
assert res.origins == 2
|
||||
|
||||
scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
|
||||
|
||||
expected_origins = [
|
||||
{
|
||||
"url": "https://anaconda.org/conda-forge/21cmfast",
|
||||
"artifacts": {
|
||||
"linux-64/3.0.2-py36h1af98f8_1": {
|
||||
"url": "https://conda.anaconda.org/conda-forge/linux-64/21cmfast-3.0.2-py36h1af98f8_1.tar.bz2", # noqa: B950
|
||||
"date": "2020-11-11T16:04:49.658000+00:00",
|
||||
"version": "3.0.2",
|
||||
"filename": "21cmfast-3.0.2-py36h1af98f8_1.tar.bz2",
|
||||
"checksums": {
|
||||
"md5": "d65ab674acf3b7294ebacaec05fc5b54",
|
||||
"sha256": "1154fceeb5c4ee9bb97d245713ac21eb1910237c724d2b7103747215663273c2", # noqa: B950
|
||||
},
|
||||
}
|
||||
},
|
||||
},
|
||||
{
|
||||
"url": "https://anaconda.org/conda-forge/lifetimes",
|
||||
"artifacts": {
|
||||
"linux-64/0.11.1-py36h9f0ad1d_1": {
|
||||
"url": "https://conda.anaconda.org/conda-forge/linux-64/lifetimes-0.11.1-py36h9f0ad1d_1.tar.bz2", # noqa: B950
|
||||
"date": "2020-07-06T12:19:36.425000+00:00",
|
||||
"version": "0.11.1",
|
||||
"filename": "lifetimes-0.11.1-py36h9f0ad1d_1.tar.bz2",
|
||||
"checksums": {
|
||||
"md5": "faa398f7ba0d60ce44aa6eeded490cee",
|
||||
"sha256": "f82a352dfae8abceeeaa538b220fd9c5e4aa4e59092a6a6cea70b9ec0581ea03", # noqa: B950
|
||||
},
|
||||
},
|
||||
"linux-64/0.11.1-py36hc560c46_1": {
|
||||
"url": "https://conda.anaconda.org/conda-forge/linux-64/lifetimes-0.11.1-py36hc560c46_1.tar.bz2", # noqa: B950
|
||||
"date": "2020-07-06T12:19:37.032000+00:00",
|
||||
"version": "0.11.1",
|
||||
"filename": "lifetimes-0.11.1-py36hc560c46_1.tar.bz2",
|
||||
"checksums": {
|
||||
"md5": "c53a689a4c5948e84211bdfc23e3fe68",
|
||||
"sha256": "76146c2ebd6e3b65928bde53a2585287759d77beba785c0eeb889ee565c0035d", # noqa: B950
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
]
|
||||
|
||||
assert len(scheduler_origins) == len(expected_origins)
|
||||
|
||||
assert [
|
||||
(
|
||||
scheduled.visit_type,
|
||||
scheduled.url,
|
||||
scheduled.extra_loader_arguments["artifacts"],
|
||||
)
|
||||
for scheduled in sorted(scheduler_origins, key=lambda scheduled: scheduled.url)
|
||||
] == [
|
||||
(
|
||||
"conda",
|
||||
expected["url"],
|
||||
expected["artifacts"],
|
||||
)
|
||||
for expected in sorted(expected_origins, key=lambda expected: expected["url"])
|
||||
]
|
31
swh/lister/conda/tests/test_tasks.py
Normal file
31
swh/lister/conda/tests/test_tasks.py
Normal file
|
@ -0,0 +1,31 @@
|
|||
# Copyright (C) 2022 The Software Heritage developers
|
||||
# See the AUTHORS file at the top-level directory of this distribution
|
||||
# License: GNU General Public License version 3, or any later version
|
||||
# See top-level LICENSE file for more information
|
||||
|
||||
from swh.lister.pattern import ListerStats
|
||||
|
||||
|
||||
def test_conda_ping(swh_scheduler_celery_app, swh_scheduler_celery_worker):
|
||||
res = swh_scheduler_celery_app.send_task("swh.lister.conda.tasks.ping")
|
||||
assert res
|
||||
res.wait()
|
||||
assert res.successful()
|
||||
assert res.result == "OK"
|
||||
|
||||
|
||||
def test_conda_lister(swh_scheduler_celery_app, swh_scheduler_celery_worker, mocker):
|
||||
# setup the mocked CondaLister
|
||||
lister = mocker.patch("swh.lister.conda.tasks.CondaLister")
|
||||
lister.from_configfile.return_value = lister
|
||||
stats = ListerStats(pages=42, origins=42)
|
||||
lister.run.return_value = stats
|
||||
|
||||
res = swh_scheduler_celery_app.send_task("swh.lister.conda.tasks.CondaListerTask")
|
||||
assert res
|
||||
res.wait()
|
||||
assert res.successful()
|
||||
assert res.result == stats.dict()
|
||||
|
||||
lister.from_configfile.assert_called_once_with()
|
||||
lister.run.assert_called_once_with()
|
Loading…
Add table
Add a link
Reference in a new issue