From 8ff418fbc28b2bf7c3158a261b95146e3dd4d9c4 Mon Sep 17 00:00:00 2001 From: Franck Bret Date: Wed, 21 Sep 2022 14:44:56 +0200 Subject: [PATCH] Conda: List origins for Anaconda, the package manager that provides tooling for datascience Related T4547 --- setup.py | 1 + swh/lister/conda/__init__.py | 124 ++++++++++++++++++ swh/lister/conda/lister.py | 118 +++++++++++++++++ swh/lister/conda/tasks.py | 19 +++ swh/lister/conda/tests/__init__.py | 0 .../conda-forge_linux-64_repodata.json.bz2 | Bin 0 -> 955 bytes .../pkgs_free_linux-64_repodata.json.bz2 | Bin 0 -> 634 bytes .../pkgs_free_osx-64_repodata.json.bz2 | Bin 0 -> 513 bytes .../pkgs_free_win-64_repodata.json.bz2 | Bin 0 -> 1529 bytes .../pkgs_main_linux-64_repodata.json.bz2 | Bin 0 -> 1445 bytes .../pkgs_pro_linux-64_repodata.json.bz2 | Bin 0 -> 770 bytes swh/lister/conda/tests/test_lister.py | 94 +++++++++++++ swh/lister/conda/tests/test_tasks.py | 31 +++++ 13 files changed, 387 insertions(+) create mode 100644 swh/lister/conda/__init__.py create mode 100644 swh/lister/conda/lister.py create mode 100644 swh/lister/conda/tasks.py create mode 100644 swh/lister/conda/tests/__init__.py create mode 100644 swh/lister/conda/tests/data/https_conda.anaconda.org/conda-forge_linux-64_repodata.json.bz2 create mode 100644 swh/lister/conda/tests/data/https_repo.anaconda.com/pkgs_free_linux-64_repodata.json.bz2 create mode 100644 swh/lister/conda/tests/data/https_repo.anaconda.com/pkgs_free_osx-64_repodata.json.bz2 create mode 100644 swh/lister/conda/tests/data/https_repo.anaconda.com/pkgs_free_win-64_repodata.json.bz2 create mode 100644 swh/lister/conda/tests/data/https_repo.anaconda.com/pkgs_main_linux-64_repodata.json.bz2 create mode 100644 swh/lister/conda/tests/data/https_repo.anaconda.com/pkgs_pro_linux-64_repodata.json.bz2 create mode 100644 swh/lister/conda/tests/test_lister.py create mode 100644 swh/lister/conda/tests/test_tasks.py diff --git a/setup.py b/setup.py index 8d3d7dd..dfd7f3d 100755 --- a/setup.py +++ b/setup.py @@ -60,6 +60,7 @@ setup( lister.bitbucket=swh.lister.bitbucket:register lister.bower=swh.lister.bower:register lister.cgit=swh.lister.cgit:register + lister.conda=swh.lister.conda:register lister.cran=swh.lister.cran:register lister.crates=swh.lister.crates:register lister.debian=swh.lister.debian:register diff --git a/swh/lister/conda/__init__.py b/swh/lister/conda/__init__.py new file mode 100644 index 0000000..3cc6dd0 --- /dev/null +++ b/swh/lister/conda/__init__.py @@ -0,0 +1,124 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + + +""" +Conda lister +============ + +Anaconda is a package manager that provides tooling for datascience. + +The Conda lister list `packages`_ from Anaconda `repositories`_. +Those repositories host packages for several languages (Python, R) operating systems +and architecture. +Packages are grouped within free or commercial `channels`_. + +To instantiate a conda lister we need to give some `channel`and `arch` arguments:: + + lister = CondaLister( + scheduler=swh_scheduler, channel="free", archs=["linux-64", "osx-64", "win-64"] + ) + +The default `url` value of lister is `https://repo.anaconda.com/pkgs`. One can set another +repository url, for example:: + + lister = CondaLister( + scheduler=swh_scheduler, + url="https://conda.anaconda.org", + channel="conda-forge", + archs=["linux-64"], + ) + +Origins retrieving strategy +--------------------------- + +Each channel provides several `repodata.json`_ files that list available packages +and related versions. + +Given a channel and a list of system and architecture the lister download and parse +corresponding repodata.json. + +We use bz2 compressed version of repodata.json. See for example `main/linux-64`_ page +to view available repodata files. + +Page listing +------------ + +The lister returns one page per channel / architecture that list all available package +versions. + +Origins from page +----------------- + +Origins urls are built following this pattern `https://anaconda.org/{channel}/{pkgname}`. +Each origin is yield with an `artifacts` entry in `extra_loader_arguments` that list +artifact metadata for each archived package version. + +Origin data example for one origin with two related versions.:: + + { + "url": "https://anaconda.org/conda-forge/lifetimes", + "artifacts": { + "linux-64/0.11.1-py36h9f0ad1d_1": { + "url": "https://conda.anaconda.org/conda-forge/linux-64/lifetimes-0.11.1-py36h9f0ad1d_1.tar.bz2", # noqa: B950 + "date": "2020-07-06T12:19:36.425000+00:00", + "version": "0.11.1", + "filename": "lifetimes-0.11.1-py36h9f0ad1d_1.tar.bz2", + "checksums": { + "md5": "faa398f7ba0d60ce44aa6eeded490cee", + "sha256": "f82a352dfae8abceeeaa538b220fd9c5e4aa4e59092a6a6cea70b9ec0581ea03", # noqa: B950 + }, + }, + "linux-64/0.11.1-py36hc560c46_1": { + "url": "https://conda.anaconda.org/conda-forge/linux-64/lifetimes-0.11.1-py36hc560c46_1.tar.bz2", # noqa: B950 + "date": "2020-07-06T12:19:37.032000+00:00", + "version": "0.11.1", + "filename": "lifetimes-0.11.1-py36hc560c46_1.tar.bz2", + "checksums": { + "md5": "c53a689a4c5948e84211bdfc23e3fe68", + "sha256": "76146c2ebd6e3b65928bde53a2585287759d77beba785c0eeb889ee565c0035d", # noqa: B950 + }, + }, + }, + } + +Running tests +------------- + +Activate the virtualenv and run from within swh-lister directory:: + + pytest -s -vv --log-cli-level=DEBUG swh/lister/conda/tests + +Testing with Docker +------------------- + +Change directory to swh/docker then launch the docker environment:: + + docker compose up -d + +Then schedule a conda listing task:: + + docker compose exec swh-scheduler swh scheduler task add -p oneshot list-conda channel="free" archs="[linux-64, osx-64, win-64]" # noqa: B950 + +You can follow lister execution by displaying logs of swh-lister service:: + + docker compose logs -f swh-lister + +.. _packages: https://docs.anaconda.com/anaconda/packages/pkg-docs/ +.. _Anaconda: https://anaconda.com/ +.. _repositories: https://repo.anaconda.com/pkgs/ +.. _channels: https://docs.anaconda.com/anaconda/user-guide/tasks/using-repositories/ +.. _main/linux-64: https://repo.anaconda.com/pkgs/main/linux-64/ +.. _repodata.json: https://repo.anaconda.com/pkgs/free/linux-64/repodata.json +""" + + +def register(): + from .lister import CondaLister + + return { + "lister": CondaLister, + "task_modules": ["%s.tasks" % __name__], + } diff --git a/swh/lister/conda/lister.py b/swh/lister/conda/lister.py new file mode 100644 index 0000000..cf91e3c --- /dev/null +++ b/swh/lister/conda/lister.py @@ -0,0 +1,118 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import bz2 +from collections import defaultdict +import datetime +import json +import logging +from typing import Any, Dict, Iterator, List, Optional, Tuple + +import iso8601 + +from swh.scheduler.interface import SchedulerInterface +from swh.scheduler.model import ListedOrigin + +from ..pattern import CredentialsType, StatelessLister + +logger = logging.getLogger(__name__) + +# Aliasing the page results returned by `get_pages` method from the lister. +CondaListerPage = Tuple[str, Dict[str, Dict[str, Any]]] + + +class CondaLister(StatelessLister[CondaListerPage]): + """List Conda (anaconda.com) origins.""" + + LISTER_NAME = "conda" + VISIT_TYPE = "conda" + INSTANCE = "conda" + BASE_REPO_URL = "https://repo.anaconda.com/pkgs" + REPO_URL_PATTERN = "{url}/{channel}/{arch}/repodata.json.bz2" + ORIGIN_URL_PATTERN = "https://anaconda.org/{channel}/{pkgname}" + ARCHIVE_URL_PATTERN = "{url}/{channel}/{arch}/{filename}" + + def __init__( + self, + scheduler: SchedulerInterface, + credentials: Optional[CredentialsType] = None, + url: str = BASE_REPO_URL, + channel: str = "", + archs: List = [], + ): + super().__init__( + scheduler=scheduler, + credentials=credentials, + instance=self.INSTANCE, + url=url, + ) + self.channel: str = channel + self.archs: List[str] = archs + self.packages: Dict[str, Any] = defaultdict(dict) + self.package_dates: Dict[str, Any] = defaultdict(list) + + def get_pages(self) -> Iterator[CondaListerPage]: + """Yield an iterator which returns 'page'""" + + for arch in self.archs: + repodata_url = self.REPO_URL_PATTERN.format( + url=self.url, channel=self.channel, arch=arch + ) + response = self.http_request(url=repodata_url) + packages = json.loads(bz2.decompress(response.content))["packages"] + yield (arch, packages) + + def get_origins_from_page(self, page: CondaListerPage) -> Iterator[ListedOrigin]: + """Iterate on all pages and yield ListedOrigin instances.""" + assert self.lister_obj.id is not None + arch, packages = page + + for filename, package_metadata in packages.items(): + artifact = { + "filename": filename, + "url": self.ARCHIVE_URL_PATTERN.format( + url=self.url, + channel=self.channel, + filename=filename, + arch=arch, + ), + "version": package_metadata["version"], + "checksums": {}, + } + + for checksum in ("md5", "sha256"): + if checksum in package_metadata: + artifact["checksums"][checksum] = package_metadata[checksum] + + version_key = ( + f"{arch}/{package_metadata['version']}-{package_metadata['build']}" + ) + self.packages[package_metadata["name"]][version_key] = artifact + + package_date = None + if "timestamp" in package_metadata: + package_date = datetime.datetime.fromtimestamp( + package_metadata["timestamp"] / 1e3, datetime.timezone.utc + ) + elif "date" in package_metadata: + package_date = iso8601.parse_date(package_metadata["date"]) + + last_update = None + if package_date: + artifact["date"] = package_date.isoformat() + self.package_dates[package_metadata["name"]].append(package_date) + last_update = max(self.package_dates[package_metadata["name"]]) + + yield ListedOrigin( + lister_id=self.lister_obj.id, + visit_type=self.VISIT_TYPE, + url=self.ORIGIN_URL_PATTERN.format( + channel=self.channel, pkgname=package_metadata["name"] + ), + last_update=last_update, + extra_loader_arguments={ + "artifacts": self.packages[package_metadata["name"]], + }, + ) diff --git a/swh/lister/conda/tasks.py b/swh/lister/conda/tasks.py new file mode 100644 index 0000000..667a998 --- /dev/null +++ b/swh/lister/conda/tasks.py @@ -0,0 +1,19 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from celery import shared_task + +from swh.lister.conda.lister import CondaLister + + +@shared_task(name=__name__ + ".CondaListerTask") +def list_conda(**lister_args): + """Lister task for Anaconda registry""" + return CondaLister.from_configfile(**lister_args).run().dict() + + +@shared_task(name=__name__ + ".ping") +def _ping(): + return "OK" diff --git a/swh/lister/conda/tests/__init__.py b/swh/lister/conda/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/swh/lister/conda/tests/data/https_conda.anaconda.org/conda-forge_linux-64_repodata.json.bz2 b/swh/lister/conda/tests/data/https_conda.anaconda.org/conda-forge_linux-64_repodata.json.bz2 new file mode 100644 index 0000000000000000000000000000000000000000..253d20094c0d8e30c8afbe89ee5bdde64206e85a GIT binary patch literal 955 zcmV;s14R5nT4*^jL0KkKSrqJ$DgXqqUw~8)P#1sq03rkmzwh7rPy^oW6zqn;iPJkUj*kmOJ2p{F~5)$Mho4_pw5r%`V zm+r^Bh2_Psg+>eSc1PwlbuRqWE;r!_H)1;95As7!uomf~unN8Ge z4}>=vWouc*&{U(CYjauV-M-N0>Un4$?NRO;vn<%L84C@HAXN-(LNEwKu+%~vOTRPd zo8K17<+s^QR|*ndU6>8i0jhB30)HhW@uMar7{n5Egu(@=at-xJ(NHgd79igEXR0j3 zkR;$b^O@t=9i)*X8xZV2C94}(fg$C$H&&dhQR&&7zlMSHG0e7b%ZehTXht$60KjPm z6lUfDOUjSQAk{73VXYbs883r^7i!Z)Av_}=b@LIsV;(MZ^8+hz$z4*>uvpmpbhdeV zFtB1;u-SVaPhi8ET~ZC5w@vF8A;qHMRtsqLlyeMN=`CVzw2l(XIg?(&JY+SUSACU9 zq0Kv8UuQMq7h)Y&t)BaUq+q6l6{R%+FIT`Nsf-Jw*pN1WG6OKJoMVP=yzedd8d|Us z)+~nE+XbRWM6A|#qKC*vPL9bFlWC^ZLUgn`iqT9qn4W#IM)ZW0MMIxsudiWmcpSlWdLsn1=x@r5$ShLu`#im?RY zjA8M@*hSD)*9LXy0|9|sO@LLH5%w}4W5wzFq(ZsI!* zr)M)vK8(dRZ$OXM4zWp dNr1wqL2$U2f*mHjYcO+AV2?0*Y0-#=9sf`zwiJ0Kmwe*cSCJAK_sS` zO{#e`(W&Zek>xZr(?*&!85sjWnvF7QV3?o)0000IDrx`$pa1{>004?4Mx#wM(@e<# z(?HM~0A#2HHdsUj^punmQ=uiKZK?{uQQQR!Q6>n0PG44F*{XonXSXqSK+RFIQmBCo zbetIMd~m`GZd!UXRD{UMnG7&Nq&{O5d%X}tN0fJQqIj^pLK_g+n8Y}mF`@``iAofP z8cgNNX`XYOAvn?##UKLeXSk`ofoZo*(Na$lu=m*<+6crhv@oFvFhJ@J0GSO6L~8X! zEJYgYHH^&6&_?Gw5+NbHb$jI{cKmlVwp|N`Rn?>CCIZvNL5amFN+H!~kMRW*VOSEp zhHkN(_XC&{i(n+9HyF48w{1Uv|Vp^qkyarVuZ5iP03e-Kw2Y=wMpH6M@|ES7z^ZFNNWvq?W`uW z(wtk^rU)iXc$0q>oP2n3TTuO9sLMJbrWZf2C0E3&mgGz$wH?Kn>2!3z>=g09pb-cj zw;U)!jT(q41j2UmK!RpLb0Mh-BVsfvYo2PIOkbbbMH34W9YUoH9MQ@+OxVllzTJ|| zWf~_51#qwtZP-dbIy$Koh>taG3jq^>s0p)uDoV#A?4=S?^-?p$kZ9#e9)_3fQqd8) zjBU1xiUBQwITf5_KMHI}u7_zXX@GMPV9IVL0lhnY66-N&68h0C8%aX3<1of`Y)mP6 U?_b}DnF#+Eaz!{$ke^jZMLT2;CIA2c literal 0 HcmV?d00001 diff --git a/swh/lister/conda/tests/data/https_repo.anaconda.com/pkgs_free_osx-64_repodata.json.bz2 b/swh/lister/conda/tests/data/https_repo.anaconda.com/pkgs_free_osx-64_repodata.json.bz2 new file mode 100644 index 0000000000000000000000000000000000000000..e096fceea1e86b2639610f66778d0cca5b8dfac9 GIT binary patch literal 513 zcmV+c0{;C%T4*^jL0KkKSvurIiU0zDUw~8)P#1sm00h7qzwiHgKmq(1*ado&^h}JH zX*Aj<3502r15*K_2{ug`1obok02%-WG&D2-WHbN(003Yf=!WF|yCN2v80 z4LqetNbQ+6s6haTH;^el^))mIO2i1*1T@+fLpq(g$t5+83kj`@w2Wlte-0H+irtiI z?*x<2Sqdv8N432Uvm`V6e%}(*Zr+gzq%s-6kmTT$4M{iYbKJKkU^D78jQ+HctG^{= z)ikWO*6YPqQW0&;qJ@E+Aob{Ny!q?-O!ysh?n=R+Jx_20o#vR5g0SIt;+=Gp2*agm zzD3!6k)>r~UuY@dp(`r_0s-oTT}VMn)r2`|tR+E5ddZ_(*aT3Sn9yYRNlVc?6>Ugr z=GcZs6-0$4g`|qB`O(&s6yz|OwdT9 zGn$+VjMP66&{&n})}Wy=KIh!=rQV34B1!C9N3cq>Q86xJ_x(78|KjdQrwS4WT!@iC DkSOYm literal 0 HcmV?d00001 diff --git a/swh/lister/conda/tests/data/https_repo.anaconda.com/pkgs_free_win-64_repodata.json.bz2 b/swh/lister/conda/tests/data/https_repo.anaconda.com/pkgs_free_win-64_repodata.json.bz2 new file mode 100644 index 0000000000000000000000000000000000000000..868512bb72152573dd5592b6c76a34b733644c9f GIT binary patch literal 1529 zcmVxD5SNSg#hn2?BR4T49%!4N1uAFJV&b2yh)k>*#M;(2rP8O4u3RZCgfwZ78 z8yaJ5Vq$?YQURsiuY|>*sH%zcLxpccIVAX3i3*g5l%y=O(n(8RqN3fPBlE8$!7@EVewlSs|gtJ`hOz!H{asj*+je7$FJaX_vkyTJQ_`ZbbwF-$N zfSmainLzjhV3@fRJ3mkZi;y7j_o?y_1OXfjNEoD_GJE+aXBdl+4YR-n`j_+!S&NOx z4PFDHtiT;lj<9HnyI|5BYwkl%9I)%0a{$GAmE7wINVz_TjIkyR%*$d8p3+{AuFP>LmpI?>v}Y{{7Ia|A@FVt%|17d zQ+2vg&*7&?gMHYUz)%^2fVS36MZ|rEY?&rUG|CsRKNA}w2q2*61X8LD`@5a2VN=0w z%I%E$H6h1uC9vJcEuDyR%a-01mG*7wzPz^~){(uQ9I!)IPI&X)o#=LY+*~^M6hJg+ zqi}+R7YIHA4?@$Y$+|M;-CNvW5tkhzZ!+b0ywMHtIH7+;)P)~ZGJK7~SiHPlWQ$kJ z%v|J&wrN1NKDCs5JnjyYVQl{3oxD8OG`|FtRG~9^$<6XwNjou*X%6_F{i^EJd$O_` zIdvt-?_ssY(N)k*4^n@yDw)`oR0;G3jexgAJW{jSR?Qb~+%XU!u~8rgIfq@o)PJyt5i!` fL3Kzq#^g1jl>Es5zq+-cul!xf6yZWbY9jVve2dXD literal 0 HcmV?d00001 diff --git a/swh/lister/conda/tests/data/https_repo.anaconda.com/pkgs_main_linux-64_repodata.json.bz2 b/swh/lister/conda/tests/data/https_repo.anaconda.com/pkgs_main_linux-64_repodata.json.bz2 new file mode 100644 index 0000000000000000000000000000000000000000..42cb71a5732d4fa7eb8fb89e49dfeafadb257a60 GIT binary patch literal 1445 zcmV;W1zP$-T4*^jL0KkKS=r7C-2e#G-+)vQPzQhaJSNNm*7$i|S`Oi28yNe8!Q znxy6y7D$T=tg4k1Me1GlbL&Y7Gl3nlLe&bXDNugYPq{~v==sU;lMu_g$gK3)+;aQo{GW zt#2P0D0B~qvO}k34YX>a2cu~59iBx2BuW;>;jN+hJ9SRFg7_fb@NUU5(rR^1_VyDL zs-;<`o9%1mt7|UC3c^-3GM4OE#Iq`?uA;89!utk`ev{|nK*)iDqX+>EpMYr!0F zM#RourwZ9if`xk{X{))X+zqC;Kp@nFVz}jg;mOWgDrjJ|{Y+Y~spl71Y)b3M>YHai33rq#}RZ&=!Z%c2+|16ovc2Xo=XL=>t`|scDI}i*g#)2X9WaY?$RGraYoLo0 zDwL^bMz#@-=7Kl?A!x|~zY|S@tPv{r&vp)3zM$s&7St&gWGWQU0YfBHu?HnBcZ?RJ zSB3;O%z*Gfpx|ysw#iNt5JsHX5E-7&OA#uaW@;tF0Pm{>L$a{r!_q3he-4?vP@Z#W@ zcaAjv6ab^06zx&eJ4gZ7*h~V_Z$=mFfHv*^Ye{&e$Gy&3+biDF{mk^4fpCn0u|*&; z?KHaN*86j3Dy}(YfucxSiG?`4D|dscjg=P$v|*b8CzfCd$~m%P0q0WzBf|2e_%Je~ z@2SF0c1I}2vkM}zv4(`_lN$}n_4^3ir&CvD%wYUPMIt>D&B2-nG+eg>CNZ(mdodPt zmhsOqTlE7t(;YHfX3I*<0MZTuzm78-l%%B_ax2IU3g+YuOTg{!PNo-VFb)IY1QGEp z-t2W+N!7jl-GRi0q;0ut+l)lxo1W|8{;o2phQ=M(zUxm6!0#d=VgvG+8<&nOK!tmO zO9@k=?~f+l4x_Wpnl{zA`YWHM@xbaP*V{g9tQIWTh7T*)EI8^XI)T^-y<}M3jLg9f z;v4`pChuPn<0UC2+C72AH4PfQjG^A^Q*=F0$*C_R(ZODP12mIba+2wwz`K}WQf#5A zfZEnQ>%&?yoic7+CKz`NmWib~Z%upLq}2;Z>h{WRZFWP=EmLSsePY*`r)o`*$`vHkGig0|hUt@d;Z3SGu={Y6Cc){l#OHLb z(>&qm4AGM$$Szui=pf8y>)rwS4qIl)_?K*y#B literal 0 HcmV?d00001 diff --git a/swh/lister/conda/tests/data/https_repo.anaconda.com/pkgs_pro_linux-64_repodata.json.bz2 b/swh/lister/conda/tests/data/https_repo.anaconda.com/pkgs_pro_linux-64_repodata.json.bz2 new file mode 100644 index 0000000000000000000000000000000000000000..94bc5403f75662a12f41a6a33246737af379be86 GIT binary patch literal 770 zcmV+d1O5C$T4*^jL0KkKS++z?MgRoDUw~8)P#1sj6e3Itzwh7rPy=35rro5jpb0_} zc>-?8(;F@!3@z@*M`cUkgex zS^MPNc^DL@o#|w~Ju96xCuMg#gtZk(N=qzIh%ksafe2=JA;3f(h&YCZ+m_oCo0-`= zRHWYBQpw~|N{G#lj4h)wO&j9&))sr+Oj{YcRo6zl_%*M|QBGzI3#odkN(hMdP~pkz z^pG<`3_a-%pB!vJ-mVsmNz86jp`kSKNi4xL6{o065MLaoCX7iEArYYxvyrhFz@peD z!Vqbq*yv(*SduuAk*?Wqf(48YIF>_++6$l`F9^(hG?rRfxm4=rk#Kd-Lsz9Ae=A0om z(~ec`*JOV932Epij54m3S2Av|h?m4f;}e$q$In@ocYX=m;&D()vAt#` zo4RkES=J_b2K7~2m`dNQObu|OT6dbn