From a6f796b26867a078f44825a82ca71942f12eed7b Mon Sep 17 00:00:00 2001 From: Franck Bret Date: Fri, 8 Jul 2022 12:46:11 +0200 Subject: [PATCH] crates.lister: Implement incremental mode: Add incremental mode support based on a 'last_commit' state, used to get new package versions from git diff range of commits. --- mypy.ini | 2 + requirements.txt | 1 + swh/lister/crates/lister.py | 173 ++++++++++++++---- .../tests/data/fake-crates-repository.tar.gz | Bin 4467 -> 9134 bytes .../tests/data/fake_crates_repository_init.sh | 47 ++++- swh/lister/crates/tests/test_lister.py | 122 +++++++++++- 6 files changed, 296 insertions(+), 49 deletions(-) diff --git a/mypy.ini b/mypy.ini index eb2343b..51c1c65 100644 --- a/mypy.ini +++ b/mypy.ini @@ -43,3 +43,5 @@ ignore_missing_imports = True [mypy-xmltodict.*] ignore_missing_imports = True +[mypy-dulwich.*] +ignore_missing_imports = True diff --git a/requirements.txt b/requirements.txt index ea5ee0f..5021815 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,3 +7,4 @@ launchpadlib tenacity >= 6.2 xmltodict lxml +dulwich diff --git a/swh/lister/crates/lister.py b/swh/lister/crates/lister.py index 63604a1..fbe3003 100644 --- a/swh/lister/crates/lister.py +++ b/swh/lister/crates/lister.py @@ -2,20 +2,24 @@ # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information - +from dataclasses import asdict, dataclass +import datetime +import io import json import logging from pathlib import Path -import subprocess -from typing import Any, Dict, Iterator, List +import shutil +from typing import Any, Dict, Iterator, List, Optional from urllib.parse import urlparse -import iso8601 +from dulwich import porcelain +from dulwich.patch import write_tree_diff +from dulwich.repo import Repo from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin -from ..pattern import CredentialsType, StatelessLister +from ..pattern import CredentialsType, Lister logger = logging.getLogger(__name__) @@ -23,11 +27,25 @@ logger = logging.getLogger(__name__) CratesListerPage = List[Dict[str, Any]] -class CratesLister(StatelessLister[CratesListerPage]): +@dataclass +class CratesListerState: + """Store lister state for incremental mode operations. + 'last_commit' represents a git commit hash + """ + + last_commit: str = "" + + +class CratesLister(Lister[CratesListerState, CratesListerPage]): """List origins from the "crates.io" forge. It basically fetches https://github.com/rust-lang/crates.io-index.git to a - temp directory and then walks through each file to get the crate's info. + temp directory and then walks through each file to get the crate's info on + the first run. + + In incremental mode, it relies on the same Git repository but instead of reading + each file of the repo, it get the differences through ``git log last_commit..HEAD``. + Resulting output string is parsed to build page entries. """ # Part of the lister API, that identifies this lister @@ -55,17 +73,24 @@ class CratesLister(StatelessLister[CratesListerPage]): instance=self.INSTANCE, ) + def state_from_dict(self, d: Dict[str, Any]) -> CratesListerState: + if "last_commit" not in d: + d["last_commit"] = "" + return CratesListerState(**d) + + def state_to_dict(self, state: CratesListerState) -> Dict[str, Any]: + return asdict(state) + def get_index_repository(self) -> None: """Get crates.io-index repository up to date running git command.""" - - subprocess.check_call( - [ - "git", - "clone", - self.INDEX_REPOSITORY_URL, - self.DESTINATION_PATH, - ] - ) + if self.DESTINATION_PATH.exists(): + porcelain.pull( + self.DESTINATION_PATH, remote_location=self.INDEX_REPOSITORY_URL + ) + else: + porcelain.clone( + source=self.INDEX_REPOSITORY_URL, target=self.DESTINATION_PATH + ) def get_crates_index(self) -> List[Path]: """Build a sorted list of file paths excluding dotted directories and @@ -74,7 +99,6 @@ class CratesLister(StatelessLister[CratesListerPage]): Each file path corresponds to a crate that lists all available versions. """ - crates_index = sorted( path for path in self.DESTINATION_PATH.rglob("*") @@ -85,6 +109,51 @@ class CratesLister(StatelessLister[CratesListerPage]): return crates_index + def get_last_commit_hash(self, repository_path: Path) -> str: + """Returns the last commit hash of a git repository""" + assert repository_path.exists() + + repo = Repo(str(repository_path)) + head = repo.head() + last_commit = repo[head] + + return last_commit.id.decode() + + def get_last_update_by_file(self, filepath: Path) -> Optional[datetime.datetime]: + """Given a file path within a Git repository, returns its last commit + date as iso8601 + """ + repo = Repo(str(self.DESTINATION_PATH)) + # compute relative path otherwise it fails + relative_path = filepath.relative_to(self.DESTINATION_PATH) + walker = repo.get_walker(paths=[bytes(relative_path)], max_entries=1) + try: + commit = next(iter(walker)).commit + except StopIteration: + logger.error( + "Can not find %s related commits in repository %s", relative_path, repo + ) + return None + else: + last_update = datetime.datetime.fromtimestamp( + commit.author_time, datetime.timezone.utc + ) + return last_update + + def page_entry_dict(self, entry: Dict[str, Any]) -> Dict[str, Any]: + """Transform package version definition dict to a suitable + page entry dict + """ + return dict( + name=entry["name"], + version=entry["vers"], + checksum=entry["cksum"], + yanked=entry["yanked"], + crate_file=self.CRATE_FILE_URL_PATTERN.format( + crate=entry["name"], version=entry["vers"] + ), + ) + def get_pages(self) -> Iterator[CratesListerPage]: """Yield an iterator sorted by name in ascending order of pages. @@ -98,34 +167,41 @@ class CratesLister(StatelessLister[CratesListerPage]): """ # Fetch crates.io index repository self.get_index_repository() - # Get a list of all crates files from the index repository - crates_index = self.get_crates_index() - logger.debug("found %s crates in crates_index", len(crates_index)) + if not self.state.last_commit: + # First discovery + # List all crates files from the index repository + crates_index = self.get_crates_index() + else: + # Incremental case + # Get new package version by parsing a range of commits from index repository + repo = Repo(str(self.DESTINATION_PATH)) + head = repo[repo.head()] + last = repo[self.state.last_commit.encode()] + outstream = io.BytesIO() + write_tree_diff(outstream, repo.object_store, last.tree, head.tree) + raw_diff = outstream.getvalue() + crates_index = [] + for line in raw_diff.splitlines(): + if line.startswith(b"+++ b/"): + filepath = line.split(b"+++ b/", 1)[1] + crates_index.append(self.DESTINATION_PATH / filepath.decode()) + crates_index = sorted(crates_index) + + logger.debug("Found %s crates in crates_index", len(crates_index)) + + # Each line of a crate file is a json entry describing released versions + # for a package for crate in crates_index: page = [] - # %cI is for strict iso8601 date formatting - last_update_str = subprocess.check_output( - ["git", "log", "-1", "--pretty=format:%cI", str(crate)], - cwd=self.DESTINATION_PATH, - ) - last_update = iso8601.parse_date(last_update_str.decode().strip()) + last_update = self.get_last_update_by_file(crate) with crate.open("rb") as current_file: for line in current_file: data = json.loads(line) - # pick only the data we need - page.append( - dict( - name=data["name"], - version=data["vers"], - checksum=data["cksum"], - crate_file=self.CRATE_FILE_URL_PATTERN.format( - crate=data["name"], version=data["vers"] - ), - last_update=last_update, - ) - ) + entry = self.page_entry_dict(data) + entry["last_update"] = last_update + page.append(entry) yield page def get_origins_from_page(self, page: CratesListerPage) -> Iterator[ListedOrigin]: @@ -136,6 +212,7 @@ class CratesLister(StatelessLister[CratesListerPage]): url = self.CRATE_API_URL_PATTERN.format(crate=page[0]["name"]) last_update = page[0]["last_update"] artifacts = [] + crates_metadata = [] for version in page: filename = urlparse(version["crate_file"]).path.split("/")[-1] @@ -150,6 +227,8 @@ class CratesLister(StatelessLister[CratesListerPage]): "version": version["version"], } artifacts.append(artifact) + data = {f"{version['version']}": {"yanked": f"{version['yanked']}"}} + crates_metadata.append(data) yield ListedOrigin( lister_id=self.lister_obj.id, @@ -158,5 +237,23 @@ class CratesLister(StatelessLister[CratesListerPage]): last_update=last_update, extra_loader_arguments={ "artifacts": artifacts, + "crates_metadata": crates_metadata, }, ) + + def finalize(self) -> None: + last = self.get_last_commit_hash(repository_path=self.DESTINATION_PATH) + if self.state.last_commit == last: + self.updated = False + else: + self.state.last_commit = last + self.updated = True + + logger.debug("Listing crates origin completed with last commit id %s", last) + + # Cleanup by removing the repository directory + if self.DESTINATION_PATH.exists(): + shutil.rmtree(self.DESTINATION_PATH) + logger.debug( + "Successfully removed %s directory", str(self.DESTINATION_PATH) + ) diff --git a/swh/lister/crates/tests/data/fake-crates-repository.tar.gz b/swh/lister/crates/tests/data/fake-crates-repository.tar.gz index 8b384b4a12a9ba15ba133fd50a0ab08b6c1468bf..498b10590d39b5bc909006831ea2f7ee2c739498 100644 GIT binary patch literal 9134 zcmZ{JRan$t)HNtA9g@-w0t$#yL$`#aba$7MLkiN}-Q8V7BV7uTBi$klIRo>1`M>Y; ze0SfzI9KQ5tiAVIYwzdKMPs4-Z#+NE_3 zgU`0e5}2oY!U{19A@@v~inm&7kHmw0E9|KALS2*9dU)Tk z`{m?4o zWGm~yNv~jjugy@}HG?OKvLjJgMa(VjdO8bphlS=-EpWT>dE2<9ea-J*5f^6`A4VUD zCimQ`a3n9>Uo#f(_n$w1urlFhc>QL+=c`&BbkA+)6sm+B7}M{wf*j+eJ-pq|JkNGw z2t|;bdmLEV2q{m!bsBuoYah$D$P+198Tlke3E?F*9+CplXBXQ$bx! zK=Z`FjkTpA{1{wSGLc{^X&I|P!-sr3v#3JO8{QfqMNK3m42|Vvl(nVAHjxzh|<|}@g zas2s$f_l_f_0hY2WyZJMc404uB*Mmpmn`iu4209kQYCQ`>JDX4DI9XBq6P~8s9-tE zxk_oW^D_x}3VLbi3(!n=C6HzEEjFMYlo4?$q!KbIu#<9dYEjCogi37-msWAVXfAbi zw3XV=DmqEQ+Oa>D>YF(p6wQ=qp65!cQpb%f#Hk}07a$)NeD(I*^r$4NnUwsuK{SCN z)@;Edi4GFK?ac%;59 z#Y&tlXHcA|6E!jCtqFyrb65wHjvVEcP$;GW zn2X&EpyUCU@Jm=70@Aj(!0SS-N5y~L*6t=@RD-D}E^=Ys$XQ~gU!cXE@u-x$xVO_l z;`Ca0Ed)KK3Nraxl&A+gMG{=F3O5+syFNh8${k~zC%_^H(AoT6!hot=p!FMERnyg` z?c6Yb_CQ7T=8k{54st){0%vtZE6+^HT;WvK(A)} z&V$&QHh1fSuRI=+OKTQ5=u#@|(FmRGbh=&@X1M?Vbpe*F~Xqz6_}Y6bI>6X61r zq`=AvsAlUDZsR`1xQH_`2Sj)Svs8me`Jn;U{a#*HVbeg>$T^Tzw0du%++S8J8IBq! zh_0MMf}zF3@Ej|cSqit1UA@Av`TSgnHN71Snu$~=RL1}af}Ro`_URhjAf0_f#LZL# zMCe~!_z?=WCFV)( zaTP!ibqy3cjsY);B9EG!o!TQx*=XWAZq4!K5$=xMtR;_J^!iAXh5s;xEt(V)7j#)y z=;?f)@%oG|G1?WS&8Zul&!kksuHGuW<7LZ7FdeZFRj55VsO+O_Cg(&9n0z~LZ zGWDz_>eXb~^UBtJDmA=`2^V4Oi=f`zu2;_I5@J=nu{UIEGD!(HyvY?KK-C%Wnjr^? zD~0c=9fCuwu0V=|5ou|Q#f8E0%^8c~IZaoLE0T`g-VNY=Zi)bNy*~%Y!WH{9{-%}g zfNL+V?*@l)9FA-H0;Fz_Uc-Bdf!`r1X`Kwq2kA_1UPKB6q#FyMffXZ zzDYLdu=t#uE>q*yMKeD2{9TXY0UmWXspp3NoaNA{cC)jG{{v4B(HXCu$7iWkNR|9F zuyz8}JUvn(`GeFqC9m)FQ4PVkHBxv)WeltioW;a%EUpv6V1Rm}(l8*=`UoLIGcSu# zjRjG3${E= z94W4Et^#FV&~W+gn2@lFX3kybau^s3_tn^p>6V?U=(DJ;`q=~}hO=3ml$>Q{&DoAm z;yIi;8r@&9ui6fJz}DDeZMOoiReJ3`5Zroj+tR*@la>xT2miHewfsQ7+HZa3_h9&~ zUE<~Dfb1eQuG1goTJke^=RCF@p!MAt38e*^!(8E*x0gxbgM9m(Kego=UY-)E7ff~$ zev`o2LHow@L=Q}s2MEsDLp%$b`CmO~=>*$?)2pufJR@gUXx?-0cBFe;^I$}GcV0}U z4u9Rj@@v8v7ZkIt4E5Rv=M5_r%^zXIJn2KJ5^T4}7=y~tKRT(61Grq0u(dJX> z6?(T#i*RTCK%4ALuR&7@YB0Pn;Jo|hrhu-ctJSQ#Xvt>G66o5wf8DKW6R4@4Kq=Jp z4~BG5)pE>CSQlKqc5Nn}L0Xx5pI0)YjFnA$u)9q(23g;Yb%t5upb23cjiw%e$;OQV zGdz|{NUTV?cQ$Nzkbl~3;34M7XY9b)7J5FHDZjb#U>whhZ{c{IUK}v;E|Iz9vG|85 z?r0oPXEJy+TT%2(LaWHsXcIF10JI?vH=VL_h|bgiPh5?T+5=9x&PnAvZ6#+?tpy>0??d z61qd!ex0%B7wx)5CsUCW7m;ineutf6=Y&-AO(pGp8 zLG~@Q)$c}#fF_`KJwXV=@q=&E_Mu7An0z;@v+5IVFp7nc7XRUQg}*40M9-ed!@S*{ z6Zl*zY))a;Zy8JU*$QVz>G18~D7fm=I*<)|(%KN%1r%}c4UqRY^w>Rpc|ThkARq!W znKV-Djd=O70-`5+9KB+o$HK9?Y)|4NH4DV+19THvXFx4|4w877G@F+86SGzsbrUO* zGrcW^`tyVeiC=_W6Bl63TjC2)(+*gh^IToOt#3``No!N;zx3ilOPf`Ov%CTyo5?M)4q;#4ty?$xjsFq1C~67&i%kb3{f!C(3(D<*ToO9t=rFRdzKMM73H zhKkXO0ox}S`(Q1tb-4Qo7i^?dYov^#es59tju-J2lI?$C;QeJ(9`ONt-W(> zs@|$HCPOS$XoHtxcvf75GWbrg=fM+G`qi+5tq!J$^n;q;@7ru(L$6<46tL+!=Vf&} z!Q{utz5Q-wdZ}(LRZ4RSpj!y}>@l`ZvGz3Oe z@NBi~0?Q6ywm$(*AA#45IY>ftvdN?p_mCiZoKNWHxD`_9VGNqx58%fQ z3{t~D$Zz8~AYENz|B6=3SAL&-<97Y}S5G=^{?U8XOPkk!TOeDZ{7bK)xrZtyAKYg{ z1pDSag!-;k=GU$-OT(|%v>%}pzfRD+{cOA^W?q$fJGNSF5w`O)>>c*JioJ~oiC15K zS+Vd7NSt$Dx4FLi{SM|}wd6nWx!C+zEC6RA{%2qHfek_j^wktzw0>2-c!Gve>)VEG zZ1rgc77x;eI@+s;zrX`FI{G+bl#tq2;wY`oOy|JNi1{3nkd&$^cW`Y2gD< zn$d39?*GAlVwOqh*V#Wk>+)B(@4$^qpsKM_&|CnT2>*%7DcsuyLgDnfNb>MbtqBGk zbV{Cqa*#Nq7;{8(UX&b>gxJSFuJ&Yfw1Pe?Vu#2;Chj3D2VjYhZBMPs9)KKcqoli& z>^?h`xtz)ciGe%^-(JnEzayg|8`J3(S|xWR(U4jN#jM`d`}F)DE50$PkjHcAg#Xdf zMd5$HSM>~=RSVx4RjRf624EQjCyG4{P)c#Y=(U5dOYF<=HOz?@FVR_3XHDMndMV~u zSj?^so?e>6|iG zN}%sZ(T>rcF^MKzRJ3uQZ?@X#r?c0li)f!~?|jcq488-fX{F^M~!g@TEkMj`L&_ykgD{k0t7d<2%qrhCqc_mV{fAK|}|A>&lv>-}z%YLmfK5Ux>;+K)Cxi_&(s$=iYWMsCf~9Rn+DmLg*EK}C+UZyfLg1trzf%l5OBx%TCX2r zUl(|+{wp-uf<~V9(4Bsj%p;l57f>T=y)$V%0P8);_?txSkMG3Zrdbx*OA$(uelErd zqoMrCmte-O+YRt@HUh9tWd0dYFBY7^LH%jA7FbUOG51YRGOgzLaFkzABf(2z`(MG{ z<=zh6`%CKxkE}0WoA*w>)W7Pob>C~NcKDbjc+=XMYv2VBT#HGLmKPT_zdP!+9}89H zfv&nu(PvvRcpq3L}oaI($Qoq!5_-6r{~yv;7Rwda7_xu~H3?Uzbv$i=k2&+%?l zQ8Q@PZ|l!dp`ERyqv=NPPQL}#4F$1K(&?I_YD_LNCvg1e1!Q(Y<{ zgl9KtP0rDMvQ#Ij2(Y{l@D>tTbk4j7d(Ym&jyy(FZbc6Q_5(I;@@@n2BQz{3eyxBz zj*uYIy+?2?VF?M5!FI}X3OO1cI+||qt?ALoRYW%%iVW`Q-KdH^b(;I2cTHVo&6SE9 zQQ}SM*^Bgo7HXzIMzwIK??2MRxEE5LQfho#R%~fB+YqO-sKV(lXrps-FqP#MpCvZ^ zL@N5>Q?P-=`HNwVDfN@welA%aCuU1L%RFMD>l=o%M(?k8@?>uApG__pcfNUCSX*YH zotTdC-MSj|o;hgL{&D%S)tRxLq*c(_1RGydzGg{Lt@oOzlWqxVk9EQ9*N;5HM z(}(W^!#ZSZ+Nif=atpI%<56FJ@6M(r9YEii1#Hp0;PS)Vl2IShP!%!ID2cym#DwUj z(CZ}?E27y*rr6sd$I1|(lzJaTJvP!37{eDTl0mW_N?KL~RPFV_=aROs2I{@g=j@Vg zos_2*vnpr-{c2$H0Q{5i_-d9xAlB@?k|fPI1KRY0(jE$FWRQ7ONh%Z%D*;141K6`T zfv`i6)kIl!Kvt%KS|td+BQ(1VH&I@}_v>~0LhC7GQsAU|)4vnWc@6OB5oYipq3<#jnU8xqEeTPaPCx%pd<5VO1Disic%D5~v z0my5$FB2GIeq4MFUsj9Hknr+C_lKcX5diz&HB=x(_)(e>d}b~}jqw_nkB%gM6^MJ!0M0Tn$@1K`O;J?=wrj~PM8|MBJvYSvZFcmk7=|Sm zPp!Y>{-k7yHReH7>i2rpJMX(;4#){c9X(qdIUrp$!vZzppHDUtOFf1?U=;oq>F_|_ z?QZK{vXyh=t$kKUCuil16XJgCVm=<8j;U{Q%(+nofBo$|98$Gr5kNkh^RhQ!GyN5B zm>j0xND{$0h=b51wgFW>ofMB_rcL{fe3b^X#Iv+SH`Xb~S#=2Mj>2P$g2u!C;IOV-i z4qwgD9|qfoBhxykU>|+5U-v-EJ^;Hhk*-(-MS2KG5t-u*xCWuHr#&Nj_FMsr=sArH z3Zaba>ssI<@eULOyEhS#O1=ZH6;16V2BNC1)r(%P-(GKrV`rydH5qdeYRw4Z%(pd& z*2Wz_Kx=NcfhTHU9bd)M<4SJ{)V4t1zmVXLea06e!HwIPv4Fs|C6(-anGe7f9`a{_ zI-(Xoz9C<6n4M73*@jI(WWR{Pc!A7^wFm%X=RPu>K?I&m%fU z$JjJ0;YOYp+EQH#n|)pD)y}Yq@vlZo zu`@T-zl3?@JsvLBgz^=fQ{Z0M4X$zJ?TTG#vwBAv*_Gt)%z=w#n4@m(*Om`89+C46 zi}-AQAiXov>a)M_u|i)U|0N6pJAnL0IS+0Ob*$({ikNNgsYd3H#b}|?m7}Zmgv=NU zq>hD;@>{I;;M6ZWXAoAb1&+u`>gCayxy|C4pZ6EP|>p_tkRWROh?>tIM$k17-C$=ZeEe7U8^=t{*z5--LLkljg7F zX9sX|<%UlUayGkzJ2&=R*OzQg8P;li3dxOk>su-~zt|-h8$YN0@R%m*@5e|;{OVND zPOjcd$mp*GB7)^Gea&~@L(}6iINJ-qteJBslJFCobYq)dU%*1zJ6>UZ8P=SS6VlPs zezD8}pESGi1k!qqseggLe%>dTr7k)vZW~6=!#EMvP|cY1A>nJUX$DVhEV?7G1Bx6O z+cMhnRnL0U9#{vFsg^&imXkIJl4)X_tCx9h#08+7ZvNL9(|b(y!c$4L!r^MzLMtnk z%H30ssM;XvEo~?1Chm!rTBGX<8X#!t0o;eI{-$0IPQswXv9=46*1%$UgSCsLALB}c zlE@e3iXv-f`2q$3mmo=^E^kbj1U3Mu%Q*45?rju=)_eIT!Dhq3V5-LZFaJPlz^i>b z1D(93Cu1CIIRIeyP=U}V$)z9ICX5K-lTUsh?v_Hqtnkeo^<}7Sn5MnThvyZ2W6(%| z>4|W-tFZt9%=+Jyh@3MmM}6Su^*QGX9=&CLGQDymX(GJ(HcEa;S233V@OCi_6j9>HQjxjHE)s-W>H`-6OEB~2Yd-#b( zWHN@H$2>FE)>Eta`|7C8^n9Z0Doj1sl97J@*m38_54vV> zBz2&IVTv;$)n_q{sb z@*t8a544k7_@z}jLDta9$Ivr0!fNPg*0#>P@I7FA1&+#Q2F2ZMkmNH@cYO8_$e;X~ zIYU08+eSkp7;PApo2lRArCJzzrVJHUtUTXbu#(O7gZPr@8aeQt9EE^4eQKBFydKpg z{E#5X6YeKF82WC9^GuQ$ePEP6Aw%aFU*dC`ED@CjgHoknM>qnItgu6ZV{QS#!H7G8 zr&ZGg)Yr-i!&mt9%;YaG&CdO4?d0D{P=~iCLGpcb9)3#TOxf=>O}BavafFE9Dc1UL zb?wAfwk3JK{t~syHoC|I-v3I}`rzV^q{Ayz1q22XDU=y$l(x(hh%DZJm&6Sv#!?DG z3vXnfT6ld0fZC8R{#~00N-bWS6la;%^rlvt!TH@4v|NRJWQq1&SUX%c3f0$|E?dN_ z9zka#z?-N4tS7_BFLNy|CLk7gUTB`tuR%ea9F`G|ofgayLT`)9s1HH%V!`g|1VX%* z!Sw_kE@McmgnKr~>EOLJJ->#@?*{%>ZL_j)5wldMcwG$dbrKzQK~B zNMLabL$ggYmqdk3NDR0KqpFDH;m&jD4Y==Jk2^X0+FxG>j|UuB%|qjI{aj226gGWXP5$QVp^Xl-flA|5@?QzvXC*_ik>CrZvj*P;T#e$4UW&$-% z`u($@f*-^|CM{J-UP3;bEO69z0155B(# zbt>G8$OhSZ7Z~32a5=UWgk@Mu5SqJXSgPy9%%jC@v&Viyk$;0SE=cQIhW*WXg=9q_ z(@c>P=4_1~KAjdMO^;Qqha>BT@r_UAo{)WTP(T_5P1T(l!wjX?e?g6fli9HiO<=hv zX}AS^9zf z@b}{3H2}fGDJUP<<6c%${R90DNIu-V#1_Z_{GXHTF3SJZpU~;9QWyhS1)t%!EYC@6 z9$cV1ya<3W=-)j=;Q2WKLxTR_0Ra2J7hZ#NTKj!)!}z@kp_2;s#4JE%8WRe$-ILY5 zUrd__=-%RIzqo;CCp%I+#!J`Z^7p3$+hmcE z4z{rVo4}~T7%k^%_>4}^B-vG6?rWPJkRh}^eYNIh72*(|3~zw6c~sk8M4TYJ7qWiE zp`Nkrln_^cMh$Gbon8MOH+T3Nz6I~=y@a-xJ7la1eNg<9IWLkM@Y!77NvG$HwV7J5 zx~l#p#mQ);#=q;#?k(r*^MUGeuh{>#dh&$)$|%E;hcBUd ziFgT~D|YwM?-j;mDP7(98shTy?1=keyme+9UYr*prP8jqOJ%gm=aW|5-oGn!*4xfY zx9jQE)~;*(2`vNbrOrOO>Y)ao>AY|{ulFdSnj?wOCx^Wf6w=_qzVBW?^LD>Qjr8{M zdbY0TTKwi0GQ4joz#<%tTU*JE862hkW%Sls^sNuH6?fA016NQAFo{@qSZOuL1r6&2Gk?-fwhD_;#`Wp+JJ(!=YF=^4QGfCA$M7|nm zck1N742RA6(%1?l8npVg+Fy{aGchsihQog3S?*?5u|N`Cyq6db}8ZuXWJKKlBtBhl-@c?W8XBuzS4ivEAn$+<`PRRNo-e zUcAG^MA-Gb&C#I~rV>TECMC6`{IptjE6BmfN?p2f55g6Dkh25tl`Mhbvw)#+Z?c^w zfRdu*DocwVk7^*X30K8J8MMlfH>WC)$My{!>&yKBxz3>kUWl6M9B`FY9g=?Wxn#`z z>EJh2y?6nWFE#yr5w^~I?!uFEe5t?X9vKG&-i`BqXTFit^^AR}VCYkJ4MLoxR-ZI` zs5D0GW0Sgzm9?luM41Fk_cwbUb4Pg$W+R{V&%O_f(DKBy>zK*=+q$tk6~l_0H(57U zBbEwj@#|t)R_5`EIgq9r`KFUvnRQt)iJ_)<4~tMc)x$qxo8P5OJWm|eSo8&V>)6B%C>$tq*7vdBxzS6l-y~#FPITSnn|BcbU!GJP_k-8Ws2~m=}^Lo zVr^Fx_#SC4#j!9y9}{Ef`pQ{KI0EFV%>5gA!(2PiSedH(L`luRfYOKI(e;A021Rl! zwM`@!d%%06oyUp6P5os>THcGUpJkU%ena&3oN%w&)?RdQ8B05vgfwYX;1_5TFP$j;gQP_cR zy#7x0D8w-5Y*RVmzrpT!H|!>mD3ZPX#eZZ8wJ33fCKvuyeo}u4uKV3d%|%$0|Ns7jgm9UwZ+|o14c!Ya&aYIBHhcrC_OoAwaK5~d{kZhqDMEl+!~7S z3h{xm`owyaRi~GKadG;{Gc4#Y|665LwCB%SywF>ZnUmRAG8T-i*%=Xwj=m`#41O&O z;w;5p=8p9ZPSiy5`ob{q8m&WbV5Ve(sIc-lX9nhzs=2m2P9!C<+J|GtfT9cN)wVZ= zq1r7j1)}O;-Po-nk#H+>+QZ74bM-BaQLB?xwExr9lU#F`rY|P2!jI>K?|Os)7LBlF z=)wwE6}aY|S#bQo6KD6%>D|Oz`@q*o@RdB|3-fD7dr)XU)O`BFX({l!X*4a`pNm{_ z9tv72e^}K`5@Ip083;rQvRG4C3#xjefs83Hz)M9zQ%jTY670NOyny3@-9(q2oUiWt8s`^)%+yN&Y)VFL`(LUi_i?BX86TkVL949W;Bl9> z4E_4nB30u2BF@P=I%j*R+9iL{C66vzFHYldsgVHE);bTqZK{0rHe^Jk?8( zrsEmsyK5fBPQjO);{k4FfoU||<+h9)#UmJ*LY_=v{$d9S@p*7-s*=Gz-nJ4Jq_so`Vo!mPX${PCSjp=9K}DXQCG68g*ro~*YDw1L@Ni*2!Pieu^@PTGis!6@$7K0&eZ7dqF|`=lderqHDeD& z1;Kmk;i0Szr?F$RgeJnJTjz`*7q%X1oOcYp4K*WRijgL=fd#nf&ep=;G*DC@p&{bmyUDTqZ*sM%_2 zEa$Z408Z#k+!gdCC~)i%l7$4FLf*bvaYO_68uz#4yAKxYUL6i2c-2~!+RpwUM2%GI zgq}Nhzc^e$+oL4r7q_-_vRTk4#JxS9qhpk3uZ$lL=H*XEcw?MpkZEwSnCrq!vKBPg zGQHj{bVWB%(|s_#@B)>gk_xSt;A971ZSNM)g z`LrJ&W%HOWa9@oK^N&bs->y8^6G5fl@$e6bx{Cgq0?eN;;Tsd3GPuEkCJCGG%@%C~ zsYN|wDLznT(K#Kh6UMOjD@6lxm5bfBR)>-`0>Q>{;HmjH?gOe-iJrH&-8f($C1e&ABW~dRIs`%ITtK~quXlN>>k&YZ~pKWOmUcwSPuT*HX1R@>SHy%i4x z>+4-2D;w(KFxh**&d%K^Yh=E+EsNVvLHnAf3XZe5A!C2r>zbtOy%dCJiHyOktP68q zZ(57!LdAb7dxXr*%!A`&(8%4=wl>8F1kqOpD8grZmF(Kx=1-n5lo#Al#^d-`uwmt& zM#j;$D^m|Rv)T{P?DOu5>XQvMr^w~u+|YNI;2tEs22XCNs<@=;rmoxA6sFO7p()fhxgbZ2L_k9l?Jj%9-=f8w9@?r}Dgv?)4+W#j%r4W_Ee7h$skyxFE9% z1&L^^zMe!mnH7_t)Yj}4U`JPuhd%p)hsqDoY`-=|^bn=s{s(n3;!6Wowy!(C`^ z8N*jUEV(uAY9PB$J)-$dwu0NDDMcT$Q4nC_;XTDyhOd5IA~iRGjO)n++NFoZgp2vk zQJ=AMz~+wDY5Ii(SdA@ZEp@ll7y%-3=oGAX$oOj$YP`$|ARNws{a|)XI;O3;i@xQs zd-Goo5cZjM07&Q~8>VMPw|YL_T4L}NT7_Zam#o{f z&UfN*yXm*9Xd7Pb@*fm`d5F_knIxX)^#Wo2Q$SWY_)5XilASs%jzlv|{(^4bRySti zMoESyC3$#j9}Y}n9S^8%IRcT_`ea%NaGp7>06p!7AFEW*2GUP*TRPsjZqI-s6LUb+x=Ls+*UM14a_)E5U9+nXy6x# zX-lI({vn`nU1@o;^fSRz)yEXfF9kUubPDta>d9f>ogu6cEBVl?|X>iCEw9igT$&%NhSNC840J27nnlQ?kEk}1@RIZl&b zzwAcRfC1S`o|vKFavN=uM6z%x*Q2=r&^7z5mGuH7zc-Oa`M-hFVM*?=EE->E+&tW# z*9v)*aL3s7%$`XJyIvhG`K;9yG|qh*`7QWokVA?{3eaBg&idrL<*Xw=^x*z6W?jhU zOO*!BHTK0*wYSgE#ovNG7iL;Z#t#a%Dp86uG&Ya%4~*x`Lj-QU-@y5(eOpP;xQ`o2 z8g+DDE=M4nWJA1LeWL#ztGuH;aZ1$$^?7WGns&75Q>c%sGNfB0M2j`7qKJ_kfKEam za9tW88LXCfsD6ouwr2tlGZOwMkb8SQ)yA(@KZ&E!KicdLvn>+5G(gB^tE@kaHO(1* z?WK9nW8_~qX}EcLQtTxjfE5cJEv_~UQcgHS2aN2`M{|UV?#PIR?2n~H?_x5X3g}d` zdlm0=uHXN4Hs3c`PGcGNoAzft zi_4wu7@O3wJJUAx<@zt&LQjYGd(XZPPqy9)v9nxMA6~Kws1$f{p`@`;0I|MExpT+G zq&dZLcg>Mxaz*Xu*q34(@9Be69h=njf*PpGV|8atDu2JO#@E>)U&C|uMh^p-gOfAu z-1#pKpr081;pt~tW1pm7)t^p>b|+8XNoqhWWnSuswAI)-chYhC7Ys}mm(tOt|+i0JgqH%v`))-=yq8p_`hOiA`F5N|#C+}L-Z(d;zDu3)^Fbp8A$z7mb~ib-#ax$fRLhZwML4Y$ukkEe%|I~7)ixFY}%n-z?r5yr5E%2X^U7y{rWk@lb zfC_c_PU03GxzX|T)<~K(TjyDYnAk?BsE@p3sC0vE@7(r=*#blFDy)GO^OLTlP2kqx z=dA4xvUK|MstwnB5B4kdA|Yv>7(dsSlMu276m*FrylnUitceExr3pP_91t|KK_E#e wf`D1JDt*Aifnui&1symEy8w*Be*kPWAGGt$#Qr~bLRf>kuf{Bs43Log9~Iv&DgXcg diff --git a/swh/lister/crates/tests/data/fake_crates_repository_init.sh b/swh/lister/crates/tests/data/fake_crates_repository_init.sh index 60680d6..6368601 100755 --- a/swh/lister/crates/tests/data/fake_crates_repository_init.sh +++ b/swh/lister/crates/tests/data/fake_crates_repository_init.sh @@ -18,20 +18,47 @@ mkdir -p re/ge touch .dot-file touch config.json -echo '{"name":"rand","vers":"0.1.1","deps":[],"cksum":"48a45b46c2a8c38348adb1205b13c3c5eb0174e0c0fec52cc88e9fb1de14c54d","features":{},"yanked":false}' > ra/nd/rand -echo '{"name":"rand","vers":"0.1.2","deps":[{"name":"libc","req":"^0.1.1","features":[""],"optional":false,"default_features":true,"target":null,"kind":"normal"},{"name":"log","req":"^0.2.1","features":[""],"optional":false,"default_features":true,"target":null,"kind":"normal"}],"cksum":"6e229ed392842fa93c1d76018d197b7e1b74250532bafb37b0e1d121a92d4cf7","features":{},"yanked":false}' >> ra/nd/rand - -echo '{"name":"regex","vers":"0.1.0","deps":[],"cksum":"f0ff1ca641d3c9a2c30464dac30183a8b91cdcc959d616961be020cdea6255c5","features":{},"yanked":false}' > re/ge/regex -echo '{"name":"regex","vers":"0.1.1","deps":[{"name":"regex_macros","req":"^0.1.0","features":[""],"optional":false,"default_features":true,"target":null,"kind":"dev"}],"cksum":"a07bef996bd38a73c21a8e345d2c16848b41aa7ec949e2fedffe9edf74cdfb36","features":{},"yanked":false}' >> re/ge/regex -echo '{"name":"regex","vers":"0.1.2","deps":[{"name":"regex_macros","req":"^0.1.0","features":[""],"optional":false,"default_features":true,"target":null,"kind":"dev"}],"cksum":"343bd0171ee23346506db6f4c64525de6d72f0e8cc533f83aea97f3e7488cbf9","features":{},"yanked":false}' >> re/ge/regex -echo '{"name":"regex","vers":"0.1.3","deps":[{"name":"regex_macros","req":"^0.1.0","features":[""],"optional":false,"default_features":true,"target":null,"kind":"dev"}],"cksum":"defb220c4054ca1b95fe8b0c9a6e782dda684c1bdf8694df291733ae8a3748e3","features":{},"yanked":false}' >> re/ge/regex - -echo '{"name":"regex-syntax","vers":"0.1.0","deps":[{"name":"rand","req":"^0.3","features":[""],"optional":false,"default_features":true,"target":null,"kind":"dev"},{"name":"quickcheck","req":"^0.2","features":[""],"optional":false,"default_features":true,"target":null,"kind":"dev"}],"cksum":"398952a2f6cd1d22bc1774fd663808e32cf36add0280dee5cdd84a8fff2db944","features":{},"yanked":false}' > re/ge/regex-syntax - # Init as a git repository git init git add . git commit -m "Init fake crates.io-index repository for tests purpose" +echo '{"name":"rand","vers":"0.1.1","deps":[],"cksum":"48a45b46c2a8c38348adb1205b13c3c5eb0174e0c0fec52cc88e9fb1de14c54d","features":{},"yanked":false}' > ra/nd/rand +git add . +git commit -m " Updating crate rand#0.1.1" + +echo '{"name":"rand","vers":"0.1.2","deps":[{"name":"libc","req":"^0.1.1","features":[""],"optional":false,"default_features":true,"target":null,"kind":"normal"},{"name":"log","req":"^0.2.1","features":[""],"optional":false,"default_features":true,"target":null,"kind":"normal"}],"cksum":"6e229ed392842fa93c1d76018d197b7e1b74250532bafb37b0e1d121a92d4cf7","features":{},"yanked":false}' >> ra/nd/rand +git add . +git commit -m " Updating crate rand#0.1.2" + +echo '{"name":"regex","vers":"0.1.0","deps":[],"cksum":"f0ff1ca641d3c9a2c30464dac30183a8b91cdcc959d616961be020cdea6255c5","features":{},"yanked":false}' > re/ge/regex +git add . +git commit -m " Updating crate regex#0.1.0" + +echo '{"name":"regex","vers":"0.1.1","deps":[{"name":"regex_macros","req":"^0.1.0","features":[""],"optional":false,"default_features":true,"target":null,"kind":"dev"}],"cksum":"a07bef996bd38a73c21a8e345d2c16848b41aa7ec949e2fedffe9edf74cdfb36","features":{},"yanked":false}' >> re/ge/regex +git add . +git commit -m " Updating crate regex#0.1.1" + +echo '{"name":"regex","vers":"0.1.2","deps":[{"name":"regex_macros","req":"^0.1.0","features":[""],"optional":false,"default_features":true,"target":null,"kind":"dev"}],"cksum":"343bd0171ee23346506db6f4c64525de6d72f0e8cc533f83aea97f3e7488cbf9","features":{},"yanked":false}' >> re/ge/regex +git add . +git commit -m " Updating crate regex#0.1.2" + +echo '{"name":"regex","vers":"0.1.3","deps":[{"name":"regex_macros","req":"^0.1.0","features":[""],"optional":false,"default_features":true,"target":null,"kind":"dev"}],"cksum":"defb220c4054ca1b95fe8b0c9a6e782dda684c1bdf8694df291733ae8a3748e3","features":{},"yanked":false}' >> re/ge/regex +git add . +git commit -m " Updating crate regex#0.1.3" + +echo '{"name":"regex-syntax","vers":"0.1.0","deps":[{"name":"rand","req":"^0.3","features":[""],"optional":false,"default_features":true,"target":null,"kind":"dev"},{"name":"quickcheck","req":"^0.2","features":[""],"optional":false,"default_features":true,"target":null,"kind":"dev"}],"cksum":"398952a2f6cd1d22bc1774fd663808e32cf36add0280dee5cdd84a8fff2db944","features":{},"yanked":false}' > re/ge/regex-syntax +git add . +git commit -m " Updating crate regex-syntax#0.1.0" + # Save some space rm .git/hooks/*.sample + +# Compress git directory as a tar.gz archive +cd ../ +tar -cvzf fake-crates-repository.tar.gz crates.io-index +mv fake-crates-repository.tar.gz ../ + +# Clean up tmp_dir +cd ../ +rm -rf tmp_dir diff --git a/swh/lister/crates/tests/test_lister.py b/swh/lister/crates/tests/test_lister.py index bbb1c7d..2c62449 100644 --- a/swh/lister/crates/tests/test_lister.py +++ b/swh/lister/crates/tests/test_lister.py @@ -5,7 +5,9 @@ from pathlib import Path -from swh.lister.crates.lister import CratesLister +from dulwich.repo import Repo + +from swh.lister.crates.lister import CratesLister, CratesListerState from swh.lister.crates.tests import prepare_repository_from_archive expected_origins = [ @@ -29,6 +31,16 @@ expected_origins = [ "version": "0.1.2", }, ], + "metadata": [ + { + "version": "0.1.1", + "yanked": False, + }, + { + "version": "0.1.2", + "yanked": False, + }, + ], }, { "url": "https://crates.io/api/v1/crates/regex", @@ -66,6 +78,24 @@ expected_origins = [ "version": "0.1.3", }, ], + "metadata": [ + { + "version": "0.1.0", + "yanked": False, + }, + { + "version": "0.1.1", + "yanked": False, + }, + { + "version": "0.1.2", + "yanked": False, + }, + { + "version": "0.1.3", + "yanked": False, + }, + ], }, { "url": "https://crates.io/api/v1/crates/regex-syntax", @@ -79,10 +109,19 @@ expected_origins = [ "version": "0.1.0", }, ], + "metadata": [ + { + "version": "0.1.0", + "yanked": False, + }, + ], }, ] +expected_origins_incremental = [expected_origins[1], expected_origins[2]] + + def test_crates_lister(datadir, tmp_path, swh_scheduler): archive_path = Path(datadir, "fake-crates-repository.tar.gz") repo_url = prepare_repository_from_archive( @@ -112,3 +151,84 @@ def test_crates_lister(datadir, tmp_path, swh_scheduler): ) assert len(scheduler_origins_sorted) == len(expected_origins_sorted) + + +def test_crates_lister_incremental(datadir, tmp_path, swh_scheduler): + archive_path = Path(datadir, "fake-crates-repository.tar.gz") + repo_url = prepare_repository_from_archive( + archive_path, "crates.io-index", tmp_path + ) + + lister = CratesLister(scheduler=swh_scheduler) + lister.INDEX_REPOSITORY_URL = repo_url + lister.DESTINATION_PATH = tmp_path.parent / "crates.io-index-tests" + # The lister has not run yet, get the index repository + lister.get_index_repository() + # Set a CratesListerState with a last commit value to force incremental case + repo = Repo(lister.DESTINATION_PATH) + # Lets set this last commit to third one from head + step = list(repo.get_walker(max_entries=3))[-1] + last_commit_state = CratesListerState(last_commit=step.commit.id.decode()) + lister.state = last_commit_state + + res = lister.run() + + assert res.pages == 2 + assert res.origins == 2 + + expected_origins_sorted = sorted( + expected_origins_incremental, key=lambda x: x.get("url") + ) + scheduler_origins_sorted = sorted( + swh_scheduler.get_listed_origins(lister.lister_obj.id).results, + key=lambda x: x.url, + ) + + for scheduled, expected in zip(scheduler_origins_sorted, expected_origins_sorted): + assert scheduled.visit_type == "crates" + assert scheduled.url == expected.get("url") + assert scheduled.extra_loader_arguments.get("artifacts") == expected.get( + "artifacts" + ) + + assert len(scheduler_origins_sorted) == len(expected_origins_sorted) + + +def test_crates_lister_incremental_nothing_new(datadir, tmp_path, swh_scheduler): + """Ensure incremental mode runs fine when the repository last commit is the same + than lister.state.las-_commit""" + archive_path = Path(datadir, "fake-crates-repository.tar.gz") + repo_url = prepare_repository_from_archive( + archive_path, "crates.io-index", tmp_path + ) + + lister = CratesLister(scheduler=swh_scheduler) + lister.INDEX_REPOSITORY_URL = repo_url + lister.DESTINATION_PATH = tmp_path.parent / "crates.io-index-tests" + lister.get_index_repository() + + repo = Repo(lister.DESTINATION_PATH) + + # Set a CratesListerState with a last commit value to force incremental case + last_commit_state = CratesListerState(last_commit=repo.head().decode()) + lister.state = last_commit_state + + res = lister.run() + + assert res.pages == 0 + assert res.origins == 0 + + +def test_crates_lister_repository_cleanup(datadir, tmp_path, swh_scheduler): + archive_path = Path(datadir, "fake-crates-repository.tar.gz") + repo_url = prepare_repository_from_archive( + archive_path, "crates.io-index", tmp_path + ) + + lister = CratesLister(scheduler=swh_scheduler) + lister.INDEX_REPOSITORY_URL = repo_url + lister.DESTINATION_PATH = tmp_path.parent / "crates.io-index-tests" + + lister.run() + # Repository directory should not exists after the lister runs + assert not lister.DESTINATION_PATH.exists()