From 108816f232d2397590240ffc369d5f4c4da32aca Mon Sep 17 00:00:00 2001 From: Antoine Lambert Date: Thu, 6 Oct 2022 17:51:27 +0200 Subject: [PATCH] rubygems: Use gems database dump to improve listing output Instead of using an undocumented rubygems HTTP endpoint that only gives us the names of the gems, prefer to exploit the daily PostgreSQL dump of the rubygems.org database. It enables to list all gems but also all versions of a gem and its release artifacts. For each relase artifact, the following info are extracted: version, download URL, sha256 checksum, release date plus a couple of extra metadata. The lister will now set list of artifacts and list of metadata as extra loader arguments when sending a listed origin to the scheduler database. A last_update date is also computed which should ensure loading tasks for rubygems will be scheduled only when new releases are available since last loadings. To be noted, the lister will spawn a temporary postgres instance so this require the initdb executable from postgres server installation to be available in the execution environment. Related to T1777 --- mypy.ini | 6 + requirements.txt | 2 + swh/lister/rubygems/lister.py | 201 +++++++++++++++--- .../tests/data/https_rubygems.org/versions | 6 - .../rubygems/tests/data/rubygems_dumps.xml | 22 ++ .../tests/data/rubygems_pgsql_dump.tar | Bin 0 -> 2867 bytes .../tests/data/small_rubygems_dump.sh | 38 ++++ swh/lister/rubygems/tests/test_lister.py | 153 +++++++++++-- 8 files changed, 378 insertions(+), 50 deletions(-) delete mode 100644 swh/lister/rubygems/tests/data/https_rubygems.org/versions create mode 100644 swh/lister/rubygems/tests/data/rubygems_dumps.xml create mode 100644 swh/lister/rubygems/tests/data/rubygems_pgsql_dump.tar create mode 100644 swh/lister/rubygems/tests/data/small_rubygems_dump.sh diff --git a/mypy.ini b/mypy.ini index 42c58d8..286fec0 100644 --- a/mypy.ini +++ b/mypy.ini @@ -42,3 +42,9 @@ ignore_missing_imports = True [mypy-dulwich.*] ignore_missing_imports = True + +[mypy-testing.postgresql.*] +ignore_missing_imports = True + +[mypy-psycopg2.*] +ignore_missing_imports = True diff --git a/requirements.txt b/requirements.txt index a909c6d..17a1e8f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,3 +7,5 @@ launchpadlib tenacity >= 6.2 lxml dulwich +testing.postgresql +psycopg2 diff --git a/swh/lister/rubygems/lister.py b/swh/lister/rubygems/lister.py index c4cb707..6961646 100644 --- a/swh/lister/rubygems/lister.py +++ b/swh/lister/rubygems/lister.py @@ -3,8 +3,20 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +import base64 +from datetime import timezone +import gzip import logging -from typing import Iterator, List, Optional, Text +import os +import shutil +import subprocess +import tarfile +import tempfile +from typing import Any, Dict, Iterator, Optional, Tuple + +from bs4 import BeautifulSoup +import psycopg2 +from testing.postgresql import Postgresql from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin @@ -13,18 +25,39 @@ from ..pattern import CredentialsType, StatelessLister logger = logging.getLogger(__name__) -# Aliasing the page results returned by `get_pages` method from the lister. -RubyGemsListerPage = Text +RubyGemsListerPage = Dict[str, Any] class RubyGemsLister(StatelessLister[RubyGemsListerPage]): - """Lister for RubyGems.org, the Ruby community’s gem hosting service.""" + """Lister for RubyGems.org, the Ruby community's gem hosting service. + + Instead of querying rubygems.org Web API, it uses gems data from the + daily PostreSQL database dump of rubygems. It enables to gather all + interesting info about a gem and its release artifacts (version number, + download URL, checksums, release date) in an efficient way and without + flooding rubygems Web API with numerous HTTP requests (as there is more + than 187000 gems available on 07/10/2022). + """ LISTER_NAME = "rubygems" VISIT_TYPE = "rubygems" INSTANCE = "rubygems" - INDEX_URL = "https://rubygems.org/versions" + RUBY_GEMS_POSTGRES_DUMP_BASE_URL = ( + "https://s3-us-west-2.amazonaws.com/rubygems-dumps" + ) + RUBY_GEMS_POSTGRES_DUMP_LIST_URL = ( + f"{RUBY_GEMS_POSTGRES_DUMP_BASE_URL}?prefix=production/public_postgresql" + ) + + RUBY_GEM_DOWNLOAD_URL_PATTERN = "https://rubygems.org/downloads/{gem}-{version}.gem" + RUBY_GEM_ORIGIN_URL_PATTERN = "https://rubygems.org/gems/{gem}" + RUBY_GEM_EXTRINSIC_METADATA_URL_PATTERN = ( + "https://rubygems.org/api/v2/rubygems/{gem}/versions/{version}.json" + ) + + DB_NAME = "rubygems" + DUMP_SQL_PATH = "public_postgresql/databases/PostgreSQL.sql.gz" def __init__( self, @@ -35,41 +68,147 @@ class RubyGemsLister(StatelessLister[RubyGemsListerPage]): scheduler=scheduler, credentials=credentials, instance=self.INSTANCE, - url=self.INDEX_URL, + url=self.RUBY_GEMS_POSTGRES_DUMP_BASE_URL, ) + def get_latest_dump_file(self) -> str: + response = self.http_request(self.RUBY_GEMS_POSTGRES_DUMP_LIST_URL) + xml = BeautifulSoup(response.content, "xml") + contents = xml.find_all("Contents") + return contents[-1].find("Key").text + + def create_rubygems_db( + self, postgresql: Postgresql + ) -> Tuple[str, psycopg2._psycopg.connection]: + logger.debug("Creating rubygems database") + + db_dsn = postgresql.dsn() + db_url = postgresql.url().replace(db_dsn["database"], self.DB_NAME) + db = psycopg2.connect(**db_dsn) + db.set_isolation_level(psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT) + with db.cursor() as cursor: + cursor.execute(f"CREATE DATABASE {self.DB_NAME}") + + db_dsn["database"] = self.DB_NAME + + db = psycopg2.connect(**db_dsn) + db.set_isolation_level(psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT) + with db.cursor() as cursor: + cursor.execute("CREATE EXTENSION IF NOT EXISTS hstore") + + return db_url, db + + def populate_rubygems_db(self, db_url: str): + dump_file = self.get_latest_dump_file() + dump_id = dump_file.split("/")[2] + + response = self.http_request(f"{self.url}/{dump_file}", stream=True) + + with tempfile.TemporaryDirectory() as temp_dir: + logger.debug( + "Downloading latest rubygems database dump: %s (%s bytes)", + dump_id, + response.headers["content-length"], + ) + dump_file = os.path.join(temp_dir, "rubygems_dump.tar") + with open(dump_file, "wb") as dump: + for chunk in response.iter_content(chunk_size=1024): + dump.write(chunk) + + with tarfile.open(dump_file) as dump_tar: + dump_tar.extractall(temp_dir) + + logger.debug("Populating rubygems database with dump %s", dump_id) + psql = subprocess.Popen( + ["psql", "-q", db_url], + stdin=subprocess.PIPE, + ) + + # passing value of gzip.open as stdin of subprocess.run makes the process + # read raw data instead of decompressed data so we have to use a pipe + with gzip.open(os.path.join(temp_dir, self.DUMP_SQL_PATH), "rb") as sql: + shutil.copyfileobj(sql, psql.stdin) # type: ignore + + # denote end of read file + psql.stdin.close() # type: ignore + psql.wait() + def get_pages(self) -> Iterator[RubyGemsListerPage]: - """Yield an iterator which returns 'page' + # spawn a temporary postgres instance (require initdb executable in environment) + with Postgresql() as postgresql: + db_url, db = self.create_rubygems_db(postgresql) + self.populate_rubygems_db(db_url) - It uses the index file located at `https://rubygems.org/versions` - to get a list of package names. Each page returns an origin url based on - the following pattern:: - - https://rubygems.org/gems/{pkgname} - - """ - - package_names: List[str] = [] - response = self.http_request(url=self.url) - data = response.content.decode() - - # remove the first 3 lines (file headers + first package named '-') - for line in data.splitlines()[3:]: - package_names.append(line.split(" ")[0]) - - # Remove duplicates - package_names_set: List[str] = list(set(package_names)) - - for pkgname in package_names_set: - yield f"https://rubygems.org/gems/{pkgname}" + with db.cursor() as cursor: + cursor.execute("SELECT id, name from rubygems") + for gem_id, gem_name in cursor.fetchall(): + logger.debug("Processing gem named %s", gem_name[1]) + with db.cursor() as cursor_v: + cursor_v.execute( + "SELECT authors, built_at, number, sha256, size from versions " + "where rubygem_id = %s", + (gem_id,), + ) + versions = [ + { + "number": number, + "url": self.RUBY_GEM_DOWNLOAD_URL_PATTERN.format( + gem=gem_name, version=number + ), + "date": built_at.replace(tzinfo=timezone.utc), + "authors": authors, + "sha256": ( + base64.decodebytes(sha256.encode()).hex() + if sha256 + else None + ), + "size": size, + } + for authors, built_at, number, sha256, size in cursor_v.fetchall() + ] + if versions: + yield { + "name": gem_name, + "versions": versions, + } def get_origins_from_page(self, page: RubyGemsListerPage) -> Iterator[ListedOrigin]: - """Iterate on all pages and yield ListedOrigin instances.""" assert self.lister_obj.id is not None + artifacts = [] + rubygem_metadata = [] + for version in page["versions"]: + artifacts.append( + { + "version": version["number"], + "filename": version["url"].split("/")[-1], + "url": version["url"], + "checksums": ( + {"sha256": version["sha256"]} if version["sha256"] else {} + ), + "length": version["size"], + } + ) + rubygem_metadata.append( + { + "version": version["number"], + "date": version["date"].isoformat(), + "authors": version["authors"], + "extrinsic_metadata_url": ( + self.RUBY_GEM_EXTRINSIC_METADATA_URL_PATTERN.format( + gem=page["name"], version=version["number"] + ) + ), + } + ) + yield ListedOrigin( lister_id=self.lister_obj.id, visit_type=self.VISIT_TYPE, - url=page, - last_update=None, + url=self.RUBY_GEM_ORIGIN_URL_PATTERN.format(gem=page["name"]), + last_update=max(version["date"] for version in page["versions"]), + extra_loader_arguments={ + "artifacts": artifacts, + "rubygem_metadata": rubygem_metadata, + }, ) diff --git a/swh/lister/rubygems/tests/data/https_rubygems.org/versions b/swh/lister/rubygems/tests/data/https_rubygems.org/versions deleted file mode 100644 index 74d2703..0000000 --- a/swh/lister/rubygems/tests/data/https_rubygems.org/versions +++ /dev/null @@ -1,6 +0,0 @@ -created_at: 2022-09-01T00:00:05Z ---- -- 1 05d0116933ba44b0b5d0ee19bfd35ccc -mercurial-ruby 0.3.0,0.4.0,0.5.0,0.6.0,0.6.1,0.7.0,0.7.1,0.7.2,0.7.3,0.7.4,0.7.5,0.7.6,0.7.7,0.7.8,0.7.9,0.7.10,0.7.11,0.7.12 3ea9d3b3f1010f06d292dcfcc799f260 -mercurial-wrapper 0.8.4,0.8.5 b6541e48f15eafc0b50fa694cdbffc22 -mercurius 0.0.1,0.0.2,0.0.3,0.0.5,0.0.6,0.0.7,0.0.8,0.0.9,0.1.0,0.1.1,0.1.2,0.1.3,0.1.4,0.1.5,0.1.6,0.1.7,0.1.8,0.1.9,0.2.0,0.2.1 9a388c7c57d2ed4a879ab42520d91ffd diff --git a/swh/lister/rubygems/tests/data/rubygems_dumps.xml b/swh/lister/rubygems/tests/data/rubygems_dumps.xml new file mode 100644 index 0000000..5506050 --- /dev/null +++ b/swh/lister/rubygems/tests/data/rubygems_dumps.xml @@ -0,0 +1,22 @@ + + + rubygems-dumps + production/public_postgresql + + 1000 + false + + production/public_postgresql/2022.10.05.06.10.11/public_postgresql.tar + 2022-10-05T06:11:15.000Z + "d1c447a2a490225c2d59061e60ed86e9-75" + 391653888 + STANDARD + + + production/public_postgresql/2022.10.06.06.10.05/public_postgresql.tar + 2022-10-06T06:11:11.000Z + "2ccd9340e4f802ec982e4cd00db2d168-75" + 390047744 + STANDARD + + \ No newline at end of file diff --git a/swh/lister/rubygems/tests/data/rubygems_pgsql_dump.tar b/swh/lister/rubygems/tests/data/rubygems_pgsql_dump.tar new file mode 100644 index 0000000000000000000000000000000000000000..971cdf99efc3e7b1c4c70463d7e74407dbdea581 GIT binary patch literal 2867 zcmV-33(WK%iwFP!000001MSv%G}P?^z;R2Kd1VPNqRA9Vh0JCwS(6ai5@Q+5G)=}b zjAiU3hKTSY46=l=FG(>J*@^5*uWe|tlq@5=+k4(~&$;Kk=e_ryd%E|Yd+vPxd4A{j zJm);m?|h#>e(v6mE>37Wcf6+;mVoi}a5>NP!^wY#fWvaPV?)bnwJ@ z{;-?yPxptyVPFL8dw&E34rTQJPxyQNjemb>W?~@seL@cF|1a+GTVMqk?9ckc<>ev2 z_lH9e5ZL#70)fE5FeZ@v5B<_V@qgZ5Jcjf42i9Jz&t~KL!Uy+4+T!e*!2K>LQ1g6d zPLxoLJ)3jmm`a*qwFXXiuz|8(5z_Y|HWc)RmRXo5yOfo&rm~DhlwId7HurkZKGRy* zQlWYyG5(WecIUmD$CrSWduE#khOmu^w_(!>+qd5%>JTLXmK0uVE`lwUx0zmYF(r~1ahhj~IN|^~(^KLzsqAt>`MN||m z*zI>Et{>%o_fX}5?A)b#*P|e!-q)~ZNchI|YW0wK8Ebr}(Z-Pr&$?N>BPp+F0ufC= z-G0nxjg@1IJd~|ke4$?{^7vigfW+L99^UT5BG6>L4$0~hFQeRsnk3jUx0`D1mT;}| zP1E*t_l0x~h5S})aH;tG8X%J`4V5=qh{EQWQCNs)?5UreI>%ux^y4l!os`Wi@2B zUQ%hcUHL_U@lGhQfP3lIYO#F}cph59QtoV~yD`*BtUH^Dv%#=+lBNH z{CvX7yOiUSVbgS0!NHI6NU)Rr=5kA26=0I906^0a4h7(Vv}Qik3< zC`j{ARU+)2<;OLHNvl~%H?e5#+^d5;T2XJdPcP)xolEIzyC`MxN!BJO*<(er6EiW#nQ6-_=fD@4 zH`2x2PPmw&Un@--^+oaf%9w-89RYwqvvNe$wHC|dg-VI9<~~A6eB}tg=VyxH<|&sn z&rW>1GQ*rAm|&rArWN=(a!2E6iGMQw8e2kD;+&M*urKl!#W&)CG|bON09-yM2B_E> zml7z?Gz}gLk2w;NCJ)qpM%!)9M{`)D%03IsvIG+#yzW~fl}q=Bw{3DTo)HSY`DJO= zHE6lzI-6fD>+)}Qmo4Ymj7!%wh!MzQVoyJzekBlTqZ=pO20We#yOPeqlHQAy#5mgW z=>vzW=b;%BIY;O+I1-=am|!WUvWO)Mli68t0(MrHz;PrgV`0`OUHK#>c~jI*K;P+C zA==MckbR_hktyrTs$n0Xv8qiak8P`#INR}4yifR{{iZzjQR4;Kqmp{&i;ibxVy}n< z4|x?ySr(5hG@3=oLF8rEad85YWrNvdrew)-=7>|9zXUx)MUUEM{+S(nRV;za)E`@k=_$bVi33Syfi2 zB9AV^Q&z0YN{2Dh^>_sWs*kN7R9jWa-{o5pi{C?6Rdka40~E$S7Oy^{S`_kje4NU6 zSMv*wQX80VX;Z4uiQWlFxR(~|V0(RYP$?oTx^JRCHO1{smqdo?aF$Lx>jL#To-K-g zRQm!Kt^r^!2M;kIEN91t3>Km-8G$4syV?`jmCLP%FrEXa@Q+GM$3b@h-hByB)Y zZ4go@r-b-8mtJV~6)S-{84(mNr^UnJ@WjdJptR0Do7A3bH9|%96F8)ih7r z=i8C%HGMZ%Dq5UG{598FB6qc#>8B&>S`33^qAh0@>kp@m7z3^sfdo*9TNj<8c5(Bj zH<0he4%B%%uJ9P$Oy%YgPLd*R7~?x}==(f&X=K4tX>KLFdJj*BHSmkh>QFD*i$1g9 z_tooE)U1EzdfuX42ljEtnzco$TAjt6wfOL&h6%S0d%-D>ChLH22jdBMSfM2i(u9>K-RE!yOB^<%bA>-U0yOSB=v;(AS!9}-eu{{>ZBzTE$y4@fRTh|drVr`X)vgMibg zx~!SWlBqhST^yuEGue!m2A)a3J`i!IFmoAC9gPxy+nL&U38^|gahSU2`6yB?pTbgT zu)XBu^tnVk>++s{1W-6G-Owl2vo&W`-HJX%SWS+pXdE>9Wqo#l>(cB{7cFpF*~%Za zg*HSGCW~=CsEN4bq5}>M{I;#`hgjd)k^xtHG_Q-Ms2{smc#t)0rvB&YbX|wB+ogL| z7CHS)z+O5EMU@;}t#PQ(9-3Xw5=3^?XB(oRDK%Z3#89eY_zTwQm#+H?om_)JjgJqQ zmP7qKre1IpuGX>A_C{mi8BcSA8d0S)lxD%A_XyB(WFWGy++R=*>tlR3)@7ec>WvOB z!?ms_YaMQ!42aH6Tw7836#ty>ExYQ8cg(kZg1+$>Lr1Nf<-YI{nKSIvo^#B#^>n&s z3qPWU_Q?UKSUvZDay>V9kHFCy^NvOgwejt3 public_postgresql/databases/PostgreSQL.sql.gz +tar -cvf rubygems_pgsql_dump.tar public_postgresql diff --git a/swh/lister/rubygems/tests/test_lister.py b/swh/lister/rubygems/tests/test_lister.py index 8a5f355..122c8c7 100644 --- a/swh/lister/rubygems/tests/test_lister.py +++ b/swh/lister/rubygems/tests/test_lister.py @@ -2,26 +2,153 @@ # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information + +# flake8: noqa: B950 + +from pathlib import Path + +import iso8601 +import pytest + from swh.lister.rubygems.lister import RubyGemsLister +from swh.scheduler.model import ListedOrigin -expected_origins = [ - "https://rubygems.org/gems/mercurial-ruby", - "https://rubygems.org/gems/mercurial-wrapper", - "https://rubygems.org/gems/mercurius", -] +DUMP_FILEPATH = "production/public_postgresql/2022.10.06.06.10.05/public_postgresql.tar" -def test_rubygems_lister(datadir, requests_mock_datadir, swh_scheduler): +@pytest.fixture +def expected_listed_origins(): + return [ + { + "url": "https://rubygems.org/gems/haar_joke", + "visit_type": "rubygems", + "last_update": iso8601.parse_date("2016-11-05T00:00:00+00:00"), + "extra_loader_arguments": { + "artifacts": [ + { + "url": "https://rubygems.org/downloads/haar_joke-0.0.2.gem", + "length": 8704, + "version": "0.0.2", + "filename": "haar_joke-0.0.2.gem", + "checksums": { + "sha256": "85a8cf5f41890e9605265eeebfe9e99aa0350a01a3c799f9f55a0615a31a2f5f" + }, + }, + { + "url": "https://rubygems.org/downloads/haar_joke-0.0.1.gem", + "length": 8704, + "version": "0.0.1", + "filename": "haar_joke-0.0.1.gem", + "checksums": { + "sha256": "a2ee7052fb8ffcfc4ec0fdb77fae9a36e473f859af196a36870a0f386b5ab55e" + }, + }, + ], + "rubygem_metadata": [ + { + "date": "2016-11-05T00:00:00+00:00", + "authors": "Gemma Gotch", + "version": "0.0.2", + "extrinsic_metadata_url": "https://rubygems.org/api/v2/rubygems/haar_joke/versions/0.0.2.json", + }, + { + "date": "2016-07-23T00:00:00+00:00", + "authors": "Gemma Gotch", + "version": "0.0.1", + "extrinsic_metadata_url": "https://rubygems.org/api/v2/rubygems/haar_joke/versions/0.0.1.json", + }, + ], + }, + }, + { + "url": "https://rubygems.org/gems/l33tify", + "visit_type": "rubygems", + "last_update": iso8601.parse_date("2014-11-14T00:00:00+00:00"), + "extra_loader_arguments": { + "artifacts": [ + { + "url": "https://rubygems.org/downloads/l33tify-0.0.2.gem", + "length": 6144, + "version": "0.0.2", + "filename": "l33tify-0.0.2.gem", + "checksums": { + "sha256": "0087a21fb6161bba8892df40de3b5e27404f941658084413b8fde49db2bc7c9f" + }, + }, + { + "url": "https://rubygems.org/downloads/l33tify-0.0.3.gem", + "length": 6144, + "version": "0.0.3", + "filename": "l33tify-0.0.3.gem", + "checksums": { + "sha256": "4502097ddf2657d561ce0f527ef1f49f1658c8a0968ab8cc853273138f8382a2" + }, + }, + { + "url": "https://rubygems.org/downloads/l33tify-0.0.1.gem", + "length": 6144, + "version": "0.0.1", + "filename": "l33tify-0.0.1.gem", + "checksums": { + "sha256": "5abfb737ce5cf561726f2f7cc1ba0f0e4f865f8b7283192e05eb3f246d3dbbca" + }, + }, + ], + "rubygem_metadata": [ + { + "date": "2014-11-14T00:00:00+00:00", + "authors": "E Alexander Liedtke", + "version": "0.0.2", + "extrinsic_metadata_url": "https://rubygems.org/api/v2/rubygems/l33tify/versions/0.0.2.json", + }, + { + "date": "2014-11-14T00:00:00+00:00", + "authors": "E Alexander Liedtke", + "version": "0.0.3", + "extrinsic_metadata_url": "https://rubygems.org/api/v2/rubygems/l33tify/versions/0.0.3.json", + }, + { + "date": "2014-11-14T00:00:00+00:00", + "authors": "E Alexander Liedtke", + "version": "0.0.1", + "extrinsic_metadata_url": "https://rubygems.org/api/v2/rubygems/l33tify/versions/0.0.1.json", + }, + ], + }, + }, + ] + + +@pytest.fixture(autouse=True) +def network_requests_mock(datadir, requests_mock): + requests_mock.get( + RubyGemsLister.RUBY_GEMS_POSTGRES_DUMP_LIST_URL, + content=Path(datadir, "rubygems_dumps.xml").read_bytes(), + ) + content = Path(datadir, "rubygems_pgsql_dump.tar").read_bytes() + requests_mock.get( + f"{RubyGemsLister.RUBY_GEMS_POSTGRES_DUMP_BASE_URL}/{DUMP_FILEPATH}", + content=content, + headers={"content-length": str(len(content))}, + ) + + +@pytest.mark.db +def test_rubygems_lister(swh_scheduler, expected_listed_origins): lister = RubyGemsLister(scheduler=swh_scheduler) res = lister.run() - assert res.pages == 3 - assert res.origins == 1 + 1 + 1 + assert res.pages == 2 + assert res.origins == 2 scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results - assert len(scheduler_origins) == len(expected_origins) - - for origin in scheduler_origins: - assert origin.visit_type == "rubygems" - assert origin.url in expected_origins + assert [ + { + "url": origin.url, + "visit_type": origin.visit_type, + "last_update": origin.last_update, + "extra_loader_arguments": origin.extra_loader_arguments, + } + for origin in scheduler_origins + ] == expected_listed_origins