From f57b8f3a2c49080ae9bc11217b8d6ef4ed8c564e Mon Sep 17 00:00:00 2001 From: Antoine Lambert Date: Tue, 27 Sep 2022 16:34:38 +0200 Subject: [PATCH] cpan: Improve listing process by querying the metacpan release endpoint Instead of querying the metacpan distribution endpoint to list origins, prefer to use the release endpoint instead enabling to list all artifacts associated to CPAN packages by scrolling results. Compared to previous implementation, it enables to compute a last_update date for all CPAN packages but also to obtain artifact sha256 checksums that will be used by the CPAN loader to check downloads integrity. As the multiple versions of a module are spread across multiple pages from the CPAN API, origins are sent to the scheduler once all pages processed, it is also faster to proceed that way. Related to T2833 --- swh/lister/cpan/__init__.py | 8 +- swh/lister/cpan/lister.py | 142 ++++++++-- .../v1__search_scroll | 247 ++++++++++++++++++ ...Q1NjA6eXptdmszQUNUam1XbVJjRjRkRk9UdzswOw== | 50 ---- ...XptdmszQUNUam1XbVJjRjRkRk9UdzswOw==_visit1 | 16 -- ...on__search,fields=name,size=1000,scroll=1m | 52 ---- .../v1_release__search | 246 +++++++++++++++++ swh/lister/cpan/tests/test_lister.py | 99 ++++++- 8 files changed, 701 insertions(+), 159 deletions(-) create mode 100644 swh/lister/cpan/tests/data/https_fastapi.metacpan.org/v1__search_scroll delete mode 100644 swh/lister/cpan/tests/data/https_fastapi.metacpan.org/v1__search_scroll,scroll=1m,scroll_id=cXVlcnlUaGVuRmV0Y2g7Mzs5NTU1MTQ1NTk6eXptdmszQUNUam1XbVJjRjRkRk9Udzs5NTQ5NjQ5NjI6ZHZIZWxCb3BUZi1Cb3NwRDB5NmRQUTs5NTU1MTQ1NjA6eXptdmszQUNUam1XbVJjRjRkRk9UdzswOw== delete mode 100644 swh/lister/cpan/tests/data/https_fastapi.metacpan.org/v1__search_scroll,scroll=1m,scroll_id=cXVlcnlUaGVuRmV0Y2g7Mzs5NTU1MTQ1NTk6eXptdmszQUNUam1XbVJjRjRkRk9Udzs5NTQ5NjQ5NjI6ZHZIZWxCb3BUZi1Cb3NwRDB5NmRQUTs5NTU1MTQ1NjA6eXptdmszQUNUam1XbVJjRjRkRk9UdzswOw==_visit1 delete mode 100644 swh/lister/cpan/tests/data/https_fastapi.metacpan.org/v1_distribution__search,fields=name,size=1000,scroll=1m create mode 100644 swh/lister/cpan/tests/data/https_fastapi.metacpan.org/v1_release__search diff --git a/swh/lister/cpan/__init__.py b/swh/lister/cpan/__init__.py index bdc7a17..2624112 100644 --- a/swh/lister/cpan/__init__.py +++ b/swh/lister/cpan/__init__.py @@ -16,9 +16,9 @@ As of September 2022 `cpan.org`_ list 43675 package names. Origins retrieving strategy --------------------------- -To get a list of all package names we call a first `http api endpoint`_ that -retrieve results and a ``_scroll_id`` that will be used to scroll pages through -`search`_ endpoint. +To get a list of all package names and their associated release artifacts we call +a first `http api endpoint`_ that retrieve results and a ``_scroll_id`` that will +be used to scroll pages through `search`_ endpoint. Page listing ------------ @@ -57,7 +57,7 @@ You can follow lister execution by displaying logs of swh-lister service:: .. _cpan.org: https://cpan.org/ .. _metacpan.org: https://metacpan.org/ -.. _http api endpoint: https://explorer.metacpan.org/?url=/distribution/ +.. _http api endpoint: https://explorer.metacpan.org/?url=/release/ .. _search: https://github.com/metacpan/metacpan-api/blob/master/docs/API-docs.md#search-without-constraints # noqa: B950 diff --git a/swh/lister/cpan/lister.py b/swh/lister/cpan/lister.py index 5777331..23b821d 100644 --- a/swh/lister/cpan/lister.py +++ b/swh/lister/cpan/lister.py @@ -3,8 +3,12 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +from collections import defaultdict +from datetime import datetime import logging -from typing import Any, Dict, Iterator, List, Optional +from typing import Any, Dict, Iterator, List, Optional, Set + +import iso8601 from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin @@ -14,7 +18,33 @@ from ..pattern import CredentialsType, StatelessLister logger = logging.getLogger(__name__) # Aliasing the page results returned by `get_pages` method from the lister. -CpanListerPage = List[Dict[str, Any]] +CpanListerPage = Set[str] + + +def get_field_value(entry, field_name): + """ + Splits ``field_name`` on ``.``, and use it as path in the nested ``entry`` + dictionary. If a value does not exist, returns None. + + >>> entry = {"_source": {"foo": 1, "bar": {"baz": 2, "qux": [3]}}} + >>> get_field_value(entry, "foo") + 1 + >>> get_field_value(entry, "bar") + {'baz': 2, 'qux': [3]} + >>> get_field_value(entry, "bar.baz") + 2 + >>> get_field_value(entry, "bar.qux") + 3 + """ + fields = field_name.split(".") + field_value = entry["_source"] + for field in fields[:-1]: + field_value = field_value.get(field, {}) + field_value = field_value.get(fields[-1]) + # scrolled results might have field value in a list + if isinstance(field_value, list): + field_value = field_value[0] + return field_value class CpanLister(StatelessLister[CpanListerPage]): @@ -25,7 +55,15 @@ class CpanLister(StatelessLister[CpanListerPage]): VISIT_TYPE = "cpan" INSTANCE = "cpan" - BASE_URL = "https://fastapi.metacpan.org/v1/" + API_BASE_URL = "https://fastapi.metacpan.org/v1" + REQUIRED_DOC_FIELDS = [ + "download_url", + "checksum_sha256", + "distribution", + "version", + ] + OPTIONAL_DOC_FIELDS = ["date", "author", "stat.size", "name", "metadata.author"] + ORIGIN_URL_PATTERN = "https://metacpan.org/dist/{module_name}" def __init__( self, @@ -36,26 +74,82 @@ class CpanLister(StatelessLister[CpanListerPage]): scheduler=scheduler, credentials=credentials, instance=self.INSTANCE, - url=self.BASE_URL, + url=self.API_BASE_URL, ) + self.artifacts: Dict[str, List[Dict[str, Any]]] = defaultdict(list) + self.module_metadata: Dict[str, List[Dict[str, Any]]] = defaultdict(list) + self.release_dates: Dict[str, List[datetime]] = defaultdict(list) + self.module_names: Set[str] = set() + + def process_release_page(self, page: List[Dict[str, Any]]): + for entry in page: + + if "_source" not in entry or not all( + k in entry["_source"].keys() for k in self.REQUIRED_DOC_FIELDS + ): + logger.warning( + "Skipping release entry %s as some required fields are missing", + entry.get("_source"), + ) + continue + + module_name = get_field_value(entry, "distribution") + module_version = get_field_value(entry, "version") + module_download_url = get_field_value(entry, "download_url") + module_sha256_checksum = get_field_value(entry, "checksum_sha256") + module_date = get_field_value(entry, "date") + module_size = get_field_value(entry, "stat.size") + module_author = get_field_value(entry, "author") + module_author_fullname = get_field_value(entry, "metadata.author") + release_name = get_field_value(entry, "name") + + self.artifacts[module_name].append( + { + "url": module_download_url, + "filename": module_download_url.split("/")[-1], + "checksums": {"sha256": module_sha256_checksum}, + "version": module_version, + "length": module_size, + } + ) + + self.module_metadata[module_name].append( + { + "name": module_name, + "version": module_version, + "cpan_author": module_author, + "author": ( + module_author_fullname + if module_author_fullname not in (None, "", "unknown") + else module_author + ), + "date": module_date, + "release_name": release_name, + } + ) + + self.release_dates[module_name].append(iso8601.parse_date(module_date)) + + self.module_names.add(module_name) + def get_pages(self) -> Iterator[CpanListerPage]: """Yield an iterator which returns 'page'""" - endpoint = f"{self.BASE_URL}distribution/_search" - scrollendpoint = f"{self.BASE_URL}_search/scroll" - size: int = 1000 + endpoint = f"{self.API_BASE_URL}/release/_search" + scrollendpoint = f"{self.API_BASE_URL}/_search/scroll" + size = 1000 res = self.http_request( endpoint, params={ - "fields": ["name"], + "_source": self.REQUIRED_DOC_FIELDS + self.OPTIONAL_DOC_FIELDS, "size": size, "scroll": "1m", }, ) data = res.json()["hits"]["hits"] - yield data + self.process_release_page(data) _scroll_id = res.json()["_scroll_id"] @@ -65,27 +159,25 @@ class CpanLister(StatelessLister[CpanListerPage]): ) data = scroll_res.json()["hits"]["hits"] _scroll_id = scroll_res.json()["_scroll_id"] - yield data + self.process_release_page(data) - def get_origins_from_page(self, page: CpanListerPage) -> Iterator[ListedOrigin]: + yield self.module_names + + def get_origins_from_page( + self, module_names: CpanListerPage + ) -> Iterator[ListedOrigin]: """Iterate on all pages and yield ListedOrigin instances.""" assert self.lister_obj.id is not None - for entry in page: - # Skip the entry if 'fields' or 'name' keys are missing - if "fields" not in entry or "name" not in entry["fields"]: - continue - - pkgname = entry["fields"]["name"] - # TODO: Check why sometimes its a one value list - if type(pkgname) != str: - pkgname = pkgname[0] - - url = f"https://metacpan.org/dist/{pkgname}" - + for module_name in module_names: yield ListedOrigin( lister_id=self.lister_obj.id, visit_type=self.VISIT_TYPE, - url=url, - last_update=None, + url=self.ORIGIN_URL_PATTERN.format(module_name=module_name), + last_update=max(self.release_dates[module_name]), + extra_loader_arguments={ + "api_base_url": self.API_BASE_URL, + "artifacts": self.artifacts[module_name], + "module_metadata": self.module_metadata[module_name], + }, ) diff --git a/swh/lister/cpan/tests/data/https_fastapi.metacpan.org/v1__search_scroll b/swh/lister/cpan/tests/data/https_fastapi.metacpan.org/v1__search_scroll new file mode 100644 index 0000000..2eac909 --- /dev/null +++ b/swh/lister/cpan/tests/data/https_fastapi.metacpan.org/v1__search_scroll @@ -0,0 +1,247 @@ +{ + "_shards": { + "failed": 0, + "total": 3, + "successful": 3 + }, + "_scroll_id": "cXVlcnlUaGVuRmV0Y2g7Mzs5OTQ2NzY3ODU6ZHZIZWxCb3BUZi1Cb3NwRDB5NmRQUTs5OTQ2NzY3ODQ6ZHZIZWxCb3BUZi1Cb3NwRDB5NmRQUTs5OTUyMzQzMTA6eXptdmszQUNUam1XbVJjRjRkRk9UdzswOw==", + "terminated_early": true, + "took": 3, + "hits": { + "max_score": 1.0, + "hits": [ + { + "_score": 1.0, + "_source": { + "author": "KIMOTO", + "name": "Validator-Custom-0.1210", + "download_url": "https://cpan.metacpan.org/authors/id/K/KI/KIMOTO/Validator-Custom-0.1210.tar.gz", + "version": "0.1210", + "distribution": "Validator-Custom", + "date": "2010-08-14T01:41:56", + "stat": { + "size": 17608 + }, + "checksum_sha256": "f7240f7793ced2952701f0ed28ecf43c07cc2fa4549cc505831eceb8424cba4a", + "metadata": { + "author": [ + "Yuki Kimoto " + ] + } + }, + "_type": "release", + "_index": "cpan_v1_01", + "_id": "VGApYqMT4TCxUzHcITn8ZhGHlxE" + }, + { + "_type": "release", + "_id": "ilQN4bpIIdRl6DoiB3y47fgNIk8", + "_index": "cpan_v1_01", + "_score": 1.0, + "_source": { + "author": "KIMOTO", + "name": "Validator-Custom-0.1208", + "date": "2010-07-28T23:00:52", + "distribution": "Validator-Custom", + "version": "0.1208", + "download_url": "https://cpan.metacpan.org/authors/id/K/KI/KIMOTO/Validator-Custom-0.1208.tar.gz", + "metadata": { + "author": [ + "Yuki Kimoto " + ] + }, + "checksum_sha256": "e33a860b026cad852eb919da4a3645007b47e5f414eb7272534b10cee279b52b", + "stat": { + "size": 17489 + } + } + }, + { + "_source": { + "author": "KIMOTO", + "name": "DBIx-Custom-0.1619", + "download_url": "https://cpan.metacpan.org/authors/id/K/KI/KIMOTO/DBIx-Custom-0.1619.tar.gz", + "version": "0.1619", + "distribution": "DBIx-Custom", + "date": "2010-10-20T15:01:35", + "stat": { + "size": 27195 + }, + "checksum_sha256": "83c295343f48ebc03029139082345c93527ffe5831820f99e4a72ee67ef186a5", + "metadata": { + "author": [ + "unknown" + ] + } + }, + "_score": 1.0, + "_id": "g7562_4h9d693lxvc_cgEOTJAZk", + "_index": "cpan_v1_01", + "_type": "release" + }, + { + "_score": 1.0, + "_source": { + "author": "KIMOTO", + "name": "DBIx-Custom-0.1401", + "version": "0.1401", + "distribution": "DBIx-Custom", + "date": "2010-05-01T23:29:22", + "download_url": "https://cpan.metacpan.org/authors/id/K/KI/KIMOTO/DBIx-Custom-0.1401.tar.gz", + "checksum_sha256": "004be1d48b6819941b3cb3c53bf457799d811348e0bb15e7cf18211505637aba", + "metadata": { + "author": [ + "Yuki Kimoto " + ] + }, + "stat": { + "size": 22711 + } + }, + "_type": "release", + "_id": "bLRsOH2sevNQ6Q93exgkvCZONo0", + "_index": "cpan_v1_01" + }, + { + "_type": "release", + "_index": "cpan_v1_01", + "_id": "D8L3qWKznn0IQZrZEeDi9uyXbJY", + "_score": 1.0, + "_source": { + "author": "KIMOTO", + "name": "DBIx-Custom-0.1301", + "stat": { + "size": 22655 + }, + "metadata": { + "author": [ + "Yuki Kimoto " + ] + }, + "checksum_sha256": "6b39e3ad2bc98f06af3a75c96cd8c056a25f7501ed216a375472c8fe7bbb72be", + "download_url": "https://cpan.metacpan.org/authors/id/K/KI/KIMOTO/DBIx-Custom-0.1301.tar.gz", + "distribution": "DBIx-Custom", + "version": "0.1301", + "date": "2010-05-01T13:02:19" + } + }, + { + "_score": 1.0, + "_source": { + "author": "KIMOTO", + "name": "DBIx-Custom-0.1602", + "metadata": { + "author": [ + "Yuki Kimoto " + ] + }, + "checksum_sha256": "7a7e18514e171a6c55ef4c8aef92bd548b15ffd7dec4c1fdc83c276a032f6b8a", + "stat": { + "size": 18999 + }, + "date": "2010-06-25T12:11:33", + "distribution": "DBIx-Custom", + "version": "0.1602", + "download_url": "https://cpan.metacpan.org/authors/id/K/KI/KIMOTO/DBIx-Custom-0.1602.tar.gz" + }, + "_type": "release", + "_index": "cpan_v1_01", + "_id": "kmzgsMLGdsuiHjrSW55lLwMRO4o" + }, + { + "_source": { + "author": "KIMOTO", + "name": "Validator-Custom-0.1204", + "download_url": "https://cpan.metacpan.org/authors/id/K/KI/KIMOTO/Validator-Custom-0.1204.tar.gz", + "version": "0.1204", + "distribution": "Validator-Custom", + "date": "2010-07-08T13:14:23", + "stat": { + "size": 13256 + }, + "checksum_sha256": "40800b3d92cebc09967b61725cecdd05de2b04649f095e3034c5dd82f3d4ad89", + "metadata": { + "author": [ + "Yuki Kimoto " + ] + } + }, + "_score": 1.0, + "_index": "cpan_v1_01", + "_id": "M_lLALu56mb_cDK_jAXwUB2PUlw", + "_type": "release" + }, + { + "_id": "EVuvfiFcvtEr9Ne5Q4QoMAaxe7E", + "_index": "cpan_v1_01", + "_type": "release", + "_source": { + "author": "KIMOTO", + "name": "Validator-Custom-0.1203", + "stat": { + "size": 12572 + }, + "metadata": { + "author": [ + "Yuki Kimoto " + ] + }, + "checksum_sha256": "028a0b41c152c585143167464bed2ac6b6680c8006aa80867f9a8faa4ca5efe7", + "download_url": "https://cpan.metacpan.org/authors/id/K/KI/KIMOTO/Validator-Custom-0.1203.tar.gz", + "date": "2010-07-07T13:29:41", + "distribution": "Validator-Custom", + "version": "0.1203" + }, + "_score": 1.0 + }, + { + "_index": "cpan_v1_01", + "_id": "ZaT8bwXejVTHmrzZCqNJPRFImBY", + "_type": "release", + "_source": { + "author": "KIMOTO", + "name": "DBIx-Custom-0.1641", + "metadata": { + "author": [ + "Yuki Kimoto " + ] + }, + "checksum_sha256": "940412af9b7faf4c946a5e4d57ca52e5b704e49c4d7d0aa5ecb6d2286477ebc6", + "stat": { + "size": 40480 + }, + "distribution": "DBIx-Custom", + "version": "0.1641", + "date": "2011-01-27T05:19:14", + "download_url": "https://cpan.metacpan.org/authors/id/K/KI/KIMOTO/DBIx-Custom-0.1641.tar.gz" + }, + "_score": 1.0 + }, + { + "_source": { + "author": "KIMOTO", + "name": "DBIx-Custom-0.1646", + "version": "0.1646", + "distribution": "DBIx-Custom", + "date": "2011-02-18T17:48:52", + "download_url": "https://cpan.metacpan.org/authors/id/K/KI/KIMOTO/DBIx-Custom-0.1646.tar.gz", + "metadata": { + "author": [ + "Yuki Kimoto " + ] + }, + "checksum_sha256": "7f729311e3e22d36b158e62b42ab2fbd29f08eabd57206e235db939d1ae57d24", + "stat": { + "size": 46577 + } + }, + "_score": 1.0, + "_index": "cpan_v1_01", + "_id": "j21QIzHRYZKz1vobyGAPa2BuO50", + "_type": "release" + } + ], + "total": 359941 + }, + "timed_out": false +} \ No newline at end of file diff --git a/swh/lister/cpan/tests/data/https_fastapi.metacpan.org/v1__search_scroll,scroll=1m,scroll_id=cXVlcnlUaGVuRmV0Y2g7Mzs5NTU1MTQ1NTk6eXptdmszQUNUam1XbVJjRjRkRk9Udzs5NTQ5NjQ5NjI6ZHZIZWxCb3BUZi1Cb3NwRDB5NmRQUTs5NTU1MTQ1NjA6eXptdmszQUNUam1XbVJjRjRkRk9UdzswOw== b/swh/lister/cpan/tests/data/https_fastapi.metacpan.org/v1__search_scroll,scroll=1m,scroll_id=cXVlcnlUaGVuRmV0Y2g7Mzs5NTU1MTQ1NTk6eXptdmszQUNUam1XbVJjRjRkRk9Udzs5NTQ5NjQ5NjI6ZHZIZWxCb3BUZi1Cb3NwRDB5NmRQUTs5NTU1MTQ1NjA6eXptdmszQUNUam1XbVJjRjRkRk9UdzswOw== deleted file mode 100644 index 9c9e8e0..0000000 --- a/swh/lister/cpan/tests/data/https_fastapi.metacpan.org/v1__search_scroll,scroll=1m,scroll_id=cXVlcnlUaGVuRmV0Y2g7Mzs5NTU1MTQ1NTk6eXptdmszQUNUam1XbVJjRjRkRk9Udzs5NTQ5NjQ5NjI6ZHZIZWxCb3BUZi1Cb3NwRDB5NmRQUTs5NTU1MTQ1NjA6eXptdmszQUNUam1XbVJjRjRkRk9UdzswOw== +++ /dev/null @@ -1,50 +0,0 @@ -{ - "_shards" : { - "successful" : 3, - "total" : 3, - "failed" : 0 - }, - "timed_out" : false, - "hits" : { - "max_score" : 1.0, - "hits" : [ - { - "_type" : "distribution", - "fields" : { - "name" : [ - "EventSource-Server" - ] - }, - "_id" : "EventSource-Server", - "_index" : "cpan_v1_01", - "_score" : 1.0 - }, - { - "_score" : 1.0, - "_index" : "cpan_v1_01", - "_id" : "Interchange6", - "fields" : { - "name" : [ - "Interchange6" - ] - }, - "_type" : "distribution" - }, - { - "_score" : 1.0, - "_index" : "cpan_v1_01", - "_id" : "Internals-CountObjects", - "fields" : { - "name" : [ - "Internals-CountObjects" - ] - }, - "_type" : "distribution" - } - ], - "total" : 43675 - }, - "took" : 72, - "_scroll_id" : "cXVlcnlUaGVuRmV0Y2g7Mzs5NTU1MTQ1NTk6eXptdmszQUNUam1XbVJjRjRkRk9Udzs5NTQ5NjQ5NjI6ZHZIZWxCb3BUZi1Cb3NwRDB5NmRQUTs5NTU1MTQ1NjA6eXptdmszQUNUam1XbVJjRjRkRk9UdzswOw==", - "terminated_early" : true -} diff --git a/swh/lister/cpan/tests/data/https_fastapi.metacpan.org/v1__search_scroll,scroll=1m,scroll_id=cXVlcnlUaGVuRmV0Y2g7Mzs5NTU1MTQ1NTk6eXptdmszQUNUam1XbVJjRjRkRk9Udzs5NTQ5NjQ5NjI6ZHZIZWxCb3BUZi1Cb3NwRDB5NmRQUTs5NTU1MTQ1NjA6eXptdmszQUNUam1XbVJjRjRkRk9UdzswOw==_visit1 b/swh/lister/cpan/tests/data/https_fastapi.metacpan.org/v1__search_scroll,scroll=1m,scroll_id=cXVlcnlUaGVuRmV0Y2g7Mzs5NTU1MTQ1NTk6eXptdmszQUNUam1XbVJjRjRkRk9Udzs5NTQ5NjQ5NjI6ZHZIZWxCb3BUZi1Cb3NwRDB5NmRQUTs5NTU1MTQ1NjA6eXptdmszQUNUam1XbVJjRjRkRk9UdzswOw==_visit1 deleted file mode 100644 index 4f88f01..0000000 --- a/swh/lister/cpan/tests/data/https_fastapi.metacpan.org/v1__search_scroll,scroll=1m,scroll_id=cXVlcnlUaGVuRmV0Y2g7Mzs5NTU1MTQ1NTk6eXptdmszQUNUam1XbVJjRjRkRk9Udzs5NTQ5NjQ5NjI6ZHZIZWxCb3BUZi1Cb3NwRDB5NmRQUTs5NTU1MTQ1NjA6eXptdmszQUNUam1XbVJjRjRkRk9UdzswOw==_visit1 +++ /dev/null @@ -1,16 +0,0 @@ -{ - "_scroll_id" : "cXVlcnlUaGVuRmV0Y2g7Mzs5NTU1MTQ1NTk6eXptdmszQUNUam1XbVJjRjRkRk9Udzs5NTQ5NjQ5NjI6ZHZIZWxCb3BUZi1Cb3NwRDB5NmRQUTs5NTU1MTQ1NjA6eXptdmszQUNUam1XbVJjRjRkRk9UdzswOw==", - "took" : 1, - "hits" : { - "hits" : [], - "total" : 43675, - "max_score" : 1.0 - }, - "terminated_early" : true, - "timed_out" : false, - "_shards" : { - "failed" : 0, - "total" : 3, - "successful" : 3 - } -} diff --git a/swh/lister/cpan/tests/data/https_fastapi.metacpan.org/v1_distribution__search,fields=name,size=1000,scroll=1m b/swh/lister/cpan/tests/data/https_fastapi.metacpan.org/v1_distribution__search,fields=name,size=1000,scroll=1m deleted file mode 100644 index e476506..0000000 --- a/swh/lister/cpan/tests/data/https_fastapi.metacpan.org/v1_distribution__search,fields=name,size=1000,scroll=1m +++ /dev/null @@ -1,52 +0,0 @@ -{ - "_shards" : { - "successful" : 3, - "failed" : 0, - "total" : 3 - }, - "_scroll_id" : "cXVlcnlUaGVuRmV0Y2g7Mzs5NTU1MTQ1NTk6eXptdmszQUNUam1XbVJjRjRkRk9Udzs5NTQ5NjQ5NjI6ZHZIZWxCb3BUZi1Cb3NwRDB5NmRQUTs5NTU1MTQ1NjA6eXptdmszQUNUam1XbVJjRjRkRk9UdzswOw==", - "took" : 61, - "hits" : { - "max_score" : 1.0, - "hits" : [ - { - "_score" : 1.0, - "_index" : "cpan_v1_01", - "_id" : "openerserver_perl-master", - "fields" : { - "name" : "openerserver_perl-master" - }, - "_type" : "distribution" - }, - { - "_score" : 1.0, - "_type" : "distribution", - "fields" : { - "name" : "Getopt_Auto" - }, - "_id" : "Getopt_Auto", - "_index" : "cpan_v1_01" - }, - { - "_id" : "App-Booklist", - "_index" : "cpan_v1_01", - "_type" : "distribution", - "fields" : { - "name" : "App-Booklist" - }, - "_score" : 1.0 - }, - { - "fields" : { - "name" : "EuclideanRhythm" - }, - "_type" : "distribution", - "_index" : "cpan_v1_01", - "_id" : "EuclideanRhythm", - "_score" : 1.0 - } - ], - "total" : 43675 - }, - "timed_out" : false -} diff --git a/swh/lister/cpan/tests/data/https_fastapi.metacpan.org/v1_release__search b/swh/lister/cpan/tests/data/https_fastapi.metacpan.org/v1_release__search new file mode 100644 index 0000000..cb3dabf --- /dev/null +++ b/swh/lister/cpan/tests/data/https_fastapi.metacpan.org/v1_release__search @@ -0,0 +1,246 @@ +{ + "timed_out": false, + "_shards": { + "total": 3, + "failed": 0, + "successful": 3 + }, + "hits": { + "hits": [ + { + "_index": "cpan_v1_01", + "_id": "40MmOvf_SQx_mr8Kj9Eush14a3E", + "_source": { + "author": "KRYDE", + "name": "math-image-46", + "date": "2011-03-02T00:46:14", + "download_url": "https://cpan.metacpan.org/authors/id/K/KR/KRYDE/math-image-46.tar.gz", + "checksum_sha256": "6bd988e3959feb1071d3b9953d16e723af66bdb7b5440ea17add8709d95f20fa", + "version": "46", + "stat": { + "size": 533502 + }, + "distribution": "math-image", + "metadata": { + "author": [ + "Kevin Ryde " + ] + } + }, + "_type": "release", + "_score": 1.0 + }, + { + "_index": "cpan_v1_01", + "_source": { + "author": "MITHALDU", + "name": "Dist-Zilla-Plugin-ProgCriticTests-1.101580-TRIAL", + "distribution": "Dist-Zilla-Plugin-ProgCriticTests", + "metadata": { + "author": [ + "Christian Walde " + ] + }, + "stat": { + "size": 16918 + }, + "checksum_sha256": "ef8c92d0fc55551392a6daeee20a1c13a3ee1bcd0fcacf611cbc2a6cc503f401", + "download_url": "https://cpan.metacpan.org/authors/id/M/MI/MITHALDU/Dist-Zilla-Plugin-ProgCriticTests-1.101580-TRIAL.tar.gz", + "date": "2010-06-07T14:43:36", + "version": "1.101580" + }, + "_id": "6df77_MLO_BG8YC_vQKsay7OFYM", + "_type": "release", + "_score": 1.0 + }, + { + "_index": "cpan_v1_01", + "_type": "release", + "_source": { + "author": "MITHUN", + "name": "Net-Rapidshare-v0.04", + "download_url": "https://cpan.metacpan.org/authors/id/M/MI/MITHUN/Net-Rapidshare-v0.04.tar.gz", + "date": "2009-07-28T05:57:26", + "checksum_sha256": "f01456a8f8c2b6806a8dd041cf848f330884573d363b28c8b3ff12e837fa8f4f", + "version": "v0.04", + "distribution": "Net-Rapidshare", + "metadata": { + "author": [ + "unknown" + ] + }, + "stat": { + "size": 15068 + } + }, + "_id": "jCs3ZLWuoetrkMLOFKV3YTSr_fM", + "_score": 1.0 + }, + { + "_index": "cpan_v1_01", + "_source": { + "author": "MITHUN", + "name": "Net-Rapidshare-v0.05", + "version": "v0.05", + "download_url": "https://cpan.metacpan.org/authors/id/M/MI/MITHUN/Net-Rapidshare-v0.05.tgz", + "date": "2009-12-21T00:29:48", + "checksum_sha256": "e1128d3b35616530d9722d0fe3f5f0e343fd914bc8f9c0df55c1a9ad6c7402fe", + "metadata": { + "author": [ + "unknown" + ] + }, + "distribution": "Net-Rapidshare", + "stat": { + "size": 15971 + } + }, + "_id": "pExMIwabhz_0S1rX7xAY_lq0GTY", + "_type": "release", + "_score": 1.0 + }, + { + "_type": "release", + "_source": { + "author": "MITHUN", + "name": "Net-Rapidshare-v0.0.1", + "version": "v0.0.1", + "download_url": "https://cpan.metacpan.org/authors/id/M/MI/MITHUN/Net-Rapidshare-v0.0.1.tar.gz", + "checksum_sha256": "990de0a72753fa182e7a5867e55fd6755375b71280bb7e5b3a5f07c4de8af905", + "date": "2009-07-18T22:56:38", + "stat": { + "size": 15161 + }, + "metadata": { + "author": [ + "unknown" + ] + }, + "distribution": "Net-Rapidshare" + }, + "_id": "eqkhDnj0efXHisWRrMZZ1EHFgug", + "_index": "cpan_v1_01", + "_score": 1.0 + }, + { + "_score": 1.0, + "_index": "cpan_v1_01", + "_type": "release", + "_source": { + "author": "KIMOTO", + "name": "DBIx-Custom-Basic-0.0101", + "stat": { + "size": 3409 + }, + "distribution": "DBIx-Custom-Basic", + "metadata": { + "author": [ + "Yuki Kimoto " + ] + }, + "download_url": "https://cpan.metacpan.org/authors/id/K/KI/KIMOTO/DBIx-Custom-Basic-0.0101.tar.gz", + "date": "2009-11-08T04:18:30", + "checksum_sha256": "86f68b2d0789934aa6b0202345e9807c5b650f8030b55d0d669ef25293fa3f1f", + "version": "0.0101" + }, + "_id": "oKf3t0pXHXa6mZ_4sUZSaSMKuXg" + }, + { + "_score": 1.0, + "_index": "cpan_v1_01", + "_source": { + "author": "KIMOTO", + "name": "DBIx-Custom-SQLite-0.0101", + "version": "0.0101", + "download_url": "https://cpan.metacpan.org/authors/id/K/KI/KIMOTO/DBIx-Custom-SQLite-0.0101.tar.gz", + "date": "2009-11-08T04:20:31", + "checksum_sha256": "0af123551dff95f9654f4fbc24e945c5d6481b92e67b8e03ca91ef4c83088cc7", + "metadata": { + "author": [ + "Yuki Kimoto " + ] + }, + "distribution": "DBIx-Custom-SQLite", + "stat": { + "size": 3927 + } + }, + "_type": "release", + "_id": "zpVA3zMoUhx0mj8Cn4YC9CuFyA8" + }, + { + "_index": "cpan_v1_01", + "_source": { + "author": "KIMOTO", + "name": "Validator-Custom-Ext-Mojolicious-0.0103", + "checksum_sha256": "0911fe6ae65f9173c6eb68b6116600552b088939b94881be3c7275344b1cbdce", + "download_url": "https://cpan.metacpan.org/authors/id/K/KI/KIMOTO/Validator-Custom-Ext-Mojolicious-0.0103.tar.gz", + "date": "2010-01-16T14:51:11", + "version": "0.0103", + "stat": { + "size": 4190 + }, + "distribution": "Validator-Custom-Ext-Mojolicious", + "metadata": { + "author": [ + "Yuki Kimoto " + ] + } + }, + "_id": "mY_jP2O7NnTtr3utv_xZQNu10Ic", + "_type": "release", + "_score": 1.0 + }, + { + "_source": { + "author": "KIMOTO", + "name": "Validator-Custom-Ext-Mojolicious-0.0102", + "stat": { + "size": 4257 + }, + "distribution": "Validator-Custom-Ext-Mojolicious", + "metadata": { + "author": [ + "Yuki Kimoto " + ] + }, + "date": "2010-01-15T14:07:24", + "download_url": "https://cpan.metacpan.org/authors/id/K/KI/KIMOTO/Validator-Custom-Ext-Mojolicious-0.0102.tar.gz", + "checksum_sha256": "a88d01504353223f7a3cb0d6a240debb9c6d6155858f1048a19007c3b366beed", + "version": "0.0102" + }, + "_id": "WZm6hQ6mBfOqgVE6dPQOE0L8hg0", + "_type": "release", + "_index": "cpan_v1_01", + "_score": 1.0 + }, + { + "_index": "cpan_v1_01", + "_type": "release", + "_source": { + "author": "KIMOTO", + "name": "Validator-Custom-0.1207", + "download_url": "https://cpan.metacpan.org/authors/id/K/KI/KIMOTO/Validator-Custom-0.1207.tar.gz", + "date": "2010-07-28T13:42:23", + "checksum_sha256": "f599da2ecc17ac74443628eb84233ee6b25b204511f83ea778dad9efd0f558e0", + "version": "0.1207", + "stat": { + "size": 16985 + }, + "distribution": "Validator-Custom", + "metadata": { + "author": [ + "Yuki Kimoto " + ] + } + }, + "_id": "NWJOqmjEinjfJqawfpkEpEhu4d0", + "_score": 1.0 + } + ], + "total": 359941, + "max_score": 1.0 + }, + "took": 14, + "_scroll_id": "cXVlcnlUaGVuRmV0Y2g7Mzs5OTQ2NzY3ODU6ZHZIZWxCb3BUZi1Cb3NwRDB5NmRQUTs5OTQ2NzY3ODQ6ZHZIZWxCb3BUZi1Cb3NwRDB5NmRQUTs5OTUyMzQzMTA6eXptdmszQUNUam1XbVJjRjRkRk9UdzswOw==" +} \ No newline at end of file diff --git a/swh/lister/cpan/tests/test_lister.py b/swh/lister/cpan/tests/test_lister.py index 716feca..9e7950c 100644 --- a/swh/lister/cpan/tests/test_lister.py +++ b/swh/lister/cpan/tests/test_lister.py @@ -3,25 +3,95 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +from collections import defaultdict +from itertools import chain +import json +from pathlib import Path + +import pytest + from swh.lister.cpan.lister import CpanLister -expected_origins = [ - "https://metacpan.org/dist/App-Booklist", - "https://metacpan.org/dist/EuclideanRhythm", - "https://metacpan.org/dist/EventSource-Server", - "https://metacpan.org/dist/Getopt_Auto", - "https://metacpan.org/dist/Interchange6", - "https://metacpan.org/dist/Internals-CountObjects", - "https://metacpan.org/dist/openerserver_perl-master", -] + +@pytest.fixture +def release_search_response(datadir): + return json.loads( + Path(datadir, "https_fastapi.metacpan.org", "v1_release__search").read_bytes() + ) -def test_cpan_lister(datadir, requests_mock_datadir_visits, swh_scheduler): +@pytest.fixture +def release_scroll_first_response(datadir): + return json.loads( + Path(datadir, "https_fastapi.metacpan.org", "v1__search_scroll").read_bytes() + ) + + +@pytest.fixture(autouse=True) +def mock_network_requests( + requests_mock, release_search_response, release_scroll_first_response +): + requests_mock.get( + "https://fastapi.metacpan.org/v1/release/_search", + json=release_search_response, + ) + requests_mock.get( + "https://fastapi.metacpan.org/v1/_search/scroll", + [ + { + "json": release_scroll_first_response, + }, + {"json": {"hits": {"hits": []}, "_scroll_id": ""}}, + ], + ) + + +def test_cpan_lister( + swh_scheduler, release_search_response, release_scroll_first_response +): lister = CpanLister(scheduler=swh_scheduler) res = lister.run() - assert res.pages == 3 - assert res.origins == 4 + 3 + 0 + expected_origins = set() + expected_artifacts = defaultdict(list) + expected_module_metadata = defaultdict(list) + for release in chain( + release_search_response["hits"]["hits"], + release_scroll_first_response["hits"]["hits"], + ): + distribution = release["_source"]["distribution"] + release_name = release["_source"]["name"] + checksum_sha256 = release["_source"]["checksum_sha256"] + download_url = release["_source"]["download_url"] + version = release["_source"]["version"] + size = release["_source"]["stat"]["size"] + author = release["_source"]["author"] + author_fullname = release["_source"]["metadata"]["author"][0] + date = release["_source"]["date"] + origin_url = f"https://metacpan.org/dist/{distribution}" + expected_origins.add(origin_url) + expected_artifacts[origin_url].append( + { + "url": download_url, + "filename": download_url.split("/")[-1], + "version": version, + "length": size, + "checksums": {"sha256": checksum_sha256}, + } + ) + expected_module_metadata[origin_url].append( + { + "name": distribution, + "version": version, + "cpan_author": author, + "author": author_fullname if author_fullname != "unknown" else author, + "date": date, + "release_name": release_name, + } + ) + + assert res.pages == 1 + assert res.origins == len(expected_origins) scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results @@ -30,3 +100,8 @@ def test_cpan_lister(datadir, requests_mock_datadir_visits, swh_scheduler): for origin in scheduler_origins: assert origin.visit_type == "cpan" assert origin.url in expected_origins + assert origin.extra_loader_arguments == { + "api_base_url": "https://fastapi.metacpan.org/v1", + "artifacts": expected_artifacts[origin.url], + "module_metadata": expected_module_metadata[origin.url], + }