cpan: Improve listing process by querying the metacpan release endpoint

Instead of querying the metacpan distribution endpoint to list origins,
prefer to use the release endpoint instead enabling to list all artifacts
associated to CPAN packages by scrolling results.

Compared to previous implementation, it enables to compute a last_update
date for all CPAN packages but also to obtain artifact sha256 checksums
that will be used by the CPAN loader to check downloads integrity.

As the multiple versions of a module are spread across multiple pages
from the CPAN API, origins are sent to the scheduler once all pages
processed, it is also faster to proceed that way.

Related to T2833
This commit is contained in:
Antoine Lambert 2022-09-27 16:34:38 +02:00
parent 108816f232
commit f57b8f3a2c
8 changed files with 701 additions and 159 deletions

View file

@ -16,9 +16,9 @@ As of September 2022 `cpan.org`_ list 43675 package names.
Origins retrieving strategy
---------------------------
To get a list of all package names we call a first `http api endpoint`_ that
retrieve results and a ``_scroll_id`` that will be used to scroll pages through
`search`_ endpoint.
To get a list of all package names and their associated release artifacts we call
a first `http api endpoint`_ that retrieve results and a ``_scroll_id`` that will
be used to scroll pages through `search`_ endpoint.
Page listing
------------
@ -57,7 +57,7 @@ You can follow lister execution by displaying logs of swh-lister service::
.. _cpan.org: https://cpan.org/
.. _metacpan.org: https://metacpan.org/
.. _http api endpoint: https://explorer.metacpan.org/?url=/distribution/
.. _http api endpoint: https://explorer.metacpan.org/?url=/release/
.. _search: https://github.com/metacpan/metacpan-api/blob/master/docs/API-docs.md#search-without-constraints # noqa: B950

View file

@ -3,8 +3,12 @@
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from collections import defaultdict
from datetime import datetime
import logging
from typing import Any, Dict, Iterator, List, Optional
from typing import Any, Dict, Iterator, List, Optional, Set
import iso8601
from swh.scheduler.interface import SchedulerInterface
from swh.scheduler.model import ListedOrigin
@ -14,7 +18,33 @@ from ..pattern import CredentialsType, StatelessLister
logger = logging.getLogger(__name__)
# Aliasing the page results returned by `get_pages` method from the lister.
CpanListerPage = List[Dict[str, Any]]
CpanListerPage = Set[str]
def get_field_value(entry, field_name):
"""
Splits ``field_name`` on ``.``, and use it as path in the nested ``entry``
dictionary. If a value does not exist, returns None.
>>> entry = {"_source": {"foo": 1, "bar": {"baz": 2, "qux": [3]}}}
>>> get_field_value(entry, "foo")
1
>>> get_field_value(entry, "bar")
{'baz': 2, 'qux': [3]}
>>> get_field_value(entry, "bar.baz")
2
>>> get_field_value(entry, "bar.qux")
3
"""
fields = field_name.split(".")
field_value = entry["_source"]
for field in fields[:-1]:
field_value = field_value.get(field, {})
field_value = field_value.get(fields[-1])
# scrolled results might have field value in a list
if isinstance(field_value, list):
field_value = field_value[0]
return field_value
class CpanLister(StatelessLister[CpanListerPage]):
@ -25,7 +55,15 @@ class CpanLister(StatelessLister[CpanListerPage]):
VISIT_TYPE = "cpan"
INSTANCE = "cpan"
BASE_URL = "https://fastapi.metacpan.org/v1/"
API_BASE_URL = "https://fastapi.metacpan.org/v1"
REQUIRED_DOC_FIELDS = [
"download_url",
"checksum_sha256",
"distribution",
"version",
]
OPTIONAL_DOC_FIELDS = ["date", "author", "stat.size", "name", "metadata.author"]
ORIGIN_URL_PATTERN = "https://metacpan.org/dist/{module_name}"
def __init__(
self,
@ -36,26 +74,82 @@ class CpanLister(StatelessLister[CpanListerPage]):
scheduler=scheduler,
credentials=credentials,
instance=self.INSTANCE,
url=self.BASE_URL,
url=self.API_BASE_URL,
)
self.artifacts: Dict[str, List[Dict[str, Any]]] = defaultdict(list)
self.module_metadata: Dict[str, List[Dict[str, Any]]] = defaultdict(list)
self.release_dates: Dict[str, List[datetime]] = defaultdict(list)
self.module_names: Set[str] = set()
def process_release_page(self, page: List[Dict[str, Any]]):
for entry in page:
if "_source" not in entry or not all(
k in entry["_source"].keys() for k in self.REQUIRED_DOC_FIELDS
):
logger.warning(
"Skipping release entry %s as some required fields are missing",
entry.get("_source"),
)
continue
module_name = get_field_value(entry, "distribution")
module_version = get_field_value(entry, "version")
module_download_url = get_field_value(entry, "download_url")
module_sha256_checksum = get_field_value(entry, "checksum_sha256")
module_date = get_field_value(entry, "date")
module_size = get_field_value(entry, "stat.size")
module_author = get_field_value(entry, "author")
module_author_fullname = get_field_value(entry, "metadata.author")
release_name = get_field_value(entry, "name")
self.artifacts[module_name].append(
{
"url": module_download_url,
"filename": module_download_url.split("/")[-1],
"checksums": {"sha256": module_sha256_checksum},
"version": module_version,
"length": module_size,
}
)
self.module_metadata[module_name].append(
{
"name": module_name,
"version": module_version,
"cpan_author": module_author,
"author": (
module_author_fullname
if module_author_fullname not in (None, "", "unknown")
else module_author
),
"date": module_date,
"release_name": release_name,
}
)
self.release_dates[module_name].append(iso8601.parse_date(module_date))
self.module_names.add(module_name)
def get_pages(self) -> Iterator[CpanListerPage]:
"""Yield an iterator which returns 'page'"""
endpoint = f"{self.BASE_URL}distribution/_search"
scrollendpoint = f"{self.BASE_URL}_search/scroll"
size: int = 1000
endpoint = f"{self.API_BASE_URL}/release/_search"
scrollendpoint = f"{self.API_BASE_URL}/_search/scroll"
size = 1000
res = self.http_request(
endpoint,
params={
"fields": ["name"],
"_source": self.REQUIRED_DOC_FIELDS + self.OPTIONAL_DOC_FIELDS,
"size": size,
"scroll": "1m",
},
)
data = res.json()["hits"]["hits"]
yield data
self.process_release_page(data)
_scroll_id = res.json()["_scroll_id"]
@ -65,27 +159,25 @@ class CpanLister(StatelessLister[CpanListerPage]):
)
data = scroll_res.json()["hits"]["hits"]
_scroll_id = scroll_res.json()["_scroll_id"]
yield data
self.process_release_page(data)
def get_origins_from_page(self, page: CpanListerPage) -> Iterator[ListedOrigin]:
yield self.module_names
def get_origins_from_page(
self, module_names: CpanListerPage
) -> Iterator[ListedOrigin]:
"""Iterate on all pages and yield ListedOrigin instances."""
assert self.lister_obj.id is not None
for entry in page:
# Skip the entry if 'fields' or 'name' keys are missing
if "fields" not in entry or "name" not in entry["fields"]:
continue
pkgname = entry["fields"]["name"]
# TODO: Check why sometimes its a one value list
if type(pkgname) != str:
pkgname = pkgname[0]
url = f"https://metacpan.org/dist/{pkgname}"
for module_name in module_names:
yield ListedOrigin(
lister_id=self.lister_obj.id,
visit_type=self.VISIT_TYPE,
url=url,
last_update=None,
url=self.ORIGIN_URL_PATTERN.format(module_name=module_name),
last_update=max(self.release_dates[module_name]),
extra_loader_arguments={
"api_base_url": self.API_BASE_URL,
"artifacts": self.artifacts[module_name],
"module_metadata": self.module_metadata[module_name],
},
)

View file

@ -0,0 +1,247 @@
{
"_shards": {
"failed": 0,
"total": 3,
"successful": 3
},
"_scroll_id": "cXVlcnlUaGVuRmV0Y2g7Mzs5OTQ2NzY3ODU6ZHZIZWxCb3BUZi1Cb3NwRDB5NmRQUTs5OTQ2NzY3ODQ6ZHZIZWxCb3BUZi1Cb3NwRDB5NmRQUTs5OTUyMzQzMTA6eXptdmszQUNUam1XbVJjRjRkRk9UdzswOw==",
"terminated_early": true,
"took": 3,
"hits": {
"max_score": 1.0,
"hits": [
{
"_score": 1.0,
"_source": {
"author": "KIMOTO",
"name": "Validator-Custom-0.1210",
"download_url": "https://cpan.metacpan.org/authors/id/K/KI/KIMOTO/Validator-Custom-0.1210.tar.gz",
"version": "0.1210",
"distribution": "Validator-Custom",
"date": "2010-08-14T01:41:56",
"stat": {
"size": 17608
},
"checksum_sha256": "f7240f7793ced2952701f0ed28ecf43c07cc2fa4549cc505831eceb8424cba4a",
"metadata": {
"author": [
"Yuki Kimoto <kimoto.yuki@gmail.com>"
]
}
},
"_type": "release",
"_index": "cpan_v1_01",
"_id": "VGApYqMT4TCxUzHcITn8ZhGHlxE"
},
{
"_type": "release",
"_id": "ilQN4bpIIdRl6DoiB3y47fgNIk8",
"_index": "cpan_v1_01",
"_score": 1.0,
"_source": {
"author": "KIMOTO",
"name": "Validator-Custom-0.1208",
"date": "2010-07-28T23:00:52",
"distribution": "Validator-Custom",
"version": "0.1208",
"download_url": "https://cpan.metacpan.org/authors/id/K/KI/KIMOTO/Validator-Custom-0.1208.tar.gz",
"metadata": {
"author": [
"Yuki Kimoto <kimoto.yuki@gmail.com>"
]
},
"checksum_sha256": "e33a860b026cad852eb919da4a3645007b47e5f414eb7272534b10cee279b52b",
"stat": {
"size": 17489
}
}
},
{
"_source": {
"author": "KIMOTO",
"name": "DBIx-Custom-0.1619",
"download_url": "https://cpan.metacpan.org/authors/id/K/KI/KIMOTO/DBIx-Custom-0.1619.tar.gz",
"version": "0.1619",
"distribution": "DBIx-Custom",
"date": "2010-10-20T15:01:35",
"stat": {
"size": 27195
},
"checksum_sha256": "83c295343f48ebc03029139082345c93527ffe5831820f99e4a72ee67ef186a5",
"metadata": {
"author": [
"unknown"
]
}
},
"_score": 1.0,
"_id": "g7562_4h9d693lxvc_cgEOTJAZk",
"_index": "cpan_v1_01",
"_type": "release"
},
{
"_score": 1.0,
"_source": {
"author": "KIMOTO",
"name": "DBIx-Custom-0.1401",
"version": "0.1401",
"distribution": "DBIx-Custom",
"date": "2010-05-01T23:29:22",
"download_url": "https://cpan.metacpan.org/authors/id/K/KI/KIMOTO/DBIx-Custom-0.1401.tar.gz",
"checksum_sha256": "004be1d48b6819941b3cb3c53bf457799d811348e0bb15e7cf18211505637aba",
"metadata": {
"author": [
"Yuki Kimoto <kimoto.yuki@gmail.com>"
]
},
"stat": {
"size": 22711
}
},
"_type": "release",
"_id": "bLRsOH2sevNQ6Q93exgkvCZONo0",
"_index": "cpan_v1_01"
},
{
"_type": "release",
"_index": "cpan_v1_01",
"_id": "D8L3qWKznn0IQZrZEeDi9uyXbJY",
"_score": 1.0,
"_source": {
"author": "KIMOTO",
"name": "DBIx-Custom-0.1301",
"stat": {
"size": 22655
},
"metadata": {
"author": [
"Yuki Kimoto <kimoto.yuki@gmail.com>"
]
},
"checksum_sha256": "6b39e3ad2bc98f06af3a75c96cd8c056a25f7501ed216a375472c8fe7bbb72be",
"download_url": "https://cpan.metacpan.org/authors/id/K/KI/KIMOTO/DBIx-Custom-0.1301.tar.gz",
"distribution": "DBIx-Custom",
"version": "0.1301",
"date": "2010-05-01T13:02:19"
}
},
{
"_score": 1.0,
"_source": {
"author": "KIMOTO",
"name": "DBIx-Custom-0.1602",
"metadata": {
"author": [
"Yuki Kimoto <kimoto.yuki@gmail.com>"
]
},
"checksum_sha256": "7a7e18514e171a6c55ef4c8aef92bd548b15ffd7dec4c1fdc83c276a032f6b8a",
"stat": {
"size": 18999
},
"date": "2010-06-25T12:11:33",
"distribution": "DBIx-Custom",
"version": "0.1602",
"download_url": "https://cpan.metacpan.org/authors/id/K/KI/KIMOTO/DBIx-Custom-0.1602.tar.gz"
},
"_type": "release",
"_index": "cpan_v1_01",
"_id": "kmzgsMLGdsuiHjrSW55lLwMRO4o"
},
{
"_source": {
"author": "KIMOTO",
"name": "Validator-Custom-0.1204",
"download_url": "https://cpan.metacpan.org/authors/id/K/KI/KIMOTO/Validator-Custom-0.1204.tar.gz",
"version": "0.1204",
"distribution": "Validator-Custom",
"date": "2010-07-08T13:14:23",
"stat": {
"size": 13256
},
"checksum_sha256": "40800b3d92cebc09967b61725cecdd05de2b04649f095e3034c5dd82f3d4ad89",
"metadata": {
"author": [
"Yuki Kimoto <kimoto.yuki@gmail.com>"
]
}
},
"_score": 1.0,
"_index": "cpan_v1_01",
"_id": "M_lLALu56mb_cDK_jAXwUB2PUlw",
"_type": "release"
},
{
"_id": "EVuvfiFcvtEr9Ne5Q4QoMAaxe7E",
"_index": "cpan_v1_01",
"_type": "release",
"_source": {
"author": "KIMOTO",
"name": "Validator-Custom-0.1203",
"stat": {
"size": 12572
},
"metadata": {
"author": [
"Yuki Kimoto <kimoto.yuki@gmail.com>"
]
},
"checksum_sha256": "028a0b41c152c585143167464bed2ac6b6680c8006aa80867f9a8faa4ca5efe7",
"download_url": "https://cpan.metacpan.org/authors/id/K/KI/KIMOTO/Validator-Custom-0.1203.tar.gz",
"date": "2010-07-07T13:29:41",
"distribution": "Validator-Custom",
"version": "0.1203"
},
"_score": 1.0
},
{
"_index": "cpan_v1_01",
"_id": "ZaT8bwXejVTHmrzZCqNJPRFImBY",
"_type": "release",
"_source": {
"author": "KIMOTO",
"name": "DBIx-Custom-0.1641",
"metadata": {
"author": [
"Yuki Kimoto <kimoto.yuki@gmail.com>"
]
},
"checksum_sha256": "940412af9b7faf4c946a5e4d57ca52e5b704e49c4d7d0aa5ecb6d2286477ebc6",
"stat": {
"size": 40480
},
"distribution": "DBIx-Custom",
"version": "0.1641",
"date": "2011-01-27T05:19:14",
"download_url": "https://cpan.metacpan.org/authors/id/K/KI/KIMOTO/DBIx-Custom-0.1641.tar.gz"
},
"_score": 1.0
},
{
"_source": {
"author": "KIMOTO",
"name": "DBIx-Custom-0.1646",
"version": "0.1646",
"distribution": "DBIx-Custom",
"date": "2011-02-18T17:48:52",
"download_url": "https://cpan.metacpan.org/authors/id/K/KI/KIMOTO/DBIx-Custom-0.1646.tar.gz",
"metadata": {
"author": [
"Yuki Kimoto <kimoto.yuki@gmail.com>"
]
},
"checksum_sha256": "7f729311e3e22d36b158e62b42ab2fbd29f08eabd57206e235db939d1ae57d24",
"stat": {
"size": 46577
}
},
"_score": 1.0,
"_index": "cpan_v1_01",
"_id": "j21QIzHRYZKz1vobyGAPa2BuO50",
"_type": "release"
}
],
"total": 359941
},
"timed_out": false
}

View file

@ -1,50 +0,0 @@
{
"_shards" : {
"successful" : 3,
"total" : 3,
"failed" : 0
},
"timed_out" : false,
"hits" : {
"max_score" : 1.0,
"hits" : [
{
"_type" : "distribution",
"fields" : {
"name" : [
"EventSource-Server"
]
},
"_id" : "EventSource-Server",
"_index" : "cpan_v1_01",
"_score" : 1.0
},
{
"_score" : 1.0,
"_index" : "cpan_v1_01",
"_id" : "Interchange6",
"fields" : {
"name" : [
"Interchange6"
]
},
"_type" : "distribution"
},
{
"_score" : 1.0,
"_index" : "cpan_v1_01",
"_id" : "Internals-CountObjects",
"fields" : {
"name" : [
"Internals-CountObjects"
]
},
"_type" : "distribution"
}
],
"total" : 43675
},
"took" : 72,
"_scroll_id" : "cXVlcnlUaGVuRmV0Y2g7Mzs5NTU1MTQ1NTk6eXptdmszQUNUam1XbVJjRjRkRk9Udzs5NTQ5NjQ5NjI6ZHZIZWxCb3BUZi1Cb3NwRDB5NmRQUTs5NTU1MTQ1NjA6eXptdmszQUNUam1XbVJjRjRkRk9UdzswOw==",
"terminated_early" : true
}

View file

@ -1,16 +0,0 @@
{
"_scroll_id" : "cXVlcnlUaGVuRmV0Y2g7Mzs5NTU1MTQ1NTk6eXptdmszQUNUam1XbVJjRjRkRk9Udzs5NTQ5NjQ5NjI6ZHZIZWxCb3BUZi1Cb3NwRDB5NmRQUTs5NTU1MTQ1NjA6eXptdmszQUNUam1XbVJjRjRkRk9UdzswOw==",
"took" : 1,
"hits" : {
"hits" : [],
"total" : 43675,
"max_score" : 1.0
},
"terminated_early" : true,
"timed_out" : false,
"_shards" : {
"failed" : 0,
"total" : 3,
"successful" : 3
}
}

View file

@ -1,52 +0,0 @@
{
"_shards" : {
"successful" : 3,
"failed" : 0,
"total" : 3
},
"_scroll_id" : "cXVlcnlUaGVuRmV0Y2g7Mzs5NTU1MTQ1NTk6eXptdmszQUNUam1XbVJjRjRkRk9Udzs5NTQ5NjQ5NjI6ZHZIZWxCb3BUZi1Cb3NwRDB5NmRQUTs5NTU1MTQ1NjA6eXptdmszQUNUam1XbVJjRjRkRk9UdzswOw==",
"took" : 61,
"hits" : {
"max_score" : 1.0,
"hits" : [
{
"_score" : 1.0,
"_index" : "cpan_v1_01",
"_id" : "openerserver_perl-master",
"fields" : {
"name" : "openerserver_perl-master"
},
"_type" : "distribution"
},
{
"_score" : 1.0,
"_type" : "distribution",
"fields" : {
"name" : "Getopt_Auto"
},
"_id" : "Getopt_Auto",
"_index" : "cpan_v1_01"
},
{
"_id" : "App-Booklist",
"_index" : "cpan_v1_01",
"_type" : "distribution",
"fields" : {
"name" : "App-Booklist"
},
"_score" : 1.0
},
{
"fields" : {
"name" : "EuclideanRhythm"
},
"_type" : "distribution",
"_index" : "cpan_v1_01",
"_id" : "EuclideanRhythm",
"_score" : 1.0
}
],
"total" : 43675
},
"timed_out" : false
}

View file

@ -0,0 +1,246 @@
{
"timed_out": false,
"_shards": {
"total": 3,
"failed": 0,
"successful": 3
},
"hits": {
"hits": [
{
"_index": "cpan_v1_01",
"_id": "40MmOvf_SQx_mr8Kj9Eush14a3E",
"_source": {
"author": "KRYDE",
"name": "math-image-46",
"date": "2011-03-02T00:46:14",
"download_url": "https://cpan.metacpan.org/authors/id/K/KR/KRYDE/math-image-46.tar.gz",
"checksum_sha256": "6bd988e3959feb1071d3b9953d16e723af66bdb7b5440ea17add8709d95f20fa",
"version": "46",
"stat": {
"size": 533502
},
"distribution": "math-image",
"metadata": {
"author": [
"Kevin Ryde <user42@zip.com.au>"
]
}
},
"_type": "release",
"_score": 1.0
},
{
"_index": "cpan_v1_01",
"_source": {
"author": "MITHALDU",
"name": "Dist-Zilla-Plugin-ProgCriticTests-1.101580-TRIAL",
"distribution": "Dist-Zilla-Plugin-ProgCriticTests",
"metadata": {
"author": [
"Christian Walde <mithaldu@yahoo.de>"
]
},
"stat": {
"size": 16918
},
"checksum_sha256": "ef8c92d0fc55551392a6daeee20a1c13a3ee1bcd0fcacf611cbc2a6cc503f401",
"download_url": "https://cpan.metacpan.org/authors/id/M/MI/MITHALDU/Dist-Zilla-Plugin-ProgCriticTests-1.101580-TRIAL.tar.gz",
"date": "2010-06-07T14:43:36",
"version": "1.101580"
},
"_id": "6df77_MLO_BG8YC_vQKsay7OFYM",
"_type": "release",
"_score": 1.0
},
{
"_index": "cpan_v1_01",
"_type": "release",
"_source": {
"author": "MITHUN",
"name": "Net-Rapidshare-v0.04",
"download_url": "https://cpan.metacpan.org/authors/id/M/MI/MITHUN/Net-Rapidshare-v0.04.tar.gz",
"date": "2009-07-28T05:57:26",
"checksum_sha256": "f01456a8f8c2b6806a8dd041cf848f330884573d363b28c8b3ff12e837fa8f4f",
"version": "v0.04",
"distribution": "Net-Rapidshare",
"metadata": {
"author": [
"unknown"
]
},
"stat": {
"size": 15068
}
},
"_id": "jCs3ZLWuoetrkMLOFKV3YTSr_fM",
"_score": 1.0
},
{
"_index": "cpan_v1_01",
"_source": {
"author": "MITHUN",
"name": "Net-Rapidshare-v0.05",
"version": "v0.05",
"download_url": "https://cpan.metacpan.org/authors/id/M/MI/MITHUN/Net-Rapidshare-v0.05.tgz",
"date": "2009-12-21T00:29:48",
"checksum_sha256": "e1128d3b35616530d9722d0fe3f5f0e343fd914bc8f9c0df55c1a9ad6c7402fe",
"metadata": {
"author": [
"unknown"
]
},
"distribution": "Net-Rapidshare",
"stat": {
"size": 15971
}
},
"_id": "pExMIwabhz_0S1rX7xAY_lq0GTY",
"_type": "release",
"_score": 1.0
},
{
"_type": "release",
"_source": {
"author": "MITHUN",
"name": "Net-Rapidshare-v0.0.1",
"version": "v0.0.1",
"download_url": "https://cpan.metacpan.org/authors/id/M/MI/MITHUN/Net-Rapidshare-v0.0.1.tar.gz",
"checksum_sha256": "990de0a72753fa182e7a5867e55fd6755375b71280bb7e5b3a5f07c4de8af905",
"date": "2009-07-18T22:56:38",
"stat": {
"size": 15161
},
"metadata": {
"author": [
"unknown"
]
},
"distribution": "Net-Rapidshare"
},
"_id": "eqkhDnj0efXHisWRrMZZ1EHFgug",
"_index": "cpan_v1_01",
"_score": 1.0
},
{
"_score": 1.0,
"_index": "cpan_v1_01",
"_type": "release",
"_source": {
"author": "KIMOTO",
"name": "DBIx-Custom-Basic-0.0101",
"stat": {
"size": 3409
},
"distribution": "DBIx-Custom-Basic",
"metadata": {
"author": [
"Yuki Kimoto <kimoto.yuki@gmail.com>"
]
},
"download_url": "https://cpan.metacpan.org/authors/id/K/KI/KIMOTO/DBIx-Custom-Basic-0.0101.tar.gz",
"date": "2009-11-08T04:18:30",
"checksum_sha256": "86f68b2d0789934aa6b0202345e9807c5b650f8030b55d0d669ef25293fa3f1f",
"version": "0.0101"
},
"_id": "oKf3t0pXHXa6mZ_4sUZSaSMKuXg"
},
{
"_score": 1.0,
"_index": "cpan_v1_01",
"_source": {
"author": "KIMOTO",
"name": "DBIx-Custom-SQLite-0.0101",
"version": "0.0101",
"download_url": "https://cpan.metacpan.org/authors/id/K/KI/KIMOTO/DBIx-Custom-SQLite-0.0101.tar.gz",
"date": "2009-11-08T04:20:31",
"checksum_sha256": "0af123551dff95f9654f4fbc24e945c5d6481b92e67b8e03ca91ef4c83088cc7",
"metadata": {
"author": [
"Yuki Kimoto <kimoto.yuki@gmail.com>"
]
},
"distribution": "DBIx-Custom-SQLite",
"stat": {
"size": 3927
}
},
"_type": "release",
"_id": "zpVA3zMoUhx0mj8Cn4YC9CuFyA8"
},
{
"_index": "cpan_v1_01",
"_source": {
"author": "KIMOTO",
"name": "Validator-Custom-Ext-Mojolicious-0.0103",
"checksum_sha256": "0911fe6ae65f9173c6eb68b6116600552b088939b94881be3c7275344b1cbdce",
"download_url": "https://cpan.metacpan.org/authors/id/K/KI/KIMOTO/Validator-Custom-Ext-Mojolicious-0.0103.tar.gz",
"date": "2010-01-16T14:51:11",
"version": "0.0103",
"stat": {
"size": 4190
},
"distribution": "Validator-Custom-Ext-Mojolicious",
"metadata": {
"author": [
"Yuki Kimoto <kimoto.yuki@gmail.com>"
]
}
},
"_id": "mY_jP2O7NnTtr3utv_xZQNu10Ic",
"_type": "release",
"_score": 1.0
},
{
"_source": {
"author": "KIMOTO",
"name": "Validator-Custom-Ext-Mojolicious-0.0102",
"stat": {
"size": 4257
},
"distribution": "Validator-Custom-Ext-Mojolicious",
"metadata": {
"author": [
"Yuki Kimoto <kimoto.yuki@gmail.com>"
]
},
"date": "2010-01-15T14:07:24",
"download_url": "https://cpan.metacpan.org/authors/id/K/KI/KIMOTO/Validator-Custom-Ext-Mojolicious-0.0102.tar.gz",
"checksum_sha256": "a88d01504353223f7a3cb0d6a240debb9c6d6155858f1048a19007c3b366beed",
"version": "0.0102"
},
"_id": "WZm6hQ6mBfOqgVE6dPQOE0L8hg0",
"_type": "release",
"_index": "cpan_v1_01",
"_score": 1.0
},
{
"_index": "cpan_v1_01",
"_type": "release",
"_source": {
"author": "KIMOTO",
"name": "Validator-Custom-0.1207",
"download_url": "https://cpan.metacpan.org/authors/id/K/KI/KIMOTO/Validator-Custom-0.1207.tar.gz",
"date": "2010-07-28T13:42:23",
"checksum_sha256": "f599da2ecc17ac74443628eb84233ee6b25b204511f83ea778dad9efd0f558e0",
"version": "0.1207",
"stat": {
"size": 16985
},
"distribution": "Validator-Custom",
"metadata": {
"author": [
"Yuki Kimoto <kimoto.yuki@gmail.com>"
]
}
},
"_id": "NWJOqmjEinjfJqawfpkEpEhu4d0",
"_score": 1.0
}
],
"total": 359941,
"max_score": 1.0
},
"took": 14,
"_scroll_id": "cXVlcnlUaGVuRmV0Y2g7Mzs5OTQ2NzY3ODU6ZHZIZWxCb3BUZi1Cb3NwRDB5NmRQUTs5OTQ2NzY3ODQ6ZHZIZWxCb3BUZi1Cb3NwRDB5NmRQUTs5OTUyMzQzMTA6eXptdmszQUNUam1XbVJjRjRkRk9UdzswOw=="
}

View file

@ -3,25 +3,95 @@
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from collections import defaultdict
from itertools import chain
import json
from pathlib import Path
import pytest
from swh.lister.cpan.lister import CpanLister
expected_origins = [
"https://metacpan.org/dist/App-Booklist",
"https://metacpan.org/dist/EuclideanRhythm",
"https://metacpan.org/dist/EventSource-Server",
"https://metacpan.org/dist/Getopt_Auto",
"https://metacpan.org/dist/Interchange6",
"https://metacpan.org/dist/Internals-CountObjects",
"https://metacpan.org/dist/openerserver_perl-master",
]
@pytest.fixture
def release_search_response(datadir):
return json.loads(
Path(datadir, "https_fastapi.metacpan.org", "v1_release__search").read_bytes()
)
def test_cpan_lister(datadir, requests_mock_datadir_visits, swh_scheduler):
@pytest.fixture
def release_scroll_first_response(datadir):
return json.loads(
Path(datadir, "https_fastapi.metacpan.org", "v1__search_scroll").read_bytes()
)
@pytest.fixture(autouse=True)
def mock_network_requests(
requests_mock, release_search_response, release_scroll_first_response
):
requests_mock.get(
"https://fastapi.metacpan.org/v1/release/_search",
json=release_search_response,
)
requests_mock.get(
"https://fastapi.metacpan.org/v1/_search/scroll",
[
{
"json": release_scroll_first_response,
},
{"json": {"hits": {"hits": []}, "_scroll_id": ""}},
],
)
def test_cpan_lister(
swh_scheduler, release_search_response, release_scroll_first_response
):
lister = CpanLister(scheduler=swh_scheduler)
res = lister.run()
assert res.pages == 3
assert res.origins == 4 + 3 + 0
expected_origins = set()
expected_artifacts = defaultdict(list)
expected_module_metadata = defaultdict(list)
for release in chain(
release_search_response["hits"]["hits"],
release_scroll_first_response["hits"]["hits"],
):
distribution = release["_source"]["distribution"]
release_name = release["_source"]["name"]
checksum_sha256 = release["_source"]["checksum_sha256"]
download_url = release["_source"]["download_url"]
version = release["_source"]["version"]
size = release["_source"]["stat"]["size"]
author = release["_source"]["author"]
author_fullname = release["_source"]["metadata"]["author"][0]
date = release["_source"]["date"]
origin_url = f"https://metacpan.org/dist/{distribution}"
expected_origins.add(origin_url)
expected_artifacts[origin_url].append(
{
"url": download_url,
"filename": download_url.split("/")[-1],
"version": version,
"length": size,
"checksums": {"sha256": checksum_sha256},
}
)
expected_module_metadata[origin_url].append(
{
"name": distribution,
"version": version,
"cpan_author": author,
"author": author_fullname if author_fullname != "unknown" else author,
"date": date,
"release_name": release_name,
}
)
assert res.pages == 1
assert res.origins == len(expected_origins)
scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
@ -30,3 +100,8 @@ def test_cpan_lister(datadir, requests_mock_datadir_visits, swh_scheduler):
for origin in scheduler_origins:
assert origin.visit_type == "cpan"
assert origin.url in expected_origins
assert origin.extra_loader_arguments == {
"api_base_url": "https://fastapi.metacpan.org/v1",
"artifacts": expected_artifacts[origin.url],
"module_metadata": expected_module_metadata[origin.url],
}