cpan: Improve listing process by querying the metacpan release endpoint
Instead of querying the metacpan distribution endpoint to list origins, prefer to use the release endpoint instead enabling to list all artifacts associated to CPAN packages by scrolling results. Compared to previous implementation, it enables to compute a last_update date for all CPAN packages but also to obtain artifact sha256 checksums that will be used by the CPAN loader to check downloads integrity. As the multiple versions of a module are spread across multiple pages from the CPAN API, origins are sent to the scheduler once all pages processed, it is also faster to proceed that way. Related to T2833
This commit is contained in:
parent
108816f232
commit
f57b8f3a2c
8 changed files with 701 additions and 159 deletions
|
@ -16,9 +16,9 @@ As of September 2022 `cpan.org`_ list 43675 package names.
|
|||
Origins retrieving strategy
|
||||
---------------------------
|
||||
|
||||
To get a list of all package names we call a first `http api endpoint`_ that
|
||||
retrieve results and a ``_scroll_id`` that will be used to scroll pages through
|
||||
`search`_ endpoint.
|
||||
To get a list of all package names and their associated release artifacts we call
|
||||
a first `http api endpoint`_ that retrieve results and a ``_scroll_id`` that will
|
||||
be used to scroll pages through `search`_ endpoint.
|
||||
|
||||
Page listing
|
||||
------------
|
||||
|
@ -57,7 +57,7 @@ You can follow lister execution by displaying logs of swh-lister service::
|
|||
|
||||
.. _cpan.org: https://cpan.org/
|
||||
.. _metacpan.org: https://metacpan.org/
|
||||
.. _http api endpoint: https://explorer.metacpan.org/?url=/distribution/
|
||||
.. _http api endpoint: https://explorer.metacpan.org/?url=/release/
|
||||
.. _search: https://github.com/metacpan/metacpan-api/blob/master/docs/API-docs.md#search-without-constraints # noqa: B950
|
||||
|
||||
|
||||
|
|
|
@ -3,8 +3,12 @@
|
|||
# License: GNU General Public License version 3, or any later version
|
||||
# See top-level LICENSE file for more information
|
||||
|
||||
from collections import defaultdict
|
||||
from datetime import datetime
|
||||
import logging
|
||||
from typing import Any, Dict, Iterator, List, Optional
|
||||
from typing import Any, Dict, Iterator, List, Optional, Set
|
||||
|
||||
import iso8601
|
||||
|
||||
from swh.scheduler.interface import SchedulerInterface
|
||||
from swh.scheduler.model import ListedOrigin
|
||||
|
@ -14,7 +18,33 @@ from ..pattern import CredentialsType, StatelessLister
|
|||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Aliasing the page results returned by `get_pages` method from the lister.
|
||||
CpanListerPage = List[Dict[str, Any]]
|
||||
CpanListerPage = Set[str]
|
||||
|
||||
|
||||
def get_field_value(entry, field_name):
|
||||
"""
|
||||
Splits ``field_name`` on ``.``, and use it as path in the nested ``entry``
|
||||
dictionary. If a value does not exist, returns None.
|
||||
|
||||
>>> entry = {"_source": {"foo": 1, "bar": {"baz": 2, "qux": [3]}}}
|
||||
>>> get_field_value(entry, "foo")
|
||||
1
|
||||
>>> get_field_value(entry, "bar")
|
||||
{'baz': 2, 'qux': [3]}
|
||||
>>> get_field_value(entry, "bar.baz")
|
||||
2
|
||||
>>> get_field_value(entry, "bar.qux")
|
||||
3
|
||||
"""
|
||||
fields = field_name.split(".")
|
||||
field_value = entry["_source"]
|
||||
for field in fields[:-1]:
|
||||
field_value = field_value.get(field, {})
|
||||
field_value = field_value.get(fields[-1])
|
||||
# scrolled results might have field value in a list
|
||||
if isinstance(field_value, list):
|
||||
field_value = field_value[0]
|
||||
return field_value
|
||||
|
||||
|
||||
class CpanLister(StatelessLister[CpanListerPage]):
|
||||
|
@ -25,7 +55,15 @@ class CpanLister(StatelessLister[CpanListerPage]):
|
|||
VISIT_TYPE = "cpan"
|
||||
INSTANCE = "cpan"
|
||||
|
||||
BASE_URL = "https://fastapi.metacpan.org/v1/"
|
||||
API_BASE_URL = "https://fastapi.metacpan.org/v1"
|
||||
REQUIRED_DOC_FIELDS = [
|
||||
"download_url",
|
||||
"checksum_sha256",
|
||||
"distribution",
|
||||
"version",
|
||||
]
|
||||
OPTIONAL_DOC_FIELDS = ["date", "author", "stat.size", "name", "metadata.author"]
|
||||
ORIGIN_URL_PATTERN = "https://metacpan.org/dist/{module_name}"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
|
@ -36,26 +74,82 @@ class CpanLister(StatelessLister[CpanListerPage]):
|
|||
scheduler=scheduler,
|
||||
credentials=credentials,
|
||||
instance=self.INSTANCE,
|
||||
url=self.BASE_URL,
|
||||
url=self.API_BASE_URL,
|
||||
)
|
||||
|
||||
self.artifacts: Dict[str, List[Dict[str, Any]]] = defaultdict(list)
|
||||
self.module_metadata: Dict[str, List[Dict[str, Any]]] = defaultdict(list)
|
||||
self.release_dates: Dict[str, List[datetime]] = defaultdict(list)
|
||||
self.module_names: Set[str] = set()
|
||||
|
||||
def process_release_page(self, page: List[Dict[str, Any]]):
|
||||
for entry in page:
|
||||
|
||||
if "_source" not in entry or not all(
|
||||
k in entry["_source"].keys() for k in self.REQUIRED_DOC_FIELDS
|
||||
):
|
||||
logger.warning(
|
||||
"Skipping release entry %s as some required fields are missing",
|
||||
entry.get("_source"),
|
||||
)
|
||||
continue
|
||||
|
||||
module_name = get_field_value(entry, "distribution")
|
||||
module_version = get_field_value(entry, "version")
|
||||
module_download_url = get_field_value(entry, "download_url")
|
||||
module_sha256_checksum = get_field_value(entry, "checksum_sha256")
|
||||
module_date = get_field_value(entry, "date")
|
||||
module_size = get_field_value(entry, "stat.size")
|
||||
module_author = get_field_value(entry, "author")
|
||||
module_author_fullname = get_field_value(entry, "metadata.author")
|
||||
release_name = get_field_value(entry, "name")
|
||||
|
||||
self.artifacts[module_name].append(
|
||||
{
|
||||
"url": module_download_url,
|
||||
"filename": module_download_url.split("/")[-1],
|
||||
"checksums": {"sha256": module_sha256_checksum},
|
||||
"version": module_version,
|
||||
"length": module_size,
|
||||
}
|
||||
)
|
||||
|
||||
self.module_metadata[module_name].append(
|
||||
{
|
||||
"name": module_name,
|
||||
"version": module_version,
|
||||
"cpan_author": module_author,
|
||||
"author": (
|
||||
module_author_fullname
|
||||
if module_author_fullname not in (None, "", "unknown")
|
||||
else module_author
|
||||
),
|
||||
"date": module_date,
|
||||
"release_name": release_name,
|
||||
}
|
||||
)
|
||||
|
||||
self.release_dates[module_name].append(iso8601.parse_date(module_date))
|
||||
|
||||
self.module_names.add(module_name)
|
||||
|
||||
def get_pages(self) -> Iterator[CpanListerPage]:
|
||||
"""Yield an iterator which returns 'page'"""
|
||||
|
||||
endpoint = f"{self.BASE_URL}distribution/_search"
|
||||
scrollendpoint = f"{self.BASE_URL}_search/scroll"
|
||||
size: int = 1000
|
||||
endpoint = f"{self.API_BASE_URL}/release/_search"
|
||||
scrollendpoint = f"{self.API_BASE_URL}/_search/scroll"
|
||||
size = 1000
|
||||
|
||||
res = self.http_request(
|
||||
endpoint,
|
||||
params={
|
||||
"fields": ["name"],
|
||||
"_source": self.REQUIRED_DOC_FIELDS + self.OPTIONAL_DOC_FIELDS,
|
||||
"size": size,
|
||||
"scroll": "1m",
|
||||
},
|
||||
)
|
||||
data = res.json()["hits"]["hits"]
|
||||
yield data
|
||||
self.process_release_page(data)
|
||||
|
||||
_scroll_id = res.json()["_scroll_id"]
|
||||
|
||||
|
@ -65,27 +159,25 @@ class CpanLister(StatelessLister[CpanListerPage]):
|
|||
)
|
||||
data = scroll_res.json()["hits"]["hits"]
|
||||
_scroll_id = scroll_res.json()["_scroll_id"]
|
||||
yield data
|
||||
self.process_release_page(data)
|
||||
|
||||
def get_origins_from_page(self, page: CpanListerPage) -> Iterator[ListedOrigin]:
|
||||
yield self.module_names
|
||||
|
||||
def get_origins_from_page(
|
||||
self, module_names: CpanListerPage
|
||||
) -> Iterator[ListedOrigin]:
|
||||
"""Iterate on all pages and yield ListedOrigin instances."""
|
||||
assert self.lister_obj.id is not None
|
||||
|
||||
for entry in page:
|
||||
# Skip the entry if 'fields' or 'name' keys are missing
|
||||
if "fields" not in entry or "name" not in entry["fields"]:
|
||||
continue
|
||||
|
||||
pkgname = entry["fields"]["name"]
|
||||
# TODO: Check why sometimes its a one value list
|
||||
if type(pkgname) != str:
|
||||
pkgname = pkgname[0]
|
||||
|
||||
url = f"https://metacpan.org/dist/{pkgname}"
|
||||
|
||||
for module_name in module_names:
|
||||
yield ListedOrigin(
|
||||
lister_id=self.lister_obj.id,
|
||||
visit_type=self.VISIT_TYPE,
|
||||
url=url,
|
||||
last_update=None,
|
||||
url=self.ORIGIN_URL_PATTERN.format(module_name=module_name),
|
||||
last_update=max(self.release_dates[module_name]),
|
||||
extra_loader_arguments={
|
||||
"api_base_url": self.API_BASE_URL,
|
||||
"artifacts": self.artifacts[module_name],
|
||||
"module_metadata": self.module_metadata[module_name],
|
||||
},
|
||||
)
|
||||
|
|
|
@ -0,0 +1,247 @@
|
|||
{
|
||||
"_shards": {
|
||||
"failed": 0,
|
||||
"total": 3,
|
||||
"successful": 3
|
||||
},
|
||||
"_scroll_id": "cXVlcnlUaGVuRmV0Y2g7Mzs5OTQ2NzY3ODU6ZHZIZWxCb3BUZi1Cb3NwRDB5NmRQUTs5OTQ2NzY3ODQ6ZHZIZWxCb3BUZi1Cb3NwRDB5NmRQUTs5OTUyMzQzMTA6eXptdmszQUNUam1XbVJjRjRkRk9UdzswOw==",
|
||||
"terminated_early": true,
|
||||
"took": 3,
|
||||
"hits": {
|
||||
"max_score": 1.0,
|
||||
"hits": [
|
||||
{
|
||||
"_score": 1.0,
|
||||
"_source": {
|
||||
"author": "KIMOTO",
|
||||
"name": "Validator-Custom-0.1210",
|
||||
"download_url": "https://cpan.metacpan.org/authors/id/K/KI/KIMOTO/Validator-Custom-0.1210.tar.gz",
|
||||
"version": "0.1210",
|
||||
"distribution": "Validator-Custom",
|
||||
"date": "2010-08-14T01:41:56",
|
||||
"stat": {
|
||||
"size": 17608
|
||||
},
|
||||
"checksum_sha256": "f7240f7793ced2952701f0ed28ecf43c07cc2fa4549cc505831eceb8424cba4a",
|
||||
"metadata": {
|
||||
"author": [
|
||||
"Yuki Kimoto <kimoto.yuki@gmail.com>"
|
||||
]
|
||||
}
|
||||
},
|
||||
"_type": "release",
|
||||
"_index": "cpan_v1_01",
|
||||
"_id": "VGApYqMT4TCxUzHcITn8ZhGHlxE"
|
||||
},
|
||||
{
|
||||
"_type": "release",
|
||||
"_id": "ilQN4bpIIdRl6DoiB3y47fgNIk8",
|
||||
"_index": "cpan_v1_01",
|
||||
"_score": 1.0,
|
||||
"_source": {
|
||||
"author": "KIMOTO",
|
||||
"name": "Validator-Custom-0.1208",
|
||||
"date": "2010-07-28T23:00:52",
|
||||
"distribution": "Validator-Custom",
|
||||
"version": "0.1208",
|
||||
"download_url": "https://cpan.metacpan.org/authors/id/K/KI/KIMOTO/Validator-Custom-0.1208.tar.gz",
|
||||
"metadata": {
|
||||
"author": [
|
||||
"Yuki Kimoto <kimoto.yuki@gmail.com>"
|
||||
]
|
||||
},
|
||||
"checksum_sha256": "e33a860b026cad852eb919da4a3645007b47e5f414eb7272534b10cee279b52b",
|
||||
"stat": {
|
||||
"size": 17489
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"_source": {
|
||||
"author": "KIMOTO",
|
||||
"name": "DBIx-Custom-0.1619",
|
||||
"download_url": "https://cpan.metacpan.org/authors/id/K/KI/KIMOTO/DBIx-Custom-0.1619.tar.gz",
|
||||
"version": "0.1619",
|
||||
"distribution": "DBIx-Custom",
|
||||
"date": "2010-10-20T15:01:35",
|
||||
"stat": {
|
||||
"size": 27195
|
||||
},
|
||||
"checksum_sha256": "83c295343f48ebc03029139082345c93527ffe5831820f99e4a72ee67ef186a5",
|
||||
"metadata": {
|
||||
"author": [
|
||||
"unknown"
|
||||
]
|
||||
}
|
||||
},
|
||||
"_score": 1.0,
|
||||
"_id": "g7562_4h9d693lxvc_cgEOTJAZk",
|
||||
"_index": "cpan_v1_01",
|
||||
"_type": "release"
|
||||
},
|
||||
{
|
||||
"_score": 1.0,
|
||||
"_source": {
|
||||
"author": "KIMOTO",
|
||||
"name": "DBIx-Custom-0.1401",
|
||||
"version": "0.1401",
|
||||
"distribution": "DBIx-Custom",
|
||||
"date": "2010-05-01T23:29:22",
|
||||
"download_url": "https://cpan.metacpan.org/authors/id/K/KI/KIMOTO/DBIx-Custom-0.1401.tar.gz",
|
||||
"checksum_sha256": "004be1d48b6819941b3cb3c53bf457799d811348e0bb15e7cf18211505637aba",
|
||||
"metadata": {
|
||||
"author": [
|
||||
"Yuki Kimoto <kimoto.yuki@gmail.com>"
|
||||
]
|
||||
},
|
||||
"stat": {
|
||||
"size": 22711
|
||||
}
|
||||
},
|
||||
"_type": "release",
|
||||
"_id": "bLRsOH2sevNQ6Q93exgkvCZONo0",
|
||||
"_index": "cpan_v1_01"
|
||||
},
|
||||
{
|
||||
"_type": "release",
|
||||
"_index": "cpan_v1_01",
|
||||
"_id": "D8L3qWKznn0IQZrZEeDi9uyXbJY",
|
||||
"_score": 1.0,
|
||||
"_source": {
|
||||
"author": "KIMOTO",
|
||||
"name": "DBIx-Custom-0.1301",
|
||||
"stat": {
|
||||
"size": 22655
|
||||
},
|
||||
"metadata": {
|
||||
"author": [
|
||||
"Yuki Kimoto <kimoto.yuki@gmail.com>"
|
||||
]
|
||||
},
|
||||
"checksum_sha256": "6b39e3ad2bc98f06af3a75c96cd8c056a25f7501ed216a375472c8fe7bbb72be",
|
||||
"download_url": "https://cpan.metacpan.org/authors/id/K/KI/KIMOTO/DBIx-Custom-0.1301.tar.gz",
|
||||
"distribution": "DBIx-Custom",
|
||||
"version": "0.1301",
|
||||
"date": "2010-05-01T13:02:19"
|
||||
}
|
||||
},
|
||||
{
|
||||
"_score": 1.0,
|
||||
"_source": {
|
||||
"author": "KIMOTO",
|
||||
"name": "DBIx-Custom-0.1602",
|
||||
"metadata": {
|
||||
"author": [
|
||||
"Yuki Kimoto <kimoto.yuki@gmail.com>"
|
||||
]
|
||||
},
|
||||
"checksum_sha256": "7a7e18514e171a6c55ef4c8aef92bd548b15ffd7dec4c1fdc83c276a032f6b8a",
|
||||
"stat": {
|
||||
"size": 18999
|
||||
},
|
||||
"date": "2010-06-25T12:11:33",
|
||||
"distribution": "DBIx-Custom",
|
||||
"version": "0.1602",
|
||||
"download_url": "https://cpan.metacpan.org/authors/id/K/KI/KIMOTO/DBIx-Custom-0.1602.tar.gz"
|
||||
},
|
||||
"_type": "release",
|
||||
"_index": "cpan_v1_01",
|
||||
"_id": "kmzgsMLGdsuiHjrSW55lLwMRO4o"
|
||||
},
|
||||
{
|
||||
"_source": {
|
||||
"author": "KIMOTO",
|
||||
"name": "Validator-Custom-0.1204",
|
||||
"download_url": "https://cpan.metacpan.org/authors/id/K/KI/KIMOTO/Validator-Custom-0.1204.tar.gz",
|
||||
"version": "0.1204",
|
||||
"distribution": "Validator-Custom",
|
||||
"date": "2010-07-08T13:14:23",
|
||||
"stat": {
|
||||
"size": 13256
|
||||
},
|
||||
"checksum_sha256": "40800b3d92cebc09967b61725cecdd05de2b04649f095e3034c5dd82f3d4ad89",
|
||||
"metadata": {
|
||||
"author": [
|
||||
"Yuki Kimoto <kimoto.yuki@gmail.com>"
|
||||
]
|
||||
}
|
||||
},
|
||||
"_score": 1.0,
|
||||
"_index": "cpan_v1_01",
|
||||
"_id": "M_lLALu56mb_cDK_jAXwUB2PUlw",
|
||||
"_type": "release"
|
||||
},
|
||||
{
|
||||
"_id": "EVuvfiFcvtEr9Ne5Q4QoMAaxe7E",
|
||||
"_index": "cpan_v1_01",
|
||||
"_type": "release",
|
||||
"_source": {
|
||||
"author": "KIMOTO",
|
||||
"name": "Validator-Custom-0.1203",
|
||||
"stat": {
|
||||
"size": 12572
|
||||
},
|
||||
"metadata": {
|
||||
"author": [
|
||||
"Yuki Kimoto <kimoto.yuki@gmail.com>"
|
||||
]
|
||||
},
|
||||
"checksum_sha256": "028a0b41c152c585143167464bed2ac6b6680c8006aa80867f9a8faa4ca5efe7",
|
||||
"download_url": "https://cpan.metacpan.org/authors/id/K/KI/KIMOTO/Validator-Custom-0.1203.tar.gz",
|
||||
"date": "2010-07-07T13:29:41",
|
||||
"distribution": "Validator-Custom",
|
||||
"version": "0.1203"
|
||||
},
|
||||
"_score": 1.0
|
||||
},
|
||||
{
|
||||
"_index": "cpan_v1_01",
|
||||
"_id": "ZaT8bwXejVTHmrzZCqNJPRFImBY",
|
||||
"_type": "release",
|
||||
"_source": {
|
||||
"author": "KIMOTO",
|
||||
"name": "DBIx-Custom-0.1641",
|
||||
"metadata": {
|
||||
"author": [
|
||||
"Yuki Kimoto <kimoto.yuki@gmail.com>"
|
||||
]
|
||||
},
|
||||
"checksum_sha256": "940412af9b7faf4c946a5e4d57ca52e5b704e49c4d7d0aa5ecb6d2286477ebc6",
|
||||
"stat": {
|
||||
"size": 40480
|
||||
},
|
||||
"distribution": "DBIx-Custom",
|
||||
"version": "0.1641",
|
||||
"date": "2011-01-27T05:19:14",
|
||||
"download_url": "https://cpan.metacpan.org/authors/id/K/KI/KIMOTO/DBIx-Custom-0.1641.tar.gz"
|
||||
},
|
||||
"_score": 1.0
|
||||
},
|
||||
{
|
||||
"_source": {
|
||||
"author": "KIMOTO",
|
||||
"name": "DBIx-Custom-0.1646",
|
||||
"version": "0.1646",
|
||||
"distribution": "DBIx-Custom",
|
||||
"date": "2011-02-18T17:48:52",
|
||||
"download_url": "https://cpan.metacpan.org/authors/id/K/KI/KIMOTO/DBIx-Custom-0.1646.tar.gz",
|
||||
"metadata": {
|
||||
"author": [
|
||||
"Yuki Kimoto <kimoto.yuki@gmail.com>"
|
||||
]
|
||||
},
|
||||
"checksum_sha256": "7f729311e3e22d36b158e62b42ab2fbd29f08eabd57206e235db939d1ae57d24",
|
||||
"stat": {
|
||||
"size": 46577
|
||||
}
|
||||
},
|
||||
"_score": 1.0,
|
||||
"_index": "cpan_v1_01",
|
||||
"_id": "j21QIzHRYZKz1vobyGAPa2BuO50",
|
||||
"_type": "release"
|
||||
}
|
||||
],
|
||||
"total": 359941
|
||||
},
|
||||
"timed_out": false
|
||||
}
|
|
@ -1,50 +0,0 @@
|
|||
{
|
||||
"_shards" : {
|
||||
"successful" : 3,
|
||||
"total" : 3,
|
||||
"failed" : 0
|
||||
},
|
||||
"timed_out" : false,
|
||||
"hits" : {
|
||||
"max_score" : 1.0,
|
||||
"hits" : [
|
||||
{
|
||||
"_type" : "distribution",
|
||||
"fields" : {
|
||||
"name" : [
|
||||
"EventSource-Server"
|
||||
]
|
||||
},
|
||||
"_id" : "EventSource-Server",
|
||||
"_index" : "cpan_v1_01",
|
||||
"_score" : 1.0
|
||||
},
|
||||
{
|
||||
"_score" : 1.0,
|
||||
"_index" : "cpan_v1_01",
|
||||
"_id" : "Interchange6",
|
||||
"fields" : {
|
||||
"name" : [
|
||||
"Interchange6"
|
||||
]
|
||||
},
|
||||
"_type" : "distribution"
|
||||
},
|
||||
{
|
||||
"_score" : 1.0,
|
||||
"_index" : "cpan_v1_01",
|
||||
"_id" : "Internals-CountObjects",
|
||||
"fields" : {
|
||||
"name" : [
|
||||
"Internals-CountObjects"
|
||||
]
|
||||
},
|
||||
"_type" : "distribution"
|
||||
}
|
||||
],
|
||||
"total" : 43675
|
||||
},
|
||||
"took" : 72,
|
||||
"_scroll_id" : "cXVlcnlUaGVuRmV0Y2g7Mzs5NTU1MTQ1NTk6eXptdmszQUNUam1XbVJjRjRkRk9Udzs5NTQ5NjQ5NjI6ZHZIZWxCb3BUZi1Cb3NwRDB5NmRQUTs5NTU1MTQ1NjA6eXptdmszQUNUam1XbVJjRjRkRk9UdzswOw==",
|
||||
"terminated_early" : true
|
||||
}
|
|
@ -1,16 +0,0 @@
|
|||
{
|
||||
"_scroll_id" : "cXVlcnlUaGVuRmV0Y2g7Mzs5NTU1MTQ1NTk6eXptdmszQUNUam1XbVJjRjRkRk9Udzs5NTQ5NjQ5NjI6ZHZIZWxCb3BUZi1Cb3NwRDB5NmRQUTs5NTU1MTQ1NjA6eXptdmszQUNUam1XbVJjRjRkRk9UdzswOw==",
|
||||
"took" : 1,
|
||||
"hits" : {
|
||||
"hits" : [],
|
||||
"total" : 43675,
|
||||
"max_score" : 1.0
|
||||
},
|
||||
"terminated_early" : true,
|
||||
"timed_out" : false,
|
||||
"_shards" : {
|
||||
"failed" : 0,
|
||||
"total" : 3,
|
||||
"successful" : 3
|
||||
}
|
||||
}
|
|
@ -1,52 +0,0 @@
|
|||
{
|
||||
"_shards" : {
|
||||
"successful" : 3,
|
||||
"failed" : 0,
|
||||
"total" : 3
|
||||
},
|
||||
"_scroll_id" : "cXVlcnlUaGVuRmV0Y2g7Mzs5NTU1MTQ1NTk6eXptdmszQUNUam1XbVJjRjRkRk9Udzs5NTQ5NjQ5NjI6ZHZIZWxCb3BUZi1Cb3NwRDB5NmRQUTs5NTU1MTQ1NjA6eXptdmszQUNUam1XbVJjRjRkRk9UdzswOw==",
|
||||
"took" : 61,
|
||||
"hits" : {
|
||||
"max_score" : 1.0,
|
||||
"hits" : [
|
||||
{
|
||||
"_score" : 1.0,
|
||||
"_index" : "cpan_v1_01",
|
||||
"_id" : "openerserver_perl-master",
|
||||
"fields" : {
|
||||
"name" : "openerserver_perl-master"
|
||||
},
|
||||
"_type" : "distribution"
|
||||
},
|
||||
{
|
||||
"_score" : 1.0,
|
||||
"_type" : "distribution",
|
||||
"fields" : {
|
||||
"name" : "Getopt_Auto"
|
||||
},
|
||||
"_id" : "Getopt_Auto",
|
||||
"_index" : "cpan_v1_01"
|
||||
},
|
||||
{
|
||||
"_id" : "App-Booklist",
|
||||
"_index" : "cpan_v1_01",
|
||||
"_type" : "distribution",
|
||||
"fields" : {
|
||||
"name" : "App-Booklist"
|
||||
},
|
||||
"_score" : 1.0
|
||||
},
|
||||
{
|
||||
"fields" : {
|
||||
"name" : "EuclideanRhythm"
|
||||
},
|
||||
"_type" : "distribution",
|
||||
"_index" : "cpan_v1_01",
|
||||
"_id" : "EuclideanRhythm",
|
||||
"_score" : 1.0
|
||||
}
|
||||
],
|
||||
"total" : 43675
|
||||
},
|
||||
"timed_out" : false
|
||||
}
|
|
@ -0,0 +1,246 @@
|
|||
{
|
||||
"timed_out": false,
|
||||
"_shards": {
|
||||
"total": 3,
|
||||
"failed": 0,
|
||||
"successful": 3
|
||||
},
|
||||
"hits": {
|
||||
"hits": [
|
||||
{
|
||||
"_index": "cpan_v1_01",
|
||||
"_id": "40MmOvf_SQx_mr8Kj9Eush14a3E",
|
||||
"_source": {
|
||||
"author": "KRYDE",
|
||||
"name": "math-image-46",
|
||||
"date": "2011-03-02T00:46:14",
|
||||
"download_url": "https://cpan.metacpan.org/authors/id/K/KR/KRYDE/math-image-46.tar.gz",
|
||||
"checksum_sha256": "6bd988e3959feb1071d3b9953d16e723af66bdb7b5440ea17add8709d95f20fa",
|
||||
"version": "46",
|
||||
"stat": {
|
||||
"size": 533502
|
||||
},
|
||||
"distribution": "math-image",
|
||||
"metadata": {
|
||||
"author": [
|
||||
"Kevin Ryde <user42@zip.com.au>"
|
||||
]
|
||||
}
|
||||
},
|
||||
"_type": "release",
|
||||
"_score": 1.0
|
||||
},
|
||||
{
|
||||
"_index": "cpan_v1_01",
|
||||
"_source": {
|
||||
"author": "MITHALDU",
|
||||
"name": "Dist-Zilla-Plugin-ProgCriticTests-1.101580-TRIAL",
|
||||
"distribution": "Dist-Zilla-Plugin-ProgCriticTests",
|
||||
"metadata": {
|
||||
"author": [
|
||||
"Christian Walde <mithaldu@yahoo.de>"
|
||||
]
|
||||
},
|
||||
"stat": {
|
||||
"size": 16918
|
||||
},
|
||||
"checksum_sha256": "ef8c92d0fc55551392a6daeee20a1c13a3ee1bcd0fcacf611cbc2a6cc503f401",
|
||||
"download_url": "https://cpan.metacpan.org/authors/id/M/MI/MITHALDU/Dist-Zilla-Plugin-ProgCriticTests-1.101580-TRIAL.tar.gz",
|
||||
"date": "2010-06-07T14:43:36",
|
||||
"version": "1.101580"
|
||||
},
|
||||
"_id": "6df77_MLO_BG8YC_vQKsay7OFYM",
|
||||
"_type": "release",
|
||||
"_score": 1.0
|
||||
},
|
||||
{
|
||||
"_index": "cpan_v1_01",
|
||||
"_type": "release",
|
||||
"_source": {
|
||||
"author": "MITHUN",
|
||||
"name": "Net-Rapidshare-v0.04",
|
||||
"download_url": "https://cpan.metacpan.org/authors/id/M/MI/MITHUN/Net-Rapidshare-v0.04.tar.gz",
|
||||
"date": "2009-07-28T05:57:26",
|
||||
"checksum_sha256": "f01456a8f8c2b6806a8dd041cf848f330884573d363b28c8b3ff12e837fa8f4f",
|
||||
"version": "v0.04",
|
||||
"distribution": "Net-Rapidshare",
|
||||
"metadata": {
|
||||
"author": [
|
||||
"unknown"
|
||||
]
|
||||
},
|
||||
"stat": {
|
||||
"size": 15068
|
||||
}
|
||||
},
|
||||
"_id": "jCs3ZLWuoetrkMLOFKV3YTSr_fM",
|
||||
"_score": 1.0
|
||||
},
|
||||
{
|
||||
"_index": "cpan_v1_01",
|
||||
"_source": {
|
||||
"author": "MITHUN",
|
||||
"name": "Net-Rapidshare-v0.05",
|
||||
"version": "v0.05",
|
||||
"download_url": "https://cpan.metacpan.org/authors/id/M/MI/MITHUN/Net-Rapidshare-v0.05.tgz",
|
||||
"date": "2009-12-21T00:29:48",
|
||||
"checksum_sha256": "e1128d3b35616530d9722d0fe3f5f0e343fd914bc8f9c0df55c1a9ad6c7402fe",
|
||||
"metadata": {
|
||||
"author": [
|
||||
"unknown"
|
||||
]
|
||||
},
|
||||
"distribution": "Net-Rapidshare",
|
||||
"stat": {
|
||||
"size": 15971
|
||||
}
|
||||
},
|
||||
"_id": "pExMIwabhz_0S1rX7xAY_lq0GTY",
|
||||
"_type": "release",
|
||||
"_score": 1.0
|
||||
},
|
||||
{
|
||||
"_type": "release",
|
||||
"_source": {
|
||||
"author": "MITHUN",
|
||||
"name": "Net-Rapidshare-v0.0.1",
|
||||
"version": "v0.0.1",
|
||||
"download_url": "https://cpan.metacpan.org/authors/id/M/MI/MITHUN/Net-Rapidshare-v0.0.1.tar.gz",
|
||||
"checksum_sha256": "990de0a72753fa182e7a5867e55fd6755375b71280bb7e5b3a5f07c4de8af905",
|
||||
"date": "2009-07-18T22:56:38",
|
||||
"stat": {
|
||||
"size": 15161
|
||||
},
|
||||
"metadata": {
|
||||
"author": [
|
||||
"unknown"
|
||||
]
|
||||
},
|
||||
"distribution": "Net-Rapidshare"
|
||||
},
|
||||
"_id": "eqkhDnj0efXHisWRrMZZ1EHFgug",
|
||||
"_index": "cpan_v1_01",
|
||||
"_score": 1.0
|
||||
},
|
||||
{
|
||||
"_score": 1.0,
|
||||
"_index": "cpan_v1_01",
|
||||
"_type": "release",
|
||||
"_source": {
|
||||
"author": "KIMOTO",
|
||||
"name": "DBIx-Custom-Basic-0.0101",
|
||||
"stat": {
|
||||
"size": 3409
|
||||
},
|
||||
"distribution": "DBIx-Custom-Basic",
|
||||
"metadata": {
|
||||
"author": [
|
||||
"Yuki Kimoto <kimoto.yuki@gmail.com>"
|
||||
]
|
||||
},
|
||||
"download_url": "https://cpan.metacpan.org/authors/id/K/KI/KIMOTO/DBIx-Custom-Basic-0.0101.tar.gz",
|
||||
"date": "2009-11-08T04:18:30",
|
||||
"checksum_sha256": "86f68b2d0789934aa6b0202345e9807c5b650f8030b55d0d669ef25293fa3f1f",
|
||||
"version": "0.0101"
|
||||
},
|
||||
"_id": "oKf3t0pXHXa6mZ_4sUZSaSMKuXg"
|
||||
},
|
||||
{
|
||||
"_score": 1.0,
|
||||
"_index": "cpan_v1_01",
|
||||
"_source": {
|
||||
"author": "KIMOTO",
|
||||
"name": "DBIx-Custom-SQLite-0.0101",
|
||||
"version": "0.0101",
|
||||
"download_url": "https://cpan.metacpan.org/authors/id/K/KI/KIMOTO/DBIx-Custom-SQLite-0.0101.tar.gz",
|
||||
"date": "2009-11-08T04:20:31",
|
||||
"checksum_sha256": "0af123551dff95f9654f4fbc24e945c5d6481b92e67b8e03ca91ef4c83088cc7",
|
||||
"metadata": {
|
||||
"author": [
|
||||
"Yuki Kimoto <kimoto.yuki@gmail.com>"
|
||||
]
|
||||
},
|
||||
"distribution": "DBIx-Custom-SQLite",
|
||||
"stat": {
|
||||
"size": 3927
|
||||
}
|
||||
},
|
||||
"_type": "release",
|
||||
"_id": "zpVA3zMoUhx0mj8Cn4YC9CuFyA8"
|
||||
},
|
||||
{
|
||||
"_index": "cpan_v1_01",
|
||||
"_source": {
|
||||
"author": "KIMOTO",
|
||||
"name": "Validator-Custom-Ext-Mojolicious-0.0103",
|
||||
"checksum_sha256": "0911fe6ae65f9173c6eb68b6116600552b088939b94881be3c7275344b1cbdce",
|
||||
"download_url": "https://cpan.metacpan.org/authors/id/K/KI/KIMOTO/Validator-Custom-Ext-Mojolicious-0.0103.tar.gz",
|
||||
"date": "2010-01-16T14:51:11",
|
||||
"version": "0.0103",
|
||||
"stat": {
|
||||
"size": 4190
|
||||
},
|
||||
"distribution": "Validator-Custom-Ext-Mojolicious",
|
||||
"metadata": {
|
||||
"author": [
|
||||
"Yuki Kimoto <kimoto.yuki@gmail.com>"
|
||||
]
|
||||
}
|
||||
},
|
||||
"_id": "mY_jP2O7NnTtr3utv_xZQNu10Ic",
|
||||
"_type": "release",
|
||||
"_score": 1.0
|
||||
},
|
||||
{
|
||||
"_source": {
|
||||
"author": "KIMOTO",
|
||||
"name": "Validator-Custom-Ext-Mojolicious-0.0102",
|
||||
"stat": {
|
||||
"size": 4257
|
||||
},
|
||||
"distribution": "Validator-Custom-Ext-Mojolicious",
|
||||
"metadata": {
|
||||
"author": [
|
||||
"Yuki Kimoto <kimoto.yuki@gmail.com>"
|
||||
]
|
||||
},
|
||||
"date": "2010-01-15T14:07:24",
|
||||
"download_url": "https://cpan.metacpan.org/authors/id/K/KI/KIMOTO/Validator-Custom-Ext-Mojolicious-0.0102.tar.gz",
|
||||
"checksum_sha256": "a88d01504353223f7a3cb0d6a240debb9c6d6155858f1048a19007c3b366beed",
|
||||
"version": "0.0102"
|
||||
},
|
||||
"_id": "WZm6hQ6mBfOqgVE6dPQOE0L8hg0",
|
||||
"_type": "release",
|
||||
"_index": "cpan_v1_01",
|
||||
"_score": 1.0
|
||||
},
|
||||
{
|
||||
"_index": "cpan_v1_01",
|
||||
"_type": "release",
|
||||
"_source": {
|
||||
"author": "KIMOTO",
|
||||
"name": "Validator-Custom-0.1207",
|
||||
"download_url": "https://cpan.metacpan.org/authors/id/K/KI/KIMOTO/Validator-Custom-0.1207.tar.gz",
|
||||
"date": "2010-07-28T13:42:23",
|
||||
"checksum_sha256": "f599da2ecc17ac74443628eb84233ee6b25b204511f83ea778dad9efd0f558e0",
|
||||
"version": "0.1207",
|
||||
"stat": {
|
||||
"size": 16985
|
||||
},
|
||||
"distribution": "Validator-Custom",
|
||||
"metadata": {
|
||||
"author": [
|
||||
"Yuki Kimoto <kimoto.yuki@gmail.com>"
|
||||
]
|
||||
}
|
||||
},
|
||||
"_id": "NWJOqmjEinjfJqawfpkEpEhu4d0",
|
||||
"_score": 1.0
|
||||
}
|
||||
],
|
||||
"total": 359941,
|
||||
"max_score": 1.0
|
||||
},
|
||||
"took": 14,
|
||||
"_scroll_id": "cXVlcnlUaGVuRmV0Y2g7Mzs5OTQ2NzY3ODU6ZHZIZWxCb3BUZi1Cb3NwRDB5NmRQUTs5OTQ2NzY3ODQ6ZHZIZWxCb3BUZi1Cb3NwRDB5NmRQUTs5OTUyMzQzMTA6eXptdmszQUNUam1XbVJjRjRkRk9UdzswOw=="
|
||||
}
|
|
@ -3,25 +3,95 @@
|
|||
# License: GNU General Public License version 3, or any later version
|
||||
# See top-level LICENSE file for more information
|
||||
|
||||
from collections import defaultdict
|
||||
from itertools import chain
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from swh.lister.cpan.lister import CpanLister
|
||||
|
||||
expected_origins = [
|
||||
"https://metacpan.org/dist/App-Booklist",
|
||||
"https://metacpan.org/dist/EuclideanRhythm",
|
||||
"https://metacpan.org/dist/EventSource-Server",
|
||||
"https://metacpan.org/dist/Getopt_Auto",
|
||||
"https://metacpan.org/dist/Interchange6",
|
||||
"https://metacpan.org/dist/Internals-CountObjects",
|
||||
"https://metacpan.org/dist/openerserver_perl-master",
|
||||
]
|
||||
|
||||
@pytest.fixture
|
||||
def release_search_response(datadir):
|
||||
return json.loads(
|
||||
Path(datadir, "https_fastapi.metacpan.org", "v1_release__search").read_bytes()
|
||||
)
|
||||
|
||||
|
||||
def test_cpan_lister(datadir, requests_mock_datadir_visits, swh_scheduler):
|
||||
@pytest.fixture
|
||||
def release_scroll_first_response(datadir):
|
||||
return json.loads(
|
||||
Path(datadir, "https_fastapi.metacpan.org", "v1__search_scroll").read_bytes()
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def mock_network_requests(
|
||||
requests_mock, release_search_response, release_scroll_first_response
|
||||
):
|
||||
requests_mock.get(
|
||||
"https://fastapi.metacpan.org/v1/release/_search",
|
||||
json=release_search_response,
|
||||
)
|
||||
requests_mock.get(
|
||||
"https://fastapi.metacpan.org/v1/_search/scroll",
|
||||
[
|
||||
{
|
||||
"json": release_scroll_first_response,
|
||||
},
|
||||
{"json": {"hits": {"hits": []}, "_scroll_id": ""}},
|
||||
],
|
||||
)
|
||||
|
||||
|
||||
def test_cpan_lister(
|
||||
swh_scheduler, release_search_response, release_scroll_first_response
|
||||
):
|
||||
lister = CpanLister(scheduler=swh_scheduler)
|
||||
res = lister.run()
|
||||
|
||||
assert res.pages == 3
|
||||
assert res.origins == 4 + 3 + 0
|
||||
expected_origins = set()
|
||||
expected_artifacts = defaultdict(list)
|
||||
expected_module_metadata = defaultdict(list)
|
||||
for release in chain(
|
||||
release_search_response["hits"]["hits"],
|
||||
release_scroll_first_response["hits"]["hits"],
|
||||
):
|
||||
distribution = release["_source"]["distribution"]
|
||||
release_name = release["_source"]["name"]
|
||||
checksum_sha256 = release["_source"]["checksum_sha256"]
|
||||
download_url = release["_source"]["download_url"]
|
||||
version = release["_source"]["version"]
|
||||
size = release["_source"]["stat"]["size"]
|
||||
author = release["_source"]["author"]
|
||||
author_fullname = release["_source"]["metadata"]["author"][0]
|
||||
date = release["_source"]["date"]
|
||||
origin_url = f"https://metacpan.org/dist/{distribution}"
|
||||
expected_origins.add(origin_url)
|
||||
expected_artifacts[origin_url].append(
|
||||
{
|
||||
"url": download_url,
|
||||
"filename": download_url.split("/")[-1],
|
||||
"version": version,
|
||||
"length": size,
|
||||
"checksums": {"sha256": checksum_sha256},
|
||||
}
|
||||
)
|
||||
expected_module_metadata[origin_url].append(
|
||||
{
|
||||
"name": distribution,
|
||||
"version": version,
|
||||
"cpan_author": author,
|
||||
"author": author_fullname if author_fullname != "unknown" else author,
|
||||
"date": date,
|
||||
"release_name": release_name,
|
||||
}
|
||||
)
|
||||
|
||||
assert res.pages == 1
|
||||
assert res.origins == len(expected_origins)
|
||||
|
||||
scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
|
||||
|
||||
|
@ -30,3 +100,8 @@ def test_cpan_lister(datadir, requests_mock_datadir_visits, swh_scheduler):
|
|||
for origin in scheduler_origins:
|
||||
assert origin.visit_type == "cpan"
|
||||
assert origin.url in expected_origins
|
||||
assert origin.extra_loader_arguments == {
|
||||
"api_base_url": "https://fastapi.metacpan.org/v1",
|
||||
"artifacts": expected_artifacts[origin.url],
|
||||
"module_metadata": expected_module_metadata[origin.url],
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue