Cpan: List Perl module origins from cpan.org
Related T2833
This commit is contained in:
parent
6696a8424a
commit
a4aec3894e
10 changed files with 364 additions and 0 deletions
1
setup.py
1
setup.py
|
@ -61,6 +61,7 @@ setup(
|
|||
lister.bower=swh.lister.bower:register
|
||||
lister.cgit=swh.lister.cgit:register
|
||||
lister.conda=swh.lister.conda:register
|
||||
lister.cpan=swh.lister.cpan:register
|
||||
lister.cran=swh.lister.cran:register
|
||||
lister.crates=swh.lister.crates:register
|
||||
lister.debian=swh.lister.debian:register
|
||||
|
|
73
swh/lister/cpan/__init__.py
Normal file
73
swh/lister/cpan/__init__.py
Normal file
|
@ -0,0 +1,73 @@
|
|||
# Copyright (C) 2022 The Software Heritage developers
|
||||
# See the AUTHORS file at the top-level directory of this distribution
|
||||
# License: GNU General Public License version 3, or any later version
|
||||
# See top-level LICENSE file for more information
|
||||
|
||||
|
||||
"""
|
||||
Cpan lister
|
||||
=============
|
||||
|
||||
The Cpan lister list origins from `cpan.org`_, the Comprehensive Perl Archive
|
||||
Network. It provides search features via `metacpan.org`_.
|
||||
|
||||
As of September 2022 `cpan.org`_ list 43675 package names.
|
||||
|
||||
Origins retrieving strategy
|
||||
---------------------------
|
||||
|
||||
To get a list of all package names we call a first `http api endpoint`_ that
|
||||
retrieve results and a ``_scroll_id`` that will be used to scroll pages through
|
||||
`search`_ endpoint.
|
||||
|
||||
Page listing
|
||||
------------
|
||||
|
||||
Each page returns a list of ``results`` which are raw data from api response.
|
||||
|
||||
Origins from page
|
||||
-----------------
|
||||
|
||||
Origin url is the html page corresponding to a package name on `metacpan.org`_, following
|
||||
this pattern::
|
||||
|
||||
"https://metacpan.org/dist/{pkgname}"
|
||||
|
||||
Running tests
|
||||
-------------
|
||||
|
||||
Activate the virtualenv and run from within swh-lister directory::
|
||||
|
||||
pytest -s -vv --log-cli-level=DEBUG swh/lister/cpan/tests
|
||||
|
||||
Testing with Docker
|
||||
-------------------
|
||||
|
||||
Change directory to swh/docker then launch the docker environment::
|
||||
|
||||
docker compose up -d
|
||||
|
||||
Then schedule a Cpan listing task::
|
||||
|
||||
docker compose exec swh-scheduler swh scheduler task add -p oneshot list-cpan
|
||||
|
||||
You can follow lister execution by displaying logs of swh-lister service::
|
||||
|
||||
docker compose logs -f swh-lister
|
||||
|
||||
.. _cpan.org: https://cpan.org/
|
||||
.. _metacpan.org: https://metacpan.org/
|
||||
.. _http api endpoint: https://explorer.metacpan.org/?url=/distribution/
|
||||
.. _search: https://github.com/metacpan/metacpan-api/blob/master/docs/API-docs.md#search-without-constraints # noqa: B950
|
||||
|
||||
|
||||
"""
|
||||
|
||||
|
||||
def register():
|
||||
from .lister import CpanLister
|
||||
|
||||
return {
|
||||
"lister": CpanLister,
|
||||
"task_modules": ["%s.tasks" % __name__],
|
||||
}
|
91
swh/lister/cpan/lister.py
Normal file
91
swh/lister/cpan/lister.py
Normal file
|
@ -0,0 +1,91 @@
|
|||
# Copyright (C) 2022 The Software Heritage developers
|
||||
# See the AUTHORS file at the top-level directory of this distribution
|
||||
# License: GNU General Public License version 3, or any later version
|
||||
# See top-level LICENSE file for more information
|
||||
|
||||
import logging
|
||||
from typing import Any, Dict, Iterator, List, Optional
|
||||
|
||||
from swh.scheduler.interface import SchedulerInterface
|
||||
from swh.scheduler.model import ListedOrigin
|
||||
|
||||
from ..pattern import CredentialsType, StatelessLister
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Aliasing the page results returned by `get_pages` method from the lister.
|
||||
CpanListerPage = List[Dict[str, Any]]
|
||||
|
||||
|
||||
class CpanLister(StatelessLister[CpanListerPage]):
|
||||
"""The Cpan lister list origins from 'Cpan', the Comprehensive Perl Archive
|
||||
Network."""
|
||||
|
||||
LISTER_NAME = "cpan"
|
||||
VISIT_TYPE = "cpan"
|
||||
INSTANCE = "cpan"
|
||||
|
||||
BASE_URL = "https://fastapi.metacpan.org/v1/"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
scheduler: SchedulerInterface,
|
||||
credentials: Optional[CredentialsType] = None,
|
||||
):
|
||||
super().__init__(
|
||||
scheduler=scheduler,
|
||||
credentials=credentials,
|
||||
instance=self.INSTANCE,
|
||||
url=self.BASE_URL,
|
||||
)
|
||||
|
||||
def get_pages(self) -> Iterator[CpanListerPage]:
|
||||
"""Yield an iterator which returns 'page'"""
|
||||
|
||||
endpoint = f"{self.BASE_URL}distribution/_search"
|
||||
scrollendpoint = f"{self.BASE_URL}_search/scroll"
|
||||
size: int = 1000
|
||||
|
||||
res = self.http_request(
|
||||
endpoint,
|
||||
params={
|
||||
"fields": ["name"],
|
||||
"size": size,
|
||||
"scroll": "1m",
|
||||
},
|
||||
)
|
||||
data = res.json()["hits"]["hits"]
|
||||
yield data
|
||||
|
||||
_scroll_id = res.json()["_scroll_id"]
|
||||
|
||||
while data:
|
||||
scroll_res = self.http_request(
|
||||
scrollendpoint, params={"scroll": "1m", "scroll_id": _scroll_id}
|
||||
)
|
||||
data = scroll_res.json()["hits"]["hits"]
|
||||
_scroll_id = scroll_res.json()["_scroll_id"]
|
||||
yield data
|
||||
|
||||
def get_origins_from_page(self, page: CpanListerPage) -> Iterator[ListedOrigin]:
|
||||
"""Iterate on all pages and yield ListedOrigin instances."""
|
||||
assert self.lister_obj.id is not None
|
||||
|
||||
for entry in page:
|
||||
# Skip the entry if 'fields' or 'name' keys are missing
|
||||
if "fields" not in entry or "name" not in entry["fields"]:
|
||||
continue
|
||||
|
||||
pkgname = entry["fields"]["name"]
|
||||
# TODO: Check why sometimes its a one value list
|
||||
if type(pkgname) != str:
|
||||
pkgname = pkgname[0]
|
||||
|
||||
url = f"https://metacpan.org/dist/{pkgname}"
|
||||
|
||||
yield ListedOrigin(
|
||||
lister_id=self.lister_obj.id,
|
||||
visit_type=self.VISIT_TYPE,
|
||||
url=url,
|
||||
last_update=None,
|
||||
)
|
19
swh/lister/cpan/tasks.py
Normal file
19
swh/lister/cpan/tasks.py
Normal file
|
@ -0,0 +1,19 @@
|
|||
# Copyright (C) 2022 The Software Heritage developers
|
||||
# See the AUTHORS file at the top-level directory of this distribution
|
||||
# License: GNU General Public License version 3, or any later version
|
||||
# See top-level LICENSE file for more information
|
||||
|
||||
from celery import shared_task
|
||||
|
||||
from swh.lister.cpan.lister import CpanLister
|
||||
|
||||
|
||||
@shared_task(name=__name__ + ".CpanListerTask")
|
||||
def list_cpan(**lister_args):
|
||||
"""Lister task for Cpan"""
|
||||
return CpanLister.from_configfile(**lister_args).run().dict()
|
||||
|
||||
|
||||
@shared_task(name=__name__ + ".ping")
|
||||
def _ping():
|
||||
return "OK"
|
0
swh/lister/cpan/tests/__init__.py
Normal file
0
swh/lister/cpan/tests/__init__.py
Normal file
|
@ -0,0 +1,50 @@
|
|||
{
|
||||
"_shards" : {
|
||||
"successful" : 3,
|
||||
"total" : 3,
|
||||
"failed" : 0
|
||||
},
|
||||
"timed_out" : false,
|
||||
"hits" : {
|
||||
"max_score" : 1.0,
|
||||
"hits" : [
|
||||
{
|
||||
"_type" : "distribution",
|
||||
"fields" : {
|
||||
"name" : [
|
||||
"EventSource-Server"
|
||||
]
|
||||
},
|
||||
"_id" : "EventSource-Server",
|
||||
"_index" : "cpan_v1_01",
|
||||
"_score" : 1.0
|
||||
},
|
||||
{
|
||||
"_score" : 1.0,
|
||||
"_index" : "cpan_v1_01",
|
||||
"_id" : "Interchange6",
|
||||
"fields" : {
|
||||
"name" : [
|
||||
"Interchange6"
|
||||
]
|
||||
},
|
||||
"_type" : "distribution"
|
||||
},
|
||||
{
|
||||
"_score" : 1.0,
|
||||
"_index" : "cpan_v1_01",
|
||||
"_id" : "Internals-CountObjects",
|
||||
"fields" : {
|
||||
"name" : [
|
||||
"Internals-CountObjects"
|
||||
]
|
||||
},
|
||||
"_type" : "distribution"
|
||||
}
|
||||
],
|
||||
"total" : 43675
|
||||
},
|
||||
"took" : 72,
|
||||
"_scroll_id" : "cXVlcnlUaGVuRmV0Y2g7Mzs5NTU1MTQ1NTk6eXptdmszQUNUam1XbVJjRjRkRk9Udzs5NTQ5NjQ5NjI6ZHZIZWxCb3BUZi1Cb3NwRDB5NmRQUTs5NTU1MTQ1NjA6eXptdmszQUNUam1XbVJjRjRkRk9UdzswOw==",
|
||||
"terminated_early" : true
|
||||
}
|
|
@ -0,0 +1,16 @@
|
|||
{
|
||||
"_scroll_id" : "cXVlcnlUaGVuRmV0Y2g7Mzs5NTU1MTQ1NTk6eXptdmszQUNUam1XbVJjRjRkRk9Udzs5NTQ5NjQ5NjI6ZHZIZWxCb3BUZi1Cb3NwRDB5NmRQUTs5NTU1MTQ1NjA6eXptdmszQUNUam1XbVJjRjRkRk9UdzswOw==",
|
||||
"took" : 1,
|
||||
"hits" : {
|
||||
"hits" : [],
|
||||
"total" : 43675,
|
||||
"max_score" : 1.0
|
||||
},
|
||||
"terminated_early" : true,
|
||||
"timed_out" : false,
|
||||
"_shards" : {
|
||||
"failed" : 0,
|
||||
"total" : 3,
|
||||
"successful" : 3
|
||||
}
|
||||
}
|
|
@ -0,0 +1,52 @@
|
|||
{
|
||||
"_shards" : {
|
||||
"successful" : 3,
|
||||
"failed" : 0,
|
||||
"total" : 3
|
||||
},
|
||||
"_scroll_id" : "cXVlcnlUaGVuRmV0Y2g7Mzs5NTU1MTQ1NTk6eXptdmszQUNUam1XbVJjRjRkRk9Udzs5NTQ5NjQ5NjI6ZHZIZWxCb3BUZi1Cb3NwRDB5NmRQUTs5NTU1MTQ1NjA6eXptdmszQUNUam1XbVJjRjRkRk9UdzswOw==",
|
||||
"took" : 61,
|
||||
"hits" : {
|
||||
"max_score" : 1.0,
|
||||
"hits" : [
|
||||
{
|
||||
"_score" : 1.0,
|
||||
"_index" : "cpan_v1_01",
|
||||
"_id" : "openerserver_perl-master",
|
||||
"fields" : {
|
||||
"name" : "openerserver_perl-master"
|
||||
},
|
||||
"_type" : "distribution"
|
||||
},
|
||||
{
|
||||
"_score" : 1.0,
|
||||
"_type" : "distribution",
|
||||
"fields" : {
|
||||
"name" : "Getopt_Auto"
|
||||
},
|
||||
"_id" : "Getopt_Auto",
|
||||
"_index" : "cpan_v1_01"
|
||||
},
|
||||
{
|
||||
"_id" : "App-Booklist",
|
||||
"_index" : "cpan_v1_01",
|
||||
"_type" : "distribution",
|
||||
"fields" : {
|
||||
"name" : "App-Booklist"
|
||||
},
|
||||
"_score" : 1.0
|
||||
},
|
||||
{
|
||||
"fields" : {
|
||||
"name" : "EuclideanRhythm"
|
||||
},
|
||||
"_type" : "distribution",
|
||||
"_index" : "cpan_v1_01",
|
||||
"_id" : "EuclideanRhythm",
|
||||
"_score" : 1.0
|
||||
}
|
||||
],
|
||||
"total" : 43675
|
||||
},
|
||||
"timed_out" : false
|
||||
}
|
31
swh/lister/cpan/tests/test_lister.py
Normal file
31
swh/lister/cpan/tests/test_lister.py
Normal file
|
@ -0,0 +1,31 @@
|
|||
# Copyright (C) 2022 The Software Heritage developers
|
||||
# See the AUTHORS file at the top-level directory of this distribution
|
||||
# License: GNU General Public License version 3, or any later version
|
||||
# See top-level LICENSE file for more information
|
||||
from swh.lister.cpan.lister import CpanLister
|
||||
|
||||
expected_origins = [
|
||||
"https://metacpan.org/dist/App-Booklist",
|
||||
"https://metacpan.org/dist/EuclideanRhythm",
|
||||
"https://metacpan.org/dist/EventSource-Server",
|
||||
"https://metacpan.org/dist/Getopt_Auto",
|
||||
"https://metacpan.org/dist/Interchange6",
|
||||
"https://metacpan.org/dist/Internals-CountObjects",
|
||||
"https://metacpan.org/dist/openerserver_perl-master",
|
||||
]
|
||||
|
||||
|
||||
def test_cpan_lister(datadir, requests_mock_datadir_visits, swh_scheduler):
|
||||
lister = CpanLister(scheduler=swh_scheduler)
|
||||
res = lister.run()
|
||||
|
||||
assert res.pages == 3
|
||||
assert res.origins == 4 + 3 + 0
|
||||
|
||||
scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
|
||||
|
||||
assert len(scheduler_origins) == len(expected_origins)
|
||||
|
||||
for origin in scheduler_origins:
|
||||
assert origin.visit_type == "cpan"
|
||||
assert origin.url in expected_origins
|
31
swh/lister/cpan/tests/test_tasks.py
Normal file
31
swh/lister/cpan/tests/test_tasks.py
Normal file
|
@ -0,0 +1,31 @@
|
|||
# Copyright (C) 2022 The Software Heritage developers
|
||||
# See the AUTHORS file at the top-level directory of this distribution
|
||||
# License: GNU General Public License version 3, or any later version
|
||||
# See top-level LICENSE file for more information
|
||||
|
||||
from swh.lister.pattern import ListerStats
|
||||
|
||||
|
||||
def test_cpan_ping(swh_scheduler_celery_app, swh_scheduler_celery_worker):
|
||||
res = swh_scheduler_celery_app.send_task("swh.lister.cpan.tasks.ping")
|
||||
assert res
|
||||
res.wait()
|
||||
assert res.successful()
|
||||
assert res.result == "OK"
|
||||
|
||||
|
||||
def test_cpan_lister(swh_scheduler_celery_app, swh_scheduler_celery_worker, mocker):
|
||||
# setup the mocked CpanLister
|
||||
lister = mocker.patch("swh.lister.cpan.tasks.CpanLister")
|
||||
lister.from_configfile.return_value = lister
|
||||
stats = ListerStats(pages=42, origins=42)
|
||||
lister.run.return_value = stats
|
||||
|
||||
res = swh_scheduler_celery_app.send_task("swh.lister.cpan.tasks.CpanListerTask")
|
||||
assert res
|
||||
res.wait()
|
||||
assert res.successful()
|
||||
assert res.result == stats.dict()
|
||||
|
||||
lister.from_configfile.assert_called_once_with()
|
||||
lister.run.assert_called_once_with()
|
Loading…
Add table
Add a link
Reference in a new issue