Pub.dev lister for Dart and Flutter packages

Stateless lister for https://pub.dev based on http api to list package names
This commit is contained in:
Franck Bret 2022-08-23 10:41:23 +02:00
parent ce72969de5
commit 5410b6e3f3
8 changed files with 263 additions and 0 deletions

View file

@ -71,6 +71,7 @@ setup(
lister.opam=swh.lister.opam:register
lister.packagist=swh.lister.packagist:register
lister.phabricator=swh.lister.phabricator:register
lister.pubdev=swh.lister.pubdev:register
lister.pypi=swh.lister.pypi:register
lister.sourceforge=swh.lister.sourceforge:register
lister.tuleap=swh.lister.tuleap:register

View file

@ -0,0 +1,71 @@
# Copyright (C) 2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
"""
Pub.dev lister
==============
The Pubdev lister list origins from `pub.dev`_, the `Dart`_ and `Flutter`_ packages registry.
The registry provide an `http api`_ from where the lister retrieve package names.
As of August 2022 `pub.dev`_ list 33535 package names.
Origins retrieving strategy
---------------------------
To get a list of all package names we call `https://pub.dev/api/packages` endpoint.
There is no other way for discovery (no archive index, no database dump, no dvcs repository).
Page listing
------------
There is only one page that list all origins url based
on `https://pub.dev/api/packages/{pkgname}`.
The origin url corresponds to the http api endpoint that returns complete information
about the package versions (name, version, author, description, release date).
Origins from page
-----------------
The lister yields all origins url from one page.
Running tests
-------------
Activate the virtualenv and run from within swh-lister directory::
pytest -s -vv --log-cli-level=DEBUG swh/lister/pubdev/tests
Testing with Docker
-------------------
Change directory to swh/docker then launch the docker environment::
docker-compose up -d
Then connect to the lister::
docker exec -it docker_swh-lister_1 bash
And run the lister (The output of this listing results in oneshot tasks in the scheduler)::
swh lister run -l pubdev
.. _pub.dev: https://pub.dev
.. _Dart: https://dart.dev
.. _Flutter: https://flutter.dev
.. _http api: https://pub.dev/help/api
"""
def register():
from .lister import PubDevLister
return {
"lister": PubDevLister,
"task_modules": ["%s.tasks" % __name__],
}

100
swh/lister/pubdev/lister.py Normal file
View file

@ -0,0 +1,100 @@
# Copyright (C) 2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import logging
from typing import Any, Dict, Iterator, List, Optional
import requests
from tenacity.before_sleep import before_sleep_log
from swh.lister.utils import throttling_retry
from swh.scheduler.interface import SchedulerInterface
from swh.scheduler.model import ListedOrigin
from .. import USER_AGENT
from ..pattern import CredentialsType, StatelessLister
logger = logging.getLogger(__name__)
# Aliasing the page results returned by `get_pages` method from the lister.
PubDevListerPage = List[str]
class PubDevLister(StatelessLister[PubDevListerPage]):
"""List pub.dev (Dart, Flutter) origins."""
LISTER_NAME = "pubdev"
VISIT_TYPE = "pubdev"
INSTANCE = "pubdev"
BASE_URL = "https://pub.dev/api/"
PACKAGE_NAMES_URL_PATTERN = "{base_url}package-names"
PACKAGE_INFO_URL_PATTERN = "{base_url}packages/{pkgname}"
def __init__(
self,
scheduler: SchedulerInterface,
credentials: Optional[CredentialsType] = None,
):
super().__init__(
scheduler=scheduler,
credentials=credentials,
instance=self.INSTANCE,
url=self.BASE_URL,
)
self.session = requests.Session()
self.session.headers.update(
{
"Accept": "application/json",
"User-Agent": USER_AGENT,
}
)
@throttling_retry(before_sleep=before_sleep_log(logger, logging.WARNING))
def page_request(self, url: str, params: Dict[str, Any]) -> requests.Response:
logger.info("Fetching URL %s with params %s", url, params)
response = self.session.get(url, params=params)
if response.status_code != 200:
logger.warning(
"Unexpected HTTP status code %s on %s: %s",
response.status_code,
response.url,
response.content,
)
response.raise_for_status()
return response
def get_pages(self) -> Iterator[PubDevListerPage]:
"""Yield an iterator which returns 'page'
It uses the api provided by https://pub.dev/api/ to find Dart and Flutter package
origins.
The http api call get "{base_url}package-names" to retrieve a sorted list
of all package names.
There is only one page that list all origins url based on "{base_url}packages/{pkgname}"
"""
response = self.page_request(
url=self.PACKAGE_NAMES_URL_PATTERN.format(base_url=self.url), params={}
)
yield response.json()["packages"]
def get_origins_from_page(self, page: PubDevListerPage) -> Iterator[ListedOrigin]:
"""Iterate on all pages and yield ListedOrigin instances."""
assert self.lister_obj.id is not None
for pkgname in page:
url = self.PACKAGE_INFO_URL_PATTERN.format(
base_url=self.url, pkgname=pkgname
)
yield ListedOrigin(
lister_id=self.lister_obj.id,
visit_type=self.VISIT_TYPE,
url=url,
last_update=None,
)

View file

@ -0,0 +1,19 @@
# Copyright (C) 2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from celery import shared_task
from swh.lister.pubdev.lister import PubDevLister
@shared_task(name=__name__ + ".PubDevListerTask")
def list_pubdev(**lister_args):
"""Lister task for pub.dev (Dart, Flutter) registry"""
return PubDevLister.from_configfile(**lister_args).run().dict()
@shared_task(name=__name__ + ".ping")
def _ping():
return "OK"

View file

View file

@ -0,0 +1 @@
{"packages":["Autolinker","pdf"],"nextUrl":null}

View file

@ -0,0 +1,40 @@
# Copyright (C) 2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from swh.lister.pubdev.lister import PubDevLister
expected_origins = [
{
"url": "https://pub.dev/api/packages/Autolinker",
},
{
"url": "https://pub.dev/api/packages/pdf",
},
]
def test_pubdev_lister(datadir, requests_mock_datadir, swh_scheduler):
lister = PubDevLister(scheduler=swh_scheduler)
res = lister.run()
assert res.pages == 1
assert res.origins == 1 + 1
scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
assert len(scheduler_origins) == len(expected_origins)
assert {
(
scheduled.visit_type,
scheduled.url,
)
for scheduled in scheduler_origins
} == {
(
"pubdev",
expected["url"],
)
for expected in expected_origins
}

View file

@ -0,0 +1,31 @@
# Copyright (C) 2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from swh.lister.pattern import ListerStats
def test_pubdev_ping(swh_scheduler_celery_app, swh_scheduler_celery_worker):
res = swh_scheduler_celery_app.send_task("swh.lister.pubdev.tasks.ping")
assert res
res.wait()
assert res.successful()
assert res.result == "OK"
def test_pubdev_lister(swh_scheduler_celery_app, swh_scheduler_celery_worker, mocker):
# setup the mocked PubDevLister
lister = mocker.patch("swh.lister.pubdev.tasks.PubDevLister")
lister.from_configfile.return_value = lister
stats = ListerStats(pages=42, origins=42)
lister.run.return_value = stats
res = swh_scheduler_celery_app.send_task("swh.lister.pubdev.tasks.PubDevListerTask")
assert res
res.wait()
assert res.successful()
assert res.result == stats.dict()
lister.from_configfile.assert_called_once_with()
lister.run.assert_called_once_with()