Bower: List origins from registry.bower.io

This commit is contained in:
Franck Bret 2022-08-29 15:55:00 +02:00
parent 5410b6e3f3
commit ceae8c42b5
8 changed files with 269 additions and 0 deletions

View file

@ -58,6 +58,7 @@ setup(
lister.arch=swh.lister.arch:register
lister.aur=swh.lister.aur:register
lister.bitbucket=swh.lister.bitbucket:register
lister.bower=swh.lister.bower:register
lister.cgit=swh.lister.cgit:register
lister.cran=swh.lister.cran:register
lister.crates=swh.lister.crates:register

View file

@ -0,0 +1,76 @@
# Copyright (C) 2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
"""
Bower lister
============
The `Bower`_ lister list origins from its packages registry `registry.bower.io`_.
Bower is a tool to manage Javascript packages.
The registry provide an `http api`_ from where the lister retrieve package names
and url.
As of August 2022 `registry.bower.io`_ list 71028 package names.
Note that even if the project is still maintained(security fixes, no new features), it is
recommended to not use it anymore and prefer Yarn as a replacement since 2018.
Origins retrieving strategy
---------------------------
To get a list of all package names we call `https://registry.bower.io/packages` endpoint.
There is no other way for discovery (no archive index, no database dump, no dvcs repository).
Page listing
------------
There is only one page that list all origins url.
Origins from page
-----------------
The lister yields all origins url from one page. It is a list of package name and url.
Origins url corresponds to Git repository url.
Bower is supposed to support Svn repository too but on +/- 71000 urls I have only found 35
urls that may not be Git repository.
Running tests
-------------
Activate the virtualenv and run from within swh-lister directory::
pytest -s -vv --log-cli-level=DEBUG swh/lister/bower/tests
Testing with Docker
-------------------
Change directory to swh/docker then launch the docker environment::
docker-compose up -d
Then connect to the lister::
docker exec -it docker_swh-lister_1 bash
And run the lister (The output of this listing results in oneshot tasks in the scheduler)::
swh lister run -l bower
.. _Bower: https://bower.io
.. _registry.bower.io: https://registry.bower.io
.. _http api: https://registry.bower.io/packages
"""
def register():
from .lister import BowerLister
return {
"lister": BowerLister,
"task_modules": ["%s.tasks" % __name__],
}

View file

@ -0,0 +1,91 @@
# Copyright (C) 2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import logging
from typing import Any, Dict, Iterator, List, Optional
import requests
from tenacity.before_sleep import before_sleep_log
from swh.lister.utils import throttling_retry
from swh.scheduler.interface import SchedulerInterface
from swh.scheduler.model import ListedOrigin
from .. import USER_AGENT
from ..pattern import CredentialsType, StatelessLister
logger = logging.getLogger(__name__)
# Aliasing the page results returned by `get_pages` method from the lister.
BowerListerPage = List[Dict[str, str]]
class BowerLister(StatelessLister[BowerListerPage]):
"""List Bower (Javascript package manager) origins."""
LISTER_NAME = "bower"
VISIT_TYPE = "bower"
INSTANCE = "bower"
API_URL = "https://registry.bower.io/packages"
def __init__(
self,
scheduler: SchedulerInterface,
credentials: Optional[CredentialsType] = None,
):
super().__init__(
scheduler=scheduler,
credentials=credentials,
instance=self.INSTANCE,
url=self.API_URL,
)
self.session = requests.Session()
self.session.headers.update(
{
"Accept": "application/json",
"User-Agent": USER_AGENT,
}
)
@throttling_retry(before_sleep=before_sleep_log(logger, logging.WARNING))
def page_request(self, url: str, params: Dict[str, Any]) -> requests.Response:
logger.info("Fetching URL %s with params %s", url, params)
response = self.session.get(url, params=params)
if response.status_code != 200:
logger.warning(
"Unexpected HTTP status code %s on %s: %s",
response.status_code,
response.url,
response.content,
)
response.raise_for_status()
return response
def get_pages(self) -> Iterator[BowerListerPage]:
"""Yield an iterator which returns 'page'
It uses the api endpoint provided by `https://registry.bower.io/packages`
to get a list of package names with an origin url that corresponds to Git
repository.
There is only one page that list all origins urls.
"""
response = self.page_request(url=self.url, params={})
yield response.json()
def get_origins_from_page(self, page: BowerListerPage) -> Iterator[ListedOrigin]:
"""Iterate on all pages and yield ListedOrigin instances."""
assert self.lister_obj.id is not None
for entry in page:
yield ListedOrigin(
lister_id=self.lister_obj.id,
visit_type=self.VISIT_TYPE,
url=entry["url"],
last_update=None,
)

19
swh/lister/bower/tasks.py Normal file
View file

@ -0,0 +1,19 @@
# Copyright (C) 2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from celery import shared_task
from swh.lister.bower.lister import BowerLister
@shared_task(name=__name__ + ".BowerListerTask")
def list_bower(**lister_args):
"""Lister task for Bower (Javascript package manager) registry"""
return BowerLister.from_configfile(**lister_args).run().dict()
@shared_task(name=__name__ + ".ping")
def _ping():
return "OK"

View file

View file

@ -0,0 +1,14 @@
[
{
"name": "font-awesome",
"url": "https://github.com/FortAwesome/Font-Awesome.git"
},
{
"name": "redux",
"url": "https://github.com/reactjs/redux.git"
},
{
"name": "vue",
"url": "https://github.com/vuejs/vue.git"
}
]

View file

@ -0,0 +1,37 @@
# Copyright (C) 2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from swh.lister.bower.lister import BowerLister
expected_origins = [
{"name": "font-awesome", "url": "https://github.com/FortAwesome/Font-Awesome.git"},
{"name": "redux", "url": "https://github.com/reactjs/redux.git"},
{"name": "vue", "url": "https://github.com/vuejs/vue.git"},
]
def test_bower_lister(datadir, requests_mock_datadir, swh_scheduler):
lister = BowerLister(scheduler=swh_scheduler)
res = lister.run()
assert res.pages == 1
assert res.origins == 1 + 1 + 1
scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
assert len(scheduler_origins) == len(expected_origins)
assert {
(
scheduled.visit_type,
scheduled.url,
)
for scheduled in scheduler_origins
} == {
(
"bower",
expected["url"],
)
for expected in expected_origins
}

View file

@ -0,0 +1,31 @@
# Copyright (C) 2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from swh.lister.pattern import ListerStats
def test_bower_ping(swh_scheduler_celery_app, swh_scheduler_celery_worker):
res = swh_scheduler_celery_app.send_task("swh.lister.bower.tasks.ping")
assert res
res.wait()
assert res.successful()
assert res.result == "OK"
def test_bower_lister(swh_scheduler_celery_app, swh_scheduler_celery_worker, mocker):
# setup the mocked BowerLister
lister = mocker.patch("swh.lister.bower.tasks.BowerLister")
lister.from_configfile.return_value = lister
stats = ListerStats(pages=42, origins=42)
lister.run.return_value = stats
res = swh_scheduler_celery_app.send_task("swh.lister.bower.tasks.BowerListerTask")
assert res
res.wait()
assert res.successful()
assert res.result == stats.dict()
lister.from_configfile.assert_called_once_with()
lister.run.assert_called_once_with()