parent
dabb1a2ae5
commit
52ccf49e11
8 changed files with 227 additions and 0 deletions
1
setup.py
1
setup.py
|
@ -80,6 +80,7 @@ setup(
|
|||
lister.pubdev=swh.lister.pubdev:register
|
||||
lister.puppet=swh.lister.puppet:register
|
||||
lister.pypi=swh.lister.pypi:register
|
||||
lister.rubygems=swh.lister.rubygems:register
|
||||
lister.sourceforge=swh.lister.sourceforge:register
|
||||
lister.tuleap=swh.lister.tuleap:register
|
||||
lister.maven=swh.lister.maven:register
|
||||
|
|
66
swh/lister/rubygems/__init__.py
Normal file
66
swh/lister/rubygems/__init__.py
Normal file
|
@ -0,0 +1,66 @@
|
|||
# Copyright (C) 2022 The Software Heritage developers
|
||||
# See the AUTHORS file at the top-level directory of this distribution
|
||||
# License: GNU General Public License version 3, or any later version
|
||||
# See top-level LICENSE file for more information
|
||||
|
||||
|
||||
"""
|
||||
RubyGems lister
|
||||
===============
|
||||
|
||||
The RubyGems lister list origins from `RubyGems.org`_, the Ruby community’s gem hosting service.
|
||||
|
||||
As of September 2022 `RubyGems.org`_ list 173384 package names.
|
||||
|
||||
Origins retrieving strategy
|
||||
---------------------------
|
||||
|
||||
To get a list of all package names we call an `http endpoint`_ which returns a list of gems
|
||||
as text.
|
||||
|
||||
Page listing
|
||||
------------
|
||||
|
||||
Each page returns an origin url based on the following pattern::
|
||||
|
||||
https://rubygems.org/gems/{pkgname}
|
||||
|
||||
Origins from page
|
||||
-----------------
|
||||
|
||||
The lister yields one origin url per page.
|
||||
|
||||
Running tests
|
||||
-------------
|
||||
|
||||
Activate the virtualenv and run from within swh-lister directory::
|
||||
|
||||
pytest -s -vv --log-cli-level=DEBUG swh/lister/rubygems/tests
|
||||
|
||||
Testing with Docker
|
||||
-------------------
|
||||
|
||||
Change directory to swh/docker then launch the docker environment::
|
||||
|
||||
docker compose up -d
|
||||
|
||||
Then schedule a RubyGems listing task::
|
||||
|
||||
docker compose exec swh-scheduler swh scheduler task add -p oneshot list-rubygems
|
||||
|
||||
You can follow lister execution by displaying logs of swh-lister service::
|
||||
|
||||
docker compose logs -f swh-lister
|
||||
|
||||
.. _RubyGems.org: https://rubygems.org/
|
||||
.. _http endpoint: https://rubygems.org/versions
|
||||
"""
|
||||
|
||||
|
||||
def register():
|
||||
from .lister import RubyGemsLister
|
||||
|
||||
return {
|
||||
"lister": RubyGemsLister,
|
||||
"task_modules": ["%s.tasks" % __name__],
|
||||
}
|
75
swh/lister/rubygems/lister.py
Normal file
75
swh/lister/rubygems/lister.py
Normal file
|
@ -0,0 +1,75 @@
|
|||
# Copyright (C) 2022 The Software Heritage developers
|
||||
# See the AUTHORS file at the top-level directory of this distribution
|
||||
# License: GNU General Public License version 3, or any later version
|
||||
# See top-level LICENSE file for more information
|
||||
|
||||
import logging
|
||||
from typing import Iterator, List, Optional, Text
|
||||
|
||||
from swh.scheduler.interface import SchedulerInterface
|
||||
from swh.scheduler.model import ListedOrigin
|
||||
|
||||
from ..pattern import CredentialsType, StatelessLister
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Aliasing the page results returned by `get_pages` method from the lister.
|
||||
RubyGemsListerPage = Text
|
||||
|
||||
|
||||
class RubyGemsLister(StatelessLister[RubyGemsListerPage]):
|
||||
"""Lister for RubyGems.org, the Ruby community’s gem hosting service."""
|
||||
|
||||
LISTER_NAME = "rubygems"
|
||||
VISIT_TYPE = "rubygems"
|
||||
INSTANCE = "rubygems"
|
||||
|
||||
INDEX_URL = "https://rubygems.org/versions"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
scheduler: SchedulerInterface,
|
||||
credentials: Optional[CredentialsType] = None,
|
||||
):
|
||||
super().__init__(
|
||||
scheduler=scheduler,
|
||||
credentials=credentials,
|
||||
instance=self.INSTANCE,
|
||||
url=self.INDEX_URL,
|
||||
)
|
||||
|
||||
def get_pages(self) -> Iterator[RubyGemsListerPage]:
|
||||
"""Yield an iterator which returns 'page'
|
||||
|
||||
It uses the index file located at `https://rubygems.org/versions`
|
||||
to get a list of package names. Each page returns an origin url based on
|
||||
the following pattern::
|
||||
|
||||
https://rubygems.org/gems/{pkgname}
|
||||
|
||||
"""
|
||||
|
||||
package_names: List[str] = []
|
||||
response = self.http_request(url=self.url)
|
||||
data = response.content.decode()
|
||||
|
||||
# remove the first 3 lines (file headers + first package named '-')
|
||||
for line in data.splitlines()[3:]:
|
||||
package_names.append(line.split(" ")[0])
|
||||
|
||||
# Remove duplicates
|
||||
package_names_set: List[str] = list(set(package_names))
|
||||
|
||||
for pkgname in package_names_set:
|
||||
yield f"https://rubygems.org/gems/{pkgname}"
|
||||
|
||||
def get_origins_from_page(self, page: RubyGemsListerPage) -> Iterator[ListedOrigin]:
|
||||
"""Iterate on all pages and yield ListedOrigin instances."""
|
||||
assert self.lister_obj.id is not None
|
||||
|
||||
yield ListedOrigin(
|
||||
lister_id=self.lister_obj.id,
|
||||
visit_type=self.VISIT_TYPE,
|
||||
url=page,
|
||||
last_update=None,
|
||||
)
|
19
swh/lister/rubygems/tasks.py
Normal file
19
swh/lister/rubygems/tasks.py
Normal file
|
@ -0,0 +1,19 @@
|
|||
# Copyright (C) 2022 The Software Heritage developers
|
||||
# See the AUTHORS file at the top-level directory of this distribution
|
||||
# License: GNU General Public License version 3, or any later version
|
||||
# See top-level LICENSE file for more information
|
||||
|
||||
from celery import shared_task
|
||||
|
||||
from swh.lister.rubygems.lister import RubyGemsLister
|
||||
|
||||
|
||||
@shared_task(name=__name__ + ".RubyGemsListerTask")
|
||||
def list_rubygems(**lister_args):
|
||||
"""Lister task for RubyGems"""
|
||||
return RubyGemsLister.from_configfile(**lister_args).run().dict()
|
||||
|
||||
|
||||
@shared_task(name=__name__ + ".ping")
|
||||
def _ping():
|
||||
return "OK"
|
0
swh/lister/rubygems/tests/__init__.py
Normal file
0
swh/lister/rubygems/tests/__init__.py
Normal file
|
@ -0,0 +1,6 @@
|
|||
created_at: 2022-09-01T00:00:05Z
|
||||
---
|
||||
- 1 05d0116933ba44b0b5d0ee19bfd35ccc
|
||||
mercurial-ruby 0.3.0,0.4.0,0.5.0,0.6.0,0.6.1,0.7.0,0.7.1,0.7.2,0.7.3,0.7.4,0.7.5,0.7.6,0.7.7,0.7.8,0.7.9,0.7.10,0.7.11,0.7.12 3ea9d3b3f1010f06d292dcfcc799f260
|
||||
mercurial-wrapper 0.8.4,0.8.5 b6541e48f15eafc0b50fa694cdbffc22
|
||||
mercurius 0.0.1,0.0.2,0.0.3,0.0.5,0.0.6,0.0.7,0.0.8,0.0.9,0.1.0,0.1.1,0.1.2,0.1.3,0.1.4,0.1.5,0.1.6,0.1.7,0.1.8,0.1.9,0.2.0,0.2.1 9a388c7c57d2ed4a879ab42520d91ffd
|
27
swh/lister/rubygems/tests/test_lister.py
Normal file
27
swh/lister/rubygems/tests/test_lister.py
Normal file
|
@ -0,0 +1,27 @@
|
|||
# Copyright (C) 2022 The Software Heritage developers
|
||||
# See the AUTHORS file at the top-level directory of this distribution
|
||||
# License: GNU General Public License version 3, or any later version
|
||||
# See top-level LICENSE file for more information
|
||||
from swh.lister.rubygems.lister import RubyGemsLister
|
||||
|
||||
expected_origins = [
|
||||
"https://rubygems.org/gems/mercurial-ruby",
|
||||
"https://rubygems.org/gems/mercurial-wrapper",
|
||||
"https://rubygems.org/gems/mercurius",
|
||||
]
|
||||
|
||||
|
||||
def test_rubygems_lister(datadir, requests_mock_datadir, swh_scheduler):
|
||||
lister = RubyGemsLister(scheduler=swh_scheduler)
|
||||
res = lister.run()
|
||||
|
||||
assert res.pages == 3
|
||||
assert res.origins == 1 + 1 + 1
|
||||
|
||||
scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
|
||||
|
||||
assert len(scheduler_origins) == len(expected_origins)
|
||||
|
||||
for origin in scheduler_origins:
|
||||
assert origin.visit_type == "rubygems"
|
||||
assert origin.url in expected_origins
|
33
swh/lister/rubygems/tests/test_tasks.py
Normal file
33
swh/lister/rubygems/tests/test_tasks.py
Normal file
|
@ -0,0 +1,33 @@
|
|||
# Copyright (C) 2022 The Software Heritage developers
|
||||
# See the AUTHORS file at the top-level directory of this distribution
|
||||
# License: GNU General Public License version 3, or any later version
|
||||
# See top-level LICENSE file for more information
|
||||
|
||||
from swh.lister.pattern import ListerStats
|
||||
|
||||
|
||||
def test_rubygems_ping(swh_scheduler_celery_app, swh_scheduler_celery_worker):
|
||||
res = swh_scheduler_celery_app.send_task("swh.lister.rubygems.tasks.ping")
|
||||
assert res
|
||||
res.wait()
|
||||
assert res.successful()
|
||||
assert res.result == "OK"
|
||||
|
||||
|
||||
def test_rubygems_lister(swh_scheduler_celery_app, swh_scheduler_celery_worker, mocker):
|
||||
# setup the mocked RubyGemsLister
|
||||
lister = mocker.patch("swh.lister.rubygems.tasks.RubyGemsLister")
|
||||
lister.from_configfile.return_value = lister
|
||||
stats = ListerStats(pages=42, origins=42)
|
||||
lister.run.return_value = stats
|
||||
|
||||
res = swh_scheduler_celery_app.send_task(
|
||||
"swh.lister.rubygems.tasks.RubyGemsListerTask"
|
||||
)
|
||||
assert res
|
||||
res.wait()
|
||||
assert res.successful()
|
||||
assert res.result == stats.dict()
|
||||
|
||||
lister.from_configfile.assert_called_once_with()
|
||||
lister.run.assert_called_once_with()
|
Loading…
Add table
Add a link
Reference in a new issue