swh.lister.cran
Add a lister to list all the CRAN packages . It uses the build-in API in R language to list the packages and get their metadata. Closes T1709
This commit is contained in:
parent
7c6245e663
commit
a9a37a85bf
12 changed files with 210 additions and 1 deletions
0
swh/lister/cran/__init__.py
Normal file
0
swh/lister/cran/__init__.py
Normal file
9
swh/lister/cran/list_all_packages.R
Executable file
9
swh/lister/cran/list_all_packages.R
Executable file
|
@ -0,0 +1,9 @@
|
|||
#!/usr/bin/Rscript
|
||||
|
||||
# This R script calls the buildin API to get list of
|
||||
# all the packages of R and their description, then convert the API
|
||||
# response to JSON string and print it
|
||||
|
||||
db <- tools::CRAN_package_db()[, c("Package", "Version", "Title", "Description")]
|
||||
dbjson <- jsonlite::toJSON(db)
|
||||
print(dbjson)
|
119
swh/lister/cran/lister.py
Normal file
119
swh/lister/cran/lister.py
Normal file
|
@ -0,0 +1,119 @@
|
|||
# Copyright (C) 2019 the Software Heritage developers
|
||||
# License: GNU General Public License version 3, or any later version
|
||||
# See top-level LICENSE file for more information
|
||||
import subprocess
|
||||
import json
|
||||
import logging
|
||||
import pkg_resources
|
||||
|
||||
from swh.lister.cran.models import CRANModel
|
||||
|
||||
from swh.scheduler.utils import create_task_dict
|
||||
from swh.core import utils
|
||||
from swh.lister.core.simple_lister import SimpleLister
|
||||
|
||||
|
||||
class CRANLister(SimpleLister):
|
||||
MODEL = CRANModel
|
||||
LISTER_NAME = 'cran'
|
||||
instance = 'cran'
|
||||
|
||||
def task_dict(self, origin_type, origin_url, **kwargs):
|
||||
"""Return task format dict
|
||||
|
||||
This is overridden from the lister_base as more information is
|
||||
needed for the ingestion task creation.
|
||||
"""
|
||||
return create_task_dict(
|
||||
'load-%s' % origin_type, 'recurring',
|
||||
kwargs.get('name'), origin_url, kwargs.get('version'),
|
||||
project_metadata=kwargs.get('description'))
|
||||
|
||||
def r_script_request(self):
|
||||
"""Runs R script which uses inbuilt API to return a json
|
||||
response containing data about all the R packages
|
||||
|
||||
Returns:
|
||||
List of dictionaries
|
||||
example
|
||||
[
|
||||
{'Package': 'A3',
|
||||
'Version': '1.0.0',
|
||||
'Title':
|
||||
'Accurate, Adaptable, and Accessible Error Metrics for
|
||||
Predictive\nModels',
|
||||
'Description':
|
||||
'Supplies tools for tabulating and analyzing the results
|
||||
of predictive models. The methods employed are ... '
|
||||
}
|
||||
{'Package': 'abbyyR',
|
||||
'Version': '0.5.4',
|
||||
'Title':
|
||||
'Access to Abbyy Optical Character Recognition (OCR) API',
|
||||
'Description': 'Get text from images of text using Abbyy
|
||||
Cloud Optical Character\n ...'
|
||||
}
|
||||
...
|
||||
]
|
||||
"""
|
||||
file_path = pkg_resources.resource_filename('swh.lister.cran',
|
||||
'list_all_packages.R')
|
||||
response = subprocess.run(file_path, stdout=subprocess.PIPE,
|
||||
shell=False)
|
||||
return json.loads(response.stdout)
|
||||
|
||||
def get_model_from_repo(self, repo):
|
||||
"""Transform from repository representation to model
|
||||
|
||||
"""
|
||||
project_url = 'https://cran.r-project.org/src/contrib' \
|
||||
'/%(Package)s_%(Version)s.tar.gz' % repo
|
||||
return {
|
||||
'uid': repo["Package"],
|
||||
'name': repo["Package"],
|
||||
'full_name': repo["Title"],
|
||||
'version': repo["Version"],
|
||||
'html_url': project_url,
|
||||
'origin_url': project_url,
|
||||
'origin_type': 'cran',
|
||||
'description': repo["Description"]
|
||||
}
|
||||
|
||||
def transport_response_simplified(self, response):
|
||||
"""Transform response to list for model manipulation
|
||||
|
||||
"""
|
||||
return [self.get_model_from_repo(repo) for repo in response]
|
||||
|
||||
def ingest_data(self, identifier, checks=False):
|
||||
"""Rework the base ingest_data.
|
||||
Request server endpoint which gives all in one go.
|
||||
|
||||
Simplify and filter response list of repositories. Inject
|
||||
repo information into local db. Queue loader tasks for
|
||||
linked repositories.
|
||||
|
||||
Args:
|
||||
identifier: Resource identifier (unused)
|
||||
checks (bool): Additional checks required (unused)
|
||||
|
||||
"""
|
||||
response = self.r_script_request()
|
||||
if not response:
|
||||
return response, []
|
||||
models_list = self.transport_response_simplified(response)
|
||||
models_list = self.filter_before_inject(models_list)
|
||||
all_injected = []
|
||||
for models in utils.grouper(models_list, n=10000):
|
||||
models = list(models)
|
||||
logging.debug('models: %s' % len(models))
|
||||
# inject into local db
|
||||
injected = self.inject_repo_data_into_db(models)
|
||||
# queue workers
|
||||
self.create_missing_origins_and_tasks(models, injected)
|
||||
all_injected.append(injected)
|
||||
# flush
|
||||
self.db_session.commit()
|
||||
self.db_session = self.mk_session()
|
||||
|
||||
return response, all_injected
|
17
swh/lister/cran/models.py
Normal file
17
swh/lister/cran/models.py
Normal file
|
@ -0,0 +1,17 @@
|
|||
# Copyright (C) 2019 the Software Heritage developers
|
||||
# License: GNU General Public License version 3, or any later version
|
||||
# See top-level LICENSE file for more information
|
||||
|
||||
from sqlalchemy import Column, String
|
||||
|
||||
from swh.lister.core.models import ModelBase
|
||||
|
||||
|
||||
class CRANModel(ModelBase):
|
||||
"""a CRAN repository representation
|
||||
|
||||
"""
|
||||
__tablename__ = 'cran_repo'
|
||||
|
||||
uid = Column(String, primary_key=True)
|
||||
version = Column(String)
|
17
swh/lister/cran/tasks.py
Normal file
17
swh/lister/cran/tasks.py
Normal file
|
@ -0,0 +1,17 @@
|
|||
# Copyright (C) 2019 the Software Heritage developers
|
||||
# License: GNU General Public License version 3, or any later version
|
||||
# See top-level LICENSE file for more information
|
||||
|
||||
from swh.scheduler.celery_backend.config import app
|
||||
|
||||
from swh.lister.cran.lister import CRANLister
|
||||
|
||||
|
||||
@app.task(name=__name__ + '.CRANListerTask')
|
||||
def cran_lister(**lister_args):
|
||||
CRANLister(**lister_args).run()
|
||||
|
||||
|
||||
@app.task(name=__name__ + '.ping')
|
||||
def ping():
|
||||
return 'OK'
|
0
swh/lister/cran/tests/__init__.py
Normal file
0
swh/lister/cran/tests/__init__.py
Normal file
1
swh/lister/cran/tests/conftest.py
Normal file
1
swh/lister/cran/tests/conftest.py
Normal file
|
@ -0,0 +1 @@
|
|||
from swh.lister.core.tests.conftest import * # noqa
|
27
swh/lister/cran/tests/test_tasks.py
Normal file
27
swh/lister/cran/tests/test_tasks.py
Normal file
|
@ -0,0 +1,27 @@
|
|||
from unittest.mock import patch
|
||||
|
||||
|
||||
def test_ping(swh_app, celery_session_worker):
|
||||
res = swh_app.send_task(
|
||||
'swh.lister.cran.tasks.ping')
|
||||
assert res
|
||||
res.wait()
|
||||
assert res.successful()
|
||||
assert res.result == 'OK'
|
||||
|
||||
|
||||
@patch('swh.lister.cran.tasks.CRANLister')
|
||||
def test_lister(lister, swh_app, celery_session_worker):
|
||||
# setup the mocked CRANLister
|
||||
lister.return_value = lister
|
||||
lister.run.return_value = None
|
||||
|
||||
res = swh_app.send_task(
|
||||
'swh.lister.cran.tasks.CRANListerTask')
|
||||
assert res
|
||||
res.wait()
|
||||
assert res.successful()
|
||||
|
||||
lister.assert_called_once_with()
|
||||
lister.db_last_index.assert_not_called()
|
||||
lister.run.assert_called_once_with()
|
Loading…
Add table
Add a link
Reference in a new issue