cran.lister: Fix cran lister and add proper integration test

Which checks the cran lister tasks written in the scheduler.

Related d30d574dbe
Related 5ea9d5ed39

Related T2032
This commit is contained in:
Antoine R. Dumont (@ardumont) 2019-10-10 20:52:41 +02:00
parent ef2c1847e4
commit 8d50e0d941
No known key found for this signature in database
GPG key ID: 52E2E9840D10C3B8
6 changed files with 205 additions and 67 deletions

View file

@ -6,4 +6,4 @@ include requirements-test.txt
include version.txt
include swh/lister/cran/list_all_packages.R
recursive-include swh/lister/*/tests/ *.json *.html *.txt *.* *
recursive-include swh/lister/cgit/tests/data/ *.* *
recursive-include swh/lister/*/tests/data/ *.* *

View file

@ -1,4 +1,5 @@
# Copyright (C) 2018 the Software Heritage developers
# Copyright (C) 2018-2019 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information

View file

@ -1,5 +1,11 @@
# Copyright (C) 2019 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from swh.scheduler.tests.conftest import * # noqa
import logging
import pytest
from sqlalchemy import create_engine
@ -8,6 +14,9 @@ from swh.lister import get_lister, SUPPORTED_LISTERS
from swh.lister.core.models import initialize
logger = logging.getLogger(__name__)
@pytest.fixture
def swh_listers(request, postgresql_proc, postgresql, swh_scheduler):
db_url = 'postgresql://{user}@{host}:{port}/{dbname}'.format(

View file

@ -1,4 +1,5 @@
# Copyright (C) 2019 the Software Heritage developers
# Copyright (C) 2019 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
@ -7,8 +8,7 @@ import logging
import pkg_resources
import subprocess
from collections import defaultdict
from typing import List, Dict
from typing import List, Mapping
from swh.lister.cran.models import CRANModel
@ -19,77 +19,114 @@ from swh.scheduler.utils import create_task_dict
logger = logging.getLogger(__name__)
def read_cran_data() -> List[Mapping[str, str]]:
"""Execute r script to read cran listing.
"""
filepath = pkg_resources.resource_filename('swh.lister.cran',
'list_all_packages.R')
logger.debug('script list-all-packages.R path: %s', filepath)
response = subprocess.run(
filepath, stdout=subprocess.PIPE, shell=False, encoding='utf-8')
return json.loads(response.stdout)
def compute_package_url(repo: Mapping[str, str]) -> str:
"""Compute the package url from the repo dict.
Args:
repo: dict with key 'Package', 'Version'
Returns:
the package url
"""
return 'https://cran.r-project.org/src/contrib' \
'/%(Package)s_%(Version)s.tar.gz' % repo
class CRANLister(SimpleLister):
MODEL = CRANModel
LISTER_NAME = 'cran'
instance = 'cran'
descriptions = defaultdict(dict)
def task_dict(self, origin_type, origin_url, **kwargs):
"""Return task format dict
"""Return task format dict. This creates tasks with args and kwargs
set, for example::
args: ['package', 'https://cran.r-project.org/...', 'version']
kwargs: {}
This is overridden from the lister_base as more information is
needed for the ingestion task creation.
"""
policy = kwargs.get('policy', 'oneshot')
package = kwargs.get('name')
version = kwargs.get('version')
return create_task_dict(
'load-%s' % origin_type,
kwargs.get('policy', 'recurring'),
kwargs.get('name'), origin_url, kwargs.get('version'),
project_metadata=self.descriptions[kwargs.get('name')])
policy, package, origin_url, version,
retries_left=3,
)
def safely_issue_request(self, identifier: str) -> List[Dict]:
"""Runs R script which uses inbuilt API to return a json response
containing data about all the R packages.
def safely_issue_request(self, identifier):
"""Bypass the implementation. It's now the `list_packages` which
returns data.
Returns:
List of Dict about r packages.
Sample:
[
{
'Package': 'A3',
'Version': '1.0.0',
'Title':
'Accurate, Adaptable, and Accessible Error Metrics for
Predictive\nModels',
'Description':
'Supplies tools for tabulating and analyzing the results
of predictive models. The methods employed are ... '
},
{
'Package': 'abbyyR',
'Version': '0.5.4',
'Title':
'Access to Abbyy Optical Character Recognition (OCR) API',
'Description': 'Get text from images of text using Abbyy
Cloud Optical Character\n ...'
},
...
]
As an implementation detail, we cannot change simply the base
SimpleLister yet as other implementation still uses it. This shall be
part of another refactoring pass.
"""
filepath = pkg_resources.resource_filename('swh.lister.cran',
'list_all_packages.R')
logger.debug('script list-all-packages.R path: %s', filepath)
response = subprocess.run(
filepath, stdout=subprocess.PIPE, shell=False)
data = json.loads(response.stdout)
logger.debug('r-script-request: %s', data)
return data
return None
def get_model_from_repo(self, repo):
def list_packages(self, *args) -> List[Mapping[str, str]]:
"""Runs R script which uses inbuilt API to return a json response
containing data about the R packages.
Returns:
List of Dict about r packages. For example:
.. code-block:: python
[
{
'Package': 'A3',
'Version': '1.0.0',
'Title':
'Accurate, Adaptable, and Accessible Error Metrics
for Predictive\nModels',
'Description':
'Supplies tools for tabulating and analyzing the
results of predictive models. The methods employed
are ... '
},
{
'Package': 'abbyyR',
'Version': '0.5.4',
'Title':
'Access to Abbyy OCR (OCR) API',
'Description': 'Get text from images of text using
Abbyy Cloud Optical Character\n ...'
},
...
]
"""
return read_cran_data()
def get_model_from_repo(
self, repo: Mapping[str, str]) -> Mapping[str, str]:
"""Transform from repository representation to model
"""
self.descriptions[repo["Package"]] = repo['Description']
project_url = 'https://cran.r-project.org/src/contrib' \
'/%(Package)s_%(Version)s.tar.gz' % repo
logger.debug('repo: %s', repo)
project_url = compute_package_url(repo)
package = repo['Package']
return {
'uid': repo["Package"],
'name': repo["Package"],
'full_name': repo["Title"],
'version': repo["Version"],
'uid': package,
'name': package,
'full_name': repo['Title'],
'version': repo['Version'],
'html_url': project_url,
'origin_url': project_url,
'origin_type': 'cran',
'origin_type': 'tar',
}

View file

@ -0,0 +1,39 @@
[
{
"Package": "SeleMix",
"Version": "1.0.1",
"Title": "Selective Editing via Mixture Models",
"Description": "Detection of outliers and influential errors using a latent variable model. "
},
{
"Package": "plink",
"Version": "1.5-1",
"Title": "IRT Separate Calibration Linking Methods",
"Description": "Item response theory based methods are used to compute\n linking constants and conduct chain linking of unidimensional\n or multidimensional tests for multiple groups under a common\n item design. The unidimensional methods include the Mean/Mean,\n Mean/Sigma, Haebara, and Stocking-Lord methods for dichotomous\n (1PL, 2PL and 3PL) and/or polytomous (graded response, partial\n credit/generalized partial credit, nominal, and multiple-choice\n model) items. The multidimensional methods include the least\n squares method and extensions of the Haebara and Stocking-Lord\n method using single or multiple dilation parameters for\n multidimensional extensions of all the unidimensional\n dichotomous and polytomous item response models. The package\n also includes functions for importing item and/or ability\n parameters from common IRT software, conducting IRT true score\n and observed score equating, and plotting item response\n curves/surfaces, vector plots, information plots, and comparison \n plots for examining parameter drift."
},
{
"Package": "justifier",
"Version": "0.1.0",
"Title": "Human and Machine-Readable Justifications and Justified\nDecisions Based on 'YAML'",
"Description": "Leverages the 'yum' package to\n implement a 'YAML' ('YAML Ain't Markup Language', a human\n friendly standard for data serialization; see <https:yaml.org>)\n standard for documenting justifications, such as for decisions\n taken during the planning, execution and analysis of a study\n or during the development of a behavior change intervention\n as illustrated by Marques & Peters (2019)\n <doi:10.17605/osf.io/ndxha>. These justifications are both\n human- and machine-readable, facilitating efficient extraction\n and organisation."
},
{
"Package": "Records",
"Version": "1.0",
"Title": "Record Values and Record Times",
"Description": "Functions for generating k-record values and k-record\n times"
},
{
"Package": "scRNAtools",
"Version": "1.0",
"Title": "Single Cell RNA Sequencing Data Analysis Tools",
"Description": "We integrated the common analysis methods utilized in single cell RNA sequencing data, which included cluster method, principal components analysis (PCA), the filter of differentially expressed genes, pathway enrichment analysis and correlated analysis methods."
},
{
"Package": "Deriv",
"Version": "3.9.0",
"Title": "Symbolic Differentiation",
"Description": "R-based solution for symbolic differentiation. It admits\n user-defined function as well as function substitution\n in arguments of functions to be differentiated. Some symbolic\n simplification is part of the work."
}
]

View file

@ -1,13 +1,65 @@
# Copyright (C) 2019 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import json
import pytest
from os import path
from unittest.mock import patch
from swh.lister.cran.lister import CRANLister
from swh.lister.cran.lister import compute_package_url
def test_task_dict():
lister = CRANLister()
lister.descriptions['test_pack'] = 'Test Description'
with patch('swh.lister.cran.lister.create_task_dict') as mock_create_tasks:
lister.task_dict(origin_type='cran', origin_url='https://abc',
name='test_pack')
mock_create_tasks.assert_called_once_with(
'load-cran', 'recurring', 'test_pack', 'https://abc', None,
project_metadata='Test Description')
def test_cran_compute_package_url():
url = compute_package_url({'Package': 'something', 'Version': '0.0.1'})
assert url == 'https://cran.r-project.org/src/contrib/%s_%s.tar.gz' % (
'something',
'0.0.1',
)
def test_cran_compute_package_url_failure():
for incomplete_repo in [{'Version': '0.0.1'}, {'Package': 'package'}, {}]:
with pytest.raises(KeyError):
compute_package_url(incomplete_repo)
@patch('swh.lister.cran.lister.read_cran_data')
def test_cran_lister_cran(mock_cran, datadir, swh_listers):
lister = swh_listers['cran']
with open(path.join(datadir, 'list-r-packages.json')) as f:
data = json.loads(f.read())
mock_cran.return_value = data
assert len(data) == 6
lister.run()
r = lister.scheduler.search_tasks(task_type='load-tar')
assert len(r) == 6
for row in r:
assert row['type'] == 'load-tar'
# arguments check
args = row['arguments']['args']
assert len(args) == 3
# ['SeleMix',
# 'https://cran.r-project.org/src/contrib/SeleMix_1.0.1.tar.gz',
# '1.0.1']
package = args[0]
url = args[1]
version = args[2]
assert url == compute_package_url(
{'Package': package, 'Version': version})
# kwargs
kwargs = row['arguments']['kwargs']
assert kwargs == {}
assert row['policy'] == 'oneshot'