cran.lister: Fix cran lister and add proper integration test

Which checks the cran lister tasks written in the scheduler. Related d30d574dbe Related 5ea9d5ed39 Related T2032
2019-10-10 20:52:41 +02:00 · 2019-10-10 20:52:41 +02:00 · 8d50e0d941
commit 8d50e0d941
parent ef2c1847e4
6 changed files with 205 additions and 67 deletions
--- a/MANIFEST.in
+++ b/MANIFEST.in
@ -6,4 +6,4 @@ include requirements-test.txt
 include version.txt
 include swh/lister/cran/list_all_packages.R
 recursive-include swh/lister/*/tests/ *.json *.html *.txt *.* *
-recursive-include swh/lister/cgit/tests/data/ *.* *
+recursive-include swh/lister/*/tests/data/ *.* *
--- a/swh/lister/core/simple_lister.py
+++ b/swh/lister/core/simple_lister.py
@ -1,4 +1,5 @@
-# Copyright (C) 2018 the Software Heritage developers
+# Copyright (C) 2018-2019 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information

--- a/swh/lister/core/tests/conftest.py
+++ b/swh/lister/core/tests/conftest.py
@ -1,5 +1,11 @@
+# Copyright (C) 2019  The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
 from swh.scheduler.tests.conftest import *  # noqa

+import logging
 import pytest

 from sqlalchemy import create_engine
@ -8,6 +14,9 @@ from swh.lister import get_lister, SUPPORTED_LISTERS
 from swh.lister.core.models import initialize


+logger = logging.getLogger(__name__)
+
+
@pytest.fixture
 def swh_listers(request, postgresql_proc, postgresql, swh_scheduler):
    db_url = 'postgresql://{user}@{host}:{port}/{dbname}'.format(
--- a/swh/lister/cran/lister.py
+++ b/swh/lister/cran/lister.py
@ -1,4 +1,5 @@
-# Copyright (C) 2019 the Software Heritage developers
+# Copyright (C) 2019 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information

@ -7,8 +8,7 @@ import logging
 import pkg_resources
 import subprocess

-from collections import defaultdict
-from typing import List, Dict
+from typing import List, Mapping

 from swh.lister.cran.models import CRANModel

@ -19,77 +19,114 @@ from swh.scheduler.utils import create_task_dict
 logger = logging.getLogger(__name__)


+def read_cran_data() -> List[Mapping[str, str]]:
+    """Execute r script to read cran listing.
+
+    """
+    filepath = pkg_resources.resource_filename('swh.lister.cran',
+                                               'list_all_packages.R')
+    logger.debug('script list-all-packages.R path: %s', filepath)
+    response = subprocess.run(
+        filepath, stdout=subprocess.PIPE, shell=False, encoding='utf-8')
+    return json.loads(response.stdout)
+
+
+def compute_package_url(repo: Mapping[str, str]) -> str:
+    """Compute the package url from the repo dict.
+
+    Args:
+        repo: dict with key 'Package', 'Version'
+
+    Returns:
+        the package url
+
+    """
+    return 'https://cran.r-project.org/src/contrib' \
+        '/%(Package)s_%(Version)s.tar.gz' % repo
+
+
 class CRANLister(SimpleLister):
    MODEL = CRANModel
    LISTER_NAME = 'cran'
    instance = 'cran'
-    descriptions = defaultdict(dict)

    def task_dict(self, origin_type, origin_url, **kwargs):
-        """Return task format dict
+        """Return task format dict. This creates tasks with args and kwargs
+        set, for example::
+
+            args: ['package', 'https://cran.r-project.org/...', 'version']
+            kwargs: {}

-        This is overridden from the lister_base as more information is
-        needed for the ingestion task creation.
        """
+        policy = kwargs.get('policy', 'oneshot')
+        package = kwargs.get('name')
+        version = kwargs.get('version')
        return create_task_dict(
            'load-%s' % origin_type,
-            kwargs.get('policy', 'recurring'),
-            kwargs.get('name'), origin_url, kwargs.get('version'),
-            project_metadata=self.descriptions[kwargs.get('name')])
+            policy, package, origin_url, version,
+            retries_left=3,
+        )

-    def safely_issue_request(self, identifier: str) -> List[Dict]:
-        """Runs R script which uses inbuilt API to return a json response
-           containing data about all the R packages.
+    def safely_issue_request(self, identifier):
+        """Bypass the implementation. It's now the `list_packages` which
+        returns data.

-        Returns:
-            List of Dict about r packages.
-
-        Sample:
-            [
-              {
-                'Package': 'A3',
-                'Version': '1.0.0',
-                'Title':
-                    'Accurate, Adaptable, and Accessible Error Metrics for
-                     Predictive\nModels',
-                'Description':
-                    'Supplies tools for tabulating and analyzing the results
-                     of predictive models. The methods employed are ... '
-              },
-              {
-                'Package': 'abbyyR',
-                'Version': '0.5.4',
-                'Title':
-                    'Access to Abbyy Optical Character Recognition (OCR) API',
-                'Description': 'Get text from images of text using Abbyy
-                                Cloud Optical Character\n ...'
-               },
-                ...
-            ]
+        As an implementation detail, we cannot change simply the base
+        SimpleLister yet as other implementation still uses it. This shall be
+        part of another refactoring pass.

        """
-        filepath = pkg_resources.resource_filename('swh.lister.cran',
-                                                   'list_all_packages.R')
-        logger.debug('script list-all-packages.R path: %s', filepath)
-        response = subprocess.run(
-            filepath, stdout=subprocess.PIPE, shell=False)
-        data = json.loads(response.stdout)
-        logger.debug('r-script-request: %s', data)
-        return data
+        return None

-    def get_model_from_repo(self, repo):
+    def list_packages(self, *args) -> List[Mapping[str, str]]:
+        """Runs R script which uses inbuilt API to return a json response
+           containing data about the R packages.
+
+        Returns:
+            List of Dict about r packages. For example:
+
+            .. code-block:: python
+
+                [
+                    {
+                        'Package': 'A3',
+                        'Version': '1.0.0',
+                        'Title':
+                            'Accurate, Adaptable, and Accessible Error Metrics
+                             for Predictive\nModels',
+                        'Description':
+                            'Supplies tools for tabulating and analyzing the
+                             results of predictive models. The methods employed
+                             are ... '
+                    },
+                    {
+                        'Package': 'abbyyR',
+                        'Version': '0.5.4',
+                        'Title':
+                            'Access to Abbyy OCR (OCR) API',
+                        'Description': 'Get text from images of text using
+                                        Abbyy Cloud Optical Character\n ...'
+                    },
+                    ...
+                ]
+
+        """
+        return read_cran_data()
+
+    def get_model_from_repo(
+            self, repo: Mapping[str, str]) -> Mapping[str, str]:
        """Transform from repository representation to model

        """
-        self.descriptions[repo["Package"]] = repo['Description']
-        project_url = 'https://cran.r-project.org/src/contrib' \
-                      '/%(Package)s_%(Version)s.tar.gz' % repo
+        logger.debug('repo: %s', repo)
+        project_url = compute_package_url(repo)
+        package = repo['Package']
        return {
-            'uid': repo["Package"],
-            'name': repo["Package"],
-            'full_name': repo["Title"],
-            'version': repo["Version"],
+            'uid': package,
+            'name': package,
+            'full_name': repo['Title'],
+            'version': repo['Version'],
            'html_url': project_url,
            'origin_url': project_url,
-            'origin_type': 'cran',
+            'origin_type': 'tar',
        }
--- a/swh/lister/cran/tests/data/list-r-packages.json
+++ b/swh/lister/cran/tests/data/list-r-packages.json
@ -0,0 +1,39 @@
+[
+    {
+        "Package": "SeleMix",
+        "Version": "1.0.1",
+        "Title": "Selective Editing via Mixture Models",
+        "Description": "Detection of outliers and influential errors using a latent variable model. "
+    },
+    {
+        "Package": "plink",
+        "Version": "1.5-1",
+        "Title": "IRT Separate Calibration Linking Methods",
+        "Description": "Item response theory based methods are used to compute\n        linking constants and conduct chain linking of unidimensional\n        or multidimensional tests for multiple groups under a common\n        item design.  The unidimensional methods include the Mean/Mean,\n        Mean/Sigma, Haebara, and Stocking-Lord methods for dichotomous\n        (1PL, 2PL and 3PL) and/or polytomous (graded response, partial\n        credit/generalized partial credit, nominal, and multiple-choice\n        model) items.  The multidimensional methods include the least\n        squares method and extensions of the Haebara and Stocking-Lord\n        method using single or multiple dilation parameters for\n        multidimensional extensions of all the unidimensional\n        dichotomous and polytomous item response models.  The package\n        also includes functions for importing item and/or ability\n        parameters from common IRT software, conducting IRT true score\n        and observed score equating, and plotting item response\n        curves/surfaces, vector plots, information plots, and comparison \n        plots for examining parameter drift."
+    },
+    {
+        "Package": "justifier",
+        "Version": "0.1.0",
+        "Title": "Human and Machine-Readable Justifications and Justified\nDecisions Based on 'YAML'",
+        "Description": "Leverages the 'yum' package to\n             implement a 'YAML' ('YAML Ain't Markup Language', a human\n             friendly standard for data serialization; see <https:yaml.org>)\n             standard for documenting justifications, such as for decisions\n             taken during the planning, execution and analysis of a study\n             or during the development of a behavior change intervention\n             as illustrated by Marques & Peters (2019)\n             <doi:10.17605/osf.io/ndxha>. These justifications are both\n             human- and machine-readable, facilitating efficient extraction\n             and organisation."
+    },
+    {
+        "Package": "Records",
+        "Version": "1.0",
+        "Title": "Record Values and Record Times",
+        "Description": "Functions for generating k-record values and k-record\n        times"
+    },
+    {
+        "Package": "scRNAtools",
+        "Version": "1.0",
+        "Title": "Single Cell RNA Sequencing Data Analysis Tools",
+        "Description": "We integrated the common analysis methods utilized in single cell RNA sequencing data, which included cluster method, principal components analysis (PCA), the filter of differentially expressed genes, pathway enrichment analysis and correlated analysis methods."
+    },
+
+    {
+        "Package": "Deriv",
+        "Version": "3.9.0",
+        "Title": "Symbolic Differentiation",
+        "Description": "R-based solution for symbolic differentiation. It admits\n    user-defined function as well as function substitution\n    in arguments of functions to be differentiated. Some symbolic\n    simplification is part of the work."
+    }
+]
--- a/swh/lister/cran/tests/test_lister.py
+++ b/swh/lister/cran/tests/test_lister.py
@ -1,13 +1,65 @@
+# Copyright (C) 2019 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+import json
+import pytest
+
+from os import path
 from unittest.mock import patch
-from swh.lister.cran.lister import CRANLister
+
+from swh.lister.cran.lister import compute_package_url


-def test_task_dict():
-    lister = CRANLister()
-    lister.descriptions['test_pack'] = 'Test Description'
-    with patch('swh.lister.cran.lister.create_task_dict') as mock_create_tasks:
-        lister.task_dict(origin_type='cran', origin_url='https://abc',
-                         name='test_pack')
-    mock_create_tasks.assert_called_once_with(
-        'load-cran', 'recurring', 'test_pack', 'https://abc', None,
-        project_metadata='Test Description')
+def test_cran_compute_package_url():
+    url = compute_package_url({'Package': 'something', 'Version': '0.0.1'})
+
+    assert url == 'https://cran.r-project.org/src/contrib/%s_%s.tar.gz' % (
+        'something',
+        '0.0.1',
+    )
+
+
+def test_cran_compute_package_url_failure():
+    for incomplete_repo in [{'Version': '0.0.1'}, {'Package': 'package'}, {}]:
+        with pytest.raises(KeyError):
+            compute_package_url(incomplete_repo)
+
+
+@patch('swh.lister.cran.lister.read_cran_data')
+def test_cran_lister_cran(mock_cran, datadir, swh_listers):
+    lister = swh_listers['cran']
+
+    with open(path.join(datadir, 'list-r-packages.json')) as f:
+        data = json.loads(f.read())
+
+    mock_cran.return_value = data
+    assert len(data) == 6
+
+    lister.run()
+
+    r = lister.scheduler.search_tasks(task_type='load-tar')
+    assert len(r) == 6
+
+    for row in r:
+        assert row['type'] == 'load-tar'
+        # arguments check
+        args = row['arguments']['args']
+        assert len(args) == 3
+        # ['SeleMix',
+        #  'https://cran.r-project.org/src/contrib/SeleMix_1.0.1.tar.gz',
+        #  '1.0.1']
+
+        package = args[0]
+        url = args[1]
+        version = args[2]
+
+        assert url == compute_package_url(
+            {'Package': package, 'Version': version})
+
+        # kwargs
+        kwargs = row['arguments']['kwargs']
+        assert kwargs == {}
+
+        assert row['policy'] == 'oneshot'