cran.lister: Refactor and fix cran lister
Prior to this commit, the code was actually duplicated with an old version which would not work. Related D1492#41287
This commit is contained in:
parent
85d001067a
commit
d30d574dbe
1 changed files with 30 additions and 51 deletions
|
@ -1,17 +1,22 @@
|
|||
# Copyright (C) 2019 the Software Heritage developers
|
||||
# License: GNU General Public License version 3, or any later version
|
||||
# See top-level LICENSE file for more information
|
||||
import subprocess
|
||||
|
||||
import json
|
||||
import logging
|
||||
import pkg_resources
|
||||
import subprocess
|
||||
|
||||
from collections import defaultdict
|
||||
from typing import List, Dict
|
||||
|
||||
from swh.lister.cran.models import CRANModel
|
||||
|
||||
from swh.scheduler.utils import create_task_dict
|
||||
from swh.core import utils
|
||||
from swh.lister.core.simple_lister import SimpleLister
|
||||
from swh.scheduler.utils import create_task_dict
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class CRANLister(SimpleLister):
|
||||
|
@ -32,15 +37,17 @@ class CRANLister(SimpleLister):
|
|||
kwargs.get('name'), origin_url, kwargs.get('version'),
|
||||
project_metadata=self.descriptions[kwargs.get('name')])
|
||||
|
||||
def r_script_request(self):
|
||||
"""Runs R script which uses inbuilt API to return a json
|
||||
response containing data about all the R packages
|
||||
def safely_issue_request(self, identifier: str) -> List[Dict]:
|
||||
"""Runs R script which uses inbuilt API to return a json response
|
||||
containing data about all the R packages.
|
||||
|
||||
Returns:
|
||||
List of dictionaries
|
||||
example
|
||||
List of Dict about r packages.
|
||||
|
||||
Sample:
|
||||
[
|
||||
{'Package': 'A3',
|
||||
{
|
||||
'Package': 'A3',
|
||||
'Version': '1.0.0',
|
||||
'Title':
|
||||
'Accurate, Adaptable, and Accessible Error Metrics for
|
||||
|
@ -48,22 +55,27 @@ class CRANLister(SimpleLister):
|
|||
'Description':
|
||||
'Supplies tools for tabulating and analyzing the results
|
||||
of predictive models. The methods employed are ... '
|
||||
}
|
||||
{'Package': 'abbyyR',
|
||||
},
|
||||
{
|
||||
'Package': 'abbyyR',
|
||||
'Version': '0.5.4',
|
||||
'Title':
|
||||
'Access to Abbyy Optical Character Recognition (OCR) API',
|
||||
'Description': 'Get text from images of text using Abbyy
|
||||
Cloud Optical Character\n ...'
|
||||
}
|
||||
Cloud Optical Character\n ...'
|
||||
},
|
||||
...
|
||||
]
|
||||
|
||||
"""
|
||||
file_path = pkg_resources.resource_filename('swh.lister.cran',
|
||||
'list_all_packages.R')
|
||||
response = subprocess.run(file_path, stdout=subprocess.PIPE,
|
||||
shell=False)
|
||||
return json.loads(response.stdout)
|
||||
filepath = pkg_resources.resource_filename('swh.lister.cran',
|
||||
'list_all_packages.R')
|
||||
logger.debug('script list-all-packages.R path: %s', filepath)
|
||||
response = subprocess.run(
|
||||
filepath, stdout=subprocess.PIPE, shell=False)
|
||||
data = json.loads(response.stdout)
|
||||
logger.debug('r-script-request: %s', data)
|
||||
return data
|
||||
|
||||
def get_model_from_repo(self, repo):
|
||||
"""Transform from repository representation to model
|
||||
|
@ -87,36 +99,3 @@ class CRANLister(SimpleLister):
|
|||
|
||||
"""
|
||||
return [self.get_model_from_repo(repo) for repo in response]
|
||||
|
||||
def ingest_data(self, identifier, checks=False):
|
||||
"""Rework the base ingest_data.
|
||||
Request server endpoint which gives all in one go.
|
||||
|
||||
Simplify and filter response list of repositories. Inject
|
||||
repo information into local db. Queue loader tasks for
|
||||
linked repositories.
|
||||
|
||||
Args:
|
||||
identifier: Resource identifier (unused)
|
||||
checks (bool): Additional checks required (unused)
|
||||
|
||||
"""
|
||||
response = self.r_script_request()
|
||||
if not response:
|
||||
return response, []
|
||||
models_list = self.transport_response_simplified(response)
|
||||
models_list = self.filter_before_inject(models_list)
|
||||
all_injected = []
|
||||
for models in utils.grouper(models_list, n=10000):
|
||||
models = list(models)
|
||||
logging.debug('models: %s' % len(models))
|
||||
# inject into local db
|
||||
injected = self.inject_repo_data_into_db(models)
|
||||
# queue workers
|
||||
self.create_missing_origins_and_tasks(models, injected)
|
||||
all_injected.append(injected)
|
||||
# flush
|
||||
self.db_session.commit()
|
||||
self.db_session = self.mk_session()
|
||||
|
||||
return response, all_injected
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue