swh.lister.gnu

Implement first pass of gnu lister to list all the
packages present in https://ftp.gnu.org/
Add GNU lister in README and cli.py

Closes T1722
This commit is contained in:
Archit Agrawal 2019-05-17 15:54:20 +05:30
parent f8a2ae866b
commit 151f6cd223
10 changed files with 298 additions and 1 deletions

View file

@ -177,6 +177,18 @@ logging.basicConfig(level=logging.DEBUG)
incremental_phabricator_lister(forge_url='https://forge.softwareheritage.org', api_token='XXXX')
```
## lister-gnu
Once configured, you can execute a PyPI lister using the following instructions in a `python3` script:
```lang=python
import logging
from swh.lister.gnu.tasks import gnu_lister
logging.basicConfig(level=logging.DEBUG)
gnu_lister()
```
Licensing
---------

View file

@ -12,7 +12,7 @@ from swh.core.cli import CONTEXT_SETTINGS
logger = logging.getLogger(__name__)
SUPPORTED_LISTERS = ['github', 'gitlab', 'bitbucket', 'debian', 'pypi',
'npm', 'phabricator']
'npm', 'phabricator', 'gnu']
@click.group(name='lister', context_settings=CONTEXT_SETTINGS)
@ -115,6 +115,11 @@ def cli(ctx, db_url, listers, drop_tables):
api_token='',
override_config=override_conf)
elif lister == 'gnu':
from .gnu.models import ModelBase
from .gnu.lister import GNULister
_lister = GNULister(override_config=override_conf)
else:
raise ValueError(
'Invalid lister %s: only supported listers are %s' %

View file

@ -12,4 +12,5 @@ def celery_includes():
'swh.lister.npm.tasks',
'swh.lister.pypi.tasks',
'swh.lister.phabricator.tasks',
'swh.lister.gnu.tasks'
]

View file

217
swh/lister/gnu/lister.py Normal file
View file

@ -0,0 +1,217 @@
# Copyright (C) 2019 the Software Heritage developers
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import random
import gzip
import json
import os
import requests
from urllib.parse import urlparse
from .models import GNUModel
from swh.scheduler import utils
from swh.lister.core.simple_lister import SimpleLister
from swh.model.hashutil import MultiHash, HASH_BLOCK_SIZE
class LocalResponse:
"""Local Response class with iter_content api
"""
def __init__(self, path):
self.path = path
def iter_content(self, chunk_size=None):
with open(self.path, 'rb') as f:
while True:
chunk = f.read(chunk_size)
if not chunk:
break
yield chunk
class ArchiveFetcher:
"""Http/Local client in charge of downloading archives from a
remote/local server.
Args:
temp_directory (str): Path to the temporary disk location used
for downloading the release artifacts
"""
def __init__(self, temp_directory=None):
self.temp_directory = os.getcwd()
self.session = requests.session()
self.params = {
'headers': {
'User-Agent': 'Software Heritage Lister ( __devl__)'
}
}
def download(self, url):
"""Download the remote tarball url locally.
Args:
url (str): Url (file or http*)
Raises:
ValueError in case of failing to query
Returns:
Tuple of local (filepath, hashes of filepath)
"""
url_parsed = urlparse(url)
if url_parsed.scheme == 'file':
path = url_parsed.path
response = LocalResponse(path)
length = os.path.getsize(path)
else:
response = self.session.get(url, **self.params, stream=True)
if response.status_code != 200:
raise ValueError("Fail to query '%s'. Reason: %s" % (
url, response.status_code))
length = int(response.headers['content-length'])
filepath = os.path.join(self.temp_directory, os.path.basename(url))
h = MultiHash(length=length)
with open(filepath, 'wb') as f:
for chunk in response.iter_content(chunk_size=HASH_BLOCK_SIZE):
h.update(chunk)
f.write(chunk)
actual_length = os.path.getsize(filepath)
if length != actual_length:
raise ValueError('Error when checking size: %s != %s' % (
length, actual_length))
return filepath
class GNULister(SimpleLister, ArchiveFetcher):
MODEL = GNUModel
LISTER_NAME = 'gnu'
TREE_URL = 'https://ftp.gnu.org/tree.json.gz'
def __init__(self, override_config=None):
SimpleLister.__init__(self, override_config=override_config)
ArchiveFetcher.__init__(self, override_config=override_config)
def task_dict(self, origin_type, origin_url, **kwargs):
"""(Override)
Return task format dict
This is overridden from the lister_base as more information is
needed for the ingestion task creation.
"""
_type = 'load-%s' % origin_type
_policy = 'recurring'
project_name = kwargs.get('name')
project_metadata_url = kwargs.get('html_url')
return utils.create_task_dict(
_type, _policy, project_name, origin_url,
project_metadata_url=project_metadata_url)
def download_file(self):
'''
Downloads tree.json file and returns its location
Returns
File path of the downloaded file
'''
file_path, hash_dict = self.download(self.TREE_URL)
return file_path
def read_downloaded_file(self, file_path):
'''
Reads the downloaded file content and convert it into json format
Returns
File content in json format
'''
with gzip.GzipFile(file_path, 'r') as fin:
response = json.loads(fin.read().decode('utf-8'))
return response
def safely_issue_request(self, identifier):
'''(Override)Make network request with to download the file which
has file structure of the GNU website.
Args:
identifier: resource identifier
Returns:
server response
'''
file_path = self.download_file()
response = self.read_downloaded_file(file_path)
return response
def list_packages(self, response):
"""(Override) List the actual gnu origins with their names and
time last updated from the response.
"""
response = clean_up_response(response)
_packages = []
for directory in response:
content = directory['contents']
for repo in content:
if repo['type'] == 'directory':
repo_details = {
'name': repo['name'],
'url': self._get_project_url(directory['name'],
repo['name']),
'time_modified': repo['time']
}
_packages.append(repo_details)
random.shuffle(_packages)
return _packages
def _get_project_url(self, dir_name, package_name):
"""Returns project_url
"""
return 'https://ftp.gnu.org/%s/%s/' % (dir_name, package_name)
def get_model_from_repo(self, repo):
"""(Override) Transform from repository representation to model
"""
return {
'uid': repo['name'],
'name': repo['name'],
'full_name': repo['name'],
'html_url': repo['url'],
'origin_url': repo['url'],
'time_last_upated': repo['time_modified'],
'origin_type': 'gnu',
'description': None,
}
def transport_response_simplified(self, response):
"""(Override) Transform response to list for model manipulation
"""
return [self.get_model_from_repo(repo) for repo in response]
def transport_request(self):
pass
def transport_response_to_string(self):
pass
def transport_quota_check(self):
pass
def clean_up_response(response):
final_response = []
file_system = response[0]['content']
for directory in file_system:
if directory['name'] in ('gnu', 'mirrors', 'old-gnu'):
final_response.append(directory)
return final_response

17
swh/lister/gnu/models.py Normal file
View file

@ -0,0 +1,17 @@
# Copyright (C) 2019 the Software Heritage developers
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from sqlalchemy import Column, String, Integer
from ..core.models import ModelBase
class GNUModel(ModelBase):
"""a GNU repository representation
"""
__tablename__ = 'gnu_repo'
uid = Column(String, primary_key=True)
time_last_upated = Column(Integer)

17
swh/lister/gnu/tasks.py Normal file
View file

@ -0,0 +1,17 @@
# Copyright (C) 2019 the Software Heritage developers
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from swh.scheduler.celery_backend.config import app
from .lister import GNULister
@app.task(name=__name__ + '.GNUListerTask')
def gnu_lister(**lister_args):
GNULister(**lister_args).run()
@app.task(name=__name__ + '.ping')
def ping():
return 'OK'

View file

View file

@ -0,0 +1 @@
from swh.lister.core.tests.conftest import * # noqa

View file

@ -0,0 +1,27 @@
from unittest.mock import patch
def test_ping(swh_app, celery_session_worker):
res = swh_app.send_task(
'swh.lister.gnu.tasks.ping')
assert res
res.wait()
assert res.successful()
assert res.result == 'OK'
@patch('swh.lister.gnu.tasks.GNULister')
def test_lister(lister, swh_app, celery_session_worker):
# setup the mocked GNULister
lister.return_value = lister
lister.run.return_value = None
res = swh_app.send_task(
'swh.lister.gnu.tasks.GNUListerTask')
assert res
res.wait()
assert res.successful()
lister.assert_called_once_with()
lister.db_last_index.assert_not_called()
lister.run.assert_called_once_with()