swh.lister.gnu
Implement first pass of gnu lister to list all the packages present in https://ftp.gnu.org/ Add GNU lister in README and cli.py Closes T1722
This commit is contained in:
parent
f8a2ae866b
commit
151f6cd223
10 changed files with 298 additions and 1 deletions
0
swh/lister/gnu/__init__.py
Normal file
0
swh/lister/gnu/__init__.py
Normal file
217
swh/lister/gnu/lister.py
Normal file
217
swh/lister/gnu/lister.py
Normal file
|
@ -0,0 +1,217 @@
|
|||
# Copyright (C) 2019 the Software Heritage developers
|
||||
# License: GNU General Public License version 3, or any later version
|
||||
# See top-level LICENSE file for more information
|
||||
|
||||
import random
|
||||
import gzip
|
||||
import json
|
||||
import os
|
||||
import requests
|
||||
from urllib.parse import urlparse
|
||||
|
||||
from .models import GNUModel
|
||||
|
||||
from swh.scheduler import utils
|
||||
from swh.lister.core.simple_lister import SimpleLister
|
||||
from swh.model.hashutil import MultiHash, HASH_BLOCK_SIZE
|
||||
|
||||
|
||||
class LocalResponse:
|
||||
"""Local Response class with iter_content api
|
||||
|
||||
"""
|
||||
def __init__(self, path):
|
||||
self.path = path
|
||||
|
||||
def iter_content(self, chunk_size=None):
|
||||
with open(self.path, 'rb') as f:
|
||||
while True:
|
||||
chunk = f.read(chunk_size)
|
||||
if not chunk:
|
||||
break
|
||||
yield chunk
|
||||
|
||||
|
||||
class ArchiveFetcher:
|
||||
"""Http/Local client in charge of downloading archives from a
|
||||
remote/local server.
|
||||
|
||||
Args:
|
||||
temp_directory (str): Path to the temporary disk location used
|
||||
for downloading the release artifacts
|
||||
|
||||
"""
|
||||
def __init__(self, temp_directory=None):
|
||||
self.temp_directory = os.getcwd()
|
||||
self.session = requests.session()
|
||||
self.params = {
|
||||
'headers': {
|
||||
'User-Agent': 'Software Heritage Lister ( __devl__)'
|
||||
}
|
||||
}
|
||||
|
||||
def download(self, url):
|
||||
"""Download the remote tarball url locally.
|
||||
|
||||
Args:
|
||||
url (str): Url (file or http*)
|
||||
|
||||
Raises:
|
||||
ValueError in case of failing to query
|
||||
|
||||
Returns:
|
||||
Tuple of local (filepath, hashes of filepath)
|
||||
|
||||
"""
|
||||
url_parsed = urlparse(url)
|
||||
if url_parsed.scheme == 'file':
|
||||
path = url_parsed.path
|
||||
response = LocalResponse(path)
|
||||
length = os.path.getsize(path)
|
||||
else:
|
||||
response = self.session.get(url, **self.params, stream=True)
|
||||
if response.status_code != 200:
|
||||
raise ValueError("Fail to query '%s'. Reason: %s" % (
|
||||
url, response.status_code))
|
||||
length = int(response.headers['content-length'])
|
||||
|
||||
filepath = os.path.join(self.temp_directory, os.path.basename(url))
|
||||
|
||||
h = MultiHash(length=length)
|
||||
with open(filepath, 'wb') as f:
|
||||
for chunk in response.iter_content(chunk_size=HASH_BLOCK_SIZE):
|
||||
h.update(chunk)
|
||||
f.write(chunk)
|
||||
|
||||
actual_length = os.path.getsize(filepath)
|
||||
if length != actual_length:
|
||||
raise ValueError('Error when checking size: %s != %s' % (
|
||||
length, actual_length))
|
||||
|
||||
return filepath
|
||||
|
||||
|
||||
class GNULister(SimpleLister, ArchiveFetcher):
|
||||
MODEL = GNUModel
|
||||
LISTER_NAME = 'gnu'
|
||||
TREE_URL = 'https://ftp.gnu.org/tree.json.gz'
|
||||
|
||||
def __init__(self, override_config=None):
|
||||
SimpleLister.__init__(self, override_config=override_config)
|
||||
ArchiveFetcher.__init__(self, override_config=override_config)
|
||||
|
||||
def task_dict(self, origin_type, origin_url, **kwargs):
|
||||
"""(Override)
|
||||
Return task format dict
|
||||
|
||||
This is overridden from the lister_base as more information is
|
||||
needed for the ingestion task creation.
|
||||
|
||||
"""
|
||||
_type = 'load-%s' % origin_type
|
||||
_policy = 'recurring'
|
||||
project_name = kwargs.get('name')
|
||||
project_metadata_url = kwargs.get('html_url')
|
||||
return utils.create_task_dict(
|
||||
_type, _policy, project_name, origin_url,
|
||||
project_metadata_url=project_metadata_url)
|
||||
|
||||
def download_file(self):
|
||||
'''
|
||||
Downloads tree.json file and returns its location
|
||||
|
||||
Returns
|
||||
File path of the downloaded file
|
||||
'''
|
||||
file_path, hash_dict = self.download(self.TREE_URL)
|
||||
return file_path
|
||||
|
||||
def read_downloaded_file(self, file_path):
|
||||
'''
|
||||
Reads the downloaded file content and convert it into json format
|
||||
|
||||
Returns
|
||||
File content in json format
|
||||
'''
|
||||
with gzip.GzipFile(file_path, 'r') as fin:
|
||||
response = json.loads(fin.read().decode('utf-8'))
|
||||
return response
|
||||
|
||||
def safely_issue_request(self, identifier):
|
||||
'''(Override)Make network request with to download the file which
|
||||
has file structure of the GNU website.
|
||||
|
||||
Args:
|
||||
identifier: resource identifier
|
||||
Returns:
|
||||
server response
|
||||
'''
|
||||
file_path = self.download_file()
|
||||
response = self.read_downloaded_file(file_path)
|
||||
return response
|
||||
|
||||
def list_packages(self, response):
|
||||
"""(Override) List the actual gnu origins with their names and
|
||||
time last updated from the response.
|
||||
|
||||
"""
|
||||
response = clean_up_response(response)
|
||||
_packages = []
|
||||
for directory in response:
|
||||
content = directory['contents']
|
||||
for repo in content:
|
||||
if repo['type'] == 'directory':
|
||||
repo_details = {
|
||||
'name': repo['name'],
|
||||
'url': self._get_project_url(directory['name'],
|
||||
repo['name']),
|
||||
'time_modified': repo['time']
|
||||
}
|
||||
_packages.append(repo_details)
|
||||
random.shuffle(_packages)
|
||||
return _packages
|
||||
|
||||
def _get_project_url(self, dir_name, package_name):
|
||||
"""Returns project_url
|
||||
|
||||
"""
|
||||
return 'https://ftp.gnu.org/%s/%s/' % (dir_name, package_name)
|
||||
|
||||
def get_model_from_repo(self, repo):
|
||||
"""(Override) Transform from repository representation to model
|
||||
|
||||
"""
|
||||
return {
|
||||
'uid': repo['name'],
|
||||
'name': repo['name'],
|
||||
'full_name': repo['name'],
|
||||
'html_url': repo['url'],
|
||||
'origin_url': repo['url'],
|
||||
'time_last_upated': repo['time_modified'],
|
||||
'origin_type': 'gnu',
|
||||
'description': None,
|
||||
}
|
||||
|
||||
def transport_response_simplified(self, response):
|
||||
"""(Override) Transform response to list for model manipulation
|
||||
|
||||
"""
|
||||
return [self.get_model_from_repo(repo) for repo in response]
|
||||
|
||||
def transport_request(self):
|
||||
pass
|
||||
|
||||
def transport_response_to_string(self):
|
||||
pass
|
||||
|
||||
def transport_quota_check(self):
|
||||
pass
|
||||
|
||||
|
||||
def clean_up_response(response):
|
||||
final_response = []
|
||||
file_system = response[0]['content']
|
||||
for directory in file_system:
|
||||
if directory['name'] in ('gnu', 'mirrors', 'old-gnu'):
|
||||
final_response.append(directory)
|
||||
return final_response
|
17
swh/lister/gnu/models.py
Normal file
17
swh/lister/gnu/models.py
Normal file
|
@ -0,0 +1,17 @@
|
|||
# Copyright (C) 2019 the Software Heritage developers
|
||||
# License: GNU General Public License version 3, or any later version
|
||||
# See top-level LICENSE file for more information
|
||||
|
||||
from sqlalchemy import Column, String, Integer
|
||||
|
||||
from ..core.models import ModelBase
|
||||
|
||||
|
||||
class GNUModel(ModelBase):
|
||||
"""a GNU repository representation
|
||||
|
||||
"""
|
||||
__tablename__ = 'gnu_repo'
|
||||
|
||||
uid = Column(String, primary_key=True)
|
||||
time_last_upated = Column(Integer)
|
17
swh/lister/gnu/tasks.py
Normal file
17
swh/lister/gnu/tasks.py
Normal file
|
@ -0,0 +1,17 @@
|
|||
# Copyright (C) 2019 the Software Heritage developers
|
||||
# License: GNU General Public License version 3, or any later version
|
||||
# See top-level LICENSE file for more information
|
||||
|
||||
from swh.scheduler.celery_backend.config import app
|
||||
|
||||
from .lister import GNULister
|
||||
|
||||
|
||||
@app.task(name=__name__ + '.GNUListerTask')
|
||||
def gnu_lister(**lister_args):
|
||||
GNULister(**lister_args).run()
|
||||
|
||||
|
||||
@app.task(name=__name__ + '.ping')
|
||||
def ping():
|
||||
return 'OK'
|
0
swh/lister/gnu/tests/__init__.py
Normal file
0
swh/lister/gnu/tests/__init__.py
Normal file
1
swh/lister/gnu/tests/conftest.py
Normal file
1
swh/lister/gnu/tests/conftest.py
Normal file
|
@ -0,0 +1 @@
|
|||
from swh.lister.core.tests.conftest import * # noqa
|
27
swh/lister/gnu/tests/test_tasks.py
Normal file
27
swh/lister/gnu/tests/test_tasks.py
Normal file
|
@ -0,0 +1,27 @@
|
|||
from unittest.mock import patch
|
||||
|
||||
|
||||
def test_ping(swh_app, celery_session_worker):
|
||||
res = swh_app.send_task(
|
||||
'swh.lister.gnu.tasks.ping')
|
||||
assert res
|
||||
res.wait()
|
||||
assert res.successful()
|
||||
assert res.result == 'OK'
|
||||
|
||||
|
||||
@patch('swh.lister.gnu.tasks.GNULister')
|
||||
def test_lister(lister, swh_app, celery_session_worker):
|
||||
# setup the mocked GNULister
|
||||
lister.return_value = lister
|
||||
lister.run.return_value = None
|
||||
|
||||
res = swh_app.send_task(
|
||||
'swh.lister.gnu.tasks.GNUListerTask')
|
||||
assert res
|
||||
res.wait()
|
||||
assert res.successful()
|
||||
|
||||
lister.assert_called_once_with()
|
||||
lister.db_last_index.assert_not_called()
|
||||
lister.run.assert_called_once_with()
|
Loading…
Add table
Add a link
Reference in a new issue