gnu.tests: Checks lister output from scheduler

This also adds an swh-listers fixture which allows to retrieve a test ready
lister from its name (e.g gnu). Those listers have access to a scheduler
fixture so we can check the listing output from the scheduler instance.
This commit is contained in:
Antoine R. Dumont (@ardumont) 2019-10-04 18:18:58 +02:00
parent 394658e53b
commit 0f0b840178
No known key found for this signature in database
GPG key ID: 52E2E9840D10C3B8
5 changed files with 404 additions and 215 deletions

View file

@ -2,118 +2,89 @@
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import random
import gzip
import json
import requests
from pathlib import Path
from collections import defaultdict
from .models import GNUModel
import logging
from swh.scheduler import utils
from swh.lister.core.simple_lister import SimpleLister
from swh.lister.gnu.models import GNUModel
from swh.lister.gnu.tree import GNUTree
logger = logging.getLogger(__name__)
class GNULister(SimpleLister):
MODEL = GNUModel
LISTER_NAME = 'gnu'
TREE_URL = 'https://ftp.gnu.org/tree.json.gz'
BASE_URL = 'https://ftp.gnu.org'
instance = 'gnu'
tarballs = defaultdict(dict) # Dict of key with project name value the
# associated is list of tarballs of package to ingest from the gnu mirror
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.gnu_tree = GNUTree('https://ftp.gnu.org/tree.json.gz')
def task_dict(self, origin_type, origin_url, **kwargs):
"""
Return task format dict
"""Return task format dict
This is overridden from the lister_base as more information is
needed for the ingestion task creation.
This creates tasks with args and kwargs set, for example:
.. code-block:: python
args: ['https://ftp.gnu.org/gnu/3dldf/']
kwargs: {
'tarballs': [{
'archive': 'https://...',
'time': 1071002600,
'length': 128},
...
]}
"""
tarballs = self.gnu_tree.artifacts[origin_url]
return utils.create_task_dict(
'load-%s' % origin_type,
kwargs.get('policy', 'oneshot'),
kwargs.get('name'),
origin_url,
tarballs=self.tarballs[kwargs.get('name')])
tarballs=tarballs)
def safely_issue_request(self, identifier):
'''
Download and unzip tree.json.gz file and returns its content
in JSON format
"""Bypass the implementation. It's now the GNUTree which deals with
querying the gnu mirror.
File content in dictionary format
As an implementation detail, we cannot change simply the base
SimpleLister as other implementation still uses it. This shall be part
of another refactoring pass.
Args:
identifier: resource identifier (unused)
Returns:
Server response
'''
response = requests.get(self.TREE_URL,
allow_redirects=True)
uncompressed_content = gzip.decompress(response.content)
return json.loads(uncompressed_content.decode('utf-8'))
"""
return None
def list_packages(self, response):
"""
List the actual gnu origins with their names,url and the list
of all the tarball for a package from the response.
"""List the actual gnu origins (package name) with their name, url and
associated tarballs.
Args:
response : File structure of the website
in dictionary format
response: Unused
Returns:
A list of all the packages with their names, url of their root
directory and the tarballs present for the particular package.
[
{'name': '3dldf', 'url': 'https://ftp.gnu.org/gnu/3dldf/',
'tarballs':
[
{'archive':
'https://ftp.gnu.org/gnu/3dldf/3DLDF-1.1.3.tar.gz',
'date': '1071002600'},
{'archive':
'https://ftp.gnu.org/gnu/3dldf/3DLDF-1.1.4.tar.gz',
'date': '1071078759'}}
]
},
{'name': '8sync', 'url': 'https://ftp.gnu.org/gnu/8sync/',
'tarballs':
[
{'archive':
'https://ftp.gnu.org/gnu/8sync/8sync-0.1.0.tar.gz',
'date': '1461357336'},
{'archive':
'https://ftp.gnu.org/gnu/8sync/8sync-0.2.0.tar.gz',
'date': '1480991830'}
]
]
List of packages name, url, last modification time
.. code-block:: python
[
{'name': '3dldf',
'url': 'https://ftp.gnu.org/gnu/3dldf/',
'time_modified': 1071002600},
{'name': '8sync',
'url': 'https://ftp.gnu.org/gnu/8sync/',
'time_modified': 1480991830},
...
]
"""
response = filter_directories(response)
packages = []
for directory in response:
content = directory['contents']
for repo in content:
if repo['type'] == 'directory':
package_url = '%s/%s/%s/' % (self.BASE_URL,
directory['name'],
repo['name'])
package_tarballs = find_tarballs(
repo['contents'], package_url)
if package_tarballs != []:
repo_details = {
'name': repo['name'],
'url': package_url,
'time_modified': repo['time'],
}
self.tarballs[repo['name']] = package_tarballs
packages.append(repo_details)
random.shuffle(packages)
return packages
return list(self.gnu_tree.projects.values())
def get_model_from_repo(self, repo):
"""Transform from repository representation to model
@ -128,89 +99,3 @@ class GNULister(SimpleLister):
'time_last_updated': int(repo['time_modified']),
'origin_type': 'tar',
}
def find_tarballs(package_file_structure, url):
'''Recursively lists tarballs present in the folder and subfolders for a
particular package url.
Args
package_file_structure: File structure of the package root directory
url: URL of the corresponding package
Returns
List of tarball urls and their associated metadata (time, length).
For example:
[
{'archive': 'https://ftp.gnu.org/gnu/3dldf/3DLDF-1.1.3.tar.gz',
'time': 1071002600,
'length': 543},
{'archive': 'https://ftp.gnu.org/gnu/3dldf/3DLDF-1.1.4.tar.gz',
'time': 1071078759,
'length': 456},
{'archive': 'https://ftp.gnu.org/gnu/3dldf/3DLDF-1.1.5.1.tar.gz',
'time': 1074278633,
'length': 251},
...
]
'''
tarballs = []
for single_file in package_file_structure:
filetype = single_file['type']
filename = single_file['name']
if filetype == 'file':
if file_extension_check(filename):
tarballs.append({
'archive': url + filename,
'time': int(single_file['time']),
'length': int(single_file['size']),
})
# It will recursively check for tarballs in all sub-folders
elif filetype == 'directory':
tarballs_in_dir = find_tarballs(
single_file['contents'],
url + filename + '/')
tarballs.extend(tarballs_in_dir)
return tarballs
def filter_directories(response):
'''
Keep only gnu and old-gnu folders from JSON
'''
final_response = []
file_system = response[0]['contents']
for directory in file_system:
if directory['name'] in ('gnu', 'old-gnu'):
final_response.append(directory)
return final_response
def file_extension_check(file_name):
'''
Check for the extension of the file, if the file is of zip format of
.tar.x format, where x could be anything, then returns true.
Args:
file_name : name of the file for which the extensions is needs to
be checked.
Returns:
True or False
example
file_extension_check('abc.zip') will return True
file_extension_check('abc.tar.gz') will return True
file_extension_check('abc.tar.gz.sig') will return False
'''
file_suffixes = Path(file_name).suffixes
if len(file_suffixes) == 1 and file_suffixes[-1] == '.zip':
return True
elif len(file_suffixes) > 1:
if file_suffixes[-1] == '.zip' or file_suffixes[-2] == '.tar':
return True
return False

Binary file not shown.

View file

@ -1,59 +1,41 @@
# Copyright (C) 2019 the Software Heritage developers
# Copyright (C) 2019 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import json
from swh.lister.gnu.lister import find_tarballs, filter_directories
from swh.lister.gnu.lister import file_extension_check
import logging
def test_filter_directories():
f = open('swh/lister/gnu/tests/api_response.json')
api_response = json.load(f)
cleared_api_response = filter_directories(api_response)
for directory in cleared_api_response:
if directory['name'] not in ('gnu', 'old-gnu'):
assert False
logger = logging.getLogger(__name__)
def test_find_tarballs_small_sample():
expected_tarballs = [
{
'archive': '/root/artanis/artanis-0.2.1.tar.bz2',
'time': 1495205979,
'length': 424081,
},
{
'archive': '/root/xboard/winboard/winboard-4_0_0-src.zip', # noqa
'time': 898422900,
'length': 1514448
},
{
'archive': '/root/xboard/xboard-3.6.2.tar.gz', # noqa
'time': 869814000,
'length': 450164,
},
{
'archive': '/root/xboard/xboard-4.0.0.tar.gz', # noqa
'time': 898422900,
'length': 514951,
},
]
def test_lister_no_page_check_results(swh_listers, requests_mock_datadir):
lister = swh_listers['gnu']
file_structure = json.load(open('swh/lister/gnu/tests/tree.min.json'))
actual_tarballs = find_tarballs(file_structure, '/root/')
assert actual_tarballs == expected_tarballs
lister.run()
r = lister.scheduler.search_tasks(task_type='load-tar')
assert len(r) == 383
def test_find_tarballs():
file_structure = json.load(open('swh/lister/gnu/tests/tree.json'))
actual_tarballs = find_tarballs(file_structure, '/root/')
assert len(actual_tarballs) == 42 + 3 # tar + zip
for row in r:
assert row['type'] == 'load-tar'
# arguments check
args = row['arguments']['args']
assert len(args) == 1
url = args[0]
assert url.startswith('https://ftp.gnu.org')
def test_file_extension_check():
assert file_extension_check('abc.xy.zip')
assert file_extension_check('cvb.zip')
assert file_extension_check('abc.tar.bz2')
assert file_extension_check('abc') is False
url_suffix = url.split('https://ftp.gnu.org')[1]
assert 'gnu' in url_suffix or 'old-gnu' in url_suffix
# kwargs
kwargs = row['arguments']['kwargs']
assert list(kwargs.keys()) == ['tarballs']
tarballs = kwargs['tarballs']
# check the tarball's structure
tarball = tarballs[0]
assert set(tarball.keys()) == set(['archive', 'length', 'time'])
assert row['policy'] == 'oneshot'

View file

@ -0,0 +1,135 @@
# Copyright (C) 2019 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import json
import pytest
from os import path
from swh.lister.gnu.tree import (
GNUTree, find_artifacts, check_filename_is_archive, load_raw_data
)
def test_load_raw_data_from_query(requests_mock_datadir):
actual_json = load_raw_data('https://ftp.gnu.org/tree.json.gz')
assert actual_json is not None
assert isinstance(actual_json, list)
assert len(actual_json) == 2
def test_load_raw_data_from_query_failure(requests_mock_datadir):
inexistant_url = 'https://ftp2.gnu.org/tree.unknown.gz'
with pytest.raises(ValueError, match='Error during query'):
load_raw_data(inexistant_url)
def test_load_raw_data_from_file(datadir):
filepath = path.join(datadir, 'ftp.gnu.org', 'tree.json.gz')
actual_json = load_raw_data(filepath)
assert actual_json is not None
assert isinstance(actual_json, list)
assert len(actual_json) == 2
def test_load_raw_data_from_file_failure(datadir):
unknown_path = path.join(datadir, 'ftp.gnu.org2', 'tree.json.gz')
with pytest.raises(FileNotFoundError):
load_raw_data(unknown_path)
def test_tree_json(requests_mock_datadir):
tree_json = GNUTree('https://ftp.gnu.org/tree.json.gz')
assert tree_json.projects['https://ftp.gnu.org/gnu/8sync/'] == {
'name': '8sync',
'time_modified': '1489817408',
'url': 'https://ftp.gnu.org/gnu/8sync/'
}
assert tree_json.projects['https://ftp.gnu.org/gnu/3dldf/'] == {
'name': '3dldf',
'time_modified': '1386961236',
'url': 'https://ftp.gnu.org/gnu/3dldf/'
}
assert tree_json.projects['https://ftp.gnu.org/gnu/a2ps/'] == {
'name': 'a2ps',
'time_modified': '1198900505',
'url': 'https://ftp.gnu.org/gnu/a2ps/'
}
assert tree_json.projects['https://ftp.gnu.org/old-gnu/xshogi/'] == {
'name': 'xshogi',
'time_modified': '1059822922',
'url': 'https://ftp.gnu.org/old-gnu/xshogi/'
}
assert tree_json.artifacts['https://ftp.gnu.org/old-gnu/zlibc/'] == [
{
'archive': 'https://ftp.gnu.org/old-gnu/zlibc/zlibc-0.9b.tar.gz', # noqa
'length': 90106,
'time': 857980800
},
{
'archive': 'https://ftp.gnu.org/old-gnu/zlibc/zlibc-0.9e.tar.gz', # noqa
'length': 89625,
'time': 860396400
}
]
def test_tree_json_failures(requests_mock_datadir):
url = 'https://unknown/tree.json.gz'
tree_json = GNUTree(url)
with pytest.raises(ValueError, match='Error during query to %s' % url):
tree_json.artifacts['https://ftp.gnu.org/gnu/3dldf/']
with pytest.raises(ValueError, match='Error during query to %s' % url):
tree_json.projects['https://ftp.gnu.org/old-gnu/xshogi/']
def test_find_artifacts_small_sample():
expected_tarballs = [
{
'archive': '/root/artanis/artanis-0.2.1.tar.bz2',
'time': 1495205979,
'length': 424081,
},
{
'archive': '/root/xboard/winboard/winboard-4_0_0-src.zip', # noqa
'time': 898422900,
'length': 1514448
},
{
'archive': '/root/xboard/xboard-3.6.2.tar.gz', # noqa
'time': 869814000,
'length': 450164,
},
{
'archive': '/root/xboard/xboard-4.0.0.tar.gz', # noqa
'time': 898422900,
'length': 514951,
},
]
file_structure = json.load(open('swh/lister/gnu/tests/tree.min.json'))
actual_tarballs = find_artifacts(file_structure, '/root/')
assert actual_tarballs == expected_tarballs
def test_find_artifacts():
file_structure = json.load(open('swh/lister/gnu/tests/tree.json'))
actual_tarballs = find_artifacts(file_structure, '/root/')
assert len(actual_tarballs) == 42 + 3 # tar + zip
def test_check_filename_is_archive():
for ext in ['abc.xy.zip', 'cvb.zip', 'abc.tar.bz2', 'something.tar']:
assert check_filename_is_archive(ext) is True
for ext in ['abc.tar.gz.sig', 'abc', 'something.zip2', 'foo.tar.']:
assert check_filename_is_archive(ext) is False

187
swh/lister/gnu/tree.py Normal file
View file

@ -0,0 +1,187 @@
# Copyright (C) 2019 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import gzip
import json
import logging
import requests
from pathlib import Path
from typing import Dict, Tuple, List
from urllib.parse import urlparse
logger = logging.getLogger(__name__)
def load_raw_data(url: str) -> List[Dict]:
"""Load the raw json from the tree.json.gz
Args:
url: Tree.json.gz url or path
Returns:
The raw json list
"""
if url.startswith('http://') or url.startswith('https://'):
response = requests.get(url, allow_redirects=True)
if not response.ok:
raise ValueError('Error during query to %s' % url)
raw = gzip.decompress(response.content)
else:
with gzip.open(url, 'r') as f:
raw = f.read()
raw_data = json.loads(raw.decode('utf-8'))
return raw_data
class GNUTree:
"""Gnu Tree's representation
"""
def __init__(self, url: str):
self.url = url # filepath or uri
u = urlparse(url)
self.base_url = '%s://%s' % (u.scheme, u.netloc)
# Interesting top level directories
self.top_level_directories = ['gnu', 'old-gnu']
# internal state
self._artifacts = {} # type: Dict
self._projects = {} # type: Dict
@property
def projects(self) -> Dict:
if not self._projects:
self._projects, self._artifacts = self._load()
return self._projects
@property
def artifacts(self) -> Dict:
if not self._artifacts:
self._projects, self._artifacts = self._load()
return self._artifacts
def _load(self) -> Tuple[Dict, Dict]:
"""Compute projects and artifacts per project
Returns:
Tuple of dict projects (key project url, value the associated
information) and a dict artifacts (key project url, value the
info_file list)
"""
projects = {}
artifacts = {}
raw_data = load_raw_data(self.url)[0]
for directory in raw_data['contents']:
if directory['name'] not in self.top_level_directories:
continue
infos = directory['contents']
for info in infos:
if info['type'] == 'directory':
package_url = '%s/%s/%s/' % (
self.base_url, directory['name'], info['name'])
package_artifacts = find_artifacts(
info['contents'], package_url)
if package_artifacts != []:
repo_details = {
'name': info['name'],
'url': package_url,
'time_modified': info['time'],
}
artifacts[package_url] = package_artifacts
projects[package_url] = repo_details
return projects, artifacts
def find_artifacts(filesystem: List[Dict], url: str) -> List[Dict]:
"""Recursively list artifacts present in the folder and subfolders for a
particular package url.
Args:
filesystem: File structure of the package root directory. This is a
list of Dict representing either file or directory information as
dict (keys: name, size, time, type).
url: URL of the corresponding package
Returns
List of tarball urls and their associated metadata (time, length).
For example:
.. code-block:: python
[
{'archive': 'https://ftp.gnu.org/gnu/3dldf/3DLDF-1.1.3.tar.gz',
'time': 1071002600,
'length': 543},
{'archive': 'https://ftp.gnu.org/gnu/3dldf/3DLDF-1.1.4.tar.gz',
'time': 1071078759,
'length': 456},
{'archive': 'https://ftp.gnu.org/gnu/3dldf/3DLDF-1.1.5.tar.gz',
'time': 1074278633,
'length': 251},
...
]
"""
artifacts = []
for info_file in filesystem:
filetype = info_file['type']
filename = info_file['name']
if filetype == 'file':
if check_filename_is_archive(filename):
artifacts.append({
'archive': url + filename,
'time': int(info_file['time']),
'length': int(info_file['size']),
})
# It will recursively check for artifacts in all sub-folders
elif filetype == 'directory':
tarballs_in_dir = find_artifacts(
info_file['contents'],
url + filename + '/')
artifacts.extend(tarballs_in_dir)
return artifacts
def check_filename_is_archive(filename: str) -> bool:
"""
Check for the extension of the file, if the file is of zip format of
.tar.x format, where x could be anything, then returns true.
Args:
filename: name of the file for which the extensions is needs to
be checked.
Returns:
Whether filename is an archive or not
Example:
>>> check_filename_is_archive('abc.zip')
True
>>> check_filename_is_archive('abc.tar.gz')
True
>>> check_filename_is_archive('bac.tar')
True
>>> check_filename_is_archive('abc.tar.gz.sig')
False
>>> check_filename_is_archive('foobar.tar.')
False
"""
file_suffixes = Path(filename).suffixes
logger.debug('Path(%s).suffixed: %s' % (filename, file_suffixes))
if len(file_suffixes) == 1 and file_suffixes[-1] in ('.zip', '.tar'):
return True
elif len(file_suffixes) > 1:
if file_suffixes[-1] == '.zip' or file_suffixes[-2] == '.tar':
return True
return False