gnu.tests: Checks lister output from scheduler
This also adds an swh-listers fixture which allows to retrieve a test ready lister from its name (e.g gnu). Those listers have access to a scheduler fixture so we can check the listing output from the scheduler instance.
This commit is contained in:
parent
394658e53b
commit
0f0b840178
5 changed files with 404 additions and 215 deletions
|
@ -2,118 +2,89 @@
|
|||
# License: GNU General Public License version 3, or any later version
|
||||
# See top-level LICENSE file for more information
|
||||
|
||||
import random
|
||||
import gzip
|
||||
import json
|
||||
import requests
|
||||
from pathlib import Path
|
||||
from collections import defaultdict
|
||||
|
||||
from .models import GNUModel
|
||||
import logging
|
||||
|
||||
from swh.scheduler import utils
|
||||
from swh.lister.core.simple_lister import SimpleLister
|
||||
|
||||
from swh.lister.gnu.models import GNUModel
|
||||
from swh.lister.gnu.tree import GNUTree
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class GNULister(SimpleLister):
|
||||
MODEL = GNUModel
|
||||
LISTER_NAME = 'gnu'
|
||||
TREE_URL = 'https://ftp.gnu.org/tree.json.gz'
|
||||
BASE_URL = 'https://ftp.gnu.org'
|
||||
instance = 'gnu'
|
||||
tarballs = defaultdict(dict) # Dict of key with project name value the
|
||||
# associated is list of tarballs of package to ingest from the gnu mirror
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self.gnu_tree = GNUTree('https://ftp.gnu.org/tree.json.gz')
|
||||
|
||||
def task_dict(self, origin_type, origin_url, **kwargs):
|
||||
"""
|
||||
Return task format dict
|
||||
"""Return task format dict
|
||||
|
||||
This is overridden from the lister_base as more information is
|
||||
needed for the ingestion task creation.
|
||||
|
||||
This creates tasks with args and kwargs set, for example:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
args: ['https://ftp.gnu.org/gnu/3dldf/']
|
||||
kwargs: {
|
||||
'tarballs': [{
|
||||
'archive': 'https://...',
|
||||
'time': 1071002600,
|
||||
'length': 128},
|
||||
...
|
||||
]}
|
||||
|
||||
"""
|
||||
tarballs = self.gnu_tree.artifacts[origin_url]
|
||||
return utils.create_task_dict(
|
||||
'load-%s' % origin_type,
|
||||
kwargs.get('policy', 'oneshot'),
|
||||
kwargs.get('name'),
|
||||
origin_url,
|
||||
tarballs=self.tarballs[kwargs.get('name')])
|
||||
tarballs=tarballs)
|
||||
|
||||
def safely_issue_request(self, identifier):
|
||||
'''
|
||||
Download and unzip tree.json.gz file and returns its content
|
||||
in JSON format
|
||||
"""Bypass the implementation. It's now the GNUTree which deals with
|
||||
querying the gnu mirror.
|
||||
|
||||
File content in dictionary format
|
||||
As an implementation detail, we cannot change simply the base
|
||||
SimpleLister as other implementation still uses it. This shall be part
|
||||
of another refactoring pass.
|
||||
|
||||
Args:
|
||||
identifier: resource identifier (unused)
|
||||
|
||||
Returns:
|
||||
Server response
|
||||
|
||||
'''
|
||||
response = requests.get(self.TREE_URL,
|
||||
allow_redirects=True)
|
||||
uncompressed_content = gzip.decompress(response.content)
|
||||
return json.loads(uncompressed_content.decode('utf-8'))
|
||||
"""
|
||||
return None
|
||||
|
||||
def list_packages(self, response):
|
||||
"""
|
||||
List the actual gnu origins with their names,url and the list
|
||||
of all the tarball for a package from the response.
|
||||
"""List the actual gnu origins (package name) with their name, url and
|
||||
associated tarballs.
|
||||
|
||||
Args:
|
||||
response : File structure of the website
|
||||
in dictionary format
|
||||
response: Unused
|
||||
|
||||
Returns:
|
||||
A list of all the packages with their names, url of their root
|
||||
directory and the tarballs present for the particular package.
|
||||
[
|
||||
{'name': '3dldf', 'url': 'https://ftp.gnu.org/gnu/3dldf/',
|
||||
'tarballs':
|
||||
[
|
||||
{'archive':
|
||||
'https://ftp.gnu.org/gnu/3dldf/3DLDF-1.1.3.tar.gz',
|
||||
'date': '1071002600'},
|
||||
{'archive':
|
||||
'https://ftp.gnu.org/gnu/3dldf/3DLDF-1.1.4.tar.gz',
|
||||
'date': '1071078759'}}
|
||||
]
|
||||
},
|
||||
{'name': '8sync', 'url': 'https://ftp.gnu.org/gnu/8sync/',
|
||||
'tarballs':
|
||||
[
|
||||
{'archive':
|
||||
'https://ftp.gnu.org/gnu/8sync/8sync-0.1.0.tar.gz',
|
||||
'date': '1461357336'},
|
||||
{'archive':
|
||||
'https://ftp.gnu.org/gnu/8sync/8sync-0.2.0.tar.gz',
|
||||
'date': '1480991830'}
|
||||
]
|
||||
]
|
||||
List of packages name, url, last modification time
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
[
|
||||
{'name': '3dldf',
|
||||
'url': 'https://ftp.gnu.org/gnu/3dldf/',
|
||||
'time_modified': 1071002600},
|
||||
{'name': '8sync',
|
||||
'url': 'https://ftp.gnu.org/gnu/8sync/',
|
||||
'time_modified': 1480991830},
|
||||
...
|
||||
]
|
||||
|
||||
"""
|
||||
response = filter_directories(response)
|
||||
packages = []
|
||||
for directory in response:
|
||||
content = directory['contents']
|
||||
for repo in content:
|
||||
if repo['type'] == 'directory':
|
||||
package_url = '%s/%s/%s/' % (self.BASE_URL,
|
||||
directory['name'],
|
||||
repo['name'])
|
||||
package_tarballs = find_tarballs(
|
||||
repo['contents'], package_url)
|
||||
if package_tarballs != []:
|
||||
repo_details = {
|
||||
'name': repo['name'],
|
||||
'url': package_url,
|
||||
'time_modified': repo['time'],
|
||||
}
|
||||
self.tarballs[repo['name']] = package_tarballs
|
||||
packages.append(repo_details)
|
||||
random.shuffle(packages)
|
||||
return packages
|
||||
return list(self.gnu_tree.projects.values())
|
||||
|
||||
def get_model_from_repo(self, repo):
|
||||
"""Transform from repository representation to model
|
||||
|
@ -128,89 +99,3 @@ class GNULister(SimpleLister):
|
|||
'time_last_updated': int(repo['time_modified']),
|
||||
'origin_type': 'tar',
|
||||
}
|
||||
|
||||
|
||||
def find_tarballs(package_file_structure, url):
|
||||
'''Recursively lists tarballs present in the folder and subfolders for a
|
||||
particular package url.
|
||||
|
||||
Args
|
||||
package_file_structure: File structure of the package root directory
|
||||
url: URL of the corresponding package
|
||||
|
||||
Returns
|
||||
List of tarball urls and their associated metadata (time, length).
|
||||
For example:
|
||||
|
||||
[
|
||||
{'archive': 'https://ftp.gnu.org/gnu/3dldf/3DLDF-1.1.3.tar.gz',
|
||||
'time': 1071002600,
|
||||
'length': 543},
|
||||
{'archive': 'https://ftp.gnu.org/gnu/3dldf/3DLDF-1.1.4.tar.gz',
|
||||
'time': 1071078759,
|
||||
'length': 456},
|
||||
{'archive': 'https://ftp.gnu.org/gnu/3dldf/3DLDF-1.1.5.1.tar.gz',
|
||||
'time': 1074278633,
|
||||
'length': 251},
|
||||
...
|
||||
]
|
||||
|
||||
'''
|
||||
tarballs = []
|
||||
for single_file in package_file_structure:
|
||||
filetype = single_file['type']
|
||||
filename = single_file['name']
|
||||
if filetype == 'file':
|
||||
if file_extension_check(filename):
|
||||
tarballs.append({
|
||||
'archive': url + filename,
|
||||
'time': int(single_file['time']),
|
||||
'length': int(single_file['size']),
|
||||
})
|
||||
# It will recursively check for tarballs in all sub-folders
|
||||
elif filetype == 'directory':
|
||||
tarballs_in_dir = find_tarballs(
|
||||
single_file['contents'],
|
||||
url + filename + '/')
|
||||
tarballs.extend(tarballs_in_dir)
|
||||
|
||||
return tarballs
|
||||
|
||||
|
||||
def filter_directories(response):
|
||||
'''
|
||||
Keep only gnu and old-gnu folders from JSON
|
||||
'''
|
||||
final_response = []
|
||||
file_system = response[0]['contents']
|
||||
for directory in file_system:
|
||||
if directory['name'] in ('gnu', 'old-gnu'):
|
||||
final_response.append(directory)
|
||||
return final_response
|
||||
|
||||
|
||||
def file_extension_check(file_name):
|
||||
'''
|
||||
Check for the extension of the file, if the file is of zip format of
|
||||
.tar.x format, where x could be anything, then returns true.
|
||||
|
||||
Args:
|
||||
file_name : name of the file for which the extensions is needs to
|
||||
be checked.
|
||||
|
||||
Returns:
|
||||
True or False
|
||||
|
||||
example
|
||||
file_extension_check('abc.zip') will return True
|
||||
file_extension_check('abc.tar.gz') will return True
|
||||
file_extension_check('abc.tar.gz.sig') will return False
|
||||
|
||||
'''
|
||||
file_suffixes = Path(file_name).suffixes
|
||||
if len(file_suffixes) == 1 and file_suffixes[-1] == '.zip':
|
||||
return True
|
||||
elif len(file_suffixes) > 1:
|
||||
if file_suffixes[-1] == '.zip' or file_suffixes[-2] == '.tar':
|
||||
return True
|
||||
return False
|
||||
|
|
BIN
swh/lister/gnu/tests/data/ftp.gnu.org/tree.json.gz
Normal file
BIN
swh/lister/gnu/tests/data/ftp.gnu.org/tree.json.gz
Normal file
Binary file not shown.
|
@ -1,59 +1,41 @@
|
|||
# Copyright (C) 2019 the Software Heritage developers
|
||||
# Copyright (C) 2019 The Software Heritage developers
|
||||
# See the AUTHORS file at the top-level directory of this distribution
|
||||
# License: GNU General Public License version 3, or any later version
|
||||
# See top-level LICENSE file for more information
|
||||
|
||||
import json
|
||||
|
||||
from swh.lister.gnu.lister import find_tarballs, filter_directories
|
||||
from swh.lister.gnu.lister import file_extension_check
|
||||
import logging
|
||||
|
||||
|
||||
def test_filter_directories():
|
||||
f = open('swh/lister/gnu/tests/api_response.json')
|
||||
api_response = json.load(f)
|
||||
cleared_api_response = filter_directories(api_response)
|
||||
for directory in cleared_api_response:
|
||||
if directory['name'] not in ('gnu', 'old-gnu'):
|
||||
assert False
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def test_find_tarballs_small_sample():
|
||||
expected_tarballs = [
|
||||
{
|
||||
'archive': '/root/artanis/artanis-0.2.1.tar.bz2',
|
||||
'time': 1495205979,
|
||||
'length': 424081,
|
||||
},
|
||||
{
|
||||
'archive': '/root/xboard/winboard/winboard-4_0_0-src.zip', # noqa
|
||||
'time': 898422900,
|
||||
'length': 1514448
|
||||
},
|
||||
{
|
||||
'archive': '/root/xboard/xboard-3.6.2.tar.gz', # noqa
|
||||
'time': 869814000,
|
||||
'length': 450164,
|
||||
},
|
||||
{
|
||||
'archive': '/root/xboard/xboard-4.0.0.tar.gz', # noqa
|
||||
'time': 898422900,
|
||||
'length': 514951,
|
||||
},
|
||||
]
|
||||
def test_lister_no_page_check_results(swh_listers, requests_mock_datadir):
|
||||
lister = swh_listers['gnu']
|
||||
|
||||
file_structure = json.load(open('swh/lister/gnu/tests/tree.min.json'))
|
||||
actual_tarballs = find_tarballs(file_structure, '/root/')
|
||||
assert actual_tarballs == expected_tarballs
|
||||
lister.run()
|
||||
|
||||
r = lister.scheduler.search_tasks(task_type='load-tar')
|
||||
assert len(r) == 383
|
||||
|
||||
def test_find_tarballs():
|
||||
file_structure = json.load(open('swh/lister/gnu/tests/tree.json'))
|
||||
actual_tarballs = find_tarballs(file_structure, '/root/')
|
||||
assert len(actual_tarballs) == 42 + 3 # tar + zip
|
||||
for row in r:
|
||||
assert row['type'] == 'load-tar'
|
||||
# arguments check
|
||||
args = row['arguments']['args']
|
||||
assert len(args) == 1
|
||||
|
||||
url = args[0]
|
||||
assert url.startswith('https://ftp.gnu.org')
|
||||
|
||||
def test_file_extension_check():
|
||||
assert file_extension_check('abc.xy.zip')
|
||||
assert file_extension_check('cvb.zip')
|
||||
assert file_extension_check('abc.tar.bz2')
|
||||
assert file_extension_check('abc') is False
|
||||
url_suffix = url.split('https://ftp.gnu.org')[1]
|
||||
assert 'gnu' in url_suffix or 'old-gnu' in url_suffix
|
||||
|
||||
# kwargs
|
||||
kwargs = row['arguments']['kwargs']
|
||||
assert list(kwargs.keys()) == ['tarballs']
|
||||
|
||||
tarballs = kwargs['tarballs']
|
||||
# check the tarball's structure
|
||||
tarball = tarballs[0]
|
||||
assert set(tarball.keys()) == set(['archive', 'length', 'time'])
|
||||
|
||||
assert row['policy'] == 'oneshot'
|
||||
|
|
135
swh/lister/gnu/tests/test_tree.py
Normal file
135
swh/lister/gnu/tests/test_tree.py
Normal file
|
@ -0,0 +1,135 @@
|
|||
# Copyright (C) 2019 The Software Heritage developers
|
||||
# See the AUTHORS file at the top-level directory of this distribution
|
||||
# License: GNU General Public License version 3, or any later version
|
||||
# See top-level LICENSE file for more information
|
||||
|
||||
import json
|
||||
|
||||
import pytest
|
||||
|
||||
from os import path
|
||||
from swh.lister.gnu.tree import (
|
||||
GNUTree, find_artifacts, check_filename_is_archive, load_raw_data
|
||||
)
|
||||
|
||||
|
||||
def test_load_raw_data_from_query(requests_mock_datadir):
|
||||
actual_json = load_raw_data('https://ftp.gnu.org/tree.json.gz')
|
||||
assert actual_json is not None
|
||||
assert isinstance(actual_json, list)
|
||||
assert len(actual_json) == 2
|
||||
|
||||
|
||||
def test_load_raw_data_from_query_failure(requests_mock_datadir):
|
||||
inexistant_url = 'https://ftp2.gnu.org/tree.unknown.gz'
|
||||
with pytest.raises(ValueError, match='Error during query'):
|
||||
load_raw_data(inexistant_url)
|
||||
|
||||
|
||||
def test_load_raw_data_from_file(datadir):
|
||||
filepath = path.join(datadir, 'ftp.gnu.org', 'tree.json.gz')
|
||||
actual_json = load_raw_data(filepath)
|
||||
assert actual_json is not None
|
||||
assert isinstance(actual_json, list)
|
||||
assert len(actual_json) == 2
|
||||
|
||||
|
||||
def test_load_raw_data_from_file_failure(datadir):
|
||||
unknown_path = path.join(datadir, 'ftp.gnu.org2', 'tree.json.gz')
|
||||
with pytest.raises(FileNotFoundError):
|
||||
load_raw_data(unknown_path)
|
||||
|
||||
|
||||
def test_tree_json(requests_mock_datadir):
|
||||
tree_json = GNUTree('https://ftp.gnu.org/tree.json.gz')
|
||||
|
||||
assert tree_json.projects['https://ftp.gnu.org/gnu/8sync/'] == {
|
||||
'name': '8sync',
|
||||
'time_modified': '1489817408',
|
||||
'url': 'https://ftp.gnu.org/gnu/8sync/'
|
||||
}
|
||||
|
||||
assert tree_json.projects['https://ftp.gnu.org/gnu/3dldf/'] == {
|
||||
'name': '3dldf',
|
||||
'time_modified': '1386961236',
|
||||
'url': 'https://ftp.gnu.org/gnu/3dldf/'
|
||||
}
|
||||
|
||||
assert tree_json.projects['https://ftp.gnu.org/gnu/a2ps/'] == {
|
||||
'name': 'a2ps',
|
||||
'time_modified': '1198900505',
|
||||
'url': 'https://ftp.gnu.org/gnu/a2ps/'
|
||||
}
|
||||
|
||||
assert tree_json.projects['https://ftp.gnu.org/old-gnu/xshogi/'] == {
|
||||
'name': 'xshogi',
|
||||
'time_modified': '1059822922',
|
||||
'url': 'https://ftp.gnu.org/old-gnu/xshogi/'
|
||||
}
|
||||
|
||||
assert tree_json.artifacts['https://ftp.gnu.org/old-gnu/zlibc/'] == [
|
||||
{
|
||||
'archive': 'https://ftp.gnu.org/old-gnu/zlibc/zlibc-0.9b.tar.gz', # noqa
|
||||
'length': 90106,
|
||||
'time': 857980800
|
||||
},
|
||||
{
|
||||
'archive': 'https://ftp.gnu.org/old-gnu/zlibc/zlibc-0.9e.tar.gz', # noqa
|
||||
'length': 89625,
|
||||
'time': 860396400
|
||||
}
|
||||
]
|
||||
|
||||
|
||||
def test_tree_json_failures(requests_mock_datadir):
|
||||
url = 'https://unknown/tree.json.gz'
|
||||
tree_json = GNUTree(url)
|
||||
|
||||
with pytest.raises(ValueError, match='Error during query to %s' % url):
|
||||
tree_json.artifacts['https://ftp.gnu.org/gnu/3dldf/']
|
||||
|
||||
with pytest.raises(ValueError, match='Error during query to %s' % url):
|
||||
tree_json.projects['https://ftp.gnu.org/old-gnu/xshogi/']
|
||||
|
||||
|
||||
def test_find_artifacts_small_sample():
|
||||
expected_tarballs = [
|
||||
{
|
||||
'archive': '/root/artanis/artanis-0.2.1.tar.bz2',
|
||||
'time': 1495205979,
|
||||
'length': 424081,
|
||||
},
|
||||
{
|
||||
'archive': '/root/xboard/winboard/winboard-4_0_0-src.zip', # noqa
|
||||
'time': 898422900,
|
||||
'length': 1514448
|
||||
},
|
||||
{
|
||||
'archive': '/root/xboard/xboard-3.6.2.tar.gz', # noqa
|
||||
'time': 869814000,
|
||||
'length': 450164,
|
||||
},
|
||||
{
|
||||
'archive': '/root/xboard/xboard-4.0.0.tar.gz', # noqa
|
||||
'time': 898422900,
|
||||
'length': 514951,
|
||||
},
|
||||
]
|
||||
|
||||
file_structure = json.load(open('swh/lister/gnu/tests/tree.min.json'))
|
||||
actual_tarballs = find_artifacts(file_structure, '/root/')
|
||||
assert actual_tarballs == expected_tarballs
|
||||
|
||||
|
||||
def test_find_artifacts():
|
||||
file_structure = json.load(open('swh/lister/gnu/tests/tree.json'))
|
||||
actual_tarballs = find_artifacts(file_structure, '/root/')
|
||||
assert len(actual_tarballs) == 42 + 3 # tar + zip
|
||||
|
||||
|
||||
def test_check_filename_is_archive():
|
||||
for ext in ['abc.xy.zip', 'cvb.zip', 'abc.tar.bz2', 'something.tar']:
|
||||
assert check_filename_is_archive(ext) is True
|
||||
|
||||
for ext in ['abc.tar.gz.sig', 'abc', 'something.zip2', 'foo.tar.']:
|
||||
assert check_filename_is_archive(ext) is False
|
187
swh/lister/gnu/tree.py
Normal file
187
swh/lister/gnu/tree.py
Normal file
|
@ -0,0 +1,187 @@
|
|||
# Copyright (C) 2019 The Software Heritage developers
|
||||
# See the AUTHORS file at the top-level directory of this distribution
|
||||
# License: GNU General Public License version 3, or any later version
|
||||
# See top-level LICENSE file for more information
|
||||
|
||||
import gzip
|
||||
import json
|
||||
import logging
|
||||
import requests
|
||||
|
||||
from pathlib import Path
|
||||
from typing import Dict, Tuple, List
|
||||
from urllib.parse import urlparse
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def load_raw_data(url: str) -> List[Dict]:
|
||||
"""Load the raw json from the tree.json.gz
|
||||
|
||||
Args:
|
||||
url: Tree.json.gz url or path
|
||||
|
||||
Returns:
|
||||
The raw json list
|
||||
|
||||
"""
|
||||
if url.startswith('http://') or url.startswith('https://'):
|
||||
response = requests.get(url, allow_redirects=True)
|
||||
if not response.ok:
|
||||
raise ValueError('Error during query to %s' % url)
|
||||
raw = gzip.decompress(response.content)
|
||||
else:
|
||||
with gzip.open(url, 'r') as f:
|
||||
raw = f.read()
|
||||
raw_data = json.loads(raw.decode('utf-8'))
|
||||
return raw_data
|
||||
|
||||
|
||||
class GNUTree:
|
||||
"""Gnu Tree's representation
|
||||
|
||||
"""
|
||||
def __init__(self, url: str):
|
||||
self.url = url # filepath or uri
|
||||
u = urlparse(url)
|
||||
self.base_url = '%s://%s' % (u.scheme, u.netloc)
|
||||
# Interesting top level directories
|
||||
self.top_level_directories = ['gnu', 'old-gnu']
|
||||
# internal state
|
||||
self._artifacts = {} # type: Dict
|
||||
self._projects = {} # type: Dict
|
||||
|
||||
@property
|
||||
def projects(self) -> Dict:
|
||||
if not self._projects:
|
||||
self._projects, self._artifacts = self._load()
|
||||
return self._projects
|
||||
|
||||
@property
|
||||
def artifacts(self) -> Dict:
|
||||
if not self._artifacts:
|
||||
self._projects, self._artifacts = self._load()
|
||||
return self._artifacts
|
||||
|
||||
def _load(self) -> Tuple[Dict, Dict]:
|
||||
"""Compute projects and artifacts per project
|
||||
|
||||
Returns:
|
||||
Tuple of dict projects (key project url, value the associated
|
||||
information) and a dict artifacts (key project url, value the
|
||||
info_file list)
|
||||
|
||||
"""
|
||||
projects = {}
|
||||
artifacts = {}
|
||||
|
||||
raw_data = load_raw_data(self.url)[0]
|
||||
for directory in raw_data['contents']:
|
||||
if directory['name'] not in self.top_level_directories:
|
||||
continue
|
||||
infos = directory['contents']
|
||||
for info in infos:
|
||||
if info['type'] == 'directory':
|
||||
package_url = '%s/%s/%s/' % (
|
||||
self.base_url, directory['name'], info['name'])
|
||||
package_artifacts = find_artifacts(
|
||||
info['contents'], package_url)
|
||||
if package_artifacts != []:
|
||||
repo_details = {
|
||||
'name': info['name'],
|
||||
'url': package_url,
|
||||
'time_modified': info['time'],
|
||||
}
|
||||
artifacts[package_url] = package_artifacts
|
||||
projects[package_url] = repo_details
|
||||
|
||||
return projects, artifacts
|
||||
|
||||
|
||||
def find_artifacts(filesystem: List[Dict], url: str) -> List[Dict]:
|
||||
"""Recursively list artifacts present in the folder and subfolders for a
|
||||
particular package url.
|
||||
|
||||
Args:
|
||||
|
||||
filesystem: File structure of the package root directory. This is a
|
||||
list of Dict representing either file or directory information as
|
||||
dict (keys: name, size, time, type).
|
||||
url: URL of the corresponding package
|
||||
|
||||
Returns
|
||||
List of tarball urls and their associated metadata (time, length).
|
||||
For example:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
[
|
||||
{'archive': 'https://ftp.gnu.org/gnu/3dldf/3DLDF-1.1.3.tar.gz',
|
||||
'time': 1071002600,
|
||||
'length': 543},
|
||||
{'archive': 'https://ftp.gnu.org/gnu/3dldf/3DLDF-1.1.4.tar.gz',
|
||||
'time': 1071078759,
|
||||
'length': 456},
|
||||
{'archive': 'https://ftp.gnu.org/gnu/3dldf/3DLDF-1.1.5.tar.gz',
|
||||
'time': 1074278633,
|
||||
'length': 251},
|
||||
...
|
||||
]
|
||||
|
||||
"""
|
||||
artifacts = []
|
||||
for info_file in filesystem:
|
||||
filetype = info_file['type']
|
||||
filename = info_file['name']
|
||||
if filetype == 'file':
|
||||
if check_filename_is_archive(filename):
|
||||
artifacts.append({
|
||||
'archive': url + filename,
|
||||
'time': int(info_file['time']),
|
||||
'length': int(info_file['size']),
|
||||
})
|
||||
# It will recursively check for artifacts in all sub-folders
|
||||
elif filetype == 'directory':
|
||||
tarballs_in_dir = find_artifacts(
|
||||
info_file['contents'],
|
||||
url + filename + '/')
|
||||
artifacts.extend(tarballs_in_dir)
|
||||
|
||||
return artifacts
|
||||
|
||||
|
||||
def check_filename_is_archive(filename: str) -> bool:
|
||||
"""
|
||||
Check for the extension of the file, if the file is of zip format of
|
||||
.tar.x format, where x could be anything, then returns true.
|
||||
|
||||
Args:
|
||||
filename: name of the file for which the extensions is needs to
|
||||
be checked.
|
||||
|
||||
Returns:
|
||||
Whether filename is an archive or not
|
||||
|
||||
Example:
|
||||
|
||||
>>> check_filename_is_archive('abc.zip')
|
||||
True
|
||||
>>> check_filename_is_archive('abc.tar.gz')
|
||||
True
|
||||
>>> check_filename_is_archive('bac.tar')
|
||||
True
|
||||
>>> check_filename_is_archive('abc.tar.gz.sig')
|
||||
False
|
||||
>>> check_filename_is_archive('foobar.tar.')
|
||||
False
|
||||
|
||||
"""
|
||||
file_suffixes = Path(filename).suffixes
|
||||
logger.debug('Path(%s).suffixed: %s' % (filename, file_suffixes))
|
||||
if len(file_suffixes) == 1 and file_suffixes[-1] in ('.zip', '.tar'):
|
||||
return True
|
||||
elif len(file_suffixes) > 1:
|
||||
if file_suffixes[-1] == '.zip' or file_suffixes[-2] == '.tar':
|
||||
return True
|
||||
return False
|
Loading…
Add table
Add a link
Reference in a new issue