lister.gnu: Move version parsing logic to the lister
Related D2145 proposition
This commit is contained in:
parent
7c247c8a4a
commit
821f3f1cc2
4 changed files with 233 additions and 46 deletions
|
@ -33,22 +33,23 @@ class GNULister(SimpleLister):
|
|||
|
||||
.. code-block:: python
|
||||
|
||||
args: ['https://ftp.gnu.org/gnu/3dldf/']
|
||||
args:
|
||||
kwargs: {
|
||||
'tarballs': [{
|
||||
'archive': 'https://...',
|
||||
'url': 'https://ftp.gnu.org/gnu/3dldf/',
|
||||
'artifacts': [{
|
||||
'url': 'https://...',
|
||||
'time': 1071002600,
|
||||
'length': 128},
|
||||
...
|
||||
]}
|
||||
|
||||
"""
|
||||
tarballs = self.gnu_tree.artifacts[origin_url]
|
||||
artifacts = self.gnu_tree.artifacts[origin_url]
|
||||
return utils.create_task_dict(
|
||||
'load-%s' % origin_type,
|
||||
kwargs.get('policy', 'oneshot'),
|
||||
origin_url,
|
||||
tarballs=tarballs)
|
||||
url=origin_url,
|
||||
artifacts=artifacts)
|
||||
|
||||
def safely_issue_request(self, identifier):
|
||||
"""Bypass the implementation. It's now the GNUTree which deals with
|
||||
|
|
|
@ -9,7 +9,7 @@ import logging
|
|||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def test_lister_no_page_check_results(swh_listers, requests_mock_datadir):
|
||||
def test_gnu_lister(swh_listers, requests_mock_datadir):
|
||||
lister = swh_listers['gnu']
|
||||
|
||||
lister.run()
|
||||
|
@ -21,21 +21,23 @@ def test_lister_no_page_check_results(swh_listers, requests_mock_datadir):
|
|||
assert row['type'] == 'load-tar'
|
||||
# arguments check
|
||||
args = row['arguments']['args']
|
||||
assert len(args) == 1
|
||||
assert len(args) == 0
|
||||
|
||||
url = args[0]
|
||||
# kwargs
|
||||
kwargs = row['arguments']['kwargs']
|
||||
assert set(kwargs.keys()) == {'url', 'artifacts'}
|
||||
|
||||
url = kwargs['url']
|
||||
assert url.startswith('https://ftp.gnu.org')
|
||||
|
||||
url_suffix = url.split('https://ftp.gnu.org')[1]
|
||||
assert 'gnu' in url_suffix or 'old-gnu' in url_suffix
|
||||
|
||||
# kwargs
|
||||
kwargs = row['arguments']['kwargs']
|
||||
assert list(kwargs.keys()) == ['tarballs']
|
||||
|
||||
tarballs = kwargs['tarballs']
|
||||
# check the tarball's structure
|
||||
tarball = tarballs[0]
|
||||
assert set(tarball.keys()) == set(['archive', 'length', 'time'])
|
||||
artifacts = kwargs['artifacts']
|
||||
# check the artifact's structure
|
||||
artifact = artifacts[0]
|
||||
assert set(artifact.keys()) == {
|
||||
'url', 'length', 'time', 'filename', 'version'
|
||||
}
|
||||
|
||||
assert row['policy'] == 'oneshot'
|
||||
|
|
|
@ -9,7 +9,8 @@ import pytest
|
|||
|
||||
from os import path
|
||||
from swh.lister.gnu.tree import (
|
||||
GNUTree, find_artifacts, check_filename_is_archive, load_raw_data
|
||||
GNUTree, find_artifacts, check_filename_is_archive, load_raw_data,
|
||||
get_version
|
||||
)
|
||||
|
||||
|
||||
|
@ -69,14 +70,18 @@ def test_tree_json(requests_mock_datadir):
|
|||
|
||||
assert tree_json.artifacts['https://ftp.gnu.org/old-gnu/zlibc/'] == [
|
||||
{
|
||||
'archive': 'https://ftp.gnu.org/old-gnu/zlibc/zlibc-0.9b.tar.gz', # noqa
|
||||
'url': 'https://ftp.gnu.org/old-gnu/zlibc/zlibc-0.9b.tar.gz', # noqa
|
||||
'length': 90106,
|
||||
'time': 857980800
|
||||
'time': 857980800,
|
||||
'filename': 'zlibc-0.9b.tar.gz',
|
||||
'version': '0.9b',
|
||||
},
|
||||
{
|
||||
'archive': 'https://ftp.gnu.org/old-gnu/zlibc/zlibc-0.9e.tar.gz', # noqa
|
||||
'url': 'https://ftp.gnu.org/old-gnu/zlibc/zlibc-0.9e.tar.gz', # noqa
|
||||
'length': 89625,
|
||||
'time': 860396400
|
||||
'time': 860396400,
|
||||
'filename': 'zlibc-0.9e.tar.gz',
|
||||
'version': '0.9e',
|
||||
}
|
||||
]
|
||||
|
||||
|
@ -93,38 +98,46 @@ def test_tree_json_failures(requests_mock_datadir):
|
|||
|
||||
|
||||
def test_find_artifacts_small_sample(datadir):
|
||||
expected_tarballs = [
|
||||
expected_artifacts = [
|
||||
{
|
||||
'archive': '/root/artanis/artanis-0.2.1.tar.bz2',
|
||||
'url': '/root/artanis/artanis-0.2.1.tar.bz2',
|
||||
'time': 1495205979,
|
||||
'length': 424081,
|
||||
'version': '0.2.1',
|
||||
'filename': 'artanis-0.2.1.tar.bz2',
|
||||
},
|
||||
{
|
||||
'archive': '/root/xboard/winboard/winboard-4_0_0-src.zip', # noqa
|
||||
'url': '/root/xboard/winboard/winboard-4_0_0-src.zip', # noqa
|
||||
'time': 898422900,
|
||||
'length': 1514448
|
||||
'length': 1514448,
|
||||
'version': '4_0_0-src',
|
||||
'filename': 'winboard-4_0_0-src.zip',
|
||||
},
|
||||
{
|
||||
'archive': '/root/xboard/xboard-3.6.2.tar.gz', # noqa
|
||||
'url': '/root/xboard/xboard-3.6.2.tar.gz', # noqa
|
||||
'time': 869814000,
|
||||
'length': 450164,
|
||||
'version': '3.6.2',
|
||||
'filename': 'xboard-3.6.2.tar.gz',
|
||||
},
|
||||
{
|
||||
'archive': '/root/xboard/xboard-4.0.0.tar.gz', # noqa
|
||||
'url': '/root/xboard/xboard-4.0.0.tar.gz', # noqa
|
||||
'time': 898422900,
|
||||
'length': 514951,
|
||||
'version': '4.0.0',
|
||||
'filename': 'xboard-4.0.0.tar.gz',
|
||||
},
|
||||
]
|
||||
|
||||
file_structure = json.load(open(path.join(datadir, 'tree.min.json')))
|
||||
actual_tarballs = find_artifacts(file_structure, '/root/')
|
||||
assert actual_tarballs == expected_tarballs
|
||||
actual_artifacts = find_artifacts(file_structure, '/root/')
|
||||
assert actual_artifacts == expected_artifacts
|
||||
|
||||
|
||||
def test_find_artifacts(datadir):
|
||||
file_structure = json.load(open(path.join(datadir, 'tree.json')))
|
||||
actual_tarballs = find_artifacts(file_structure, '/root/')
|
||||
assert len(actual_tarballs) == 42 + 3 # tar + zip
|
||||
actual_artifacts = find_artifacts(file_structure, '/root/')
|
||||
assert len(actual_artifacts) == 42 + 3 # tar + zip
|
||||
|
||||
|
||||
def test_check_filename_is_archive():
|
||||
|
@ -133,3 +146,61 @@ def test_check_filename_is_archive():
|
|||
|
||||
for ext in ['abc.tar.gz.sig', 'abc', 'something.zip2', 'foo.tar.']:
|
||||
assert check_filename_is_archive(ext) is False
|
||||
|
||||
|
||||
def test_get_version():
|
||||
"""From url to branch name should yield something relevant
|
||||
|
||||
"""
|
||||
for url, expected_branchname in [
|
||||
('https://gnu.org/sthg/info-2.1.0.tar.gz', '2.1.0'),
|
||||
('https://gnu.org/sthg/info-2.1.2.zip', '2.1.2'),
|
||||
('https://sthg.org/gnu/sthg.tar.gz', 'sthg'),
|
||||
('https://sthg.org/gnu/DLDF-1.1.4.tar.gz', '1.1.4'),
|
||||
('https://sthg.org/gnu/anubis-latest.tar.bz2', 'latest'),
|
||||
('https://ftp.org/gnu/aris-w32.zip', 'w32'),
|
||||
('https://ftp.org/gnu/aris-w32-2.2.zip', 'w32-2.2'),
|
||||
('https://ftp.org/gnu/autogen.info.tar.gz', 'autogen.info'),
|
||||
('https://ftp.org/gnu/crypto-build-demo.tar.gz',
|
||||
'crypto-build-demo'),
|
||||
('https://ftp.org/gnu/clue+clio+xit.clisp.tar.gz',
|
||||
'clue+clio+xit.clisp'),
|
||||
('https://ftp.org/gnu/clue+clio.for-pcl.tar.gz',
|
||||
'clue+clio.for-pcl'),
|
||||
('https://ftp.org/gnu/clisp-hppa2.0-hp-hpux10.20.tar.gz',
|
||||
'hppa2.0-hp-hpux10.20'),
|
||||
('clisp-i386-solaris2.6.tar.gz', 'i386-solaris2.6'),
|
||||
('clisp-mips-sgi-irix6.5.tar.gz', 'mips-sgi-irix6.5'),
|
||||
('clisp-powerpc-apple-macos.tar.gz', 'powerpc-apple-macos'),
|
||||
('clisp-powerpc-unknown-linuxlibc6.tar.gz',
|
||||
'powerpc-unknown-linuxlibc6'),
|
||||
|
||||
('clisp-rs6000-ibm-aix3.2.5.tar.gz', 'rs6000-ibm-aix3.2.5'),
|
||||
('clisp-sparc-redhat51-linux.tar.gz', 'sparc-redhat51-linux'),
|
||||
('clisp-sparc-sun-solaris2.4.tar.gz', 'sparc-sun-solaris2.4'),
|
||||
('clisp-sparc-sun-sunos4.1.3_U1.tar.gz',
|
||||
'sparc-sun-sunos4.1.3_U1'),
|
||||
('clisp-2.25.1-powerpc-apple-MacOSX.tar.gz',
|
||||
'2.25.1-powerpc-apple-MacOSX'),
|
||||
('clisp-2.27-PowerMacintosh-powerpc-Darwin-1.3.7.tar.gz',
|
||||
'2.27-PowerMacintosh-powerpc-Darwin-1.3.7'),
|
||||
('clisp-2.27-i686-unknown-Linux-2.2.19.tar.gz',
|
||||
'2.27-i686-unknown-Linux-2.2.19'),
|
||||
('clisp-2.28-i386-i386-freebsd-4.3-RELEASE.tar.gz',
|
||||
'2.28-i386-i386-freebsd-4.3-RELEASE'),
|
||||
('clisp-2.28-i686-unknown-cygwin_me-4.90-1.3.10.tar.gz',
|
||||
'2.28-i686-unknown-cygwin_me-4.90-1.3.10'),
|
||||
('clisp-2.29-i386-i386-freebsd-4.6-STABLE.tar.gz',
|
||||
'2.29-i386-i386-freebsd-4.6-STABLE'),
|
||||
('clisp-2.29-i686-unknown-cygwin_nt-5.0-1.3.12.tar.gz',
|
||||
'2.29-i686-unknown-cygwin_nt-5.0-1.3.12'),
|
||||
('gcl-2.5.3-ansi-japi-xdr.20030701_mingw32.zip',
|
||||
'2.5.3-ansi-japi-xdr.20030701_mingw32'),
|
||||
('gettext-runtime-0.13.1.bin.woe32.zip', '0.13.1.bin.woe32'),
|
||||
('sather-logo_images.tar.gz', 'sather-logo_images'),
|
||||
('sather-specification-000328.html.tar.gz', '000328.html')
|
||||
|
||||
]:
|
||||
actual_branchname = get_version(url)
|
||||
|
||||
assert actual_branchname == expected_branchname
|
||||
|
|
|
@ -7,15 +7,112 @@ import gzip
|
|||
import json
|
||||
import logging
|
||||
import requests
|
||||
import re
|
||||
|
||||
from os import path
|
||||
from pathlib import Path
|
||||
from typing import Dict, Tuple, List
|
||||
from typing import Any, Dict, List, Mapping, Tuple
|
||||
from urllib.parse import urlparse
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# to recognize existing naming pattern
|
||||
extensions = [
|
||||
'zip',
|
||||
'tar',
|
||||
'gz', 'tgz',
|
||||
'bz2', 'bzip2',
|
||||
'lzma', 'lz',
|
||||
'xz',
|
||||
'Z',
|
||||
]
|
||||
|
||||
version_keywords = [
|
||||
'cygwin_me',
|
||||
'w32', 'win32', 'nt', 'cygwin', 'mingw',
|
||||
'latest', 'alpha', 'beta',
|
||||
'release', 'stable',
|
||||
'hppa',
|
||||
'solaris', 'sunos', 'sun4u', 'sparc', 'sun',
|
||||
'aix', 'ibm', 'rs6000',
|
||||
'i386', 'i686',
|
||||
'linux', 'redhat', 'linuxlibc',
|
||||
'mips',
|
||||
'powerpc', 'macos', 'apple', 'darwin', 'macosx', 'powermacintosh',
|
||||
'unknown',
|
||||
'netbsd', 'freebsd',
|
||||
'sgi', 'irix',
|
||||
]
|
||||
|
||||
# Match a filename into components.
|
||||
#
|
||||
# We use Debian's release number heuristic: A release number starts
|
||||
# with a digit, and is followed by alphanumeric characters or any of
|
||||
# ., +, :, ~ and -
|
||||
#
|
||||
# We hardcode a list of possible extensions, as this release number
|
||||
# scheme would match them too... We match on any combination of those.
|
||||
#
|
||||
# Greedy matching is done right to left (we only match the extension
|
||||
# greedily with +, software_name and release_number are matched lazily
|
||||
# with +? and *?).
|
||||
|
||||
pattern = r'''
|
||||
^
|
||||
(?:
|
||||
# We have a software name and a release number, separated with a
|
||||
# -, _ or dot.
|
||||
(?P<software_name1>.+?[-_.])
|
||||
(?P<release_number>(%(vkeywords)s|[0-9][0-9a-zA-Z_.+:~-]*?)+)
|
||||
|
|
||||
# We couldn't match a release number, put everything in the
|
||||
# software name.
|
||||
(?P<software_name2>.+?)
|
||||
)
|
||||
(?P<extension>(?:\.(?:%(extensions)s))+)
|
||||
$
|
||||
''' % {
|
||||
'extensions': '|'.join(extensions),
|
||||
'vkeywords': '|'.join('%s[-]?' % k for k in version_keywords),
|
||||
}
|
||||
|
||||
|
||||
def get_version(uri: str) -> str:
|
||||
"""Extract branch name from tarball uri
|
||||
|
||||
Args:
|
||||
uri (str): Tarball URI
|
||||
|
||||
Returns:
|
||||
Version detected
|
||||
|
||||
Example:
|
||||
For uri = https://ftp.gnu.org/gnu/8sync/8sync-0.2.0.tar.gz
|
||||
|
||||
>>> get_version(uri)
|
||||
'0.2.0'
|
||||
|
||||
For uri = 8sync-0.3.0.tar.gz
|
||||
|
||||
>>> get_version(uri)
|
||||
'0.3.0'
|
||||
|
||||
"""
|
||||
filename = path.split(uri)[-1]
|
||||
m = re.match(pattern, filename,
|
||||
flags=re.VERBOSE | re.IGNORECASE)
|
||||
if m:
|
||||
d = m.groupdict()
|
||||
if d['software_name1'] and d['release_number']:
|
||||
return d['release_number']
|
||||
if d['software_name2']:
|
||||
return d['software_name2']
|
||||
|
||||
return ''
|
||||
|
||||
|
||||
def load_raw_data(url: str) -> List[Dict]:
|
||||
"""Load the raw json from the tree.json.gz
|
||||
|
||||
|
@ -99,7 +196,8 @@ class GNUTree:
|
|||
return projects, artifacts
|
||||
|
||||
|
||||
def find_artifacts(filesystem: List[Dict], url: str) -> List[Dict]:
|
||||
def find_artifacts(
|
||||
filesystem: List[Dict], url: str) -> List[Mapping[str, Any]]:
|
||||
"""Recursively list artifacts present in the folder and subfolders for a
|
||||
particular package url.
|
||||
|
||||
|
@ -111,21 +209,33 @@ def find_artifacts(filesystem: List[Dict], url: str) -> List[Dict]:
|
|||
url: URL of the corresponding package
|
||||
|
||||
Returns
|
||||
List of tarball urls and their associated metadata (time, length).
|
||||
For example:
|
||||
List of tarball urls and their associated metadata (time, length,
|
||||
etc...). For example:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
[
|
||||
{'archive': 'https://ftp.gnu.org/gnu/3dldf/3DLDF-1.1.3.tar.gz',
|
||||
'time': 1071002600,
|
||||
'length': 543},
|
||||
{'archive': 'https://ftp.gnu.org/gnu/3dldf/3DLDF-1.1.4.tar.gz',
|
||||
'time': 1071078759,
|
||||
'length': 456},
|
||||
{'archive': 'https://ftp.gnu.org/gnu/3dldf/3DLDF-1.1.5.tar.gz',
|
||||
'time': 1074278633,
|
||||
'length': 251},
|
||||
{
|
||||
'url': 'https://ftp.gnu.org/gnu/3dldf/3DLDF-1.1.3.tar.gz',
|
||||
'time': 1071002600,
|
||||
'filename': '3DLDF-1.1.3.tar.gz',
|
||||
'version': '1.1.3',
|
||||
'length': 543
|
||||
},
|
||||
{
|
||||
'url': 'https://ftp.gnu.org/gnu/3dldf/3DLDF-1.1.4.tar.gz',
|
||||
'time': 1071078759,
|
||||
'filename: '3DLDF-1.1.4.tar.gz',
|
||||
'version': '1.1.4',
|
||||
'length': 456
|
||||
},
|
||||
{
|
||||
'url': 'https://ftp.gnu.org/gnu/3dldf/3DLDF-1.1.5.tar.gz',
|
||||
'time': 1074278633,
|
||||
'filename': '3DLDF-1.1.5.tar.gz',
|
||||
'version': '1.1.5'
|
||||
'length': 251
|
||||
},
|
||||
...
|
||||
]
|
||||
|
||||
|
@ -136,10 +246,13 @@ def find_artifacts(filesystem: List[Dict], url: str) -> List[Dict]:
|
|||
filename = info_file['name']
|
||||
if filetype == 'file':
|
||||
if check_filename_is_archive(filename):
|
||||
uri = url + filename
|
||||
artifacts.append({
|
||||
'archive': url + filename,
|
||||
'url': uri,
|
||||
'filename': filename,
|
||||
'time': int(info_file['time']),
|
||||
'length': int(info_file['size']),
|
||||
'version': get_version(filename),
|
||||
})
|
||||
# It will recursively check for artifacts in all sub-folders
|
||||
elif filetype == 'directory':
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue