lister.gnu: Move version parsing logic to the lister

Related D2145 proposition
This commit is contained in:
Antoine R. Dumont (@ardumont) 2019-10-16 13:23:16 +02:00
parent 7c247c8a4a
commit 821f3f1cc2
No known key found for this signature in database
GPG key ID: 52E2E9840D10C3B8
4 changed files with 233 additions and 46 deletions

View file

@ -33,22 +33,23 @@ class GNULister(SimpleLister):
.. code-block:: python
args: ['https://ftp.gnu.org/gnu/3dldf/']
args:
kwargs: {
'tarballs': [{
'archive': 'https://...',
'url': 'https://ftp.gnu.org/gnu/3dldf/',
'artifacts': [{
'url': 'https://...',
'time': 1071002600,
'length': 128},
...
]}
"""
tarballs = self.gnu_tree.artifacts[origin_url]
artifacts = self.gnu_tree.artifacts[origin_url]
return utils.create_task_dict(
'load-%s' % origin_type,
kwargs.get('policy', 'oneshot'),
origin_url,
tarballs=tarballs)
url=origin_url,
artifacts=artifacts)
def safely_issue_request(self, identifier):
"""Bypass the implementation. It's now the GNUTree which deals with

View file

@ -9,7 +9,7 @@ import logging
logger = logging.getLogger(__name__)
def test_lister_no_page_check_results(swh_listers, requests_mock_datadir):
def test_gnu_lister(swh_listers, requests_mock_datadir):
lister = swh_listers['gnu']
lister.run()
@ -21,21 +21,23 @@ def test_lister_no_page_check_results(swh_listers, requests_mock_datadir):
assert row['type'] == 'load-tar'
# arguments check
args = row['arguments']['args']
assert len(args) == 1
assert len(args) == 0
url = args[0]
# kwargs
kwargs = row['arguments']['kwargs']
assert set(kwargs.keys()) == {'url', 'artifacts'}
url = kwargs['url']
assert url.startswith('https://ftp.gnu.org')
url_suffix = url.split('https://ftp.gnu.org')[1]
assert 'gnu' in url_suffix or 'old-gnu' in url_suffix
# kwargs
kwargs = row['arguments']['kwargs']
assert list(kwargs.keys()) == ['tarballs']
tarballs = kwargs['tarballs']
# check the tarball's structure
tarball = tarballs[0]
assert set(tarball.keys()) == set(['archive', 'length', 'time'])
artifacts = kwargs['artifacts']
# check the artifact's structure
artifact = artifacts[0]
assert set(artifact.keys()) == {
'url', 'length', 'time', 'filename', 'version'
}
assert row['policy'] == 'oneshot'

View file

@ -9,7 +9,8 @@ import pytest
from os import path
from swh.lister.gnu.tree import (
GNUTree, find_artifacts, check_filename_is_archive, load_raw_data
GNUTree, find_artifacts, check_filename_is_archive, load_raw_data,
get_version
)
@ -69,14 +70,18 @@ def test_tree_json(requests_mock_datadir):
assert tree_json.artifacts['https://ftp.gnu.org/old-gnu/zlibc/'] == [
{
'archive': 'https://ftp.gnu.org/old-gnu/zlibc/zlibc-0.9b.tar.gz', # noqa
'url': 'https://ftp.gnu.org/old-gnu/zlibc/zlibc-0.9b.tar.gz', # noqa
'length': 90106,
'time': 857980800
'time': 857980800,
'filename': 'zlibc-0.9b.tar.gz',
'version': '0.9b',
},
{
'archive': 'https://ftp.gnu.org/old-gnu/zlibc/zlibc-0.9e.tar.gz', # noqa
'url': 'https://ftp.gnu.org/old-gnu/zlibc/zlibc-0.9e.tar.gz', # noqa
'length': 89625,
'time': 860396400
'time': 860396400,
'filename': 'zlibc-0.9e.tar.gz',
'version': '0.9e',
}
]
@ -93,38 +98,46 @@ def test_tree_json_failures(requests_mock_datadir):
def test_find_artifacts_small_sample(datadir):
expected_tarballs = [
expected_artifacts = [
{
'archive': '/root/artanis/artanis-0.2.1.tar.bz2',
'url': '/root/artanis/artanis-0.2.1.tar.bz2',
'time': 1495205979,
'length': 424081,
'version': '0.2.1',
'filename': 'artanis-0.2.1.tar.bz2',
},
{
'archive': '/root/xboard/winboard/winboard-4_0_0-src.zip', # noqa
'url': '/root/xboard/winboard/winboard-4_0_0-src.zip', # noqa
'time': 898422900,
'length': 1514448
'length': 1514448,
'version': '4_0_0-src',
'filename': 'winboard-4_0_0-src.zip',
},
{
'archive': '/root/xboard/xboard-3.6.2.tar.gz', # noqa
'url': '/root/xboard/xboard-3.6.2.tar.gz', # noqa
'time': 869814000,
'length': 450164,
'version': '3.6.2',
'filename': 'xboard-3.6.2.tar.gz',
},
{
'archive': '/root/xboard/xboard-4.0.0.tar.gz', # noqa
'url': '/root/xboard/xboard-4.0.0.tar.gz', # noqa
'time': 898422900,
'length': 514951,
'version': '4.0.0',
'filename': 'xboard-4.0.0.tar.gz',
},
]
file_structure = json.load(open(path.join(datadir, 'tree.min.json')))
actual_tarballs = find_artifacts(file_structure, '/root/')
assert actual_tarballs == expected_tarballs
actual_artifacts = find_artifacts(file_structure, '/root/')
assert actual_artifacts == expected_artifacts
def test_find_artifacts(datadir):
file_structure = json.load(open(path.join(datadir, 'tree.json')))
actual_tarballs = find_artifacts(file_structure, '/root/')
assert len(actual_tarballs) == 42 + 3 # tar + zip
actual_artifacts = find_artifacts(file_structure, '/root/')
assert len(actual_artifacts) == 42 + 3 # tar + zip
def test_check_filename_is_archive():
@ -133,3 +146,61 @@ def test_check_filename_is_archive():
for ext in ['abc.tar.gz.sig', 'abc', 'something.zip2', 'foo.tar.']:
assert check_filename_is_archive(ext) is False
def test_get_version():
"""From url to branch name should yield something relevant
"""
for url, expected_branchname in [
('https://gnu.org/sthg/info-2.1.0.tar.gz', '2.1.0'),
('https://gnu.org/sthg/info-2.1.2.zip', '2.1.2'),
('https://sthg.org/gnu/sthg.tar.gz', 'sthg'),
('https://sthg.org/gnu/DLDF-1.1.4.tar.gz', '1.1.4'),
('https://sthg.org/gnu/anubis-latest.tar.bz2', 'latest'),
('https://ftp.org/gnu/aris-w32.zip', 'w32'),
('https://ftp.org/gnu/aris-w32-2.2.zip', 'w32-2.2'),
('https://ftp.org/gnu/autogen.info.tar.gz', 'autogen.info'),
('https://ftp.org/gnu/crypto-build-demo.tar.gz',
'crypto-build-demo'),
('https://ftp.org/gnu/clue+clio+xit.clisp.tar.gz',
'clue+clio+xit.clisp'),
('https://ftp.org/gnu/clue+clio.for-pcl.tar.gz',
'clue+clio.for-pcl'),
('https://ftp.org/gnu/clisp-hppa2.0-hp-hpux10.20.tar.gz',
'hppa2.0-hp-hpux10.20'),
('clisp-i386-solaris2.6.tar.gz', 'i386-solaris2.6'),
('clisp-mips-sgi-irix6.5.tar.gz', 'mips-sgi-irix6.5'),
('clisp-powerpc-apple-macos.tar.gz', 'powerpc-apple-macos'),
('clisp-powerpc-unknown-linuxlibc6.tar.gz',
'powerpc-unknown-linuxlibc6'),
('clisp-rs6000-ibm-aix3.2.5.tar.gz', 'rs6000-ibm-aix3.2.5'),
('clisp-sparc-redhat51-linux.tar.gz', 'sparc-redhat51-linux'),
('clisp-sparc-sun-solaris2.4.tar.gz', 'sparc-sun-solaris2.4'),
('clisp-sparc-sun-sunos4.1.3_U1.tar.gz',
'sparc-sun-sunos4.1.3_U1'),
('clisp-2.25.1-powerpc-apple-MacOSX.tar.gz',
'2.25.1-powerpc-apple-MacOSX'),
('clisp-2.27-PowerMacintosh-powerpc-Darwin-1.3.7.tar.gz',
'2.27-PowerMacintosh-powerpc-Darwin-1.3.7'),
('clisp-2.27-i686-unknown-Linux-2.2.19.tar.gz',
'2.27-i686-unknown-Linux-2.2.19'),
('clisp-2.28-i386-i386-freebsd-4.3-RELEASE.tar.gz',
'2.28-i386-i386-freebsd-4.3-RELEASE'),
('clisp-2.28-i686-unknown-cygwin_me-4.90-1.3.10.tar.gz',
'2.28-i686-unknown-cygwin_me-4.90-1.3.10'),
('clisp-2.29-i386-i386-freebsd-4.6-STABLE.tar.gz',
'2.29-i386-i386-freebsd-4.6-STABLE'),
('clisp-2.29-i686-unknown-cygwin_nt-5.0-1.3.12.tar.gz',
'2.29-i686-unknown-cygwin_nt-5.0-1.3.12'),
('gcl-2.5.3-ansi-japi-xdr.20030701_mingw32.zip',
'2.5.3-ansi-japi-xdr.20030701_mingw32'),
('gettext-runtime-0.13.1.bin.woe32.zip', '0.13.1.bin.woe32'),
('sather-logo_images.tar.gz', 'sather-logo_images'),
('sather-specification-000328.html.tar.gz', '000328.html')
]:
actual_branchname = get_version(url)
assert actual_branchname == expected_branchname

View file

@ -7,15 +7,112 @@ import gzip
import json
import logging
import requests
import re
from os import path
from pathlib import Path
from typing import Dict, Tuple, List
from typing import Any, Dict, List, Mapping, Tuple
from urllib.parse import urlparse
logger = logging.getLogger(__name__)
# to recognize existing naming pattern
extensions = [
'zip',
'tar',
'gz', 'tgz',
'bz2', 'bzip2',
'lzma', 'lz',
'xz',
'Z',
]
version_keywords = [
'cygwin_me',
'w32', 'win32', 'nt', 'cygwin', 'mingw',
'latest', 'alpha', 'beta',
'release', 'stable',
'hppa',
'solaris', 'sunos', 'sun4u', 'sparc', 'sun',
'aix', 'ibm', 'rs6000',
'i386', 'i686',
'linux', 'redhat', 'linuxlibc',
'mips',
'powerpc', 'macos', 'apple', 'darwin', 'macosx', 'powermacintosh',
'unknown',
'netbsd', 'freebsd',
'sgi', 'irix',
]
# Match a filename into components.
#
# We use Debian's release number heuristic: A release number starts
# with a digit, and is followed by alphanumeric characters or any of
# ., +, :, ~ and -
#
# We hardcode a list of possible extensions, as this release number
# scheme would match them too... We match on any combination of those.
#
# Greedy matching is done right to left (we only match the extension
# greedily with +, software_name and release_number are matched lazily
# with +? and *?).
pattern = r'''
^
(?:
# We have a software name and a release number, separated with a
# -, _ or dot.
(?P<software_name1>.+?[-_.])
(?P<release_number>(%(vkeywords)s|[0-9][0-9a-zA-Z_.+:~-]*?)+)
|
# We couldn't match a release number, put everything in the
# software name.
(?P<software_name2>.+?)
)
(?P<extension>(?:\.(?:%(extensions)s))+)
$
''' % {
'extensions': '|'.join(extensions),
'vkeywords': '|'.join('%s[-]?' % k for k in version_keywords),
}
def get_version(uri: str) -> str:
"""Extract branch name from tarball uri
Args:
uri (str): Tarball URI
Returns:
Version detected
Example:
For uri = https://ftp.gnu.org/gnu/8sync/8sync-0.2.0.tar.gz
>>> get_version(uri)
'0.2.0'
For uri = 8sync-0.3.0.tar.gz
>>> get_version(uri)
'0.3.0'
"""
filename = path.split(uri)[-1]
m = re.match(pattern, filename,
flags=re.VERBOSE | re.IGNORECASE)
if m:
d = m.groupdict()
if d['software_name1'] and d['release_number']:
return d['release_number']
if d['software_name2']:
return d['software_name2']
return ''
def load_raw_data(url: str) -> List[Dict]:
"""Load the raw json from the tree.json.gz
@ -99,7 +196,8 @@ class GNUTree:
return projects, artifacts
def find_artifacts(filesystem: List[Dict], url: str) -> List[Dict]:
def find_artifacts(
filesystem: List[Dict], url: str) -> List[Mapping[str, Any]]:
"""Recursively list artifacts present in the folder and subfolders for a
particular package url.
@ -111,21 +209,33 @@ def find_artifacts(filesystem: List[Dict], url: str) -> List[Dict]:
url: URL of the corresponding package
Returns
List of tarball urls and their associated metadata (time, length).
For example:
List of tarball urls and their associated metadata (time, length,
etc...). For example:
.. code-block:: python
[
{'archive': 'https://ftp.gnu.org/gnu/3dldf/3DLDF-1.1.3.tar.gz',
'time': 1071002600,
'length': 543},
{'archive': 'https://ftp.gnu.org/gnu/3dldf/3DLDF-1.1.4.tar.gz',
'time': 1071078759,
'length': 456},
{'archive': 'https://ftp.gnu.org/gnu/3dldf/3DLDF-1.1.5.tar.gz',
'time': 1074278633,
'length': 251},
{
'url': 'https://ftp.gnu.org/gnu/3dldf/3DLDF-1.1.3.tar.gz',
'time': 1071002600,
'filename': '3DLDF-1.1.3.tar.gz',
'version': '1.1.3',
'length': 543
},
{
'url': 'https://ftp.gnu.org/gnu/3dldf/3DLDF-1.1.4.tar.gz',
'time': 1071078759,
'filename: '3DLDF-1.1.4.tar.gz',
'version': '1.1.4',
'length': 456
},
{
'url': 'https://ftp.gnu.org/gnu/3dldf/3DLDF-1.1.5.tar.gz',
'time': 1074278633,
'filename': '3DLDF-1.1.5.tar.gz',
'version': '1.1.5'
'length': 251
},
...
]
@ -136,10 +246,13 @@ def find_artifacts(filesystem: List[Dict], url: str) -> List[Dict]:
filename = info_file['name']
if filetype == 'file':
if check_filename_is_archive(filename):
uri = url + filename
artifacts.append({
'archive': url + filename,
'url': uri,
'filename': filename,
'time': int(info_file['time']),
'length': int(info_file['size']),
'version': get_version(filename),
})
# It will recursively check for artifacts in all sub-folders
elif filetype == 'directory':