diff --git a/swh/lister/gnu/lister.py b/swh/lister/gnu/lister.py index 5179462..f64c4a1 100644 --- a/swh/lister/gnu/lister.py +++ b/swh/lister/gnu/lister.py @@ -38,10 +38,14 @@ class GNULister(SimpleLister): 'url': 'https://ftp.gnu.org/gnu/3dldf/', 'artifacts': [{ 'url': 'https://...', - 'time': 1071002600, - 'length': 128}, + 'time': '2003-12-09T21:43:20+00:00', + 'length': 128, + 'version': '1.0.1', + 'filename': 'something-1.0.1.tar.gz', + }, ... - ]} + ] + } """ artifacts = self.gnu_tree.artifacts[origin_url] @@ -73,14 +77,13 @@ class GNULister(SimpleLister): List of packages name, url, last modification time .. code-block:: python - [ {'name': '3dldf', 'url': 'https://ftp.gnu.org/gnu/3dldf/', - 'time_modified': 1071002600}, + 'time_modified': '2003-12-09T20:43:20+00:00'}, {'name': '8sync', 'url': 'https://ftp.gnu.org/gnu/8sync/', - 'time_modified': 1480991830}, + 'time_modified': '2016-12-06T02:37:10+00:00'}, ... ] diff --git a/swh/lister/gnu/models.py b/swh/lister/gnu/models.py index bb78f85..38c47ae 100644 --- a/swh/lister/gnu/models.py +++ b/swh/lister/gnu/models.py @@ -2,7 +2,7 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -from sqlalchemy import Column, DateTime, Integer, String +from sqlalchemy import Column, DateTime, String from ..core.models import ModelBase diff --git a/swh/lister/gnu/tests/test_tree.py b/swh/lister/gnu/tests/test_tree.py index 0e6193b..ea25515 100644 --- a/swh/lister/gnu/tests/test_tree.py +++ b/swh/lister/gnu/tests/test_tree.py @@ -149,7 +149,9 @@ def test_check_filename_is_archive(): def test_get_version(): - """From url to branch name should yield something relevant + """Parsing version from url should yield some form of "sensible" version + + Given the dataset, it's not a simple task to extract correctly the version. """ for url, expected_branchname in [ @@ -198,7 +200,8 @@ def test_get_version(): '2.5.3-ansi-japi-xdr.20030701_mingw32'), ('gettext-runtime-0.13.1.bin.woe32.zip', '0.13.1.bin.woe32'), ('sather-logo_images.tar.gz', 'sather-logo_images'), - ('sather-specification-000328.html.tar.gz', '000328.html') + ('sather-specification-000328.html.tar.gz', '000328.html'), + ('something-10.1.0.7z', '10.1.0'), ]: actual_branchname = get_version(url) diff --git a/swh/lister/gnu/tree.py b/swh/lister/gnu/tree.py index 5a4991f..8ef6bd6 100644 --- a/swh/lister/gnu/tree.py +++ b/swh/lister/gnu/tree.py @@ -13,7 +13,7 @@ from datetime import datetime from os import path from pathlib import Path from pytz import utc -from typing import Any, Dict, List, Mapping, Tuple +from typing import Any, List, Mapping, Sequence, Tuple from urllib.parse import urlparse @@ -31,22 +31,22 @@ class GNUTree: # Interesting top level directories self.top_level_directories = ['gnu', 'old-gnu'] # internal state - self._artifacts = {} # type: Dict - self._projects = {} # type: Dict + self._artifacts = {} # type: Mapping[str, Any] + self._projects = {} # type: Mapping[str, Any] @property - def projects(self) -> Dict: + def projects(self) -> Mapping[str, Any]: if not self._projects: self._projects, self._artifacts = self._load() return self._projects @property - def artifacts(self) -> Dict: + def artifacts(self) -> Mapping[str, Any]: if not self._artifacts: self._projects, self._artifacts = self._load() return self._artifacts - def _load(self) -> Tuple[Dict, Dict]: + def _load(self) -> Tuple[Mapping[str, Any], Mapping[str, Any]]: """Compute projects and artifacts per project Returns: @@ -81,8 +81,8 @@ class GNUTree: return projects, artifacts -def find_artifacts( - filesystem: List[Dict], url: str) -> List[Mapping[str, Any]]: +def find_artifacts(filesystem: List[Mapping[str, Any]], + url: str) -> List[Mapping[str, Any]]: """Recursively list artifacts present in the folder and subfolders for a particular package url. @@ -125,7 +125,7 @@ def find_artifacts( ] """ - artifacts = [] + artifacts = [] # type: List[Mapping[str, Any]] for info_file in filesystem: filetype = info_file['type'] filename = info_file['name'] @@ -176,7 +176,6 @@ def check_filename_is_archive(filename: str) -> bool: """ file_suffixes = Path(filename).suffixes - logger.debug('Path(%s).suffixed: %s' % (filename, file_suffixes)) if len(file_suffixes) == 1 and file_suffixes[-1] in ('.zip', '.tar'): return True elif len(file_suffixes) > 1: @@ -186,17 +185,17 @@ def check_filename_is_archive(filename: str) -> bool: # to recognize existing naming pattern -extensions = [ +EXTENSIONS = [ 'zip', 'tar', 'gz', 'tgz', 'bz2', 'bzip2', 'lzma', 'lz', 'xz', - 'Z', + 'Z', '7z', ] -version_keywords = [ +VERSION_KEYWORDS = [ 'cygwin_me', 'w32', 'win32', 'nt', 'cygwin', 'mingw', 'latest', 'alpha', 'beta', @@ -226,24 +225,24 @@ version_keywords = [ # greedily with +, software_name and release_number are matched lazily # with +? and *?). -pattern = r''' +PATTERN = r''' ^ (?: # We have a software name and a release number, separated with a # -, _ or dot. (?P.+?[-_.]) - (?P(%(vkeywords)s|[0-9][0-9a-zA-Z_.+:~-]*?)+) + (?P({vkeywords}|[0-9][0-9a-zA-Z_.+:~-]*?)+) | # We couldn't match a release number, put everything in the # software name. (?P.+?) ) -(?P(?:\.(?:%(extensions)s))+) +(?P(?:\.(?:{extensions}))+) $ -''' % { - 'extensions': '|'.join(extensions), - 'vkeywords': '|'.join('%s[-]?' % k for k in version_keywords), -} +'''.format( + extensions='|'.join(EXTENSIONS), + vkeywords='|'.join('%s[-]?' % k for k in VERSION_KEYWORDS), +) def get_version(uri: str) -> str: @@ -268,7 +267,7 @@ def get_version(uri: str) -> str: """ filename = path.split(uri)[-1] - m = re.match(pattern, filename, + m = re.match(PATTERN, filename, flags=re.VERBOSE | re.IGNORECASE) if m: d = m.groupdict() @@ -280,7 +279,7 @@ def get_version(uri: str) -> str: return '' -def load_raw_data(url: str) -> List[Dict]: +def load_raw_data(url: str) -> Sequence[Mapping]: """Load the raw json from the tree.json.gz Args: