gnu: Reimplement lister using new Lister API

ISO functionalities port of the stateless GNU lister to the new
swh.lister.pattern.Lister API.

Closes T2990
This commit is contained in:
Antoine Lambert 2021-01-29 11:49:59 +01:00
parent 5aa7c8f2b2
commit 4cf0c7f765
9 changed files with 84 additions and 404 deletions

View file

@ -1,14 +1,12 @@
# Copyright (C) 2019 the Software Heritage developers
# Copyright (C) 2019-2021 the Software Heritage developers
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
def register():
from .lister import GNULister
from .models import GNUModel
return {
"models": [GNUModel],
"lister": GNULister,
"task_modules": ["%s.tasks" % __name__],
}

View file

@ -1,112 +1,68 @@
# Copyright (C) 2019 the Software Heritage developers
# Copyright (C) 2019-2021 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import logging
from typing import Any, Dict, List
from typing import Any, Iterator, Mapping
from requests import Response
import iso8601
from swh.lister.core.simple_lister import SimpleLister
from swh.lister.gnu.models import GNUModel
from swh.lister.gnu.tree import GNUTree
from swh.scheduler import utils
from swh.scheduler.interface import SchedulerInterface
from swh.scheduler.model import ListedOrigin
from ..pattern import CredentialsType, StatelessLister
from .tree import GNUTree
logger = logging.getLogger(__name__)
GNUPageType = Mapping[str, Any]
class GNULister(SimpleLister):
MODEL = GNUModel
LISTER_NAME = "gnu"
instance = "gnu"
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.gnu_tree = GNUTree("https://ftp.gnu.org/tree.json.gz")
class GNULister(StatelessLister[GNUPageType]):
"""
List all GNU projects and associated artifacts.
"""
def task_dict(self, origin_type, origin_url, **kwargs):
"""Return task format dict
LISTER_NAME = "GNU"
GNU_FTP_URL = "https://ftp.gnu.org"
This is overridden from the lister_base as more information is
needed for the ingestion task creation.
This creates tasks with args and kwargs set, for example:
.. code-block:: python
args:
kwargs: {
'url': 'https://ftp.gnu.org/gnu/3dldf/',
'artifacts': [{
'url': 'https://...',
'time': '2003-12-09T21:43:20+00:00',
'length': 128,
'version': '1.0.1',
'filename': 'something-1.0.1.tar.gz',
},
...
]
}
"""
artifacts = self.gnu_tree.artifacts[origin_url]
assert origin_type == "tar"
return utils.create_task_dict(
"load-archive-files",
kwargs.get("policy", "oneshot"),
url=origin_url,
artifacts=artifacts,
retries_left=3,
def __init__(
self, scheduler: SchedulerInterface, credentials: CredentialsType = None,
):
super().__init__(
scheduler=scheduler,
url=self.GNU_FTP_URL,
instance="GNU",
credentials=credentials,
)
self.gnu_tree = GNUTree(f"{self.url}/tree.json.gz")
def safely_issue_request(self, identifier: int) -> None:
"""Bypass the implementation. It's now the GNUTree which deals with
querying the gnu mirror.
As an implementation detail, we cannot change simply the base
SimpleLister as other implementation still uses it. This shall be part
of another refactoring pass.
def get_pages(self) -> Iterator[GNUPageType]:
"""
return None
def list_packages(self, response: Response) -> List[Dict[str, Any]]:
"""List the actual gnu origins (package name) with their name, url and
associated tarballs.
Args:
response: Unused
Returns:
List of packages name, url, last modification time::
[
{
'name': '3dldf',
'url': 'https://ftp.gnu.org/gnu/3dldf/',
'time_modified': '2003-12-09T20:43:20+00:00'
},
{
'name': '8sync',
'url': 'https://ftp.gnu.org/gnu/8sync/',
'time_modified': '2016-12-06T02:37:10+00:00'
},
...
]
Yield a single page listing all GNU projects.
"""
return list(self.gnu_tree.projects.values())
def get_model_from_repo(self, repo: Dict[str, Any]) -> Dict[str, Any]:
"""Transform from repository representation to model
yield self.gnu_tree.projects
def get_origins_from_page(self, page: GNUPageType) -> Iterator[ListedOrigin]:
"""
return {
"uid": repo["url"],
"name": repo["name"],
"full_name": repo["name"],
"html_url": repo["url"],
"origin_url": repo["url"],
"time_last_updated": repo["time_modified"],
"origin_type": "tar",
}
Iterate on all GNU projects and yield ListedOrigin instances.
"""
assert self.lister_obj.id is not None
artifacts = self.gnu_tree.artifacts
for project_name, project_info in page.items():
origin_url = project_info["url"]
last_update = iso8601.parse_date(project_info["time_modified"])
logger.debug("Found origin %s last updated on %s", origin_url, last_update)
yield ListedOrigin(
lister_id=self.lister_obj.id,
url=origin_url,
visit_type="tar",
last_update=last_update,
extra_loader_arguments={"artifacts": artifacts[project_name]},
)

View file

@ -1,18 +0,0 @@
# Copyright (C) 2019 the Software Heritage developers
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from sqlalchemy import Column, DateTime, String
from ..core.models import ModelBase
class GNUModel(ModelBase):
"""a GNU repository representation
"""
__tablename__ = "gnu_repo"
uid = Column(String, primary_key=True)
time_last_updated = Column(DateTime)

View file

@ -1,4 +1,4 @@
# Copyright (C) 2019 the Software Heritage developers
# Copyright (C) 2019-2021 the Software Heritage developers
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
@ -10,7 +10,7 @@ from .lister import GNULister
@shared_task(name=__name__ + ".GNUListerTask")
def list_gnu_full(**lister_args):
"""List lister for the GNU source code archive"""
return GNULister(**lister_args).run()
return GNULister.from_configfile(**lister_args).run().dict()
@shared_task(name=__name__ + ".ping")

View file

@ -1,37 +0,0 @@
[{"type":"directory","name": ".","contents":[
{"type":"file","name":".footer.shtml","size":444,"time":"1359994299"},
{"type":"file","name":"find.txt.gz","size":261428,"time":"1557684608"},
{"type":"directory","name":"gnu","size":12288,"time":"1556742017","contents":[]},
{"type":"directory","name":"gnu+linux-distros","size":4096,"time":"1299783002","contents":[
{"type":"directory","name":"ututo-e","size":4096,"time":"1487780066","contents":[
{"type":"file","name":"README","size":48,"time":"1487780066"},
{"type":"file","name":"index.html","size":158,"time":"1487780054"}
]}
]},
{"type":"file","name":"ls-lrRt.txt.gz","size":480054,"time":"1557684607"},
{"type":"directory","name":"mirrors","size":4096,"time":"1114010630","contents":[
{"type":"directory","name":"dynebolic","size":4096,"time":"1317827602","contents":[
{"type":"file","name":"MOVED_TO_mirror.fsf.org_dynebolic","size":0,"time":"1317826935"},
{"type":"file","name":"index.html","size":107,"time":"1317827601"}
]}
]},
{"type":"link","name":"non-gnu","target":"gnu/non-gnu","size":11,"time":"1082055542","contents":[]},
{"type":"directory","name":"old-gnu","size":4096,"time":"1548360019","contents":[]},
{"type":"link","name":"pub","target":".","size":1,"time":"1060090003","contents":[]},
{"type":"directory","name":"savannah","size":4096,"time":"1194544006","contents":[
{"type":"file","name":"README","size":473,"time":"1143758028"}
]},
{"type":"directory","name":"third-party","size":4096,"time":"1059825710","contents":[
{"type":"file","name":"README","size":374,"time":"983824071"}
]},
{"type":"directory","name":"tmp","size":4096,"time":"1239072509","contents":[
]},
{"type":"file","name":"tree.json.gz","size":0,"time":"1557684608"},
{"type":"directory","name":"video","size":4096,"time":"1367963189","contents":[
{"type":"file","name":".bash_history","size":27,"time":"1307027604"},
{"type":"file","name":"stallmanupv.ogg.sig","size":536,"time":"1299776853"}
]},
{"type":"file","name":"welcome.msg","size":2830,"time":"1545163301"}
]},
{"type":"report","directories":2743,"files":63983}
]

View file

@ -1,26 +0,0 @@
# Copyright (C) 2019-2020 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import pytest
@pytest.fixture
def lister_under_test():
return "gnu"
@pytest.fixture
def lister_gnu(swh_lister):
for task_type in [
{
"type": "load-archive-files",
"description": "Load archive repository",
"backend_name": "swh.loader.packages.tasks.LoadArchive",
"default_interval": "1 day",
},
]:
swh_lister.scheduler.create_task_type(task_type)
return swh_lister

View file

@ -1,182 +0,0 @@
[
{
"archive": "https://ftp.gnu.org/gnu/artanis/artanis-0.2.1.tar.bz2",
"date": "1495205979"
},
{
"archive": "https://ftp.gnu.org/gnu/artanis/artanis-0.2.1.tar.gz",
"date": "1495205967"
},
{
"archive": "https://ftp.gnu.org/gnu/artanis/artanis-0.2.12-f39e-dirty.tar.gz",
"date": "1494994222"
},
{
"archive": "https://ftp.gnu.org/gnu/artanis/artanis-0.2.3.tar.bz2",
"date": "1520284021"
},
{
"archive": "https://ftp.gnu.org/gnu/artanis/artanis-0.2.3.tar.gz",
"date": "1520284007"
},
{
"archive": "https://ftp.gnu.org/gnu/artanis/artanis-0.2.4.tar.bz2",
"date": "1521742071"
},
{
"archive": "https://ftp.gnu.org/gnu/artanis/artanis-0.2.4.tar.gz",
"date": "1521742057"
},
{
"archive": "https://ftp.gnu.org/gnu/artanis/artanis-0.2.5.tar.bz2",
"date": "1525717261"
},
{
"archive": "https://ftp.gnu.org/gnu/artanis/artanis-0.2.5.tar.gz",
"date": "1525717246"
},
{
"archive": "https://ftp.gnu.org/gnu/artanis/artanis-0.3.1.tar.bz2",
"date": "1546205569"
},
{
"archive": "https://ftp.gnu.org/gnu/artanis/artanis-0.3.1.tar.gz",
"date": "1546205555"
},
{
"archive": "https://ftp.gnu.org/gnu/artanis/artanis-0.3.tar.bz2",
"date": "1546205025"
},
{
"archive": "https://ftp.gnu.org/gnu/artanis/artanis-0.3.tar.gz",
"date": "1546205012"
},
{
"archive": "https://ftp.gnu.org/old-gnu/xboard/winboard/winboard-4_0_0-src.zip",
"date": "898422900"
},
{
"archive": "https://ftp.gnu.org/old-gnu/xboard/winboard/winboard-4_0_2-src.zip",
"date": "920018269"
},
{
"archive": "https://ftp.gnu.org/old-gnu/xboard/winboard/winboard-4_0_3-src.zip",
"date": "936750503"
},
{
"archive": "https://ftp.gnu.org/old-gnu/xboard/winboard/winboard-4_0_4-src.tar.gz",
"date": "944290190"
},
{
"archive": "https://ftp.gnu.org/old-gnu/xboard/winboard/winboard-4_0_5-src.tar.gz",
"date": "944600462"
},
{
"archive": "https://ftp.gnu.org/old-gnu/xboard/winboard/winboard-4_0_6-src.tar.gz",
"date": "952156231"
},
{
"archive": "https://ftp.gnu.org/old-gnu/xboard/winboard/winboard-4_0_7-src.tar.gz",
"date": "952313061"
},
{
"archive": "https://ftp.gnu.org/old-gnu/xboard/winboard/winboard-4_1_0-src.tar.gz",
"date": "969299378"
},
{
"archive": "https://ftp.gnu.org/old-gnu/xboard/winboard/winboard-4_2_0beta-src.tar.gz",
"date": "977027031"
},
{
"archive": "https://ftp.gnu.org/old-gnu/xboard/winboard/winboard-4_2_1-src.tar.gz",
"date": "981323331"
},
{
"archive": "https://ftp.gnu.org/old-gnu/xboard/winboard/winboard-4_2_2-src.tar.gz",
"date": "981570576"
},
{
"archive": "https://ftp.gnu.org/old-gnu/xboard/winboard/winboard-4_2_3-src.tar.gz",
"date": "982656672"
},
{
"archive": "https://ftp.gnu.org/old-gnu/xboard/winboard/winboard-4_2_4-src.tar.gz",
"date": "1007952574"
},
{
"archive": "https://ftp.gnu.org/old-gnu/xboard/winboard/winboard-4_2_5-src.tar.gz",
"date": "1008502483"
},
{
"archive": "https://ftp.gnu.org/old-gnu/xboard/winboard/winboard-4_2_6-src.tar.gz",
"date": "1012641285"
},
{
"archive": "https://ftp.gnu.org/old-gnu/xboard/xboard-3.6.2.tar.gz",
"date": "869814000"
},
{
"archive": "https://ftp.gnu.org/old-gnu/xboard/xboard-4.0.0.tar.gz",
"date": "898422900"
},
{
"archive": "https://ftp.gnu.org/old-gnu/xboard/xboard-4.0.2.tar.gz",
"date": "920018202"
},
{
"archive": "https://ftp.gnu.org/old-gnu/xboard/xboard-4.0.3.tar.gz",
"date": "936750512"
},
{
"archive": "https://ftp.gnu.org/old-gnu/xboard/xboard-4.0.4.tar.gz",
"date": "944290148"
},
{
"archive": "https://ftp.gnu.org/old-gnu/xboard/xboard-4.0.5.tar.gz",
"date": "944599461"
},
{
"archive": "https://ftp.gnu.org/old-gnu/xboard/xboard-4.0.6.tar.gz",
"date": "952156235"
},
{
"archive": "https://ftp.gnu.org/old-gnu/xboard/xboard-4.0.7.tar.gz",
"date": "952313085"
},
{
"archive": "https://ftp.gnu.org/old-gnu/xboard/xboard-4.1.0.tar.gz",
"date": "969299287"
},
{
"archive": "https://ftp.gnu.org/old-gnu/xboard/xboard-4.2.0beta.tar.gz",
"date": "977027108"
},
{
"archive": "https://ftp.gnu.org/old-gnu/xboard/xboard-4.2.1.tar.gz",
"date": "981323501"
},
{
"archive": "https://ftp.gnu.org/old-gnu/xboard/xboard-4.2.2.tar.gz",
"date": "981562809"
},
{
"archive": "https://ftp.gnu.org/old-gnu/xboard/xboard-4.2.3.tar.gz",
"date": "982657006"
},
{
"archive": "https://ftp.gnu.org/old-gnu/xboard/xboard-4.2.4.tar.gz",
"date": "1007952745"
},
{
"archive": "https://ftp.gnu.org/old-gnu/xboard/xboard-4.2.5.tar.gz",
"date": "1008466945"
},
{
"archive": "https://ftp.gnu.org/old-gnu/xboard/xboard-4.2.6.tar.gz",
"date": "1012641715"
},
{
"archive": "https://ftp.gnu.org/old-gnu/xboard/xboard-4.2.7.tar.gz",
"date": "1070057764"
}
]

View file

@ -1,47 +1,36 @@
# Copyright (C) 2019-2020 The Software Heritage developers
# Copyright (C) 2019-2021 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import logging
logger = logging.getLogger(__name__)
from ..lister import GNULister
def test_gnu_lister(lister_gnu, requests_mock_datadir):
lister_gnu.run()
def test_gnu_lister(swh_scheduler, requests_mock_datadir):
lister = GNULister(scheduler=swh_scheduler)
r = lister_gnu.scheduler.search_tasks(task_type="load-archive-files")
assert len(r) == 383
stats = lister.run()
for row in r:
assert row["type"] == "load-archive-files"
# arguments check
args = row["arguments"]["args"]
assert len(args) == 0
assert stats.pages == 1
assert stats.origins == 383
# kwargs
kwargs = row["arguments"]["kwargs"]
assert set(kwargs.keys()) == {"url", "artifacts"}
scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
url = kwargs["url"]
assert url.startswith("https://ftp.gnu.org")
assert len(scheduler_origins) == stats.origins
url_suffix = url.split("https://ftp.gnu.org")[1]
assert "gnu" in url_suffix or "old-gnu" in url_suffix
for origin in scheduler_origins:
assert origin.url.startswith(GNULister.GNU_FTP_URL)
assert origin.last_update is not None
assert "artifacts" in origin.extra_loader_arguments
assert len(origin.extra_loader_arguments["artifacts"]) > 0
artifacts = kwargs["artifacts"]
# check the artifact's structure
artifact = artifacts[0]
assert set(artifact.keys()) == {"url", "length", "time", "filename", "version"}
for artifact in artifacts:
logger.debug(artifact)
# 'time' is an isoformat string now
for key in ["url", "time", "filename", "version"]:
assert isinstance(artifact[key], str)
assert isinstance(artifact["length"], int)
assert row["policy"] == "oneshot"
assert row["priority"] is None
assert row["retries_left"] == 3
def test_gnu_lister_from_configfile(swh_scheduler_config, mocker):
load_from_envvar = mocker.patch("swh.lister.pattern.load_from_envvar")
load_from_envvar.return_value = {
"scheduler": {"cls": "local", **swh_scheduler_config},
"credentials": {},
}
lister = GNULister.from_configfile()
assert lister.scheduler is not None
assert lister.credentials is not None

View file

@ -1,9 +1,9 @@
# Copyright (C) 2019-2020 The Software Heritage developers
# Copyright (C) 2019-2021 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from unittest.mock import patch
from swh.lister.pattern import ListerStats
def test_ping(swh_scheduler_celery_app, swh_scheduler_celery_worker):
@ -14,17 +14,17 @@ def test_ping(swh_scheduler_celery_app, swh_scheduler_celery_worker):
assert res.result == "OK"
@patch("swh.lister.gnu.tasks.GNULister")
def test_lister(lister, swh_scheduler_celery_app, swh_scheduler_celery_worker):
# setup the mocked GNULister
lister.return_value = lister
lister.run.return_value = None
def test_lister(swh_scheduler_celery_app, swh_scheduler_celery_worker, mocker):
lister = mocker.patch("swh.lister.gnu.tasks.GNULister")
lister.from_configfile.return_value = lister
stats = ListerStats(pages=1, origins=300)
lister.run.return_value = stats
res = swh_scheduler_celery_app.send_task("swh.lister.gnu.tasks.GNUListerTask")
assert res
res.wait()
assert res.successful()
assert res.result == stats.dict()
lister.assert_called_once_with()
lister.db_last_index.assert_not_called()
lister.from_configfile.assert_called_once_with()
lister.run.assert_called_once_with()