Port cgit lister to the new lister api

Related to T2984
This commit is contained in:
Vincent SELLIER 2021-01-22 15:39:49 +01:00
parent 59c9abb916
commit e4a590fc7f
No known key found for this signature in database
GPG key ID: 3F13C434EADAD17D
8 changed files with 121 additions and 213 deletions

View file

@ -1,14 +1,12 @@
# Copyright (C) 2019 the Software Heritage developers
# Copyright (C) 2019-2021 The Software Heritage developers
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
def register():
from .lister import CGitLister
from .models import CGitModel
return {
"models": [CGitModel],
"lister": CGitLister,
"task_modules": ["%s.tasks" % __name__],
}

View file

@ -1,26 +1,25 @@
# Copyright (C) 2019 the Software Heritage developers
# Copyright (C) 2019-2021 The Software Heritage developers
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import logging
import re
from typing import Any, Dict, Generator, Optional
from typing import Iterator, List, Optional
from urllib.parse import urljoin, urlparse
from bs4 import BeautifulSoup
from requests import Session
from requests.adapters import HTTPAdapter
import requests
from swh.core.utils import grouper
from swh.lister import USER_AGENT
from swh.lister.core.lister_base import ListerBase
from .models import CGitModel
from swh.lister.pattern import StatelessLister
from swh.scheduler.interface import SchedulerInterface
from swh.scheduler.model import ListedOrigin
logger = logging.getLogger(__name__)
Repositories = List[str]
class CGitLister(ListerBase):
class CGitLister(StatelessLister[Repositories]):
"""Lister class for CGit repositories.
This lister will retrieve the list of published git repositories by
@ -31,31 +30,12 @@ class CGitLister(ListerBase):
URL for that git repo.
If several "Clone" urls are provided, prefer the http/https one, if
any, otherwise fall bak to the first one.
A loader task is created for each git repository::
Task:
Type: load-git
Policy: recurring
Args:
<git_clonable_url>
Example::
Task:
Type: load-git
Policy: recurring
Args:
'https://git.savannah.gnu.org/git/elisp-es.git'
any, otherwise fallback to the first one.
"""
MODEL = CGitModel
DEFAULT_URL = "https://git.savannah.gnu.org/cgit/"
LISTER_NAME = "cgit"
url_prefix_present = True
def __init__(self, url=None, instance=None, override_config=None):
def __init__(self, scheduler: SchedulerInterface, url=None, instance=None):
"""Lister class for CGit repositories.
Args:
@ -65,49 +45,43 @@ class CGitLister(ListerBase):
if unset.
"""
super().__init__(override_config=override_config)
if url is None:
url = self.config.get("url", self.DEFAULT_URL)
self.url = url
if not instance:
instance = urlparse(url).hostname
self.instance = instance
self.session = Session()
self.session.mount(self.url, HTTPAdapter(max_retries=3))
self.session.headers = {
"User-Agent": USER_AGENT,
}
def run(self) -> Dict[str, str]:
status = "uneventful"
total = 0
for repos in grouper(self.get_repos(), 10):
models = list(filter(None, (self.build_model(repo) for repo in repos)))
injected_repos = self.inject_repo_data_into_db(models)
self.schedule_missing_tasks(models, injected_repos)
self.db_session.commit()
total += len(injected_repos)
logger.debug("Scheduled %s tasks for %s", total, self.url)
status = "eventful"
super().__init__(
scheduler=scheduler, credentials=None, url=url, instance=instance,
)
return {"status": status}
self.session = requests.Session()
self.session.headers.update(
{"Accept": "application/html", "User-Agent": USER_AGENT}
)
def get_repos(self) -> Generator[str, None, None]:
def _get_and_parse(self, url: str) -> BeautifulSoup:
"""Get the given url and parse the retrieved HTML using BeautifulSoup"""
response = self.session.get(url)
response.raise_for_status()
return BeautifulSoup(response.text, features="html.parser")
def get_pages(self) -> Iterator[Repositories]:
"""Generate git 'project' URLs found on the current CGit server
"""
next_page = self.url
next_page: Optional[str] = self.url
while next_page:
bs_idx = self.get_and_parse(next_page)
bs_idx = self._get_and_parse(next_page)
page_results = []
for tr in bs_idx.find("div", {"class": "content"}).find_all(
"tr", {"class": ""}
):
yield urljoin(self.url, tr.find("a")["href"])
page_results.append(urljoin(self.url, tr.find("a")["href"]))
yield page_results
try:
pager = bs_idx.find("ul", {"class": "pager"})
current_page = pager.find("a", {"class": "current"})
if current_page:
next_page = current_page.parent.next_sibling.a["href"]
@ -116,11 +90,33 @@ class CGitLister(ListerBase):
# no pager, or no next page
next_page = None
def build_model(self, repo_url: str) -> Optional[Dict[str, Any]]:
"""Given the URL of a git repo project page on a CGit server,
return the repo description (dict) suitable for insertion in the db.
"""
bs = self.get_and_parse(repo_url)
def get_origins_from_page(
self, repositories: Repositories
) -> Iterator[ListedOrigin]:
"""Convert a page of cgit repositories into a list of ListedOrigins."""
assert self.lister_obj.id is not None
for repository_url in repositories:
origin_url = self._get_origin_from_repository_url(repository_url)
if not origin_url:
continue
yield ListedOrigin(
lister_id=self.lister_obj.id,
url=origin_url,
visit_type="git",
last_update=None,
)
def _get_origin_from_repository_url(self, repository_url: str) -> Optional[str]:
"""Extract the git url from the repository page"""
bs = self._get_and_parse(repository_url)
# origin urls are listed on the repository page
# TODO check if forcing https is better or not ?
# <link rel='vcs-git' href='git://...' title='...'/>
# <link rel='vcs-git' href='http://...' title='...'/>
# <link rel='vcs-git' href='https://...' title='...'/>
urls = [x["href"] for x in bs.find_all("a", {"rel": "vcs-git"})]
if not urls:
@ -134,15 +130,4 @@ class CGitLister(ListerBase):
else:
# otherwise, choose the first one
origin_url = urls[0]
return {
"uid": repo_url,
"name": bs.find("a", title=re.compile(".+"))["title"],
"origin_type": "git",
"instance": self.instance,
"origin_url": origin_url,
}
def get_and_parse(self, url: str) -> BeautifulSoup:
"Get the given url and parse the retrieved HTML using BeautifulSoup"
return BeautifulSoup(self.session.get(url).text, features="html.parser")
return origin_url

View file

@ -1,18 +0,0 @@
# Copyright (C) 2019 the Software Heritage developers
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from sqlalchemy import Column, String
from ..core.models import ModelBase
class CGitModel(ModelBase):
"""a CGit repository representation
"""
__tablename__ = "cgit_repo"
uid = Column(String, primary_key=True)
instance = Column(String, index=True)

View file

@ -1,16 +1,19 @@
# Copyright (C) 2019 the Software Heritage developers
# Copyright (C) 2019-2021 The Software Heritage developers
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from typing import Dict, Optional
from celery import shared_task
from .lister import CGitLister
@shared_task(name=__name__ + ".CGitListerTask")
def list_cgit(**lister_args):
def list_cgit(url: str, instance: Optional[str] = None,) -> Dict[str, str]:
"""Lister task for CGit instances"""
return CGitLister(**lister_args).run()
lister = CGitLister.from_configfile(url=url, instance=instance)
return lister.run().dict()
@shared_task(name=__name__ + ".ping")

View file

@ -1,26 +0,0 @@
# Copyright (C) 2019-2020 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import pytest
@pytest.fixture
def lister_under_test():
return "cgit"
@pytest.fixture
def lister_cgit(swh_lister):
for task_type in [
{
"type": "load-git",
"description": "Load git repository",
"backend_name": "swh.loader.git.tasks.UpdateGitRepository",
"default_interval": "1 day",
},
]:
swh_lister.scheduler.create_task_type(task_type)
return swh_lister

View file

@ -1,71 +1,65 @@
# Copyright (C) 2019-2020 the Software Heritage developers
# Copyright (C) 2019-2021 The Software Heritage developers
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from typing import List
from swh.lister import __version__
from swh.lister.cgit.lister import CGitLister
from swh.lister.pattern import ListerStats
def test_lister_cgit_no_page(requests_mock_datadir, lister_cgit):
assert lister_cgit.url == "https://git.savannah.gnu.org/cgit/"
def test_lister_cgit_get_pages_one_page(requests_mock_datadir, swh_scheduler):
url = "https://git.savannah.gnu.org/cgit/"
lister_cgit = CGitLister(swh_scheduler, url=url)
repos = list(lister_cgit.get_repos())
assert len(repos) == 977
repos: List[List[str]] = list(lister_cgit.get_pages())
flattened_repos = sum(repos, [])
assert len(flattened_repos) == 977
assert repos[0] == "https://git.savannah.gnu.org/cgit/elisp-es.git/"
assert flattened_repos[0] == "https://git.savannah.gnu.org/cgit/elisp-es.git/"
# note the url below is NOT a subpath of /cgit/
assert repos[-1] == "https://git.savannah.gnu.org/path/to/yetris.git/" # noqa
assert (
flattened_repos[-1] == "https://git.savannah.gnu.org/path/to/yetris.git/"
) # noqa
# note the url below is NOT on the same server
assert repos[-2] == "http://example.org/cgit/xstarcastle.git/"
assert flattened_repos[-2] == "http://example.org/cgit/xstarcastle.git/"
def test_lister_cgit_model(requests_mock_datadir, lister_cgit):
repo = next(lister_cgit.get_repos())
def test_lister_cgit_get_pages_with_pages(requests_mock_datadir, swh_scheduler):
url = "https://git.tizen/cgit/"
lister_cgit = CGitLister(swh_scheduler, url=url)
model = lister_cgit.build_model(repo)
assert model == {
"uid": "https://git.savannah.gnu.org/cgit/elisp-es.git/",
"name": "elisp-es.git",
"origin_type": "git",
"instance": "git.savannah.gnu.org",
"origin_url": "https://git.savannah.gnu.org/git/elisp-es.git",
}
def test_lister_cgit_with_pages(requests_mock_datadir, lister_cgit):
lister_cgit.url = "https://git.tizen/cgit/"
repos = list(lister_cgit.get_repos())
repos: List[List[str]] = list(lister_cgit.get_pages())
flattened_repos = sum(repos, [])
# we should have 16 repos (listed on 3 pages)
assert len(repos) == 16
assert len(repos) == 3
assert len(flattened_repos) == 16
def test_lister_cgit_run(requests_mock_datadir, lister_cgit):
lister_cgit.url = "https://git.tizen/cgit/"
lister_cgit.run()
def test_lister_cgit_run(requests_mock_datadir, swh_scheduler):
"""cgit lister supports pagination"""
r = lister_cgit.scheduler.search_tasks(task_type="load-git")
assert len(r) == 16
url = "https://git.tizen/cgit/"
lister_cgit = CGitLister(swh_scheduler, url=url)
for row in r:
assert row["type"] == "load-git"
# arguments check
args = row["arguments"]["args"]
assert len(args) == 0
stats = lister_cgit.run()
# kwargs
kwargs = row["arguments"]["kwargs"]
assert len(kwargs) == 1
url = kwargs["url"]
assert url.startswith("https://git.tizen")
expected_nb_origins = 16
assert stats == ListerStats(pages=3, origins=expected_nb_origins)
assert row["policy"] == "recurring"
assert row["priority"] is None
# test page parsing
scheduler_origins = swh_scheduler.get_listed_origins(
lister_cgit.lister_obj.id
).origins
assert len(scheduler_origins) == expected_nb_origins
# test listed repositories
for listed_origin in scheduler_origins:
assert listed_origin.visit_type == "git"
assert listed_origin.url.startswith("https://git.tizen")
def test_lister_cgit_requests(requests_mock_datadir, lister_cgit):
lister_cgit.url = "https://git.tizen/cgit/"
lister_cgit.run()
# test user agent content
assert len(requests_mock_datadir.request_history) != 0
for request in requests_mock_datadir.request_history:
assert "User-Agent" in request.headers

View file

@ -1,12 +1,14 @@
# Copyright (C) 2019-2020 The Software Heritage developers
# Copyright (C) 2019-2021 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from unittest.mock import patch
from swh.lister.pattern import ListerStats
def test_ping(swh_scheduler_celery_app, swh_scheduler_celery_worker):
def test_cgit_ping(swh_scheduler_celery_app, swh_scheduler_celery_worker):
res = swh_scheduler_celery_app.send_task("swh.lister.cgit.tasks.ping")
assert res
res.wait()
@ -15,19 +17,21 @@ def test_ping(swh_scheduler_celery_app, swh_scheduler_celery_worker):
@patch("swh.lister.cgit.tasks.CGitLister")
def test_lister(lister, swh_scheduler_celery_app, swh_scheduler_celery_worker):
def test_cgit_lister_task(
lister, swh_scheduler_celery_app, swh_scheduler_celery_worker
):
# setup the mocked CGitLister
lister.return_value = lister
lister.run.return_value = None
lister.from_configfile.return_value = lister
lister.run.return_value = ListerStats(pages=10, origins=500)
kwargs = dict(url="https://git.kernel.org/", instance="kernel")
res = swh_scheduler_celery_app.send_task(
"swh.lister.cgit.tasks.CGitListerTask",
kwargs=dict(url="https://git.kernel.org/", instance="kernel"),
"swh.lister.cgit.tasks.CGitListerTask", kwargs=kwargs,
)
assert res
res.wait()
assert res.successful()
lister.assert_called_once_with(url="https://git.kernel.org/", instance="kernel")
lister.db_last_index.assert_not_called()
lister.from_configfile.assert_called_once_with(**kwargs)
lister.run.assert_called_once_with()

View file

@ -39,35 +39,3 @@ def test_get_lister(swh_scheduler_config):
**lister_args.get(lister_name, {}),
)
assert hasattr(lst, "run")
def test_get_lister_override():
"""Overriding the lister configuration should populate its config
"""
db_url = init_db().url()
listers = {
"cgit": "https://some.where/cgit",
}
# check the override ends up defined in the lister
for lister_name, url in listers.items():
lst = get_lister(
lister_name, db_url, url=url, priority="high", policy="oneshot"
)
assert lst.url == url
assert lst.config["priority"] == "high"
assert lst.config["policy"] == "oneshot"
# check the default urls are used and not the override (since it's not
# passed)
for lister_name, url in listers.items():
lst = get_lister(lister_name, db_url)
# no override so this does not end up in lister's configuration
assert "url" not in lst.config
assert "priority" not in lst.config
assert "oneshot" not in lst.config
assert lst.url == lst.DEFAULT_URL