Port cgit lister to the new lister api
Related to T2984
This commit is contained in:
parent
59c9abb916
commit
e4a590fc7f
8 changed files with 121 additions and 213 deletions
|
@ -1,14 +1,12 @@
|
|||
# Copyright (C) 2019 the Software Heritage developers
|
||||
# Copyright (C) 2019-2021 The Software Heritage developers
|
||||
# License: GNU General Public License version 3, or any later version
|
||||
# See top-level LICENSE file for more information
|
||||
|
||||
|
||||
def register():
|
||||
from .lister import CGitLister
|
||||
from .models import CGitModel
|
||||
|
||||
return {
|
||||
"models": [CGitModel],
|
||||
"lister": CGitLister,
|
||||
"task_modules": ["%s.tasks" % __name__],
|
||||
}
|
||||
|
|
|
@ -1,26 +1,25 @@
|
|||
# Copyright (C) 2019 the Software Heritage developers
|
||||
# Copyright (C) 2019-2021 The Software Heritage developers
|
||||
# License: GNU General Public License version 3, or any later version
|
||||
# See top-level LICENSE file for more information
|
||||
|
||||
import logging
|
||||
import re
|
||||
from typing import Any, Dict, Generator, Optional
|
||||
from typing import Iterator, List, Optional
|
||||
from urllib.parse import urljoin, urlparse
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
from requests import Session
|
||||
from requests.adapters import HTTPAdapter
|
||||
import requests
|
||||
|
||||
from swh.core.utils import grouper
|
||||
from swh.lister import USER_AGENT
|
||||
from swh.lister.core.lister_base import ListerBase
|
||||
|
||||
from .models import CGitModel
|
||||
from swh.lister.pattern import StatelessLister
|
||||
from swh.scheduler.interface import SchedulerInterface
|
||||
from swh.scheduler.model import ListedOrigin
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
Repositories = List[str]
|
||||
|
||||
class CGitLister(ListerBase):
|
||||
|
||||
class CGitLister(StatelessLister[Repositories]):
|
||||
"""Lister class for CGit repositories.
|
||||
|
||||
This lister will retrieve the list of published git repositories by
|
||||
|
@ -31,31 +30,12 @@ class CGitLister(ListerBase):
|
|||
URL for that git repo.
|
||||
|
||||
If several "Clone" urls are provided, prefer the http/https one, if
|
||||
any, otherwise fall bak to the first one.
|
||||
|
||||
A loader task is created for each git repository::
|
||||
|
||||
Task:
|
||||
Type: load-git
|
||||
Policy: recurring
|
||||
Args:
|
||||
<git_clonable_url>
|
||||
|
||||
Example::
|
||||
|
||||
Task:
|
||||
Type: load-git
|
||||
Policy: recurring
|
||||
Args:
|
||||
'https://git.savannah.gnu.org/git/elisp-es.git'
|
||||
any, otherwise fallback to the first one.
|
||||
"""
|
||||
|
||||
MODEL = CGitModel
|
||||
DEFAULT_URL = "https://git.savannah.gnu.org/cgit/"
|
||||
LISTER_NAME = "cgit"
|
||||
url_prefix_present = True
|
||||
|
||||
def __init__(self, url=None, instance=None, override_config=None):
|
||||
def __init__(self, scheduler: SchedulerInterface, url=None, instance=None):
|
||||
"""Lister class for CGit repositories.
|
||||
|
||||
Args:
|
||||
|
@ -65,49 +45,43 @@ class CGitLister(ListerBase):
|
|||
if unset.
|
||||
|
||||
"""
|
||||
super().__init__(override_config=override_config)
|
||||
|
||||
if url is None:
|
||||
url = self.config.get("url", self.DEFAULT_URL)
|
||||
self.url = url
|
||||
|
||||
if not instance:
|
||||
instance = urlparse(url).hostname
|
||||
self.instance = instance
|
||||
self.session = Session()
|
||||
self.session.mount(self.url, HTTPAdapter(max_retries=3))
|
||||
self.session.headers = {
|
||||
"User-Agent": USER_AGENT,
|
||||
}
|
||||
|
||||
def run(self) -> Dict[str, str]:
|
||||
status = "uneventful"
|
||||
total = 0
|
||||
for repos in grouper(self.get_repos(), 10):
|
||||
models = list(filter(None, (self.build_model(repo) for repo in repos)))
|
||||
injected_repos = self.inject_repo_data_into_db(models)
|
||||
self.schedule_missing_tasks(models, injected_repos)
|
||||
self.db_session.commit()
|
||||
total += len(injected_repos)
|
||||
logger.debug("Scheduled %s tasks for %s", total, self.url)
|
||||
status = "eventful"
|
||||
super().__init__(
|
||||
scheduler=scheduler, credentials=None, url=url, instance=instance,
|
||||
)
|
||||
|
||||
return {"status": status}
|
||||
self.session = requests.Session()
|
||||
self.session.headers.update(
|
||||
{"Accept": "application/html", "User-Agent": USER_AGENT}
|
||||
)
|
||||
|
||||
def get_repos(self) -> Generator[str, None, None]:
|
||||
def _get_and_parse(self, url: str) -> BeautifulSoup:
|
||||
"""Get the given url and parse the retrieved HTML using BeautifulSoup"""
|
||||
response = self.session.get(url)
|
||||
response.raise_for_status()
|
||||
return BeautifulSoup(response.text, features="html.parser")
|
||||
|
||||
def get_pages(self) -> Iterator[Repositories]:
|
||||
"""Generate git 'project' URLs found on the current CGit server
|
||||
|
||||
"""
|
||||
next_page = self.url
|
||||
next_page: Optional[str] = self.url
|
||||
while next_page:
|
||||
bs_idx = self.get_and_parse(next_page)
|
||||
bs_idx = self._get_and_parse(next_page)
|
||||
page_results = []
|
||||
|
||||
for tr in bs_idx.find("div", {"class": "content"}).find_all(
|
||||
"tr", {"class": ""}
|
||||
):
|
||||
yield urljoin(self.url, tr.find("a")["href"])
|
||||
page_results.append(urljoin(self.url, tr.find("a")["href"]))
|
||||
|
||||
yield page_results
|
||||
|
||||
try:
|
||||
pager = bs_idx.find("ul", {"class": "pager"})
|
||||
|
||||
current_page = pager.find("a", {"class": "current"})
|
||||
if current_page:
|
||||
next_page = current_page.parent.next_sibling.a["href"]
|
||||
|
@ -116,11 +90,33 @@ class CGitLister(ListerBase):
|
|||
# no pager, or no next page
|
||||
next_page = None
|
||||
|
||||
def build_model(self, repo_url: str) -> Optional[Dict[str, Any]]:
|
||||
"""Given the URL of a git repo project page on a CGit server,
|
||||
return the repo description (dict) suitable for insertion in the db.
|
||||
"""
|
||||
bs = self.get_and_parse(repo_url)
|
||||
def get_origins_from_page(
|
||||
self, repositories: Repositories
|
||||
) -> Iterator[ListedOrigin]:
|
||||
"""Convert a page of cgit repositories into a list of ListedOrigins."""
|
||||
assert self.lister_obj.id is not None
|
||||
|
||||
for repository_url in repositories:
|
||||
origin_url = self._get_origin_from_repository_url(repository_url)
|
||||
if not origin_url:
|
||||
continue
|
||||
|
||||
yield ListedOrigin(
|
||||
lister_id=self.lister_obj.id,
|
||||
url=origin_url,
|
||||
visit_type="git",
|
||||
last_update=None,
|
||||
)
|
||||
|
||||
def _get_origin_from_repository_url(self, repository_url: str) -> Optional[str]:
|
||||
"""Extract the git url from the repository page"""
|
||||
bs = self._get_and_parse(repository_url)
|
||||
|
||||
# origin urls are listed on the repository page
|
||||
# TODO check if forcing https is better or not ?
|
||||
# <link rel='vcs-git' href='git://...' title='...'/>
|
||||
# <link rel='vcs-git' href='http://...' title='...'/>
|
||||
# <link rel='vcs-git' href='https://...' title='...'/>
|
||||
urls = [x["href"] for x in bs.find_all("a", {"rel": "vcs-git"})]
|
||||
|
||||
if not urls:
|
||||
|
@ -134,15 +130,4 @@ class CGitLister(ListerBase):
|
|||
else:
|
||||
# otherwise, choose the first one
|
||||
origin_url = urls[0]
|
||||
|
||||
return {
|
||||
"uid": repo_url,
|
||||
"name": bs.find("a", title=re.compile(".+"))["title"],
|
||||
"origin_type": "git",
|
||||
"instance": self.instance,
|
||||
"origin_url": origin_url,
|
||||
}
|
||||
|
||||
def get_and_parse(self, url: str) -> BeautifulSoup:
|
||||
"Get the given url and parse the retrieved HTML using BeautifulSoup"
|
||||
return BeautifulSoup(self.session.get(url).text, features="html.parser")
|
||||
return origin_url
|
||||
|
|
|
@ -1,18 +0,0 @@
|
|||
# Copyright (C) 2019 the Software Heritage developers
|
||||
# License: GNU General Public License version 3, or any later version
|
||||
# See top-level LICENSE file for more information
|
||||
|
||||
from sqlalchemy import Column, String
|
||||
|
||||
from ..core.models import ModelBase
|
||||
|
||||
|
||||
class CGitModel(ModelBase):
|
||||
"""a CGit repository representation
|
||||
|
||||
"""
|
||||
|
||||
__tablename__ = "cgit_repo"
|
||||
|
||||
uid = Column(String, primary_key=True)
|
||||
instance = Column(String, index=True)
|
|
@ -1,16 +1,19 @@
|
|||
# Copyright (C) 2019 the Software Heritage developers
|
||||
# Copyright (C) 2019-2021 The Software Heritage developers
|
||||
# License: GNU General Public License version 3, or any later version
|
||||
# See top-level LICENSE file for more information
|
||||
|
||||
from typing import Dict, Optional
|
||||
|
||||
from celery import shared_task
|
||||
|
||||
from .lister import CGitLister
|
||||
|
||||
|
||||
@shared_task(name=__name__ + ".CGitListerTask")
|
||||
def list_cgit(**lister_args):
|
||||
def list_cgit(url: str, instance: Optional[str] = None,) -> Dict[str, str]:
|
||||
"""Lister task for CGit instances"""
|
||||
return CGitLister(**lister_args).run()
|
||||
lister = CGitLister.from_configfile(url=url, instance=instance)
|
||||
return lister.run().dict()
|
||||
|
||||
|
||||
@shared_task(name=__name__ + ".ping")
|
||||
|
|
|
@ -1,26 +0,0 @@
|
|||
# Copyright (C) 2019-2020 The Software Heritage developers
|
||||
# See the AUTHORS file at the top-level directory of this distribution
|
||||
# License: GNU General Public License version 3, or any later version
|
||||
# See top-level LICENSE file for more information
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def lister_under_test():
|
||||
return "cgit"
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def lister_cgit(swh_lister):
|
||||
for task_type in [
|
||||
{
|
||||
"type": "load-git",
|
||||
"description": "Load git repository",
|
||||
"backend_name": "swh.loader.git.tasks.UpdateGitRepository",
|
||||
"default_interval": "1 day",
|
||||
},
|
||||
]:
|
||||
swh_lister.scheduler.create_task_type(task_type)
|
||||
|
||||
return swh_lister
|
|
@ -1,71 +1,65 @@
|
|||
# Copyright (C) 2019-2020 the Software Heritage developers
|
||||
# Copyright (C) 2019-2021 The Software Heritage developers
|
||||
# License: GNU General Public License version 3, or any later version
|
||||
# See top-level LICENSE file for more information
|
||||
|
||||
from typing import List
|
||||
|
||||
from swh.lister import __version__
|
||||
from swh.lister.cgit.lister import CGitLister
|
||||
from swh.lister.pattern import ListerStats
|
||||
|
||||
|
||||
def test_lister_cgit_no_page(requests_mock_datadir, lister_cgit):
|
||||
assert lister_cgit.url == "https://git.savannah.gnu.org/cgit/"
|
||||
def test_lister_cgit_get_pages_one_page(requests_mock_datadir, swh_scheduler):
|
||||
url = "https://git.savannah.gnu.org/cgit/"
|
||||
lister_cgit = CGitLister(swh_scheduler, url=url)
|
||||
|
||||
repos = list(lister_cgit.get_repos())
|
||||
assert len(repos) == 977
|
||||
repos: List[List[str]] = list(lister_cgit.get_pages())
|
||||
flattened_repos = sum(repos, [])
|
||||
assert len(flattened_repos) == 977
|
||||
|
||||
assert repos[0] == "https://git.savannah.gnu.org/cgit/elisp-es.git/"
|
||||
assert flattened_repos[0] == "https://git.savannah.gnu.org/cgit/elisp-es.git/"
|
||||
# note the url below is NOT a subpath of /cgit/
|
||||
assert repos[-1] == "https://git.savannah.gnu.org/path/to/yetris.git/" # noqa
|
||||
assert (
|
||||
flattened_repos[-1] == "https://git.savannah.gnu.org/path/to/yetris.git/"
|
||||
) # noqa
|
||||
# note the url below is NOT on the same server
|
||||
assert repos[-2] == "http://example.org/cgit/xstarcastle.git/"
|
||||
assert flattened_repos[-2] == "http://example.org/cgit/xstarcastle.git/"
|
||||
|
||||
|
||||
def test_lister_cgit_model(requests_mock_datadir, lister_cgit):
|
||||
repo = next(lister_cgit.get_repos())
|
||||
def test_lister_cgit_get_pages_with_pages(requests_mock_datadir, swh_scheduler):
|
||||
url = "https://git.tizen/cgit/"
|
||||
lister_cgit = CGitLister(swh_scheduler, url=url)
|
||||
|
||||
model = lister_cgit.build_model(repo)
|
||||
assert model == {
|
||||
"uid": "https://git.savannah.gnu.org/cgit/elisp-es.git/",
|
||||
"name": "elisp-es.git",
|
||||
"origin_type": "git",
|
||||
"instance": "git.savannah.gnu.org",
|
||||
"origin_url": "https://git.savannah.gnu.org/git/elisp-es.git",
|
||||
}
|
||||
|
||||
|
||||
def test_lister_cgit_with_pages(requests_mock_datadir, lister_cgit):
|
||||
lister_cgit.url = "https://git.tizen/cgit/"
|
||||
|
||||
repos = list(lister_cgit.get_repos())
|
||||
repos: List[List[str]] = list(lister_cgit.get_pages())
|
||||
flattened_repos = sum(repos, [])
|
||||
# we should have 16 repos (listed on 3 pages)
|
||||
assert len(repos) == 16
|
||||
assert len(repos) == 3
|
||||
assert len(flattened_repos) == 16
|
||||
|
||||
|
||||
def test_lister_cgit_run(requests_mock_datadir, lister_cgit):
|
||||
lister_cgit.url = "https://git.tizen/cgit/"
|
||||
lister_cgit.run()
|
||||
def test_lister_cgit_run(requests_mock_datadir, swh_scheduler):
|
||||
"""cgit lister supports pagination"""
|
||||
|
||||
r = lister_cgit.scheduler.search_tasks(task_type="load-git")
|
||||
assert len(r) == 16
|
||||
url = "https://git.tizen/cgit/"
|
||||
lister_cgit = CGitLister(swh_scheduler, url=url)
|
||||
|
||||
for row in r:
|
||||
assert row["type"] == "load-git"
|
||||
# arguments check
|
||||
args = row["arguments"]["args"]
|
||||
assert len(args) == 0
|
||||
stats = lister_cgit.run()
|
||||
|
||||
# kwargs
|
||||
kwargs = row["arguments"]["kwargs"]
|
||||
assert len(kwargs) == 1
|
||||
url = kwargs["url"]
|
||||
assert url.startswith("https://git.tizen")
|
||||
expected_nb_origins = 16
|
||||
assert stats == ListerStats(pages=3, origins=expected_nb_origins)
|
||||
|
||||
assert row["policy"] == "recurring"
|
||||
assert row["priority"] is None
|
||||
# test page parsing
|
||||
scheduler_origins = swh_scheduler.get_listed_origins(
|
||||
lister_cgit.lister_obj.id
|
||||
).origins
|
||||
assert len(scheduler_origins) == expected_nb_origins
|
||||
|
||||
# test listed repositories
|
||||
for listed_origin in scheduler_origins:
|
||||
assert listed_origin.visit_type == "git"
|
||||
assert listed_origin.url.startswith("https://git.tizen")
|
||||
|
||||
def test_lister_cgit_requests(requests_mock_datadir, lister_cgit):
|
||||
lister_cgit.url = "https://git.tizen/cgit/"
|
||||
lister_cgit.run()
|
||||
|
||||
# test user agent content
|
||||
assert len(requests_mock_datadir.request_history) != 0
|
||||
for request in requests_mock_datadir.request_history:
|
||||
assert "User-Agent" in request.headers
|
||||
|
|
|
@ -1,12 +1,14 @@
|
|||
# Copyright (C) 2019-2020 The Software Heritage developers
|
||||
# Copyright (C) 2019-2021 The Software Heritage developers
|
||||
# See the AUTHORS file at the top-level directory of this distribution
|
||||
# License: GNU General Public License version 3, or any later version
|
||||
# See top-level LICENSE file for more information
|
||||
|
||||
from unittest.mock import patch
|
||||
|
||||
from swh.lister.pattern import ListerStats
|
||||
|
||||
def test_ping(swh_scheduler_celery_app, swh_scheduler_celery_worker):
|
||||
|
||||
def test_cgit_ping(swh_scheduler_celery_app, swh_scheduler_celery_worker):
|
||||
res = swh_scheduler_celery_app.send_task("swh.lister.cgit.tasks.ping")
|
||||
assert res
|
||||
res.wait()
|
||||
|
@ -15,19 +17,21 @@ def test_ping(swh_scheduler_celery_app, swh_scheduler_celery_worker):
|
|||
|
||||
|
||||
@patch("swh.lister.cgit.tasks.CGitLister")
|
||||
def test_lister(lister, swh_scheduler_celery_app, swh_scheduler_celery_worker):
|
||||
def test_cgit_lister_task(
|
||||
lister, swh_scheduler_celery_app, swh_scheduler_celery_worker
|
||||
):
|
||||
# setup the mocked CGitLister
|
||||
lister.return_value = lister
|
||||
lister.run.return_value = None
|
||||
lister.from_configfile.return_value = lister
|
||||
lister.run.return_value = ListerStats(pages=10, origins=500)
|
||||
|
||||
kwargs = dict(url="https://git.kernel.org/", instance="kernel")
|
||||
|
||||
res = swh_scheduler_celery_app.send_task(
|
||||
"swh.lister.cgit.tasks.CGitListerTask",
|
||||
kwargs=dict(url="https://git.kernel.org/", instance="kernel"),
|
||||
"swh.lister.cgit.tasks.CGitListerTask", kwargs=kwargs,
|
||||
)
|
||||
assert res
|
||||
res.wait()
|
||||
assert res.successful()
|
||||
|
||||
lister.assert_called_once_with(url="https://git.kernel.org/", instance="kernel")
|
||||
lister.db_last_index.assert_not_called()
|
||||
lister.from_configfile.assert_called_once_with(**kwargs)
|
||||
lister.run.assert_called_once_with()
|
||||
|
|
|
@ -39,35 +39,3 @@ def test_get_lister(swh_scheduler_config):
|
|||
**lister_args.get(lister_name, {}),
|
||||
)
|
||||
assert hasattr(lst, "run")
|
||||
|
||||
|
||||
def test_get_lister_override():
|
||||
"""Overriding the lister configuration should populate its config
|
||||
|
||||
"""
|
||||
db_url = init_db().url()
|
||||
|
||||
listers = {
|
||||
"cgit": "https://some.where/cgit",
|
||||
}
|
||||
|
||||
# check the override ends up defined in the lister
|
||||
for lister_name, url in listers.items():
|
||||
lst = get_lister(
|
||||
lister_name, db_url, url=url, priority="high", policy="oneshot"
|
||||
)
|
||||
|
||||
assert lst.url == url
|
||||
assert lst.config["priority"] == "high"
|
||||
assert lst.config["policy"] == "oneshot"
|
||||
|
||||
# check the default urls are used and not the override (since it's not
|
||||
# passed)
|
||||
for lister_name, url in listers.items():
|
||||
lst = get_lister(lister_name, db_url)
|
||||
|
||||
# no override so this does not end up in lister's configuration
|
||||
assert "url" not in lst.config
|
||||
assert "priority" not in lst.config
|
||||
assert "oneshot" not in lst.config
|
||||
assert lst.url == lst.DEFAULT_URL
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue