Added GiteaLister

Summary: Lister implementation for Gitea, works for (T2313). For now because of https://github.com/go-gitea/gitea/issues/9165 it would require setting its param limit to 50.

Reviewers: #reviewers, ardumont

Reviewed By: #reviewers, ardumont

Subscribers: ardumont

Differential Revision: https://forge.softwareheritage.org/D3107
This commit is contained in:
Léni Gauffier 2020-06-10 17:03:30 +02:00 committed by leni
parent 566294749e
commit 1408517c08
12 changed files with 752 additions and 0 deletions

View file

@ -67,6 +67,7 @@ setup(
lister.phabricator=swh.lister.phabricator:register
lister.pypi=swh.lister.pypi:register
lister.launchpad=swh.lister.launchpad:register
lister.gitea=swh.lister.gitea:register
""",
classifiers=[
"Programming Language :: Python :: 3",

View file

@ -0,0 +1,14 @@
# Copyright (C) 2020 the Software Heritage developers
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
def register():
from .models import GiteaModel
from .lister import GiteaLister
return {
"models": [GiteaModel],
"lister": GiteaLister,
"task_modules": ["%s.tasks" % __name__],
}

View file

@ -0,0 +1,91 @@
# Copyright (C) 2018-2020 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from ..core.page_by_page_lister import PageByPageHttpLister
from .models import GiteaModel
from typing import Any, Dict, List, Tuple, MutableMapping, Optional
from requests import Response
import re
from urllib3.util import parse_url
class GiteaLister(PageByPageHttpLister):
# Template path expecting an integer that represents the page id
PATH_TEMPLATE = "repos/search?page=%d&sort=id"
DEFAULT_URL = "https://try.gitea.io/api/v1/"
MODEL = GiteaModel
LISTER_NAME = "gitea"
def __init__(
self, url=None, instance=None, override_config=None, order="asc", limit=3
):
super().__init__(url=url, override_config=override_config)
if instance is None:
instance = parse_url(self.url).host
self.instance = instance
self.PATH_TEMPLATE = "%s&order=%s&limit=%s" % (
self.PATH_TEMPLATE,
order,
limit,
)
def get_model_from_repo(self, repo: Dict[str, Any]) -> Dict[str, Any]:
return {
"instance": self.instance,
"uid": repo["id"],
"name": repo["name"],
"full_name": repo["full_name"],
"html_url": repo["html_url"],
"origin_url": repo["clone_url"],
"origin_type": "git",
}
def uid(self, id: str) -> str:
return f"{self.instance}/{id}"
def get_next_target_from_response(self, response: Response) -> Optional[int]:
"""Determine the next page identifier.
"""
if "next" in response.links:
next_url = response.links["next"]["url"]
return self.get_page_from_url(next_url)
return None
def get_page_from_url(self, url: str) -> int:
page_re = re.compile(r"^.*/search\?.*page=(\d+)")
return int(page_re.match(url).group(1)) # type: ignore
def transport_response_simplified(self, response: Response) -> List[Dict[str, Any]]:
repos = response.json()["data"]
return [self.get_model_from_repo(repo) for repo in repos]
def get_pages_information(
self,
) -> Tuple[Optional[int], Optional[int], Optional[int]]:
"""Determine pages information.
"""
response = self.transport_head(identifier=1) # type: ignore
if not response.ok:
raise ValueError(
"Problem during information fetch: %s" % response.status_code
)
h = response.headers
return (
self._get_int(h, "x-total-count"),
int(self.get_page_from_url(response.links["last"]["url"])),
self._get_int(h, "x-per-page"),
)
def _get_int(self, headers: MutableMapping[str, Any], key: str) -> Optional[int]:
_val = headers.get(key)
if _val:
return int(_val)
return None
def run(self, min_bound=1, max_bound=None, check_existence=False):
return super().run(min_bound, max_bound, check_existence)

View file

@ -0,0 +1,18 @@
# Copyright (C) 2020 the Software Heritage developers
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from sqlalchemy import Column, Integer, String
from ..core.models import ModelBase
class GiteaModel(ModelBase):
"""a Gitea repository from a gitea instance
"""
__tablename__ = "gitea_repo"
uid = Column(Integer, primary_key=True)
instance = Column(String, index=True)

53
swh/lister/gitea/tasks.py Normal file
View file

@ -0,0 +1,53 @@
# Copyright (C) 2020 the Software Heritage developers
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import random
from celery import group, shared_task
from .. import utils
from .lister import GiteaLister
NBPAGES = 10
@shared_task(name=__name__ + ".IncrementalGiteaLister")
def list_gitea_incremental(**lister_args):
"""Incremental update of a Gitea instance"""
lister_args["sort"] = "desc"
lister = GiteaLister(**lister_args)
total_pages = lister.get_pages_information()[1]
# stopping as soon as existing origins for that instance are detected
return lister.run(min_bound=1, max_bound=total_pages, check_existence=True)
@shared_task(name=__name__ + ".RangeGiteaLister")
def _range_gitea_lister(start, end, **lister_args):
lister = GiteaLister(**lister_args)
return lister.run(min_bound=start, max_bound=end)
@shared_task(name=__name__ + ".FullGiteaRelister", bind=True)
def list_gitea_full(self, **lister_args):
"""Full update of a Gitea instance"""
lister = GiteaLister(**lister_args)
_, total_pages, _ = lister.get_pages_information()
ranges = list(utils.split_range(total_pages, NBPAGES))
random.shuffle(ranges)
promise = group(
_range_gitea_lister.s(minv, maxv, **lister_args) for minv, maxv in ranges
)()
self.log.debug("%s OK (spawned %s subtasks)" % (self.name, len(ranges)))
try:
promise.save()
except (NotImplementedError, AttributeError):
self.log.info("Unable to call save_group with current result backend.")
# FIXME: what to do in terms of return here?
return promise.id
@shared_task(name=__name__ + ".ping")
def _ping():
return "OK"

View file

View file

@ -0,0 +1,6 @@
# Copyright (C) 2020 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from swh.lister.core.tests.conftest import * # noqa

View file

@ -0,0 +1,4 @@
{
"ok": true,
"data": []
}

View file

@ -0,0 +1,182 @@
{
"ok": true,
"data": [
{
"id": 5017,
"owner": {
"id": 1609,
"login": "JonasFranzDEV",
"full_name": "",
"email": "info@jonasfranz.software",
"avatar_url": "https://try.gitea.io/user/avatar/JonasFranzDEV/-1",
"language": "de-DE",
"is_admin": false,
"last_login": "2019-10-19T10:58:29Z",
"created": "2017-06-25T17:43:19Z",
"username": "JonasFranzDEV"
},
"name": "drone-gitea-release",
"full_name": "JonasFranzDEV/drone-gitea-release",
"description": "",
"empty": false,
"private": false,
"fork": false,
"template": false,
"parent": null,
"mirror": false,
"size": 380,
"html_url": "https://try.gitea.io/JonasFranzDEV/drone-gitea-release",
"ssh_url": "git@try.gitea.io:JonasFranzDEV/drone-gitea-release.git",
"clone_url": "https://try.gitea.io/JonasFranzDEV/drone-gitea-release.git",
"original_url": "",
"website": "",
"stars_count": 0,
"forks_count": 0,
"watchers_count": 1,
"open_issues_count": 1,
"open_pr_counter": 0,
"release_counter": 2,
"default_branch": "master",
"archived": false,
"created_at": "2018-03-30T19:34:44Z",
"updated_at": "2018-05-29T20:09:40Z",
"permissions": {
"admin": false,
"push": false,
"pull": true
},
"has_issues": true,
"internal_tracker": {
"enable_time_tracker": true,
"allow_only_contributors_to_track_time": true,
"enable_issue_dependencies": true
},
"has_wiki": true,
"has_pull_requests": true,
"ignore_whitespace_conflicts": false,
"allow_merge_commits": false,
"allow_rebase": false,
"allow_rebase_explicit": true,
"allow_squash_merge": false,
"avatar_url": ""
},
{
"id": 5018,
"owner": {
"id": 4495,
"login": "nick.korsakov",
"full_name": "",
"email": "nick@korsakov.email",
"avatar_url": "https://try.gitea.io/user/avatar/nick.korsakov/-1",
"language": "ru-RU",
"is_admin": false,
"last_login": "2020-02-15T10:29:10Z",
"created": "2018-03-31T15:00:07Z",
"username": "nick.korsakov"
},
"name": "one",
"full_name": "nick.korsakov/one",
"description": "",
"empty": true,
"private": false,
"fork": false,
"template": false,
"parent": null,
"mirror": false,
"size": 0,
"html_url": "https://try.gitea.io/nick.korsakov/one",
"ssh_url": "git@try.gitea.io:nick.korsakov/one.git",
"clone_url": "https://try.gitea.io/nick.korsakov/one.git",
"original_url": "",
"website": "",
"stars_count": 0,
"forks_count": 0,
"watchers_count": 1,
"open_issues_count": 0,
"open_pr_counter": 0,
"release_counter": 0,
"default_branch": "master",
"archived": false,
"created_at": "2018-03-31T15:00:33Z",
"updated_at": "2018-03-31T15:00:33Z",
"permissions": {
"admin": false,
"push": false,
"pull": true
},
"has_issues": true,
"internal_tracker": {
"enable_time_tracker": true,
"allow_only_contributors_to_track_time": true,
"enable_issue_dependencies": true
},
"has_wiki": true,
"has_pull_requests": true,
"ignore_whitespace_conflicts": false,
"allow_merge_commits": false,
"allow_rebase": false,
"allow_rebase_explicit": true,
"allow_squash_merge": false,
"avatar_url": ""
},
{
"id": 5030,
"owner": {
"id": 1623,
"login": "xingshijun",
"full_name": "",
"email": "934302794@qq.com",
"avatar_url": "https://try.gitea.io/user/avatar/xingshijun/-1",
"language": "zh-CN",
"is_admin": false,
"last_login": "2019-06-15T12:28:43Z",
"created": "2017-06-28T02:19:23Z",
"username": "xingshijun"
},
"name": "lfzl",
"full_name": "xingshijun/lfzl",
"description": "",
"empty": false,
"private": false,
"fork": false,
"template": false,
"parent": null,
"mirror": false,
"size": 10990,
"html_url": "https://try.gitea.io/xingshijun/lfzl",
"ssh_url": "git@try.gitea.io:xingshijun/lfzl.git",
"clone_url": "https://try.gitea.io/xingshijun/lfzl.git",
"original_url": "",
"website": "",
"stars_count": 0,
"forks_count": 0,
"watchers_count": 1,
"open_issues_count": 0,
"open_pr_counter": 0,
"release_counter": 0,
"default_branch": "master",
"archived": false,
"created_at": "2018-04-02T08:34:08Z",
"updated_at": "2019-11-21T10:23:36Z",
"permissions": {
"admin": false,
"push": false,
"pull": true
},
"has_issues": true,
"internal_tracker": {
"enable_time_tracker": true,
"allow_only_contributors_to_track_time": true,
"enable_issue_dependencies": true
},
"has_wiki": true,
"has_pull_requests": true,
"ignore_whitespace_conflicts": false,
"allow_merge_commits": false,
"allow_rebase": false,
"allow_rebase_explicit": true,
"allow_squash_merge": false,
"avatar_url": ""
}
]
}

View file

@ -0,0 +1,182 @@
{
"ok": true,
"data": [
{
"id": 5017,
"owner": {
"id": 1609,
"login": "JonasFranzDEV",
"full_name": "",
"email": "info@jonasfranz.software",
"avatar_url": "https://try.gitea.io/user/avatar/JonasFranzDEV/-1",
"language": "de-DE",
"is_admin": false,
"last_login": "2019-10-19T10:58:29Z",
"created": "2017-06-25T17:43:19Z",
"username": "JonasFranzDEV"
},
"name": "drone-gitea-release",
"full_name": "JonasFranzDEV/drone-gitea-release",
"description": "",
"empty": false,
"private": false,
"fork": false,
"template": false,
"parent": null,
"mirror": false,
"size": 380,
"html_url": "https://try.gitea.io/JonasFranzDEV/drone-gitea-release",
"ssh_url": "git@try.gitea.io:JonasFranzDEV/drone-gitea-release.git",
"clone_url": "https://try.gitea.io/JonasFranzDEV/drone-gitea-release.git",
"original_url": "",
"website": "",
"stars_count": 0,
"forks_count": 0,
"watchers_count": 1,
"open_issues_count": 1,
"open_pr_counter": 0,
"release_counter": 2,
"default_branch": "master",
"archived": false,
"created_at": "2018-03-30T19:34:44Z",
"updated_at": "2018-05-29T20:09:40Z",
"permissions": {
"admin": false,
"push": false,
"pull": true
},
"has_issues": true,
"internal_tracker": {
"enable_time_tracker": true,
"allow_only_contributors_to_track_time": true,
"enable_issue_dependencies": true
},
"has_wiki": true,
"has_pull_requests": true,
"ignore_whitespace_conflicts": false,
"allow_merge_commits": false,
"allow_rebase": false,
"allow_rebase_explicit": true,
"allow_squash_merge": false,
"avatar_url": ""
},
{
"id": 5018,
"owner": {
"id": 4495,
"login": "nick.korsakov",
"full_name": "",
"email": "nick@korsakov.email",
"avatar_url": "https://try.gitea.io/user/avatar/nick.korsakov/-1",
"language": "ru-RU",
"is_admin": false,
"last_login": "2020-02-15T10:29:10Z",
"created": "2018-03-31T15:00:07Z",
"username": "nick.korsakov"
},
"name": "one",
"full_name": "nick.korsakov/one",
"description": "",
"empty": true,
"private": false,
"fork": false,
"template": false,
"parent": null,
"mirror": false,
"size": 0,
"html_url": "https://try.gitea.io/nick.korsakov/one",
"ssh_url": "git@try.gitea.io:nick.korsakov/one.git",
"clone_url": "https://try.gitea.io/nick.korsakov/one.git",
"original_url": "",
"website": "",
"stars_count": 0,
"forks_count": 0,
"watchers_count": 1,
"open_issues_count": 0,
"open_pr_counter": 0,
"release_counter": 0,
"default_branch": "master",
"archived": false,
"created_at": "2018-03-31T15:00:33Z",
"updated_at": "2018-03-31T15:00:33Z",
"permissions": {
"admin": false,
"push": false,
"pull": true
},
"has_issues": true,
"internal_tracker": {
"enable_time_tracker": true,
"allow_only_contributors_to_track_time": true,
"enable_issue_dependencies": true
},
"has_wiki": true,
"has_pull_requests": true,
"ignore_whitespace_conflicts": false,
"allow_merge_commits": false,
"allow_rebase": false,
"allow_rebase_explicit": true,
"allow_squash_merge": false,
"avatar_url": ""
},
{
"id": 5030,
"owner": {
"id": 1623,
"login": "xingshijun",
"full_name": "",
"email": "934302794@qq.com",
"avatar_url": "https://try.gitea.io/user/avatar/xingshijun/-1",
"language": "zh-CN",
"is_admin": false,
"last_login": "2019-06-15T12:28:43Z",
"created": "2017-06-28T02:19:23Z",
"username": "xingshijun"
},
"name": "lfzl",
"full_name": "xingshijun/lfzl",
"description": "",
"empty": false,
"private": false,
"fork": false,
"template": false,
"parent": null,
"mirror": false,
"size": 10990,
"html_url": "https://try.gitea.io/xingshijun/lfzl",
"ssh_url": "git@try.gitea.io:xingshijun/lfzl.git",
"clone_url": "https://try.gitea.io/xingshijun/lfzl.git",
"original_url": "",
"website": "",
"stars_count": 0,
"forks_count": 0,
"watchers_count": 1,
"open_issues_count": 0,
"open_pr_counter": 0,
"release_counter": 0,
"default_branch": "master",
"archived": false,
"created_at": "2018-04-02T08:34:08Z",
"updated_at": "2019-11-21T10:23:36Z",
"permissions": {
"admin": false,
"push": false,
"pull": true
},
"has_issues": true,
"internal_tracker": {
"enable_time_tracker": true,
"allow_only_contributors_to_track_time": true,
"enable_issue_dependencies": true
},
"has_wiki": true,
"has_pull_requests": true,
"ignore_whitespace_conflicts": false,
"allow_merge_commits": false,
"allow_rebase": false,
"allow_rebase_explicit": true,
"allow_squash_merge": false,
"avatar_url": ""
}
]
}

View file

@ -0,0 +1,60 @@
# Copyright (C) 2017-2020 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import logging
import re
import unittest
from swh.lister.core.tests.test_lister import HttpListerTesterBase
from swh.lister.gitea.lister import GiteaLister
logger = logging.getLogger(__name__)
class GiteaListerTester(HttpListerTesterBase, unittest.TestCase):
Lister = GiteaLister
test_re = re.compile(r"^.*/projects.*page=(\d+).*")
lister_subdir = "gitea"
good_api_response_file = "data/https_try.gitea.io/api_response.json"
bad_api_response_file = "data/https_try.gitea.io/api_empty_response.json"
first_index = 1
last_index = 2
entries_per_page = 3
convert_type = int
def response_headers(self, request):
headers = {}
if self.request_index(request) == self.first_index:
headers.update(
{
"Link": "<https://try.gitea.io/api/v1\
/repos/search?&page=%s&sort=id>;"
' rel="next"' % self.last_index
}
)
return headers
def test_lister_gitea(swh_listers, requests_mock_datadir):
lister: GiteaLister = swh_listers["gitea"]
lister.run()
r = lister.scheduler.search_tasks(task_type="load-git")
assert len(r) == 3
for row in r:
assert row["type"] == "load-git"
# arguments check
args = row["arguments"]["args"]
assert len(args) == 0
# kwargs
kwargs = row["arguments"]["kwargs"]
url = kwargs["url"]
assert url.startswith("https://try.gitea.io")
assert row["policy"] == "recurring"
assert row["priority"] is None

View file

@ -0,0 +1,141 @@
from time import sleep
from celery.result import GroupResult
from unittest.mock import patch
def test_ping(swh_app, celery_session_worker):
res = swh_app.send_task("swh.lister.gitea.tasks.ping")
assert res
res.wait()
assert res.successful()
assert res.result == "OK"
@patch("swh.lister.gitea.tasks.GiteaLister")
def test_incremental(lister, swh_app, celery_session_worker):
# setup the mocked GiteaLister
lister.return_value = lister
lister.run.return_value = None
lister.get_pages_information.return_value = (None, 10, None)
res = swh_app.send_task("swh.lister.gitea.tasks.IncrementalGiteaLister")
assert res
res.wait()
assert res.successful()
lister.assert_called_once_with(sort="desc")
lister.db_last_index.assert_not_called()
lister.get_pages_information.assert_called_once_with()
lister.run.assert_called_once_with(min_bound=1, max_bound=10, check_existence=True)
@patch("swh.lister.gitea.tasks.GiteaLister")
def test_range(lister, swh_app, celery_session_worker):
# setup the mocked GiteaLister
lister.return_value = lister
lister.run.return_value = None
res = swh_app.send_task(
"swh.lister.gitea.tasks.RangeGiteaLister", kwargs=dict(start=12, end=42)
)
assert res
res.wait()
assert res.successful()
lister.assert_called_once_with()
lister.db_last_index.assert_not_called()
lister.run.assert_called_once_with(min_bound=12, max_bound=42)
@patch("swh.lister.gitea.tasks.GiteaLister")
def test_relister(lister, swh_app, celery_session_worker):
# setup the mocked GiteaLister
lister.return_value = lister
lister.run.return_value = None
lister.get_pages_information.return_value = (None, 85, None)
lister.db_partition_indices.return_value = [
(i, i + 9) for i in range(0, 80, 10)
] + [(80, 85)]
res = swh_app.send_task("swh.lister.gitea.tasks.FullGiteaRelister")
assert res
res.wait()
assert res.successful()
# retrieve the GroupResult for this task and wait for all the subtasks
# to complete
promise_id = res.result
assert promise_id
promise = GroupResult.restore(promise_id, app=swh_app)
for i in range(5):
if promise.ready():
break
sleep(1)
lister.assert_called_with()
# one by the FullGiteaRelister task
# + 9 for the RangeGiteaLister subtasks
assert lister.call_count == 10
lister.db_last_index.assert_not_called()
lister.db_partition_indices.assert_not_called()
lister.get_pages_information.assert_called_once_with()
# lister.run should have been called once per partition interval
for i in range(8):
# XXX inconsistent behavior: max_bound is EXCLUDED here
assert (
dict(min_bound=10 * i, max_bound=10 * i + 10),
) in lister.run.call_args_list
assert (dict(min_bound=80, max_bound=85),) in lister.run.call_args_list
@patch("swh.lister.gitea.tasks.GiteaLister")
def test_relister_instance(lister, swh_app, celery_session_worker):
# setup the mocked GiteaLister
lister.return_value = lister
lister.run.return_value = None
lister.get_pages_information.return_value = (None, 85, None)
lister.db_partition_indices.return_value = [
(i, i + 9) for i in range(0, 80, 10)
] + [(80, 85)]
res = swh_app.send_task(
"swh.lister.gitea.tasks.FullGiteaRelister",
kwargs=dict(url="https://0xacab.org/api/v4"),
)
assert res
res.wait()
assert res.successful()
# retrieve the GroupResult for this task and wait for all the subtasks
# to complete
promise_id = res.result
assert promise_id
promise = GroupResult.restore(promise_id, app=swh_app)
for i in range(5):
if promise.ready():
break
sleep(1)
lister.assert_called_with(url="https://0xacab.org/api/v4")
# one by the FullGiteaRelister task
# + 9 for the RangeGiteaLister subtasks
assert lister.call_count == 10
lister.db_last_index.assert_not_called()
lister.db_partition_indices.assert_not_called()
lister.get_pages_information.assert_called_once_with()
# lister.run should have been called once per partition interval
for i in range(8):
# XXX inconsistent behavior: max_bound is EXCLUDED here
assert (
dict(min_bound=10 * i, max_bound=10 * i + 10),
) in lister.run.call_args_list
assert (dict(min_bound=80, max_bound=85),) in lister.run.call_args_list