Add gitiles lister
Gitiles instance returns voluntarily a malformed json output (json prefixed with ``)]}'\n``) [2]. The lister deals with it to properly parse the json response nonetheless. It drops the prefix and then parses the json. If at some point, they drop this prefix to return json directly, the lister will be able to deal with it too. There are 2 tests one with 'standard' gitile format and another with standard json to account for both case. Refs. swh/meta#5045 [2] https://github.com/google/gitiles/issues/263
This commit is contained in:
parent
573958ce64
commit
3ab856288c
12 changed files with 307 additions and 0 deletions
1
setup.py
1
setup.py
|
@ -68,6 +68,7 @@ setup(
|
|||
lister.fedora=swh.lister.fedora:register
|
||||
lister.gitea=swh.lister.gitea:register
|
||||
lister.github=swh.lister.github:register
|
||||
lister.gitiles=swh.lister.gitiles:register
|
||||
lister.gitlab=swh.lister.gitlab:register
|
||||
lister.gitweb=swh.lister.gitweb:register
|
||||
lister.gnu=swh.lister.gnu:register
|
||||
|
|
12
swh/lister/gitiles/__init__.py
Normal file
12
swh/lister/gitiles/__init__.py
Normal file
|
@ -0,0 +1,12 @@
|
|||
# Copyright (C) 2023 The Software Heritage developers
|
||||
# License: GNU General Public License version 3, or any later version
|
||||
# See top-level LICENSE file for more information
|
||||
|
||||
|
||||
def register():
|
||||
from .lister import GitilesLister
|
||||
|
||||
return {
|
||||
"lister": GitilesLister,
|
||||
"task_modules": [f"{__name__}.tasks"],
|
||||
}
|
82
swh/lister/gitiles/lister.py
Normal file
82
swh/lister/gitiles/lister.py
Normal file
|
@ -0,0 +1,82 @@
|
|||
# Copyright (C) 2023 The Software Heritage developers
|
||||
# License: GNU General Public License version 3, or any later version
|
||||
# See top-level LICENSE file for more information
|
||||
|
||||
from json import loads
|
||||
import logging
|
||||
from typing import Iterator, Optional
|
||||
|
||||
from swh.lister.pattern import CredentialsType, StatelessLister
|
||||
from swh.scheduler.interface import SchedulerInterface
|
||||
from swh.scheduler.model import ListedOrigin
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
Origin = str
|
||||
|
||||
|
||||
class GitilesLister(StatelessLister[Origin]):
|
||||
"""Lister class for Gitiles repositories.
|
||||
|
||||
This lister will retrieve the list of published git repositories by
|
||||
parsing the json page found at the url `<url>?format=json`.
|
||||
|
||||
"""
|
||||
|
||||
LISTER_NAME = "gitiles"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
scheduler: SchedulerInterface,
|
||||
url: Optional[str] = None,
|
||||
instance: Optional[str] = None,
|
||||
credentials: Optional[CredentialsType] = None,
|
||||
max_origins_per_page: Optional[int] = None,
|
||||
max_pages: Optional[int] = None,
|
||||
enable_origins: bool = True,
|
||||
):
|
||||
"""Lister class for Gitiles repositories.
|
||||
|
||||
Args:
|
||||
url: (Optional) Root URL of the Gitiles instance, i.e. url of the index of
|
||||
published git repositories on this instance. Defaults to
|
||||
:file:`https://{instance}` if unset.
|
||||
instance: Name of gitiles instance. Defaults to url's network location
|
||||
if unset.
|
||||
|
||||
"""
|
||||
super().__init__(
|
||||
scheduler=scheduler,
|
||||
url=url,
|
||||
instance=instance,
|
||||
credentials=credentials,
|
||||
max_origins_per_page=max_origins_per_page,
|
||||
max_pages=max_pages,
|
||||
enable_origins=enable_origins,
|
||||
)
|
||||
|
||||
self.session.headers.update({"Accept": "application/json"})
|
||||
|
||||
def get_pages(self) -> Iterator[Origin]:
|
||||
"""Generate git 'project' URLs found on the current Gitiles server."""
|
||||
response = self.http_request(f"{self.url}?format=json")
|
||||
text = response.text
|
||||
# current gitiles' json is returned with a specific prefix
|
||||
# See. https://github.com/google/gitiles/issues/263
|
||||
if text.startswith(")]}'\n"):
|
||||
text = text[5:]
|
||||
|
||||
data = loads(text)
|
||||
|
||||
for repo in data.values():
|
||||
yield repo["clone_url"]
|
||||
|
||||
def get_origins_from_page(self, origin: Origin) -> Iterator[ListedOrigin]:
|
||||
"""Convert a page of gitiles repositories into a list of ListedOrigins."""
|
||||
assert self.lister_obj.id is not None
|
||||
|
||||
yield ListedOrigin(
|
||||
lister_id=self.lister_obj.id,
|
||||
url=origin,
|
||||
visit_type="git",
|
||||
)
|
16
swh/lister/gitiles/tasks.py
Normal file
16
swh/lister/gitiles/tasks.py
Normal file
|
@ -0,0 +1,16 @@
|
|||
# Copyright (C) 2023 The Software Heritage developers
|
||||
# License: GNU General Public License version 3, or any later version
|
||||
# See top-level LICENSE file for more information
|
||||
|
||||
from typing import Dict
|
||||
|
||||
from celery import shared_task
|
||||
|
||||
from .lister import GitilesLister
|
||||
|
||||
|
||||
@shared_task(name=f"{__name__}.GitilesListerTask")
|
||||
def list_gitiles(**lister_args) -> Dict[str, str]:
|
||||
"""Lister task for Gitiles instances"""
|
||||
lister = GitilesLister.from_configfile(**lister_args)
|
||||
return lister.run().dict()
|
0
swh/lister/gitiles/tests/__init__.py
Normal file
0
swh/lister/gitiles/tests/__init__.py
Normal file
|
@ -0,0 +1,35 @@
|
|||
)]}'
|
||||
{
|
||||
"accessories/manifest": {
|
||||
"name": "accessories/manifest",
|
||||
"clone_url": "https://android.googlesource.com/accessories/manifest"
|
||||
},
|
||||
"device/google/gs201": {
|
||||
"name": "device/google/gs201",
|
||||
"clone_url": "https://android.googlesource.com/device/google/gs201",
|
||||
"description": "Bug: 244231765"
|
||||
},
|
||||
"device/google/sunfish-kernel": {
|
||||
"name": "device/google/sunfish-kernel",
|
||||
"clone_url": "https://android.googlesource.com/device/google/sunfish-kernel",
|
||||
"description": "Bug: 160260413"
|
||||
},
|
||||
"device/lge/mako-kernel": {
|
||||
"name": "device/lge/mako-kernel",
|
||||
"clone_url": "https://android.googlesource.com/device/lge/mako-kernel"
|
||||
},
|
||||
"kernel/msm-extra/camera-devicetree": {
|
||||
"name": "kernel/msm-extra/camera-devicetree",
|
||||
"clone_url": "https://android.googlesource.com/kernel/msm-extra/camera-devicetree",
|
||||
"description": "Bug: 167236823"
|
||||
},
|
||||
"platform/external/google-fonts/rubik": {
|
||||
"name": "platform/external/google-fonts/rubik",
|
||||
"clone_url": "https://android.googlesource.com/platform/external/google-fonts/rubik",
|
||||
"description": "Bug: 122303069"
|
||||
},
|
||||
"trusty/vendor/google/aosp": {
|
||||
"name": "trusty/vendor/google/aosp",
|
||||
"clone_url": "https://android.googlesource.com/trusty/vendor/google/aosp"
|
||||
}
|
||||
}
|
|
@ -0,0 +1,2 @@
|
|||
These files are a partial dump of https://android.googlesource.com/?format=json.
|
||||
|
|
@ -0,0 +1,16 @@
|
|||
{
|
||||
"apps/analytics-etl": {
|
||||
"name": "apps/analytics-etl",
|
||||
"clone_url": "https://gerrit.googlesource.com/apps/analytics-etl",
|
||||
"description": "Spark ETL to extra analytics data from Gerrit Projects using the Analytics plugin"
|
||||
},
|
||||
"apps/kibana-dashboard": {
|
||||
"name": "apps/kibana-dashboard",
|
||||
"clone_url": "https://gerrit.googlesource.com/apps/kibana-dashboard"
|
||||
},
|
||||
"apps/reviewit": {
|
||||
"name": "apps/reviewit",
|
||||
"clone_url": "https://gerrit.googlesource.com/apps/reviewit",
|
||||
"description": "The \u0027Review It!?\u0027 app is an Android application for Gerrit that allows sorting of incoming changes and review of small/trivial changes.\n\nThis is not an official Google product."
|
||||
}
|
||||
}
|
|
@ -0,0 +1,2 @@
|
|||
These files are a partial dump of https://gerrit.googlesource.com/?format=json.
|
||||
|
110
swh/lister/gitiles/tests/test_lister.py
Normal file
110
swh/lister/gitiles/tests/test_lister.py
Normal file
|
@ -0,0 +1,110 @@
|
|||
# Copyright (C) 2023 The Software Heritage developers
|
||||
# License: GNU General Public License version 3, or any later version
|
||||
# See top-level LICENSE file for more information
|
||||
|
||||
import os
|
||||
from typing import List
|
||||
|
||||
import pytest
|
||||
|
||||
from swh.lister import __version__
|
||||
from swh.lister.gitiles.lister import GitilesLister
|
||||
from swh.lister.pattern import ListerStats
|
||||
|
||||
MAIN_INSTANCE = "android.googlesource.com"
|
||||
MAIN_INSTANCE_URL = f"https://{MAIN_INSTANCE}"
|
||||
|
||||
|
||||
def test_lister_gitiles_instantiate(swh_scheduler):
|
||||
"""Build a lister with either an url or an instance is supported."""
|
||||
url = MAIN_INSTANCE_URL
|
||||
lister = GitilesLister(swh_scheduler, url=url)
|
||||
assert lister is not None
|
||||
assert lister.url == url
|
||||
|
||||
assert GitilesLister(swh_scheduler, instance=MAIN_INSTANCE) is not None
|
||||
assert lister is not None
|
||||
assert lister.url == url
|
||||
|
||||
|
||||
def test_lister_gitiles_fail_to_instantiate(swh_scheduler):
|
||||
"""Build a lister without its url nor its instance should raise"""
|
||||
# ... It will raise without any of those
|
||||
with pytest.raises(ValueError, match="'url' or 'instance'"):
|
||||
GitilesLister(swh_scheduler)
|
||||
|
||||
|
||||
def test_lister_gitiles_get_pages(requests_mock_datadir, swh_scheduler):
|
||||
"""Computing the number of pages scrapped during a listing."""
|
||||
url = MAIN_INSTANCE_URL
|
||||
lister_gitiles = GitilesLister(swh_scheduler, instance=MAIN_INSTANCE)
|
||||
|
||||
expected_nb_origins = 7
|
||||
|
||||
repos: List[str] = list(lister_gitiles.get_pages())
|
||||
assert len(repos) == expected_nb_origins
|
||||
|
||||
for listed_url in repos:
|
||||
assert listed_url.startswith(url)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"url,expected_nb_origins",
|
||||
[(MAIN_INSTANCE_URL, 7), ("https://gerrit.googlesource.com", 3)],
|
||||
)
|
||||
def test_lister_gitiles_run(
|
||||
requests_mock_datadir, swh_scheduler, url, expected_nb_origins
|
||||
):
|
||||
"""Gitiles lister nominal listing case."""
|
||||
lister_gitiles = GitilesLister(swh_scheduler, url=url)
|
||||
|
||||
stats = lister_gitiles.run()
|
||||
|
||||
assert stats == ListerStats(pages=expected_nb_origins, origins=expected_nb_origins)
|
||||
|
||||
# test page parsing
|
||||
scheduler_origins = swh_scheduler.get_listed_origins(
|
||||
lister_gitiles.lister_obj.id
|
||||
).results
|
||||
assert len(scheduler_origins) == expected_nb_origins
|
||||
|
||||
assert url.startswith("https://")
|
||||
|
||||
# test listed repositories
|
||||
for listed_origin in scheduler_origins:
|
||||
assert listed_origin.visit_type == "git"
|
||||
assert listed_origin.url.startswith(url)
|
||||
assert listed_origin.url.startswith("https://")
|
||||
assert listed_origin.last_update is None
|
||||
|
||||
# test user agent content
|
||||
for request in requests_mock_datadir.request_history:
|
||||
assert "User-Agent" in request.headers
|
||||
user_agent = request.headers["User-Agent"]
|
||||
assert "Software Heritage gitiles lister" in user_agent
|
||||
assert __version__ in user_agent
|
||||
|
||||
|
||||
def test_lister_gitiles_get_pages_with_pages_and_retry(
|
||||
requests_mock_datadir, requests_mock, datadir, mocker, swh_scheduler
|
||||
):
|
||||
"""Rate limited page are tested back after some time so ingestion can proceed."""
|
||||
url = MAIN_INSTANCE_URL
|
||||
with open(
|
||||
os.path.join(datadir, f"https_{MAIN_INSTANCE}/,format=json"), "rb"
|
||||
) as page:
|
||||
requests_mock.get(
|
||||
url,
|
||||
[
|
||||
{"content": None, "status_code": 429},
|
||||
{"content": None, "status_code": 429},
|
||||
{"content": page.read(), "status_code": 200},
|
||||
],
|
||||
)
|
||||
|
||||
lister_gitiles = GitilesLister(swh_scheduler, url=url)
|
||||
|
||||
mocker.patch.object(lister_gitiles.http_request.retry, "sleep")
|
||||
|
||||
pages: List[str] = list(lister_gitiles.get_pages())
|
||||
assert len(pages) == 7
|
28
swh/lister/gitiles/tests/test_tasks.py
Normal file
28
swh/lister/gitiles/tests/test_tasks.py
Normal file
|
@ -0,0 +1,28 @@
|
|||
# Copyright (C) 2023 The Software Heritage developers
|
||||
# See the AUTHORS file at the top-level directory of this distribution
|
||||
# License: GNU General Public License version 3, or any later version
|
||||
# See top-level LICENSE file for more information
|
||||
|
||||
from swh.lister.pattern import ListerStats
|
||||
|
||||
|
||||
def test_gitweb_lister_task(
|
||||
swh_scheduler_celery_app, swh_scheduler_celery_worker, mocker
|
||||
):
|
||||
# setup the mocked GitwebLister
|
||||
lister = mocker.patch("swh.lister.gitweb.tasks.GitwebLister")
|
||||
lister.from_configfile.return_value = lister
|
||||
lister.run.return_value = ListerStats(pages=10, origins=500)
|
||||
|
||||
kwargs = dict(url="https://android.googlesource.com", max_pages=1)
|
||||
|
||||
res = swh_scheduler_celery_app.send_task(
|
||||
"swh.lister.gitweb.tasks.GitwebListerTask",
|
||||
kwargs=kwargs,
|
||||
)
|
||||
assert res
|
||||
res.wait()
|
||||
assert res.successful()
|
||||
|
||||
lister.from_configfile.assert_called_once_with(**kwargs)
|
||||
lister.run.assert_called_once_with()
|
|
@ -45,6 +45,9 @@ lister_args = {
|
|||
"gitweb": {
|
||||
"url": "https://git.distorted.org.uk/~mdw/",
|
||||
},
|
||||
"gitiles": {
|
||||
"instance": "gerrit.googlesource.com",
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue