cgit: Compute origin urls out of a base git url when provided.
This adds a second behavior to the cgit lister to actually compute origin urls instead of parsing them out of another http request on git detailed page. This new behavior is expected to be the default behavior. The old behavior is kept for now and is expected to be used as fallback if too much false negatives are returned. Related to T2999
This commit is contained in:
parent
4cf0c7f765
commit
2e22073558
7 changed files with 191 additions and 15 deletions
|
@ -28,12 +28,19 @@ class CGitLister(StatelessLister[Repositories]):
|
|||
This lister will retrieve the list of published git repositories by
|
||||
parsing the HTML page(s) of the index retrieved at `url`.
|
||||
|
||||
For each found git repository, a query is made at the given url found
|
||||
in this index to gather published "Clone" URLs to be used as origin
|
||||
URL for that git repo.
|
||||
The lister currently defines 2 listing behaviors:
|
||||
|
||||
- If the `base_git_url` is provided, the listed origin urls are computed out of the
|
||||
base git url link and the one listed in the main listed page (resulting in less
|
||||
HTTP queries than the 2nd behavior below). This is expected to be the main
|
||||
deployed behavior.
|
||||
|
||||
- Otherwise (with no `base_git_url`), for each found git repository listed, one
|
||||
extra HTTP query is made at the given url found in the main listing page to gather
|
||||
published "Clone" URLs to be used as origin URL for that git repo. If several
|
||||
"Clone" urls are provided, prefer the http/https one, if any, otherwise fallback
|
||||
to the first one.
|
||||
|
||||
If several "Clone" urls are provided, prefer the http/https one, if
|
||||
any, otherwise fallback to the first one.
|
||||
"""
|
||||
|
||||
LISTER_NAME = "cgit"
|
||||
|
@ -44,14 +51,17 @@ class CGitLister(StatelessLister[Repositories]):
|
|||
url: str,
|
||||
instance: Optional[str] = None,
|
||||
credentials: Optional[CredentialsType] = None,
|
||||
base_git_url: Optional[str] = None,
|
||||
):
|
||||
"""Lister class for CGit repositories.
|
||||
|
||||
Args:
|
||||
url (str): main URL of the CGit instance, i.e. url of the index
|
||||
url: main URL of the CGit instance, i.e. url of the index
|
||||
of published git repositories on this instance.
|
||||
instance (str): Name of cgit instance. Defaults to url's hostname
|
||||
instance: Name of cgit instance. Defaults to url's hostname
|
||||
if unset.
|
||||
base_git_url: Optional base git url which allows the origin url
|
||||
computations.
|
||||
|
||||
"""
|
||||
if not instance:
|
||||
|
@ -66,6 +76,7 @@ class CGitLister(StatelessLister[Repositories]):
|
|||
self.session.headers.update(
|
||||
{"Accept": "application/html", "User-Agent": USER_AGENT}
|
||||
)
|
||||
self.base_git_url = base_git_url
|
||||
|
||||
def _get_and_parse(self, url: str) -> BeautifulSoup:
|
||||
"""Get the given url and parse the retrieved HTML using BeautifulSoup"""
|
||||
|
@ -87,7 +98,19 @@ class CGitLister(StatelessLister[Repositories]):
|
|||
for tr in bs_idx.find("div", {"class": "content"}).find_all(
|
||||
"tr", {"class": ""}
|
||||
):
|
||||
url = urljoin(self.url, tr.find("a")["href"])
|
||||
repository_link = tr.find("a")["href"]
|
||||
repo_url = None
|
||||
git_url = None
|
||||
|
||||
base_url = urljoin(self.url, repository_link)
|
||||
if self.base_git_url: # mapping provided
|
||||
# computing git url
|
||||
git_url = base_url.replace(self.url, self.base_git_url)
|
||||
else:
|
||||
# we compute the git detailed page url from which we will retrieve
|
||||
# the git url (cf. self.get_origins_from_page)
|
||||
repo_url = base_url
|
||||
|
||||
span = tr.find("span", {"class": re.compile("age-")})
|
||||
if span:
|
||||
last_updated_date = span["title"]
|
||||
|
@ -95,7 +118,11 @@ class CGitLister(StatelessLister[Repositories]):
|
|||
last_updated_date = None
|
||||
|
||||
page_results.append(
|
||||
{"url": url, "last_updated_date": last_updated_date}
|
||||
{
|
||||
"url": repo_url,
|
||||
"git_url": git_url,
|
||||
"last_updated_date": last_updated_date,
|
||||
}
|
||||
)
|
||||
|
||||
yield page_results
|
||||
|
@ -117,8 +144,10 @@ class CGitLister(StatelessLister[Repositories]):
|
|||
"""Convert a page of cgit repositories into a list of ListedOrigins."""
|
||||
assert self.lister_obj.id is not None
|
||||
|
||||
for repository in repositories:
|
||||
origin_url = self._get_origin_from_repository_url(repository["url"])
|
||||
for repo in repositories:
|
||||
origin_url = repo["git_url"] or self._get_origin_from_repository_url(
|
||||
repo["url"]
|
||||
)
|
||||
if origin_url is None:
|
||||
continue
|
||||
|
||||
|
@ -126,7 +155,7 @@ class CGitLister(StatelessLister[Repositories]):
|
|||
lister_id=self.lister_obj.id,
|
||||
url=origin_url,
|
||||
visit_type="git",
|
||||
last_update=_parse_last_updated_date(repository),
|
||||
last_update=_parse_last_updated_date(repo),
|
||||
)
|
||||
|
||||
def _get_origin_from_repository_url(self, repository_url: str) -> Optional[str]:
|
||||
|
|
|
@ -10,9 +10,13 @@ from .lister import CGitLister
|
|||
|
||||
|
||||
@shared_task(name=__name__ + ".CGitListerTask")
|
||||
def list_cgit(url: str, instance: Optional[str] = None,) -> Dict[str, str]:
|
||||
def list_cgit(
|
||||
url: str, instance: Optional[str] = None, base_git_url: Optional[str] = None
|
||||
) -> Dict[str, str]:
|
||||
"""Lister task for CGit instances"""
|
||||
lister = CGitLister.from_configfile(url=url, instance=instance)
|
||||
lister = CGitLister.from_configfile(
|
||||
url=url, instance=instance, base_git_url=base_git_url
|
||||
)
|
||||
return lister.run().dict()
|
||||
|
||||
|
||||
|
|
33
swh/lister/cgit/tests/data/https_git.baserock.org/cgit
Normal file
33
swh/lister/cgit/tests/data/https_git.baserock.org/cgit
Normal file
|
@ -0,0 +1,33 @@
|
|||
<!DOCTYPE html>
|
||||
<html lang='en'>
|
||||
<head>
|
||||
<title>Lorry Depot</title>
|
||||
<meta name='generator' content='cgit v1.2.1'/>
|
||||
<meta name='robots' content='index, nofollow'/>
|
||||
<link rel='stylesheet' type='text/css' href='/cgit-css/cgit.css'/>
|
||||
<link rel='shortcut icon' href='/favicon.ico'/>
|
||||
</head>
|
||||
<body>
|
||||
<div id='cgit'><table id='header'>
|
||||
<tr>
|
||||
<td class='logo' rowspan='2'><a href='/cgit/'><img src='/cgit-css/cgit.png' alt='cgit logo'/></a></td>
|
||||
<td class='main'>Lorry Depot</td></tr>
|
||||
<tr><td class='sub'>Mirror Repositories</td></tr></table>
|
||||
<table class='tabs'><tr><td>
|
||||
<a class='active' href='/cgit/'>index</a></td><td class='form'><form method='get' action='/cgit/'>
|
||||
<input type='search' name='q' size='10' value=''/>
|
||||
<input type='submit' value='search'/>
|
||||
</form></td></tr></table>
|
||||
<div class='content'>
|
||||
<table summary='repository list' class='list nowrap'><tr class='nohover'><th class='left'><a href='/cgit/?s=name'>Name</a></th><th class='left'><a href='/cgit/?s=desc'>Description</a></th><th class='left'><a href='/cgit/?s=idle'>Idle</a></th><th class='left'>Links</th></tr>
|
||||
<tr><td class='toplevel-repo'><a title='baserock/baserock/baserock-chroot.git' href='/cgit/baserock/baserock/baserock-chroot.git/'>baserock/baserock/baserock-chroot.git</a></td><td><a href='/cgit/baserock/baserock/baserock-chroot.git/'>Tools for working with Baserock chroots
|
||||
</a></td><td><span class='age-years' title='2015-05-20 09:59:01 +0000'>6 years</span></td><td><a class='button' href='/cgit/baserock/baserock/baserock-chroot.git/'>summary</a><a class='button' href='/cgit/baserock/baserock/baserock-chroot.git/log/'>log</a><a class='button' href='/cgit/baserock/baserock/baserock-chroot.git/tree/'>tree</a></td></tr>
|
||||
<tr><td class='toplevel-repo'><a title='baserock/baserock/bsp-support.git' href='/cgit/baserock/baserock/bsp-support.git/'>baserock/baserock/bsp-support.git</a></td><td><a href='/cgit/baserock/baserock/bsp-support.git/'>Programs and configuration needed to support specific devices
|
||||
</a></td><td><span class='age-years' title='2015-10-25 19:53:10 +0000'>5 years</span></td><td><a class='button' href='/cgit/baserock/baserock/bsp-support.git/'>summary</a><a class='button' href='/cgit/baserock/baserock/bsp-support.git/log/'>log</a><a class='button' href='/cgit/baserock/baserock/bsp-support.git/tree/'>tree</a></td></tr>
|
||||
<tr><td class='toplevel-repo'><a title='baserock/baserock/definitions.git' href='/cgit/baserock/baserock/definitions.git/'>baserock/baserock/definitions.git</a></td><td><a href='/cgit/baserock/baserock/definitions.git/'>gitlab.com: baserock/definitions.git
|
||||
</a></td><td><span class='age-months' title='2019-05-15 17:58:47 +0000'>21 months</span></td><td><a class='button' href='/cgit/baserock/baserock/definitions.git/'>summary</a><a class='button' href='/cgit/baserock/baserock/definitions.git/log/'>log</a><a class='button' href='/cgit/baserock/baserock/definitions.git/tree/'>tree</a></td></tr>
|
||||
</table>
|
||||
</div> <!-- class=content -->
|
||||
</div> <!-- id=cgit -->
|
||||
</body>
|
||||
</html>
|
42
swh/lister/cgit/tests/data/https_git.eclipse.org/c
Normal file
42
swh/lister/cgit/tests/data/https_git.eclipse.org/c
Normal file
|
@ -0,0 +1,42 @@
|
|||
<!DOCTYPE html>
|
||||
<html lang='en'>
|
||||
<head>
|
||||
<title>Eclipse Git repositories</title>
|
||||
<meta name='generator' content='cgit v1.2.1'/>
|
||||
<meta name='robots' content='index, nofollow'/>
|
||||
<meta charset="utf-8">
|
||||
<meta http-equiv="X-UA-Compatible" content="IE=edge">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1">
|
||||
<meta name="keywords" content="eclipse.org, Eclipse Foundation, Eclipse, Git, Source code, Source" />
|
||||
</head>
|
||||
<body>
|
||||
<!-- /#breadcrumb -->
|
||||
<main>
|
||||
<div class="container-fluid legacy-page" id="novaContent">
|
||||
<div class="col-md-24"><div id='cgit'><table id='header'>
|
||||
<tr>
|
||||
<td class='logo' rowspan='2'><a href='/c/'><img src='/git.png' alt='cgit logo'/></a></td>
|
||||
<td class='main'>Eclipse Git repositories</td></tr>
|
||||
<tr><td class='sub'>To use Git in Eclipse, check out the EGit project.</td></tr></table>
|
||||
<table class='tabs'><tr><td>
|
||||
<a class='active' href='/c/'>index</a><a href='/c/?p=about'>about</a></td><td class='form'><form method='get' action='/c/'>
|
||||
<input type='search' name='q' size='10' value=''/>
|
||||
<input type='submit' value='search'/>
|
||||
</form></td></tr></table>
|
||||
<div class='content'><table summary='repository list' class='list nowrap'><tr class='nohover'><th class='left'><a href='/c/?s=name'>Name</a></th><th class='left'><a href='/c/?s=desc'>Description</a></th><th class='left'><a href='/c/?s=idle'>Idle</a></th></tr>
|
||||
<tr class='nohover-highlight'><td colspan='3' class='reposection'>3p</td></tr>
|
||||
<tr><td class='sublevel-repo'><a title='3p.git' href='/c/3p/3p.git/'>3p.git</a></td><td><a href='/c/3p/3p.git/'>3P - PolarSys Packaging</a></td><td><span class='age-months' title='2020-01-17 23:52:00 +0000'>12 months</span></td></tr>
|
||||
<tr><td class='sublevel-repo'><a title='qvtd.git' href='/c/www.eclipse.org/qvtd.git/'>qvtd.git</a></td><td><a href='/c/www.eclipse.org/qvtd.git/'>www.eclipse.org project repository</a></td><td><span class='age-years' title='2018-07-20 23:51:54 +0000'>3 years</span></td></tr>
|
||||
<tr><td class='sublevel-repo'><a title='r4e.git' href='/c/www.eclipse.org/r4e.git/'>r4e.git</a></td><td><a href='/c/www.eclipse.org/r4e.git/'>www.eclipse.org project repository</a></td><td><span class='age-years' title='2018-11-17 00:57:51 +0000'>2 years</span></td></tr>
|
||||
<tr><td class='sublevel-repo'><a title='rap.git' href='/c/www.eclipse.org/rap.git/'>rap.git</a></td><td><a href='/c/www.eclipse.org/rap.git/'>RAP project website</a></td><td><span class='age-weeks' title='2020-12-20 01:16:37 +0000'>6 weeks</span></td></tr>
|
||||
<tr class='nohover-highlight'><td colspan='3' class='reposection'>zenoh</td></tr>
|
||||
<tr><td class='sublevel-repo'><a title='zenoh.git' href='/c/zenoh/zenoh.git/'>zenoh.git</a></td><td><a href='/c/zenoh/zenoh.git/'>Unnamed repository; edit this file 'description' to name the repository.</a></td><td><span class='age-months' title='2020-01-25 05:07:25 +0000'>12 months</span></td></tr>
|
||||
</table></div> <!-- class=content -->
|
||||
</div>
|
||||
</div>
|
||||
</main>
|
||||
<p id="back-to-top">
|
||||
<a class="visible-xs" href="#top">Back to the top</a>
|
||||
</p>
|
||||
</body>
|
||||
</html>
|
33
swh/lister/cgit/tests/data/https_jff.email/cgit
Normal file
33
swh/lister/cgit/tests/data/https_jff.email/cgit
Normal file
|
@ -0,0 +1,33 @@
|
|||
<!DOCTYPE html>
|
||||
<html lang='en'>
|
||||
<head>
|
||||
<title>Jörgs Debian Repository</title>
|
||||
<meta name='generator' content='cgit v1.1'/>
|
||||
<meta name='robots' content='index, nofollow'/>
|
||||
<link rel='stylesheet' type='text/css' href='/cgit-css/cgit.css'/>
|
||||
<link rel='shortcut icon' href='/favicon.ico'/>
|
||||
</head>
|
||||
<body>
|
||||
<div id='cgit'><table id='header'>
|
||||
<tr>
|
||||
<td class='logo' rowspan='2'><a href='/cgit/'><img src='/cgit-css/cgit.png' alt='cgit logo'/></a></td>
|
||||
<td class='main'>Jörgs Debian Repository</td></tr>
|
||||
<tr><td class='sub'>ask debian-cgit@jff.email</td></tr></table>
|
||||
<table class='tabs'><tr><td>
|
||||
<a class='active' href='/cgit/'>index</a></td><td class='form'><form method='get' action='/cgit/'>
|
||||
<input type='text' name='q' size='10' value=''/>
|
||||
<input type='submit' value='search'/>
|
||||
</form></td></tr></table>
|
||||
<div class='content'><table summary='repository list' class='list nowrap'>
|
||||
<tr class='nohover'><th class='left'><a href='/cgit/?s=name'>Name</a></th><th class='left'><a href='/cgit/?s=desc'>Description</a></th><th class='left'><a href='/cgit/?s=owner'>Owner</a></th><th class='left'><a href='/cgit/?s=idle'>Idle</a></th></tr>
|
||||
<tr><td class='toplevel-repo'><a title='gnome-pie.git' href='/cgit/gnome-pie.git/'>gnome-pie.git</a></td><td><a href='/cgit/gnome-pie.git/'>Debian repo for gnome-pie</a></td><td><a href='/cgit/?q=debian@jff.email'>debian@jff.email</a></td><td><span class='age-months' title='2020-06-01 15:22:58 +0000'>8 months</span></td></tr>
|
||||
<tr><td class='toplevel-repo'><a title='gs5.git' href='/cgit/gs5.git/'>gs5.git</a></td><td><a href='/cgit/gs5.git/'>Clone of gs5</a></td><td><a href='/cgit/?q=jff@jff.email'>jff@jff.email</a></td><td><span class='age-years' title='2018-10-01 17:24:55 +0000'>2 years</span></td></tr>
|
||||
<tr><td class='toplevel-repo'><a title='libunistring.git' href='/cgit/libunistring.git/'>libunistring.git</a></td><td><a href='/cgit/libunistring.git/'>Debian repo for libunistring</a></td><td><a href='/cgit/?q=debian@jff.email'>debian@jff.email</a></td><td><span class='age-months' title='2020-06-02 06:00:00 +0000'>8 months</span></td></tr>
|
||||
<tr><td class='toplevel-repo'><a title='mailgraph.git' href='/cgit/mailgraph.git/'>mailgraph.git</a></td><td><a href='/cgit/mailgraph.git/'>Debian repo for mailgraph</a></td><td><a href='/cgit/?q=debian@jff.email'>debian@jff.email</a></td><td><span class='age-years' title='2018-09-09 19:18:23 +0000'>2 years</span></td></tr>
|
||||
<tr><td class='toplevel-repo'><a title='mwc.git' href='/cgit/mwc.git/'>mwc.git</a></td><td><a href='/cgit/mwc.git/'>Debian repo for mwc</a></td><td><a href='/cgit/?q=debian@jff.email'>debian@jff.email</a></td><td><span class='age-months' title='2019-07-14 14:58:18 +0000'>19 months</span></td></tr>
|
||||
<tr><td class='toplevel-repo'><a title='xtrkcad.git' href='/cgit/xtrkcad.git/'>xtrkcad.git</a></td><td><a href='/cgit/xtrkcad.git/'>Debian repo for xtrkcad</a></td><td><a href='/cgit/?q=debian@jff.email'>debian@jff.email</a></td><td><span class='age-months' title='2020-08-24 19:27:40 +0000'>5 months</span></td></tr>
|
||||
</table></div> <!-- class=content -->
|
||||
<div class='footer'>generated by <a href='https://git.zx2c4.com/cgit/about/'>cgit v1.1</a> at 2021-01-29 14:18:26 +0000</div>
|
||||
</div> <!-- id=cgit -->
|
||||
</body>
|
||||
</html>
|
|
@ -196,3 +196,38 @@ def test_lister_cgit_from_configfile(swh_scheduler_config, mocker):
|
|||
lister = CGitLister.from_configfile()
|
||||
assert lister.scheduler is not None
|
||||
assert lister.credentials is not None
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"url,base_git_url,expected_nb_origins",
|
||||
[
|
||||
("https://git.eclipse.org/c", "https://eclipse.org/r", 5),
|
||||
("https://git.baserock.org/cgit/", "https://git.baserock.org/git/", 3),
|
||||
("https://jff.email/cgit/", "git://jff.email/opt/git/", 6),
|
||||
],
|
||||
)
|
||||
def test_lister_cgit_with_base_git_url(
|
||||
url, base_git_url, expected_nb_origins, requests_mock_datadir, swh_scheduler
|
||||
):
|
||||
"""With base git url provided, listed urls should be the computed origin urls
|
||||
|
||||
"""
|
||||
lister_cgit = CGitLister(swh_scheduler, url=url, base_git_url=base_git_url,)
|
||||
|
||||
stats = lister_cgit.run()
|
||||
|
||||
assert stats == ListerStats(pages=1, origins=expected_nb_origins)
|
||||
|
||||
# test page parsing
|
||||
scheduler_origins = swh_scheduler.get_listed_origins(
|
||||
lister_cgit.lister_obj.id
|
||||
).results
|
||||
assert len(scheduler_origins) == expected_nb_origins
|
||||
|
||||
# test listed repositories
|
||||
for listed_origin in scheduler_origins:
|
||||
assert listed_origin.visit_type == "git"
|
||||
assert listed_origin.url.startswith(base_git_url)
|
||||
assert (
|
||||
listed_origin.url.startswith(url) is False
|
||||
), f"url should be mapped to {base_git_url}"
|
||||
|
|
|
@ -22,7 +22,7 @@ def test_cgit_lister_task(
|
|||
lister.from_configfile.return_value = lister
|
||||
lister.run.return_value = ListerStats(pages=10, origins=500)
|
||||
|
||||
kwargs = dict(url="https://git.kernel.org/", instance="kernel")
|
||||
kwargs = dict(url="https://git.kernel.org/", instance="kernel", base_git_url=None)
|
||||
|
||||
res = swh_scheduler_celery_app.send_task(
|
||||
"swh.lister.cgit.tasks.CGitListerTask", kwargs=kwargs,
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue