cgit: Compute origin urls out of a base git url when provided.

This adds a second behavior to the cgit lister to actually compute origin urls instead
of parsing them out of another http request on git detailed page.

This new behavior is expected to be the default behavior.

The old behavior is kept for now and is expected to be used as fallback if too much
false negatives are returned.

Related to T2999
This commit is contained in:
Antoine R. Dumont (@ardumont) 2021-01-29 11:38:26 +01:00
parent 4cf0c7f765
commit 2e22073558
No known key found for this signature in database
GPG key ID: 52E2E9840D10C3B8
7 changed files with 191 additions and 15 deletions

View file

@ -28,12 +28,19 @@ class CGitLister(StatelessLister[Repositories]):
This lister will retrieve the list of published git repositories by
parsing the HTML page(s) of the index retrieved at `url`.
For each found git repository, a query is made at the given url found
in this index to gather published "Clone" URLs to be used as origin
URL for that git repo.
The lister currently defines 2 listing behaviors:
- If the `base_git_url` is provided, the listed origin urls are computed out of the
base git url link and the one listed in the main listed page (resulting in less
HTTP queries than the 2nd behavior below). This is expected to be the main
deployed behavior.
- Otherwise (with no `base_git_url`), for each found git repository listed, one
extra HTTP query is made at the given url found in the main listing page to gather
published "Clone" URLs to be used as origin URL for that git repo. If several
"Clone" urls are provided, prefer the http/https one, if any, otherwise fallback
to the first one.
If several "Clone" urls are provided, prefer the http/https one, if
any, otherwise fallback to the first one.
"""
LISTER_NAME = "cgit"
@ -44,14 +51,17 @@ class CGitLister(StatelessLister[Repositories]):
url: str,
instance: Optional[str] = None,
credentials: Optional[CredentialsType] = None,
base_git_url: Optional[str] = None,
):
"""Lister class for CGit repositories.
Args:
url (str): main URL of the CGit instance, i.e. url of the index
url: main URL of the CGit instance, i.e. url of the index
of published git repositories on this instance.
instance (str): Name of cgit instance. Defaults to url's hostname
instance: Name of cgit instance. Defaults to url's hostname
if unset.
base_git_url: Optional base git url which allows the origin url
computations.
"""
if not instance:
@ -66,6 +76,7 @@ class CGitLister(StatelessLister[Repositories]):
self.session.headers.update(
{"Accept": "application/html", "User-Agent": USER_AGENT}
)
self.base_git_url = base_git_url
def _get_and_parse(self, url: str) -> BeautifulSoup:
"""Get the given url and parse the retrieved HTML using BeautifulSoup"""
@ -87,7 +98,19 @@ class CGitLister(StatelessLister[Repositories]):
for tr in bs_idx.find("div", {"class": "content"}).find_all(
"tr", {"class": ""}
):
url = urljoin(self.url, tr.find("a")["href"])
repository_link = tr.find("a")["href"]
repo_url = None
git_url = None
base_url = urljoin(self.url, repository_link)
if self.base_git_url: # mapping provided
# computing git url
git_url = base_url.replace(self.url, self.base_git_url)
else:
# we compute the git detailed page url from which we will retrieve
# the git url (cf. self.get_origins_from_page)
repo_url = base_url
span = tr.find("span", {"class": re.compile("age-")})
if span:
last_updated_date = span["title"]
@ -95,7 +118,11 @@ class CGitLister(StatelessLister[Repositories]):
last_updated_date = None
page_results.append(
{"url": url, "last_updated_date": last_updated_date}
{
"url": repo_url,
"git_url": git_url,
"last_updated_date": last_updated_date,
}
)
yield page_results
@ -117,8 +144,10 @@ class CGitLister(StatelessLister[Repositories]):
"""Convert a page of cgit repositories into a list of ListedOrigins."""
assert self.lister_obj.id is not None
for repository in repositories:
origin_url = self._get_origin_from_repository_url(repository["url"])
for repo in repositories:
origin_url = repo["git_url"] or self._get_origin_from_repository_url(
repo["url"]
)
if origin_url is None:
continue
@ -126,7 +155,7 @@ class CGitLister(StatelessLister[Repositories]):
lister_id=self.lister_obj.id,
url=origin_url,
visit_type="git",
last_update=_parse_last_updated_date(repository),
last_update=_parse_last_updated_date(repo),
)
def _get_origin_from_repository_url(self, repository_url: str) -> Optional[str]:

View file

@ -10,9 +10,13 @@ from .lister import CGitLister
@shared_task(name=__name__ + ".CGitListerTask")
def list_cgit(url: str, instance: Optional[str] = None,) -> Dict[str, str]:
def list_cgit(
url: str, instance: Optional[str] = None, base_git_url: Optional[str] = None
) -> Dict[str, str]:
"""Lister task for CGit instances"""
lister = CGitLister.from_configfile(url=url, instance=instance)
lister = CGitLister.from_configfile(
url=url, instance=instance, base_git_url=base_git_url
)
return lister.run().dict()

View file

@ -0,0 +1,33 @@
<!DOCTYPE html>
<html lang='en'>
<head>
<title>Lorry Depot</title>
<meta name='generator' content='cgit v1.2.1'/>
<meta name='robots' content='index, nofollow'/>
<link rel='stylesheet' type='text/css' href='/cgit-css/cgit.css'/>
<link rel='shortcut icon' href='/favicon.ico'/>
</head>
<body>
<div id='cgit'><table id='header'>
<tr>
<td class='logo' rowspan='2'><a href='/cgit/'><img src='/cgit-css/cgit.png' alt='cgit logo'/></a></td>
<td class='main'>Lorry Depot</td></tr>
<tr><td class='sub'>Mirror Repositories</td></tr></table>
<table class='tabs'><tr><td>
<a class='active' href='/cgit/'>index</a></td><td class='form'><form method='get' action='/cgit/'>
<input type='search' name='q' size='10' value=''/>
<input type='submit' value='search'/>
</form></td></tr></table>
<div class='content'>
<table summary='repository list' class='list nowrap'><tr class='nohover'><th class='left'><a href='/cgit/?s=name'>Name</a></th><th class='left'><a href='/cgit/?s=desc'>Description</a></th><th class='left'><a href='/cgit/?s=idle'>Idle</a></th><th class='left'>Links</th></tr>
<tr><td class='toplevel-repo'><a title='baserock/baserock/baserock-chroot.git' href='/cgit/baserock/baserock/baserock-chroot.git/'>baserock/baserock/baserock-chroot.git</a></td><td><a href='/cgit/baserock/baserock/baserock-chroot.git/'>Tools for working with Baserock chroots
</a></td><td><span class='age-years' title='2015-05-20 09:59:01 +0000'>6 years</span></td><td><a class='button' href='/cgit/baserock/baserock/baserock-chroot.git/'>summary</a><a class='button' href='/cgit/baserock/baserock/baserock-chroot.git/log/'>log</a><a class='button' href='/cgit/baserock/baserock/baserock-chroot.git/tree/'>tree</a></td></tr>
<tr><td class='toplevel-repo'><a title='baserock/baserock/bsp-support.git' href='/cgit/baserock/baserock/bsp-support.git/'>baserock/baserock/bsp-support.git</a></td><td><a href='/cgit/baserock/baserock/bsp-support.git/'>Programs and configuration needed to support specific devices
</a></td><td><span class='age-years' title='2015-10-25 19:53:10 +0000'>5 years</span></td><td><a class='button' href='/cgit/baserock/baserock/bsp-support.git/'>summary</a><a class='button' href='/cgit/baserock/baserock/bsp-support.git/log/'>log</a><a class='button' href='/cgit/baserock/baserock/bsp-support.git/tree/'>tree</a></td></tr>
<tr><td class='toplevel-repo'><a title='baserock/baserock/definitions.git' href='/cgit/baserock/baserock/definitions.git/'>baserock/baserock/definitions.git</a></td><td><a href='/cgit/baserock/baserock/definitions.git/'>gitlab.com: baserock/definitions.git
</a></td><td><span class='age-months' title='2019-05-15 17:58:47 +0000'>21 months</span></td><td><a class='button' href='/cgit/baserock/baserock/definitions.git/'>summary</a><a class='button' href='/cgit/baserock/baserock/definitions.git/log/'>log</a><a class='button' href='/cgit/baserock/baserock/definitions.git/tree/'>tree</a></td></tr>
</table>
</div> <!-- class=content -->
</div> <!-- id=cgit -->
</body>
</html>

View file

@ -0,0 +1,42 @@
<!DOCTYPE html>
<html lang='en'>
<head>
<title>Eclipse Git repositories</title>
<meta name='generator' content='cgit v1.2.1'/>
<meta name='robots' content='index, nofollow'/>
<meta charset="utf-8">
<meta http-equiv="X-UA-Compatible" content="IE=edge">
<meta name="viewport" content="width=device-width, initial-scale=1">
<meta name="keywords" content="eclipse.org, Eclipse Foundation, Eclipse, Git, Source code, Source" />
</head>
<body>
<!-- /#breadcrumb -->
<main>
<div class="container-fluid legacy-page" id="novaContent">
<div class="col-md-24"><div id='cgit'><table id='header'>
<tr>
<td class='logo' rowspan='2'><a href='/c/'><img src='/git.png' alt='cgit logo'/></a></td>
<td class='main'>Eclipse Git repositories</td></tr>
<tr><td class='sub'>To use Git in Eclipse, check out the EGit project.</td></tr></table>
<table class='tabs'><tr><td>
<a class='active' href='/c/'>index</a><a href='/c/?p=about'>about</a></td><td class='form'><form method='get' action='/c/'>
<input type='search' name='q' size='10' value=''/>
<input type='submit' value='search'/>
</form></td></tr></table>
<div class='content'><table summary='repository list' class='list nowrap'><tr class='nohover'><th class='left'><a href='/c/?s=name'>Name</a></th><th class='left'><a href='/c/?s=desc'>Description</a></th><th class='left'><a href='/c/?s=idle'>Idle</a></th></tr>
<tr class='nohover-highlight'><td colspan='3' class='reposection'>3p</td></tr>
<tr><td class='sublevel-repo'><a title='3p.git' href='/c/3p/3p.git/'>3p.git</a></td><td><a href='/c/3p/3p.git/'>3P - PolarSys Packaging</a></td><td><span class='age-months' title='2020-01-17 23:52:00 +0000'>12 months</span></td></tr>
<tr><td class='sublevel-repo'><a title='qvtd.git' href='/c/www.eclipse.org/qvtd.git/'>qvtd.git</a></td><td><a href='/c/www.eclipse.org/qvtd.git/'>www.eclipse.org project repository</a></td><td><span class='age-years' title='2018-07-20 23:51:54 +0000'>3 years</span></td></tr>
<tr><td class='sublevel-repo'><a title='r4e.git' href='/c/www.eclipse.org/r4e.git/'>r4e.git</a></td><td><a href='/c/www.eclipse.org/r4e.git/'>www.eclipse.org project repository</a></td><td><span class='age-years' title='2018-11-17 00:57:51 +0000'>2 years</span></td></tr>
<tr><td class='sublevel-repo'><a title='rap.git' href='/c/www.eclipse.org/rap.git/'>rap.git</a></td><td><a href='/c/www.eclipse.org/rap.git/'>RAP project website</a></td><td><span class='age-weeks' title='2020-12-20 01:16:37 +0000'>6 weeks</span></td></tr>
<tr class='nohover-highlight'><td colspan='3' class='reposection'>zenoh</td></tr>
<tr><td class='sublevel-repo'><a title='zenoh.git' href='/c/zenoh/zenoh.git/'>zenoh.git</a></td><td><a href='/c/zenoh/zenoh.git/'>Unnamed repository; edit this file 'description' to name the repository.</a></td><td><span class='age-months' title='2020-01-25 05:07:25 +0000'>12 months</span></td></tr>
</table></div> <!-- class=content -->
</div>
</div>
</main>
<p id="back-to-top">
<a class="visible-xs" href="#top">Back to the top</a>
</p>
</body>
</html>

View file

@ -0,0 +1,33 @@
<!DOCTYPE html>
<html lang='en'>
<head>
<title>Jörgs Debian Repository</title>
<meta name='generator' content='cgit v1.1'/>
<meta name='robots' content='index, nofollow'/>
<link rel='stylesheet' type='text/css' href='/cgit-css/cgit.css'/>
<link rel='shortcut icon' href='/favicon.ico'/>
</head>
<body>
<div id='cgit'><table id='header'>
<tr>
<td class='logo' rowspan='2'><a href='/cgit/'><img src='/cgit-css/cgit.png' alt='cgit logo'/></a></td>
<td class='main'>Jörgs Debian Repository</td></tr>
<tr><td class='sub'>ask debian-cgit@jff.email</td></tr></table>
<table class='tabs'><tr><td>
<a class='active' href='/cgit/'>index</a></td><td class='form'><form method='get' action='/cgit/'>
<input type='text' name='q' size='10' value=''/>
<input type='submit' value='search'/>
</form></td></tr></table>
<div class='content'><table summary='repository list' class='list nowrap'>
<tr class='nohover'><th class='left'><a href='/cgit/?s=name'>Name</a></th><th class='left'><a href='/cgit/?s=desc'>Description</a></th><th class='left'><a href='/cgit/?s=owner'>Owner</a></th><th class='left'><a href='/cgit/?s=idle'>Idle</a></th></tr>
<tr><td class='toplevel-repo'><a title='gnome-pie.git' href='/cgit/gnome-pie.git/'>gnome-pie.git</a></td><td><a href='/cgit/gnome-pie.git/'>Debian repo for gnome-pie</a></td><td><a href='/cgit/?q=debian@jff.email'>debian@jff.email</a></td><td><span class='age-months' title='2020-06-01 15:22:58 +0000'>8 months</span></td></tr>
<tr><td class='toplevel-repo'><a title='gs5.git' href='/cgit/gs5.git/'>gs5.git</a></td><td><a href='/cgit/gs5.git/'>Clone of gs5</a></td><td><a href='/cgit/?q=jff@jff.email'>jff@jff.email</a></td><td><span class='age-years' title='2018-10-01 17:24:55 +0000'>2 years</span></td></tr>
<tr><td class='toplevel-repo'><a title='libunistring.git' href='/cgit/libunistring.git/'>libunistring.git</a></td><td><a href='/cgit/libunistring.git/'>Debian repo for libunistring</a></td><td><a href='/cgit/?q=debian@jff.email'>debian@jff.email</a></td><td><span class='age-months' title='2020-06-02 06:00:00 +0000'>8 months</span></td></tr>
<tr><td class='toplevel-repo'><a title='mailgraph.git' href='/cgit/mailgraph.git/'>mailgraph.git</a></td><td><a href='/cgit/mailgraph.git/'>Debian repo for mailgraph</a></td><td><a href='/cgit/?q=debian@jff.email'>debian@jff.email</a></td><td><span class='age-years' title='2018-09-09 19:18:23 +0000'>2 years</span></td></tr>
<tr><td class='toplevel-repo'><a title='mwc.git' href='/cgit/mwc.git/'>mwc.git</a></td><td><a href='/cgit/mwc.git/'>Debian repo for mwc</a></td><td><a href='/cgit/?q=debian@jff.email'>debian@jff.email</a></td><td><span class='age-months' title='2019-07-14 14:58:18 +0000'>19 months</span></td></tr>
<tr><td class='toplevel-repo'><a title='xtrkcad.git' href='/cgit/xtrkcad.git/'>xtrkcad.git</a></td><td><a href='/cgit/xtrkcad.git/'>Debian repo for xtrkcad</a></td><td><a href='/cgit/?q=debian@jff.email'>debian@jff.email</a></td><td><span class='age-months' title='2020-08-24 19:27:40 +0000'>5 months</span></td></tr>
</table></div> <!-- class=content -->
<div class='footer'>generated by <a href='https://git.zx2c4.com/cgit/about/'>cgit v1.1</a> at 2021-01-29 14:18:26 +0000</div>
</div> <!-- id=cgit -->
</body>
</html>

View file

@ -196,3 +196,38 @@ def test_lister_cgit_from_configfile(swh_scheduler_config, mocker):
lister = CGitLister.from_configfile()
assert lister.scheduler is not None
assert lister.credentials is not None
@pytest.mark.parametrize(
"url,base_git_url,expected_nb_origins",
[
("https://git.eclipse.org/c", "https://eclipse.org/r", 5),
("https://git.baserock.org/cgit/", "https://git.baserock.org/git/", 3),
("https://jff.email/cgit/", "git://jff.email/opt/git/", 6),
],
)
def test_lister_cgit_with_base_git_url(
url, base_git_url, expected_nb_origins, requests_mock_datadir, swh_scheduler
):
"""With base git url provided, listed urls should be the computed origin urls
"""
lister_cgit = CGitLister(swh_scheduler, url=url, base_git_url=base_git_url,)
stats = lister_cgit.run()
assert stats == ListerStats(pages=1, origins=expected_nb_origins)
# test page parsing
scheduler_origins = swh_scheduler.get_listed_origins(
lister_cgit.lister_obj.id
).results
assert len(scheduler_origins) == expected_nb_origins
# test listed repositories
for listed_origin in scheduler_origins:
assert listed_origin.visit_type == "git"
assert listed_origin.url.startswith(base_git_url)
assert (
listed_origin.url.startswith(url) is False
), f"url should be mapped to {base_git_url}"

View file

@ -22,7 +22,7 @@ def test_cgit_lister_task(
lister.from_configfile.return_value = lister
lister.run.return_value = ListerStats(pages=10, origins=500)
kwargs = dict(url="https://git.kernel.org/", instance="kernel")
kwargs = dict(url="https://git.kernel.org/", instance="kernel", base_git_url=None)
res = swh_scheduler_celery_app.send_task(
"swh.lister.cgit.tasks.CGitListerTask", kwargs=kwargs,