gitweb: Add optional base_git_url parameter to lister

Similar to cgit, it exist cases where git clone URLs for projects hosted
on a gitweb instance cannot be found when scraping project pages or cannot
be easily derived from the gitweb instance root URL.

So add an optional base_git_url parameter enabling to compute correct clone
URLs by appending project names to it.
This commit is contained in:
Antoine Lambert 2023-09-27 14:56:47 +02:00
parent 59a979642f
commit 7b932f46b5
2 changed files with 35 additions and 8 deletions

View file

@ -36,6 +36,7 @@ class GitwebLister(StatelessLister[Repositories]):
scheduler: SchedulerInterface,
url: Optional[str] = None,
instance: Optional[str] = None,
base_git_url: Optional[str] = None,
credentials: Optional[CredentialsType] = None,
max_origins_per_page: Optional[int] = None,
max_pages: Optional[int] = None,
@ -44,11 +45,14 @@ class GitwebLister(StatelessLister[Repositories]):
"""Lister class for Gitweb repositories.
Args:
url: (Optional) Root URL of the Gitweb instance, i.e. url of the index of
url: Root URL of the Gitweb instance, i.e. url of the index of
published git repositories on this instance. Defaults to
:file:`https://{instance}` if unset.
instance: Name of gitweb instance. Defaults to url's network location
if unset.
base_git_url: Base URL to clone a git project hosted on the Gitweb instance,
should only be used if the clone URLs cannot be found when scraping project
page or cannot be easily derived from the root URL of the instance
"""
super().__init__(
@ -63,6 +67,7 @@ class GitwebLister(StatelessLister[Repositories]):
self.session.headers.update({"Accept": "application/html"})
self.instance_scheme = urlparse(url).scheme
self.base_git_url = base_git_url
def _get_and_parse(self, url: str) -> BeautifulSoup:
"""Get the given url and parse the retrieved HTML using BeautifulSoup"""
@ -144,7 +149,7 @@ class GitwebLister(StatelessLister[Repositories]):
urls.append(url)
if not urls:
repo = try_to_determine_git_repository(repository_url)
repo = try_to_determine_git_repository(repository_url, self.base_git_url)
if not repo:
logger.debug("No git urls found on %s", repository_url)
return repo
@ -165,7 +170,9 @@ class GitwebLister(StatelessLister[Repositories]):
return origin_url
def try_to_determine_git_repository(repository_url: str) -> Optional[str]:
def try_to_determine_git_repository(
repository_url: str, base_git_url: Optional[str] = None
) -> Optional[str]:
"""Some gitweb instances does not advertise the git urls.
This heuristic works on instances demonstrating this behavior.
@ -175,7 +182,10 @@ def try_to_determine_git_repository(repository_url: str) -> Optional[str]:
parsed_url = urlparse(repository_url)
repo = parse_qs(parsed_url.query, separator=";").get("p")
if repo:
result = f"git://{parsed_url.netloc}/{repo[0]}"
if base_git_url:
result = f"{base_git_url.rstrip('/')}/{repo[0]}"
else:
result = f"git://{parsed_url.netloc}/{repo[0]}"
return result

View file

@ -128,25 +128,42 @@ def test_lister_gitweb_get_origin_from_repo_failing(
@pytest.mark.parametrize(
"url,expected_repo",
"url,base_git_url,expected_repo",
[
(
"https://git.shadowcat.co.uk?p=urisagit/gitosis-admin.git",
None,
"git://git.shadowcat.co.uk/urisagit/gitosis-admin.git",
),
(
"https://git.shadowcat.co.uk?p=File-Slurp.git;a=summary",
None,
"git://git.shadowcat.co.uk/File-Slurp.git",
),
(
"https://git.example.org?p=baaaa;a=summary",
None,
"git://git.example.org/baaaa",
),
("https://domain.org/foobar", None),
(
"https://domain.org/foobar",
None,
None,
),
(
"https://gitweb.example.org?p=project.git;a=summary",
"https://example.org",
"https://example.org/project.git",
),
(
"https://example.org?p=project.git;a=summary",
"https://example.org/git/",
"https://example.org/git/project.git",
),
],
)
def test_try_to_determine_git_repository(url, expected_repo):
assert try_to_determine_git_repository(url) == expected_repo
def test_try_to_determine_git_repository(url, base_git_url, expected_repo):
assert try_to_determine_git_repository(url, base_git_url) == expected_repo
def test_parse_last_update():