gitweb: Add optional base_git_url parameter to lister
Similar to cgit, it exist cases where git clone URLs for projects hosted on a gitweb instance cannot be found when scraping project pages or cannot be easily derived from the gitweb instance root URL. So add an optional base_git_url parameter enabling to compute correct clone URLs by appending project names to it.
This commit is contained in:
parent
59a979642f
commit
7b932f46b5
2 changed files with 35 additions and 8 deletions
|
@ -36,6 +36,7 @@ class GitwebLister(StatelessLister[Repositories]):
|
|||
scheduler: SchedulerInterface,
|
||||
url: Optional[str] = None,
|
||||
instance: Optional[str] = None,
|
||||
base_git_url: Optional[str] = None,
|
||||
credentials: Optional[CredentialsType] = None,
|
||||
max_origins_per_page: Optional[int] = None,
|
||||
max_pages: Optional[int] = None,
|
||||
|
@ -44,11 +45,14 @@ class GitwebLister(StatelessLister[Repositories]):
|
|||
"""Lister class for Gitweb repositories.
|
||||
|
||||
Args:
|
||||
url: (Optional) Root URL of the Gitweb instance, i.e. url of the index of
|
||||
url: Root URL of the Gitweb instance, i.e. url of the index of
|
||||
published git repositories on this instance. Defaults to
|
||||
:file:`https://{instance}` if unset.
|
||||
instance: Name of gitweb instance. Defaults to url's network location
|
||||
if unset.
|
||||
base_git_url: Base URL to clone a git project hosted on the Gitweb instance,
|
||||
should only be used if the clone URLs cannot be found when scraping project
|
||||
page or cannot be easily derived from the root URL of the instance
|
||||
|
||||
"""
|
||||
super().__init__(
|
||||
|
@ -63,6 +67,7 @@ class GitwebLister(StatelessLister[Repositories]):
|
|||
|
||||
self.session.headers.update({"Accept": "application/html"})
|
||||
self.instance_scheme = urlparse(url).scheme
|
||||
self.base_git_url = base_git_url
|
||||
|
||||
def _get_and_parse(self, url: str) -> BeautifulSoup:
|
||||
"""Get the given url and parse the retrieved HTML using BeautifulSoup"""
|
||||
|
@ -144,7 +149,7 @@ class GitwebLister(StatelessLister[Repositories]):
|
|||
urls.append(url)
|
||||
|
||||
if not urls:
|
||||
repo = try_to_determine_git_repository(repository_url)
|
||||
repo = try_to_determine_git_repository(repository_url, self.base_git_url)
|
||||
if not repo:
|
||||
logger.debug("No git urls found on %s", repository_url)
|
||||
return repo
|
||||
|
@ -165,7 +170,9 @@ class GitwebLister(StatelessLister[Repositories]):
|
|||
return origin_url
|
||||
|
||||
|
||||
def try_to_determine_git_repository(repository_url: str) -> Optional[str]:
|
||||
def try_to_determine_git_repository(
|
||||
repository_url: str, base_git_url: Optional[str] = None
|
||||
) -> Optional[str]:
|
||||
"""Some gitweb instances does not advertise the git urls.
|
||||
|
||||
This heuristic works on instances demonstrating this behavior.
|
||||
|
@ -175,7 +182,10 @@ def try_to_determine_git_repository(repository_url: str) -> Optional[str]:
|
|||
parsed_url = urlparse(repository_url)
|
||||
repo = parse_qs(parsed_url.query, separator=";").get("p")
|
||||
if repo:
|
||||
result = f"git://{parsed_url.netloc}/{repo[0]}"
|
||||
if base_git_url:
|
||||
result = f"{base_git_url.rstrip('/')}/{repo[0]}"
|
||||
else:
|
||||
result = f"git://{parsed_url.netloc}/{repo[0]}"
|
||||
return result
|
||||
|
||||
|
||||
|
|
|
@ -128,25 +128,42 @@ def test_lister_gitweb_get_origin_from_repo_failing(
|
|||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"url,expected_repo",
|
||||
"url,base_git_url,expected_repo",
|
||||
[
|
||||
(
|
||||
"https://git.shadowcat.co.uk?p=urisagit/gitosis-admin.git",
|
||||
None,
|
||||
"git://git.shadowcat.co.uk/urisagit/gitosis-admin.git",
|
||||
),
|
||||
(
|
||||
"https://git.shadowcat.co.uk?p=File-Slurp.git;a=summary",
|
||||
None,
|
||||
"git://git.shadowcat.co.uk/File-Slurp.git",
|
||||
),
|
||||
(
|
||||
"https://git.example.org?p=baaaa;a=summary",
|
||||
None,
|
||||
"git://git.example.org/baaaa",
|
||||
),
|
||||
("https://domain.org/foobar", None),
|
||||
(
|
||||
"https://domain.org/foobar",
|
||||
None,
|
||||
None,
|
||||
),
|
||||
(
|
||||
"https://gitweb.example.org?p=project.git;a=summary",
|
||||
"https://example.org",
|
||||
"https://example.org/project.git",
|
||||
),
|
||||
(
|
||||
"https://example.org?p=project.git;a=summary",
|
||||
"https://example.org/git/",
|
||||
"https://example.org/git/project.git",
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_try_to_determine_git_repository(url, expected_repo):
|
||||
assert try_to_determine_git_repository(url) == expected_repo
|
||||
def test_try_to_determine_git_repository(url, base_git_url, expected_repo):
|
||||
assert try_to_determine_git_repository(url, base_git_url) == expected_repo
|
||||
|
||||
|
||||
def test_parse_last_update():
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue