diff --git a/mypy.ini b/mypy.ini
index 7f9436b..76468c2 100644
--- a/mypy.ini
+++ b/mypy.ini
@@ -43,6 +43,9 @@ ignore_missing_imports = True
[mypy-dulwich.*]
ignore_missing_imports = True
+[mypy-dateparser.*]
+ignore_missing_imports = True
+
[mypy-testing.postgresql.*]
ignore_missing_imports = True
diff --git a/requirements.txt b/requirements.txt
index 2614f0a..0e58806 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -6,6 +6,7 @@ beautifulsoup4
launchpadlib
tenacity >= 6.2
lxml
+dateparser
dulwich
testing.postgresql
psycopg2
diff --git a/setup.py b/setup.py
index 9c626a8..0ad6aa5 100755
--- a/setup.py
+++ b/setup.py
@@ -1,5 +1,5 @@
#!/usr/bin/env python3
-# Copyright (C) 2015-2020 The Software Heritage developers
+# Copyright (C) 2015-2023 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
@@ -69,6 +69,7 @@ setup(
lister.gitea=swh.lister.gitea:register
lister.github=swh.lister.github:register
lister.gitlab=swh.lister.gitlab:register
+ lister.gitweb=swh.lister.gitweb:register
lister.gnu=swh.lister.gnu:register
lister.golang=swh.lister.golang:register
lister.gogs=swh.lister.gogs:register
diff --git a/swh/lister/gitweb/__init__.py b/swh/lister/gitweb/__init__.py
new file mode 100644
index 0000000..acb2af1
--- /dev/null
+++ b/swh/lister/gitweb/__init__.py
@@ -0,0 +1,12 @@
+# Copyright (C) 2023 The Software Heritage developers
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+
+def register():
+ from .lister import GitwebLister
+
+ return {
+ "lister": GitwebLister,
+ "task_modules": [f"{__name__}.tasks"],
+ }
diff --git a/swh/lister/gitweb/lister.py b/swh/lister/gitweb/lister.py
new file mode 100644
index 0000000..8d0d81b
--- /dev/null
+++ b/swh/lister/gitweb/lister.py
@@ -0,0 +1,188 @@
+# Copyright (C) 2023 The Software Heritage developers
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+from datetime import datetime, timezone
+import logging
+import re
+from typing import Any, Dict, Iterator, List, Optional
+from urllib.parse import parse_qs, urljoin, urlparse
+
+from bs4 import BeautifulSoup
+from dateparser import parse
+from requests.exceptions import HTTPError
+
+from swh.lister.pattern import CredentialsType, StatelessLister
+from swh.scheduler.interface import SchedulerInterface
+from swh.scheduler.model import ListedOrigin
+
+logger = logging.getLogger(__name__)
+
+Repositories = List[Dict[str, Any]]
+
+
+class GitwebLister(StatelessLister[Repositories]):
+ """Lister class for Gitweb repositories.
+
+ This lister will retrieve the list of published git repositories by
+ parsing the HTML page(s) of the index retrieved at `url`.
+
+ """
+
+ LISTER_NAME = "gitweb"
+
+ def __init__(
+ self,
+ scheduler: SchedulerInterface,
+ url: Optional[str] = None,
+ instance: Optional[str] = None,
+ credentials: Optional[CredentialsType] = None,
+ max_origins_per_page: Optional[int] = None,
+ max_pages: Optional[int] = None,
+ enable_origins: bool = True,
+ ):
+ """Lister class for Gitweb repositories.
+
+ Args:
+ url: (Optional) Root URL of the Gitweb instance, i.e. url of the index of
+ published git repositories on this instance. Defaults to
+ :file:`https://{instance}` if unset.
+ instance: Name of gitweb instance. Defaults to url's network location
+ if unset.
+
+ """
+ super().__init__(
+ scheduler=scheduler,
+ url=url,
+ instance=instance,
+ credentials=credentials,
+ max_origins_per_page=max_origins_per_page,
+ max_pages=max_pages,
+ enable_origins=enable_origins,
+ )
+
+ self.session.headers.update({"Accept": "application/html"})
+ self.instance_scheme = urlparse(url).scheme
+
+ def _get_and_parse(self, url: str) -> BeautifulSoup:
+ """Get the given url and parse the retrieved HTML using BeautifulSoup"""
+ response = self.http_request(url)
+ return BeautifulSoup(response.text, features="html.parser")
+
+ def get_pages(self) -> Iterator[Repositories]:
+ """Generate git 'project' URLs found on the current Gitweb server."""
+ bs_idx = self._get_and_parse(self.url)
+
+ page_results = []
+
+ for tr in bs_idx.find("table", {"class": re.compile("project_list")}).find_all(
+ "tr"
+ ):
+ link = tr.find("a")
+ if not link:
+ continue
+
+ repo_url = urljoin(self.url, link["href"]).strip("/")
+
+ # Skip this description page which is listed but won't yield any origins to list
+ if repo_url.endswith("?o=descr"):
+ continue
+
+ # This retrieves the date interval in natural language (e.g. '9 years ago')
+ # to actual python datetime interval so we can derive last update
+ span = tr.find("td", {"class": re.compile("age.*")})
+ page_results.append(
+ {"url": repo_url, "last_update_interval": span.text if span else None}
+ )
+
+ yield page_results
+
+ def get_origins_from_page(
+ self, repositories: Repositories
+ ) -> Iterator[ListedOrigin]:
+ """Convert a page of gitweb repositories into a list of ListedOrigins."""
+ assert self.lister_obj.id is not None
+
+ for repo in repositories:
+ origin_url = self._get_origin_from_repository_url(repo["url"])
+ if origin_url is None:
+ continue
+
+ yield ListedOrigin(
+ lister_id=self.lister_obj.id,
+ url=origin_url,
+ visit_type="git",
+ last_update=parse_last_update(repo.get("last_update_interval")),
+ )
+
+ def _get_origin_from_repository_url(self, repository_url: str) -> Optional[str]:
+ """Extract the git url from the repository page"""
+ try:
+ bs = self._get_and_parse(repository_url)
+ except HTTPError as e:
+ logger.warning(
+ "Unexpected HTTP status code %s on %s",
+ e.response.status_code,
+ e.response.url,
+ )
+ return None
+
+ urls = []
+ for row in bs.find_all("tr", {"class": "metadata_url"}):
+ url = row.contents[-1].string.strip()
+
+ if "," in url:
+ urls_ = [s.strip() for s in url.split(",") if s]
+ urls.extend(urls_)
+ else:
+ urls.append(url)
+
+ if not urls:
+ repo = try_to_determine_git_repository(repository_url)
+ if not repo:
+ logger.debug("No git urls found on %s", repository_url)
+ return repo
+
+ # look for the http/https url, if any, and use it as origin_url
+ for url in urls:
+ parsed_url = urlparse(url)
+ if parsed_url.scheme == "https":
+ origin_url = url
+ break
+ elif parsed_url.scheme == "http" and self.instance_scheme == "https":
+ # workaround for non-working listed http origins
+ origin_url = url.replace("http://", "https://")
+ break
+ else:
+ # otherwise, choose the first one
+ origin_url = urls[0]
+ return origin_url
+
+
+def try_to_determine_git_repository(repository_url: str) -> Optional[str]:
+ """Some gitweb instances does not advertise the git urls.
+
+ This heuristic works on instances demonstrating this behavior.
+
+ """
+ result = None
+ parsed_url = urlparse(repository_url)
+ params = parse_qs(parsed_url.query).get("p")
+ if params:
+ repo = params[0]
+ if repo and repo.endswith(";a=summary"):
+ repo = repo.rstrip(";a=summary")
+
+ result = f"git://{parsed_url.netloc}/{repo}"
+ return result
+
+
+def parse_last_update(last_update_interval: Optional[str]) -> Optional[datetime]:
+ """Parse the last update string into a datetime."""
+ if not last_update_interval:
+ return None
+ last_update_date = parse(last_update_interval)
+ last_update = None
+ if last_update_date is not None:
+ last_update = last_update_date.replace(tzinfo=timezone.utc)
+ return last_update
diff --git a/swh/lister/gitweb/tasks.py b/swh/lister/gitweb/tasks.py
new file mode 100644
index 0000000..5ff5439
--- /dev/null
+++ b/swh/lister/gitweb/tasks.py
@@ -0,0 +1,16 @@
+# Copyright (C) 2023 The Software Heritage developers
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+from typing import Dict
+
+from celery import shared_task
+
+from .lister import GitwebLister
+
+
+@shared_task(name=f"{__name__}.GitwebListerTask")
+def list_gitweb(**lister_args) -> Dict[str, str]:
+ """Lister task for Gitweb instances"""
+ lister = GitwebLister.from_configfile(**lister_args)
+ return lister.run().dict()
diff --git a/swh/lister/gitweb/tests/__init__.py b/swh/lister/gitweb/tests/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/swh/lister/gitweb/tests/data/https_git.distorted.org.uk/README b/swh/lister/gitweb/tests/data/https_git.distorted.org.uk/README
new file mode 100644
index 0000000..fb47a1d
--- /dev/null
+++ b/swh/lister/gitweb/tests/data/https_git.distorted.org.uk/README
@@ -0,0 +1,5 @@
+These files are a partial dump of https://git.distorted.org.uk/~mdw/.
+
+To ease testing, the page is named index.html. It does not represent the reality of
+those gitweb instances.
+
diff --git a/swh/lister/gitweb/tests/data/https_git.distorted.org.uk/foobar b/swh/lister/gitweb/tests/data/https_git.distorted.org.uk/foobar
new file mode 100644
index 0000000..3288094
--- /dev/null
+++ b/swh/lister/gitweb/tests/data/https_git.distorted.org.uk/foobar
@@ -0,0 +1,123 @@
+
+
+
+
+
+
+
+
+
+mdw@git.distorted.org.uk Git
+
+
+
+
+
+
+
+
+
+
+
+ hello
+
+
+
+
+
+
+
+
diff --git a/swh/lister/gitweb/tests/data/https_git.distorted.org.uk/~mdw b/swh/lister/gitweb/tests/data/https_git.distorted.org.uk/~mdw
new file mode 100644
index 0000000..e9c5019
--- /dev/null
+++ b/swh/lister/gitweb/tests/data/https_git.distorted.org.uk/~mdw
@@ -0,0 +1,145 @@
+
+
+
+
+
+
+
+
+
+mdw@git.distorted.org.uk Git
+
+
+
+
+
+
+
+
+
+
+
+
These are the GIT repositories for some of my various free software
+ projects, and some other projects I hack on or just find useful to
+ have local copies of. Feel free to browse them here.
+
+
The primary source for browsing these projects is
+ https://git.distorted.org.uk/~mdw/ .
+ There's a similar browser at https://www.chiark.greenend.org.uk/ucgi/~mdw/git/
+ which might be faster, or more available, but very slightly less
+ up-to-date.
+
+
Project foo can be cloned using any of the following URLs:
+
+ https://git.distorted.org.uk/~mdw/git/ foo
+ git://git.distorted.org.uk/~mdw/ foo
+ https://www.chiark.greenend.org.uk/ucgi/~mdw/git/ foo
+ git://git.chiark.greenend.org.uk/~mdw/ foo
+
+ The
https:// … URLs are recommended if you can use
+ them, because they provide a measure of authenticity (as well as the
+ obvious privacy benefits).
+
+
+
In order to build many of these projects, you'll need to build and
+ install cfd , and quite possibly one or more of the
+ libraries mLib and catacomb . You'll also need
+ recent-ish Autoconf, Automake and Libtool, and the Autoconf archive.
+ General procedure is as follows:
+
+ Run mdw-setup . This will run the appropriate
+ autotools.
+ Say mkdir build to make a build directory.
+ Say cd build to change to the build directory.
+ Say ../configure , maybe with some options to control
+ the configuration process.
+ Say make .
+ Now start hacking on things.
+
+ If you wanted to build Debian packages, run
mdw-setup
+ –d instead. This will skip making a
build
+ directory, which is good because otherwise it interferes with the
+ Debian build process. The various
debian/rules targets
+ should work OK after that.
+
+
Please mail me patches!
+
+
+
+
+
+
+
+
diff --git a/swh/lister/gitweb/tests/data/https_git.distorted.org.uk/~mdw_doc_ips b/swh/lister/gitweb/tests/data/https_git.distorted.org.uk/~mdw_doc_ips
new file mode 100644
index 0000000..83a9065
--- /dev/null
+++ b/swh/lister/gitweb/tests/data/https_git.distorted.org.uk/~mdw_doc_ips
@@ -0,0 +1,179 @@
+
+
+
+
+
+
+
+
+
+mdw@git.distorted.org.uk Git - doc/ips/summary
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+description Introduction to Provable Security slides and notes
+owner Mark Wooding
+last change Wed, 1 Nov 2006 14:32:34 +0000 (14:32 +0000)
+URL https://git.distorted.org.uk/~mdw/doc/ips , git://git.distorted.org.uk/~mdw/doc/ips
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/swh/lister/gitweb/tests/data/https_git.distorted.org.uk/~mdw_firewall b/swh/lister/gitweb/tests/data/https_git.distorted.org.uk/~mdw_firewall
new file mode 100644
index 0000000..6113b2a
--- /dev/null
+++ b/swh/lister/gitweb/tests/data/https_git.distorted.org.uk/~mdw_firewall
@@ -0,0 +1,167 @@
+
+
+
+
+
+
+
+
+
+mdw@git.distorted.org.uk Git - firewall/summary
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+description Firewall scripts for distorted.org.uk.
+owner Mark Wooding
+last change Thu, 16 Mar 2023 18:09:32 +0000 (18:09 +0000)
+URL https://git.distorted.org.uk/~mdw/firewall
+git://git.distorted.org.uk/~mdw/firewall
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/swh/lister/gitweb/tests/data/https_git.distorted.org.uk/~mdw_mdwtools b/swh/lister/gitweb/tests/data/https_git.distorted.org.uk/~mdw_mdwtools
new file mode 100644
index 0000000..6ab1686
--- /dev/null
+++ b/swh/lister/gitweb/tests/data/https_git.distorted.org.uk/~mdw_mdwtools
@@ -0,0 +1,204 @@
+
+
+
+
+
+
+
+
+
+mdw@git.distorted.org.uk Git - mdwtools/summary
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+description Various LaTeX packages
+owner Mark Wooding
+last change Mon, 8 Jun 2020 15:59:38 +0000 (16:59 +0100)
+URL https://git.distorted.org.uk/~mdw/mdwtools
+git://git.distorted.org.uk/~mdw/mdwtools
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/swh/lister/gitweb/tests/data/https_git.distorted.org.uk/~mdw_scad b/swh/lister/gitweb/tests/data/https_git.distorted.org.uk/~mdw_scad
new file mode 100644
index 0000000..af9c7dc
--- /dev/null
+++ b/swh/lister/gitweb/tests/data/https_git.distorted.org.uk/~mdw_scad
@@ -0,0 +1,91 @@
+
+
+
+
+
+
+
+
+
+mdw@git.distorted.org.uk Git - scad/summary
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+description OpenSCAD models that I've designed.
+owner Mark Wooding
+last change Wed, 15 Mar 2023 00:57:55 +0000 (00:57 +0000)
+URL https://git.distorted.org.uk/~mdw/scad
+git://git.distorted.org.uk/~mdw/scad
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/swh/lister/gitweb/tests/data/https_git.distorted.org.uk/~mdw_strayman b/swh/lister/gitweb/tests/data/https_git.distorted.org.uk/~mdw_strayman
new file mode 100644
index 0000000..f13ea8b
--- /dev/null
+++ b/swh/lister/gitweb/tests/data/https_git.distorted.org.uk/~mdw_strayman
@@ -0,0 +1,210 @@
+
+
+
+
+
+
+
+
+
+mdw@git.distorted.org.uk Git - strayman/summary
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+description LaTeX document class for various documents
+owner Mark Wooding
+last change Mon, 8 Jun 2020 16:00:49 +0000 (17:00 +0100)
+URL https://git.distorted.org.uk/~mdw/strayman
+git://git.distorted.org.uk/~mdw/strayman
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/swh/lister/gitweb/tests/data/https_git.distorted.org.uk/~mdw_udpkey b/swh/lister/gitweb/tests/data/https_git.distorted.org.uk/~mdw_udpkey
new file mode 100644
index 0000000..f4524e5
--- /dev/null
+++ b/swh/lister/gitweb/tests/data/https_git.distorted.org.uk/~mdw_udpkey
@@ -0,0 +1,190 @@
+
+
+
+
+
+
+
+
+
+mdw@git.distorted.org.uk Git - udpkey/summary
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+description Transmit and receive cryptographic keys over UDP; useful during boot.
+owner Mark Wooding
+last change Thu, 18 Feb 2016 17:53:27 +0000 (17:53 +0000)
+URL https://git.distorted.org.uk/~mdw/udpkey
+git://git.distorted.org.uk/~mdw/udpkey
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/swh/lister/gitweb/tests/data/https_git.distorted.org.uk/~mdw_vmctl b/swh/lister/gitweb/tests/data/https_git.distorted.org.uk/~mdw_vmctl
new file mode 100644
index 0000000..7232d60
--- /dev/null
+++ b/swh/lister/gitweb/tests/data/https_git.distorted.org.uk/~mdw_vmctl
@@ -0,0 +1,91 @@
+
+
+
+
+
+
+
+
+
+mdw@git.distorted.org.uk Git - vmctl/summary
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+description Constrained VM management, via SSH.
+owner Mark Wooding
+last change Sat, 4 Apr 2015 12:36:59 +0000 (13:36 +0100)
+URL http://git.distorted.org.uk/~mdw/vmctl
+git://git.distorted.org.uk/~mdw/vmctl
+
+
+
+
+
+
+
+
+
+
diff --git a/swh/lister/gitweb/tests/test_lister.py b/swh/lister/gitweb/tests/test_lister.py
new file mode 100644
index 0000000..1c83e7d
--- /dev/null
+++ b/swh/lister/gitweb/tests/test_lister.py
@@ -0,0 +1,154 @@
+# Copyright (C) 2023 The Software Heritage developers
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+import os
+from typing import List
+
+import pytest
+
+from swh.lister import __version__
+from swh.lister.gitweb.lister import (
+ GitwebLister,
+ parse_last_update,
+ try_to_determine_git_repository,
+)
+from swh.lister.pattern import ListerStats
+
+MAIN_INSTANCE = "git.distorted.org.uk"
+MAIN_INSTANCE_URL = f"https://{MAIN_INSTANCE}/~mdw"
+
+
+def test_lister_gitweb_instantiate(swh_scheduler):
+ """Build a lister with either an url or an instance is supported."""
+ url = MAIN_INSTANCE_URL
+ lister = GitwebLister(swh_scheduler, url=url)
+ assert lister is not None
+ assert lister.url == url
+
+ assert GitwebLister(swh_scheduler, instance=MAIN_INSTANCE) is not None
+ assert lister is not None
+ assert lister.url == url
+
+
+def test_lister_gitweb_fail_to_instantiate(swh_scheduler):
+ """Build a lister without its url nor its instance should raise"""
+ # ... It will raise without any of those
+ with pytest.raises(ValueError, match="'url' or 'instance'"):
+ GitwebLister(swh_scheduler)
+
+
+def test_lister_gitweb_get_pages(requests_mock_datadir, swh_scheduler):
+ """Computing the number of pages scrapped during a listing."""
+ url = MAIN_INSTANCE_URL
+ lister_gitweb = GitwebLister(swh_scheduler, url=url)
+
+ expected_nb_origins = 7
+
+ repos: List[List[str]] = list(lister_gitweb.get_pages())
+ flattened_repos = sum(repos, [])
+ assert len(flattened_repos) == expected_nb_origins
+
+ for listed_url in flattened_repos:
+ assert listed_url["url"].startswith(url)
+
+
+def test_lister_gitweb_run(requests_mock_datadir, swh_scheduler):
+ """Gitweb lister nominal listing case."""
+
+ url = MAIN_INSTANCE_URL
+ lister_gitweb = GitwebLister(swh_scheduler, url=url)
+
+ stats = lister_gitweb.run()
+
+ expected_nb_origins = 7 # main page will get filtered out
+ assert stats == ListerStats(pages=1, origins=expected_nb_origins)
+
+ # test page parsing
+ scheduler_origins = swh_scheduler.get_listed_origins(
+ lister_gitweb.lister_obj.id
+ ).results
+ assert len(scheduler_origins) == expected_nb_origins
+
+ assert url.startswith("https://")
+
+ # test listed repositories
+ for listed_origin in scheduler_origins:
+ assert listed_origin.visit_type == "git"
+ assert listed_origin.url.startswith(url)
+ assert listed_origin.url.startswith("https://")
+ assert listed_origin.last_update is not None
+ assert "," not in listed_origin.url
+
+ # test user agent content
+ for request in requests_mock_datadir.request_history:
+ assert "User-Agent" in request.headers
+ user_agent = request.headers["User-Agent"]
+ assert "Software Heritage gitweb lister" in user_agent
+ assert __version__ in user_agent
+
+
+def test_lister_gitweb_get_pages_with_pages_and_retry(
+ requests_mock_datadir, requests_mock, datadir, mocker, swh_scheduler
+):
+ """Rate limited page are tested back after some time so ingestion can proceed."""
+ url = MAIN_INSTANCE_URL
+ with open(os.path.join(datadir, f"https_{MAIN_INSTANCE}/~mdw"), "rb") as page:
+ requests_mock.get(
+ url,
+ [
+ {"content": None, "status_code": 429},
+ {"content": None, "status_code": 429},
+ {"content": page.read(), "status_code": 200},
+ ],
+ )
+
+ lister_gitweb = GitwebLister(swh_scheduler, url=url)
+
+ mocker.patch.object(lister_gitweb.http_request.retry, "sleep")
+
+ pages: List[List[str]] = list(lister_gitweb.get_pages())
+ flattened_repos = sum(pages, [])
+ assert len(pages) == 1
+ assert len(flattened_repos) == 7
+
+
+def test_lister_gitweb_get_origin_from_repo_failing(
+ swh_scheduler, requests_mock_datadir
+):
+ """Instances whose summary does not return anything are filtered out."""
+ # This instance has some more origins which no longer returns their summary
+ lister_gitweb = GitwebLister(swh_scheduler, url=f"https://{MAIN_INSTANCE}/foobar")
+
+ stats = lister_gitweb.run()
+
+ # so they are filtered out, only the 7 we know are thus listed
+ expected_nb_origins = 7
+ assert stats == ListerStats(pages=1, origins=expected_nb_origins)
+
+
+@pytest.mark.parametrize(
+ "url,expected_repo",
+ [
+ (
+ "https://git.shadowcat.co.uk?p=urisagit/gitosis-admin.git",
+ "git://git.shadowcat.co.uk/urisagit/gitosis-admin.git",
+ ),
+ (
+ "https://git.shadowcat.co.uk?p=File-Slurp.git;a=summary",
+ "git://git.shadowcat.co.uk/File-Slurp.git",
+ ),
+ ("https://domain.org/foobar", None),
+ ],
+)
+def test_try_to_determine_git_repository(url, expected_repo):
+ assert try_to_determine_git_repository(url) == expected_repo
+
+
+def test_parse_last_update():
+ assert parse_last_update(None) is None
+ assert parse_last_update("No commits") is None
+
+ date = parse_last_update("6 months ago")
+ assert date is not None
+ assert date.tzinfo is not None
diff --git a/swh/lister/gitweb/tests/test_tasks.py b/swh/lister/gitweb/tests/test_tasks.py
new file mode 100644
index 0000000..1e5cc34
--- /dev/null
+++ b/swh/lister/gitweb/tests/test_tasks.py
@@ -0,0 +1,30 @@
+# Copyright (C) 2023 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+from swh.lister.pattern import ListerStats
+
+
+def test_gitweb_lister_task(
+ swh_scheduler_celery_app, swh_scheduler_celery_worker, mocker
+):
+ # setup the mocked GitwebLister
+ lister = mocker.patch("swh.lister.gitweb.tasks.GitwebLister")
+ lister.from_configfile.return_value = lister
+ lister.run.return_value = ListerStats(pages=10, origins=500)
+
+ kwargs = dict(
+ url="https://git.gentoo.org/", instance="kernel", base_git_url=None, max_pages=1
+ )
+
+ res = swh_scheduler_celery_app.send_task(
+ "swh.lister.gitweb.tasks.GitwebListerTask",
+ kwargs=kwargs,
+ )
+ assert res
+ res.wait()
+ assert res.successful()
+
+ lister.from_configfile.assert_called_once_with(**kwargs)
+ lister.run.assert_called_once_with()
diff --git a/swh/lister/tests/test_cli.py b/swh/lister/tests/test_cli.py
index 19411c9..29a9a01 100644
--- a/swh/lister/tests/test_cli.py
+++ b/swh/lister/tests/test_cli.py
@@ -42,6 +42,9 @@ lister_args = {
"url": "https://archives.fedoraproject.org/pub/archive/fedora/linux/releases/",
},
"pagure": {"instance": "pagure.io"},
+ "gitweb": {
+ "url": "https://git.distorted.org.uk/~mdw/",
+ },
}