From 573958ce64fe1cde2cfe98582e80468708e5bd67 Mon Sep 17 00:00:00 2001 From: "Antoine R. Dumont (@ardumont)" Date: Sat, 8 Jul 2023 14:57:24 +0200 Subject: [PATCH] Add Gitweb lister Depending on some instances, we have some specific heuristics, some instances: - have summary pages which do not not list metadata_url (so some computation happens to list git:// origins which are cloneable) - have summary page which reference metadata_url as a multiple comma separated urls - lists relative urls of the repository so we need to join it with the main instance url to have a complete cloneable origins (or summary page) - lists "down" http origins (cloning those won't work) so lists those as cloneable https ones (when the main url is behind https). Refs. swh/devel/swh-lister#1800 --- mypy.ini | 3 + requirements.txt | 1 + setup.py | 3 +- swh/lister/gitweb/__init__.py | 12 + swh/lister/gitweb/lister.py | 188 ++++++++++++++++ swh/lister/gitweb/tasks.py | 16 ++ swh/lister/gitweb/tests/__init__.py | 0 .../data/https_git.distorted.org.uk/README | 5 + .../data/https_git.distorted.org.uk/foobar | 123 ++++++++++ .../data/https_git.distorted.org.uk/~mdw | 145 ++++++++++++ .../https_git.distorted.org.uk/~mdw_doc_ips | 179 +++++++++++++++ .../https_git.distorted.org.uk/~mdw_firewall | 167 ++++++++++++++ .../https_git.distorted.org.uk/~mdw_mdwtools | 204 +++++++++++++++++ .../data/https_git.distorted.org.uk/~mdw_scad | 91 ++++++++ .../https_git.distorted.org.uk/~mdw_strayman | 210 ++++++++++++++++++ .../https_git.distorted.org.uk/~mdw_udpkey | 190 ++++++++++++++++ .../https_git.distorted.org.uk/~mdw_vmctl | 91 ++++++++ swh/lister/gitweb/tests/test_lister.py | 154 +++++++++++++ swh/lister/gitweb/tests/test_tasks.py | 30 +++ swh/lister/tests/test_cli.py | 3 + 20 files changed, 1814 insertions(+), 1 deletion(-) create mode 100644 swh/lister/gitweb/__init__.py create mode 100644 swh/lister/gitweb/lister.py create mode 100644 swh/lister/gitweb/tasks.py create mode 100644 swh/lister/gitweb/tests/__init__.py create mode 100644 swh/lister/gitweb/tests/data/https_git.distorted.org.uk/README create mode 100644 swh/lister/gitweb/tests/data/https_git.distorted.org.uk/foobar create mode 100644 swh/lister/gitweb/tests/data/https_git.distorted.org.uk/~mdw create mode 100644 swh/lister/gitweb/tests/data/https_git.distorted.org.uk/~mdw_doc_ips create mode 100644 swh/lister/gitweb/tests/data/https_git.distorted.org.uk/~mdw_firewall create mode 100644 swh/lister/gitweb/tests/data/https_git.distorted.org.uk/~mdw_mdwtools create mode 100644 swh/lister/gitweb/tests/data/https_git.distorted.org.uk/~mdw_scad create mode 100644 swh/lister/gitweb/tests/data/https_git.distorted.org.uk/~mdw_strayman create mode 100644 swh/lister/gitweb/tests/data/https_git.distorted.org.uk/~mdw_udpkey create mode 100644 swh/lister/gitweb/tests/data/https_git.distorted.org.uk/~mdw_vmctl create mode 100644 swh/lister/gitweb/tests/test_lister.py create mode 100644 swh/lister/gitweb/tests/test_tasks.py diff --git a/mypy.ini b/mypy.ini index 7f9436b..76468c2 100644 --- a/mypy.ini +++ b/mypy.ini @@ -43,6 +43,9 @@ ignore_missing_imports = True [mypy-dulwich.*] ignore_missing_imports = True +[mypy-dateparser.*] +ignore_missing_imports = True + [mypy-testing.postgresql.*] ignore_missing_imports = True diff --git a/requirements.txt b/requirements.txt index 2614f0a..0e58806 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,6 +6,7 @@ beautifulsoup4 launchpadlib tenacity >= 6.2 lxml +dateparser dulwich testing.postgresql psycopg2 diff --git a/setup.py b/setup.py index 9c626a8..0ad6aa5 100755 --- a/setup.py +++ b/setup.py @@ -1,5 +1,5 @@ #!/usr/bin/env python3 -# Copyright (C) 2015-2020 The Software Heritage developers +# Copyright (C) 2015-2023 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -69,6 +69,7 @@ setup( lister.gitea=swh.lister.gitea:register lister.github=swh.lister.github:register lister.gitlab=swh.lister.gitlab:register + lister.gitweb=swh.lister.gitweb:register lister.gnu=swh.lister.gnu:register lister.golang=swh.lister.golang:register lister.gogs=swh.lister.gogs:register diff --git a/swh/lister/gitweb/__init__.py b/swh/lister/gitweb/__init__.py new file mode 100644 index 0000000..acb2af1 --- /dev/null +++ b/swh/lister/gitweb/__init__.py @@ -0,0 +1,12 @@ +# Copyright (C) 2023 The Software Heritage developers +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + + +def register(): + from .lister import GitwebLister + + return { + "lister": GitwebLister, + "task_modules": [f"{__name__}.tasks"], + } diff --git a/swh/lister/gitweb/lister.py b/swh/lister/gitweb/lister.py new file mode 100644 index 0000000..8d0d81b --- /dev/null +++ b/swh/lister/gitweb/lister.py @@ -0,0 +1,188 @@ +# Copyright (C) 2023 The Software Heritage developers +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from datetime import datetime, timezone +import logging +import re +from typing import Any, Dict, Iterator, List, Optional +from urllib.parse import parse_qs, urljoin, urlparse + +from bs4 import BeautifulSoup +from dateparser import parse +from requests.exceptions import HTTPError + +from swh.lister.pattern import CredentialsType, StatelessLister +from swh.scheduler.interface import SchedulerInterface +from swh.scheduler.model import ListedOrigin + +logger = logging.getLogger(__name__) + +Repositories = List[Dict[str, Any]] + + +class GitwebLister(StatelessLister[Repositories]): + """Lister class for Gitweb repositories. + + This lister will retrieve the list of published git repositories by + parsing the HTML page(s) of the index retrieved at `url`. + + """ + + LISTER_NAME = "gitweb" + + def __init__( + self, + scheduler: SchedulerInterface, + url: Optional[str] = None, + instance: Optional[str] = None, + credentials: Optional[CredentialsType] = None, + max_origins_per_page: Optional[int] = None, + max_pages: Optional[int] = None, + enable_origins: bool = True, + ): + """Lister class for Gitweb repositories. + + Args: + url: (Optional) Root URL of the Gitweb instance, i.e. url of the index of + published git repositories on this instance. Defaults to + :file:`https://{instance}` if unset. + instance: Name of gitweb instance. Defaults to url's network location + if unset. + + """ + super().__init__( + scheduler=scheduler, + url=url, + instance=instance, + credentials=credentials, + max_origins_per_page=max_origins_per_page, + max_pages=max_pages, + enable_origins=enable_origins, + ) + + self.session.headers.update({"Accept": "application/html"}) + self.instance_scheme = urlparse(url).scheme + + def _get_and_parse(self, url: str) -> BeautifulSoup: + """Get the given url and parse the retrieved HTML using BeautifulSoup""" + response = self.http_request(url) + return BeautifulSoup(response.text, features="html.parser") + + def get_pages(self) -> Iterator[Repositories]: + """Generate git 'project' URLs found on the current Gitweb server.""" + bs_idx = self._get_and_parse(self.url) + + page_results = [] + + for tr in bs_idx.find("table", {"class": re.compile("project_list")}).find_all( + "tr" + ): + link = tr.find("a") + if not link: + continue + + repo_url = urljoin(self.url, link["href"]).strip("/") + + # Skip this description page which is listed but won't yield any origins to list + if repo_url.endswith("?o=descr"): + continue + + # This retrieves the date interval in natural language (e.g. '9 years ago') + # to actual python datetime interval so we can derive last update + span = tr.find("td", {"class": re.compile("age.*")}) + page_results.append( + {"url": repo_url, "last_update_interval": span.text if span else None} + ) + + yield page_results + + def get_origins_from_page( + self, repositories: Repositories + ) -> Iterator[ListedOrigin]: + """Convert a page of gitweb repositories into a list of ListedOrigins.""" + assert self.lister_obj.id is not None + + for repo in repositories: + origin_url = self._get_origin_from_repository_url(repo["url"]) + if origin_url is None: + continue + + yield ListedOrigin( + lister_id=self.lister_obj.id, + url=origin_url, + visit_type="git", + last_update=parse_last_update(repo.get("last_update_interval")), + ) + + def _get_origin_from_repository_url(self, repository_url: str) -> Optional[str]: + """Extract the git url from the repository page""" + try: + bs = self._get_and_parse(repository_url) + except HTTPError as e: + logger.warning( + "Unexpected HTTP status code %s on %s", + e.response.status_code, + e.response.url, + ) + return None + + urls = [] + for row in bs.find_all("tr", {"class": "metadata_url"}): + url = row.contents[-1].string.strip() + + if "," in url: + urls_ = [s.strip() for s in url.split(",") if s] + urls.extend(urls_) + else: + urls.append(url) + + if not urls: + repo = try_to_determine_git_repository(repository_url) + if not repo: + logger.debug("No git urls found on %s", repository_url) + return repo + + # look for the http/https url, if any, and use it as origin_url + for url in urls: + parsed_url = urlparse(url) + if parsed_url.scheme == "https": + origin_url = url + break + elif parsed_url.scheme == "http" and self.instance_scheme == "https": + # workaround for non-working listed http origins + origin_url = url.replace("http://", "https://") + break + else: + # otherwise, choose the first one + origin_url = urls[0] + return origin_url + + +def try_to_determine_git_repository(repository_url: str) -> Optional[str]: + """Some gitweb instances does not advertise the git urls. + + This heuristic works on instances demonstrating this behavior. + + """ + result = None + parsed_url = urlparse(repository_url) + params = parse_qs(parsed_url.query).get("p") + if params: + repo = params[0] + if repo and repo.endswith(";a=summary"): + repo = repo.rstrip(";a=summary") + + result = f"git://{parsed_url.netloc}/{repo}" + return result + + +def parse_last_update(last_update_interval: Optional[str]) -> Optional[datetime]: + """Parse the last update string into a datetime.""" + if not last_update_interval: + return None + last_update_date = parse(last_update_interval) + last_update = None + if last_update_date is not None: + last_update = last_update_date.replace(tzinfo=timezone.utc) + return last_update diff --git a/swh/lister/gitweb/tasks.py b/swh/lister/gitweb/tasks.py new file mode 100644 index 0000000..5ff5439 --- /dev/null +++ b/swh/lister/gitweb/tasks.py @@ -0,0 +1,16 @@ +# Copyright (C) 2023 The Software Heritage developers +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from typing import Dict + +from celery import shared_task + +from .lister import GitwebLister + + +@shared_task(name=f"{__name__}.GitwebListerTask") +def list_gitweb(**lister_args) -> Dict[str, str]: + """Lister task for Gitweb instances""" + lister = GitwebLister.from_configfile(**lister_args) + return lister.run().dict() diff --git a/swh/lister/gitweb/tests/__init__.py b/swh/lister/gitweb/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/swh/lister/gitweb/tests/data/https_git.distorted.org.uk/README b/swh/lister/gitweb/tests/data/https_git.distorted.org.uk/README new file mode 100644 index 0000000..fb47a1d --- /dev/null +++ b/swh/lister/gitweb/tests/data/https_git.distorted.org.uk/README @@ -0,0 +1,5 @@ +These files are a partial dump of https://git.distorted.org.uk/~mdw/. + +To ease testing, the page is named index.html. It does not represent the reality of +those gitweb instances. + diff --git a/swh/lister/gitweb/tests/data/https_git.distorted.org.uk/foobar b/swh/lister/gitweb/tests/data/https_git.distorted.org.uk/foobar new file mode 100644 index 0000000..3288094 --- /dev/null +++ b/swh/lister/gitweb/tests/data/https_git.distorted.org.uk/foobar @@ -0,0 +1,123 @@ + + + + + + + + + +mdw@git.distorted.org.uk Git + + + + + + + + + +
+ + hello +
+
+
+ + + +
+List all projects
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
ProjectDescriptionOwnerLast Change
adnsGNU ADNS, an asynchronous... Mark Wooding6 years ago
anagSimple word-game solverMark Wooding3 years ago
atomsAmusing computer-mediated... Mark Wooding10 years ago
firewallFirewall scripts for distorted... Mark Wooding3 months ago
doc/ipsIntroduction to Provable Secur... Mark Wooding16 years ago
mdwtoolsVarious LaTeX packagesMark Wooding7 weeks ago
scadOpenSCAD models that I've... Mark Wooding3 months ago
straymanLaTeX document class for vario... Mark Wooding2 months ago
udpkeyTransmit and receive cryptogra... Mark Wooding7 years ago
vmctlConstrained VM management... Mark Wooding8 years ago
+ + + + + diff --git a/swh/lister/gitweb/tests/data/https_git.distorted.org.uk/~mdw b/swh/lister/gitweb/tests/data/https_git.distorted.org.uk/~mdw new file mode 100644 index 0000000..e9c5019 --- /dev/null +++ b/swh/lister/gitweb/tests/data/https_git.distorted.org.uk/~mdw @@ -0,0 +1,145 @@ + + + + + + + + + +mdw@git.distorted.org.uk Git + + + + + + + + + +
+ +

These are the GIT repositories for some of my various free software + projects, and some other projects I hack on or just find useful to + have local copies of. Feel free to browse them here.

+ +

The primary source for browsing these projects is + https://git.distorted.org.uk/~mdw/. + There's a similar browser at https://www.chiark.greenend.org.uk/ucgi/~mdw/git/ + which might be faster, or more available, but very slightly less + up-to-date.

+ +

Project foo can be cloned using any of the following URLs: +

+ The https://… URLs are recommended if you can use + them, because they provide a measure of authenticity (as well as the + obvious privacy benefits). +

+ +

In order to build many of these projects, you'll need to build and + install cfd, and quite possibly one or more of the + libraries mLib and catacomb. You'll also need + recent-ish Autoconf, Automake and Libtool, and the Autoconf archive. + General procedure is as follows: +

+ If you wanted to build Debian packages, run mdw-setup + –d instead. This will skip making a build + directory, which is good because otherwise it interferes with the + Debian build process. The various debian/rules targets + should work OK after that.

+ +

Please mail me patches!

+
+
+
+ + + +
+List all projects
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
ProjectDescriptionOwnerLast Change
firewallFirewall scripts for distorted... Mark Wooding3 months ago
doc/ipsIntroduction to Provable Secur... Mark Wooding16 years ago
mdwtoolsVarious LaTeX packagesMark Wooding7 weeks ago
scadOpenSCAD models that I've... Mark Wooding3 months ago
straymanLaTeX document class for vario... Mark Wooding2 months ago
udpkeyTransmit and receive cryptogra... Mark Wooding7 years ago
vmctlConstrained VM management... Mark Wooding8 years ago
+ + + + + diff --git a/swh/lister/gitweb/tests/data/https_git.distorted.org.uk/~mdw_doc_ips b/swh/lister/gitweb/tests/data/https_git.distorted.org.uk/~mdw_doc_ips new file mode 100644 index 0000000..83a9065 --- /dev/null +++ b/swh/lister/gitweb/tests/data/https_git.distorted.org.uk/~mdw_doc_ips @@ -0,0 +1,179 @@ + + + + + + + + + +mdw@git.distorted.org.uk Git - doc/ips/summary + + + + + + + + + + + +
+
+ +
 
+ + + + + +
descriptionIntroduction to Provable Security slides and notes
ownerMark Wooding
last changeWed, 1 Nov 2006 14:32:34 +0000 (14:32 +0000)
+
+shortlog +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
2006-11-01Mark Woodingips.cls: Fix the page size of the PDF output. master
2006-11-01Mark WoodingMerge ponder:doc/ips
2006-11-01Mark Woodingauth-mac: Rewrite the stuff about universal hashing.
2006-09-12Mark WoodingRemove unnecessary $2^{-L}$ term from the AXU-hashing...
2006-03-02Mark WoodingKill obsolete setup script; set up gitiginore.
2006-03-02Mark WoodingMakefile: Do subdirectory builds correctly.
2006-03-02Mark WoodingExpunge revision histories.
2006-03-02mdwenc-ies: Various tweakings and tidyings. svn
2006-03-02mdwcls: Move amssymb earlier to prevent interference.
2004-09-04mdwThe Great Upheaval -- step 1. 1.1.1
2002-07-17mdwVarious small fixes.
2002-02-24mdwNew build system.
2002-02-24mdwIgnore new files.
2002-02-24mdwPut bibliography database list in one place.
2002-02-24mdwMove most of the hacking into `mdwslides.dtx'.
2002-02-24mdwNew build system.
...
+
+tags +
+ + + + + + + + + + + + + +
unknown1.1.1
unknown1.1.0
+
+heads +
+ + + + + + + + + +
16 years agomaster
17 years agosvn
+ + + + + diff --git a/swh/lister/gitweb/tests/data/https_git.distorted.org.uk/~mdw_firewall b/swh/lister/gitweb/tests/data/https_git.distorted.org.uk/~mdw_firewall new file mode 100644 index 0000000..6113b2a --- /dev/null +++ b/swh/lister/gitweb/tests/data/https_git.distorted.org.uk/~mdw_firewall @@ -0,0 +1,167 @@ + + + + + + + + + +mdw@git.distorted.org.uk Git - firewall/summary + + + + + + + + + + + +
+
+ +
 
+ + + + + + +
descriptionFirewall scripts for distorted.org.uk.
ownerMark Wooding
last changeThu, 16 Mar 2023 18:09:32 +0000 (18:09 +0000)
+
+shortlog +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
2023-03-16Mark Woodinglocal.m4: Fix the IPv4 version of the `inbound-untruste... master
2023-02-25Mark Woodinglocal.mk, roadstar.m4: Move lpr service to roadstar...
2022-05-30Mark Wooding*.m4: Actually allow NFS to untrusted hosts.
2022-05-30Mark Woodinglocal.m4, etc.: Establish `inbound-untrusted' chain...
2022-05-30Mark Woodingfender.m4, ibanez.m4, vampire.m4: Invoke `footables...
2022-05-09Mark WoodingMerge branch 'master' of git.distorted.org.uk:~mdw...
2022-05-09Mark Woodingnumbers.m4, artist.m4: Add a second DisOrder port for...
2022-05-09Mark Woodinglocal.m4: Add `mdwdev.upn'.
2021-11-01Mark Woodingjazz.m4, numbers.m4: Allow Privoxy access to SGO VPN.
2021-02-03Mark Woodinglocal.m4: Update external NTP servers.
2020-04-08Mark Woodinglocal.m4: Add entry for new laptop `spirit'.
2018-12-26Mark Woodinglocal.m4, precision.m4: Introduce `vpnnat' network...
2018-12-26Mark Woodinglocal.mk: Reinstate mango.
2017-10-02Mark Woodinglocal.m4: Filter out source routing in the firewall.
2017-10-02Mark Woodinglocal.m4: Don't expect `forbidden' to return.
2017-10-01Mark Woodinglocal.m4: Add the `hippotat' network.
...
+
+heads +
+ + + + + + + + + + + + + +
3 months agomaster
8 years agojaguar
11 years agoemergency
+ + + + + \ No newline at end of file diff --git a/swh/lister/gitweb/tests/data/https_git.distorted.org.uk/~mdw_mdwtools b/swh/lister/gitweb/tests/data/https_git.distorted.org.uk/~mdw_mdwtools new file mode 100644 index 0000000..6ab1686 --- /dev/null +++ b/swh/lister/gitweb/tests/data/https_git.distorted.org.uk/~mdw_mdwtools @@ -0,0 +1,204 @@ + + + + + + + + + +mdw@git.distorted.org.uk Git - mdwtools/summary + + + + + + + + + + + +
+
+ +
 
+ + + + + + +
descriptionVarious LaTeX packages
ownerMark Wooding
last changeMon, 8 Jun 2020 15:59:38 +0000 (16:59 +0100)
+
+shortlog +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
2020-06-08Mark Wooding.mdw-build.conf: Don't try `vpath' builds with this... master
2020-06-08Mark WoodingRelease 1.8.2. 1.8.2
2020-06-08Mark Woodingdebian/changelog: Delete trailing blank line.
2020-06-07Mark Woodingsverb.dtx: Include the `\jobname' in demo filenames.
2020-06-07Mark WoodingMakefile.m4: Fix dependencies for parallel building.
2020-06-07Mark Woodingsyntax.dtx: Disable ligatures in `\readupto'.
2019-08-24Mark WoodingRelease 1.8.1. 1.8.1
2019-08-24Mark WoodingMakefile.m4: Collect version using `auto-version'.
2019-08-24Mark WoodingMakefile.m4: Build PDF versions of the documents.
2019-08-02Mark WoodingRelease 1.8.0. 1.8.0
2019-08-02Mark Woodingmdwref.dtx: Add a useful output-formatting hook.
2016-01-24Mark Woodingmdwtab.dtx: Fix group nesting in `smarray'.
2016-01-24Mark WoodingEliminate tabs from TeX input files.
2015-11-17Mark Woodingsyntax.dtx: Allow decorative material following nonterm...
2015-10-06Mark Woodingmdwtab.dtx: Cope when \if@leqno is frobbed dynamically.
2015-10-04Mark Woodingconfigure.in: Automake is now pickier about ordering.
...
+
+tags +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
3 years ago1.8.2Release 1.8.2.
3 years ago1.8.1Release 1.8.1.
3 years ago1.8.0Release 1.8.0.
10 years ago1.7.0Release 1.7.0.
unknown1.6.1
unknown1.6.0
+
+heads +
+ + + + + + + + + +
7 weeks agomdw/tangle
3 years agomaster
+ + + + + \ No newline at end of file diff --git a/swh/lister/gitweb/tests/data/https_git.distorted.org.uk/~mdw_scad b/swh/lister/gitweb/tests/data/https_git.distorted.org.uk/~mdw_scad new file mode 100644 index 0000000..af9c7dc --- /dev/null +++ b/swh/lister/gitweb/tests/data/https_git.distorted.org.uk/~mdw_scad @@ -0,0 +1,91 @@ + + + + + + + + + +mdw@git.distorted.org.uk Git - scad/summary + + + + + + + + + + + +
+
+ +
 
+ + + + + + +
descriptionOpenSCAD models that I've designed.
ownerMark Wooding
last changeWed, 15 Mar 2023 00:57:55 +0000 (00:57 +0000)
+
+shortlog +
+ + + + + + + + + + + + + + + + +
2023-03-15Mark Woodingdiscpick-tensioner.scad: Improved tensioning component... master
2022-10-28Mark WoodingAdd a build system.
2022-09-30Mark Woodingdiscpick-collar.scad: Successful print.
+
+heads +
+ + + + + +
3 months agomaster
+ + + + + \ No newline at end of file diff --git a/swh/lister/gitweb/tests/data/https_git.distorted.org.uk/~mdw_strayman b/swh/lister/gitweb/tests/data/https_git.distorted.org.uk/~mdw_strayman new file mode 100644 index 0000000..f13ea8b --- /dev/null +++ b/swh/lister/gitweb/tests/data/https_git.distorted.org.uk/~mdw_strayman @@ -0,0 +1,210 @@ + + + + + + + + + +mdw@git.distorted.org.uk Git - strayman/summary + + + + + + + + + + + +
+
+ +
 
+ + + + + + +
descriptionLaTeX document class for various documents
ownerMark Wooding
last changeMon, 8 Jun 2020 16:00:49 +0000 (17:00 +0100)
+
+shortlog +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
2020-06-08Mark Wooding.mdw-build.conf: Don't try `vpath' builds with this... master
2019-08-24Mark WoodingRelease 1.1.5. 1.1.5
2019-08-24Mark Woodingdebian/copyright: Switch to machine-readable copyright...
2019-08-24Mark Woodingstrayman.dtx: Move left-side headers and footers into...
2019-08-07Mark WoodingRelease 1.1.4. 1.1.4
2018-06-30Mark Woodingstrayman.dtx: Forbid page breaks before lists attached...
2018-06-22Mark WoodingRelease 1.1.3.2. 1.1.3.2
2018-06-21Mark Woodingdebian/control: Add Build-Depends: ghostscript because...
2018-06-21Mark Woodingstrayman.dtx (\part): Don't establish the label referen...
2016-05-05Mark Woodingstrayman.dtx: Whitespace fixes.
2013-07-08Mark WoodingActually arrange to distribute the Automake helper... 1.1.3.1
2013-07-08Mark Wooding.gitignore: Ignore `auto-version' script.
2012-05-05Mark WoodingDebianization! 1.1.3
2012-04-10Mark WoodingBuild: version from Git.
2012-04-10Mark WoodingWhitespace fixups.
2012-04-10Mark WoodingGenerate Postscript and PDF versions of documents.
...
+
+tags +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
3 years ago1.1.5Release 1.1.5.
3 years ago1.1.4Release 1.1.4.
5 years ago1.1.3.2Release 1.1.3.2.
10 years ago1.1.3.1Release 1.1.3.1.
11 years ago1.1.3Release 1.1.3.
11 years ago1.1.2Release 1.1.2.
11 years ago1.1.0Release 1.1.0.
+
+heads +
+ + + + + + + + + +
2 months agomdw/tangle
3 years agomaster
+ + + + + \ No newline at end of file diff --git a/swh/lister/gitweb/tests/data/https_git.distorted.org.uk/~mdw_udpkey b/swh/lister/gitweb/tests/data/https_git.distorted.org.uk/~mdw_udpkey new file mode 100644 index 0000000..f4524e5 --- /dev/null +++ b/swh/lister/gitweb/tests/data/https_git.distorted.org.uk/~mdw_udpkey @@ -0,0 +1,190 @@ + + + + + + + + + +mdw@git.distorted.org.uk Git - udpkey/summary + + + + + + + + + + + +
+
+ +
 
+ + + + + + +
descriptionTransmit and receive cryptographic keys over UDP; useful during boot.
ownerMark Wooding
last changeThu, 18 Feb 2016 17:53:27 +0000 (17:53 +0000)
+
+shortlog +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
2016-02-18Mark WoodingRelease 1.0.2. master 1.0.2
2016-02-18Mark Woodingdebian/control: Fix the Build-Depends.
2016-02-18Mark Woodingdebian/source/format: Apparently I'm meant to have...
2013-06-29Mark Woodingudpkey.c: Fix typos in commentary.
2013-06-29Mark Woodingudpkey.1: Prepend a line telling man(1)'s that it shoul...
2013-06-29Mark Woodingudpkey.1: Fix misformatting in a syntax display.
2013-06-29Mark WoodingRelease 1.0.1. 1.0.1
2013-06-29Mark Woodingdebian/udpkey.keyscript: Don't send network setup chatt...
2013-06-29Mark Woodingdebian/udpkey.initramfs-hook: Ensure seed is not public...
2013-06-29Mark WoodingMakefile.am: Distribute the extra Debian files.
2013-06-29Mark WoodingAnnotate `printf'-like functions for better warnings.
2013-06-29Mark Woodingudpkey.c: Missing newline in version string.
2013-06-29Mark WoodingUse constant-time comparison for checking MAC tags.
2013-06-29Mark Woodingudpkey.c: Fix format-string error.
2013-06-29Mark Woodingudpkey.1: Some more tweaks to the manpage.
2013-06-28Mark Wooding.gitignore: Ignore the `autom4te.cache' directory.
...
+
+tags +
+ + + + + + + + + + + + + + + + + + + +
7 years ago1.0.2Release 1.0.2.
10 years ago1.0.1Release 1.0.1.
10 years ago1.0.0Release 1.0.0.
+
+heads +
+ + + + + + + + + + + + + +
7 years agomdw/wip.crybaby.2016-05-05
7 years agomaster
9 years agomdw/fwd-sec
+ + + + + \ No newline at end of file diff --git a/swh/lister/gitweb/tests/data/https_git.distorted.org.uk/~mdw_vmctl b/swh/lister/gitweb/tests/data/https_git.distorted.org.uk/~mdw_vmctl new file mode 100644 index 0000000..7232d60 --- /dev/null +++ b/swh/lister/gitweb/tests/data/https_git.distorted.org.uk/~mdw_vmctl @@ -0,0 +1,91 @@ + + + + + + + + + +mdw@git.distorted.org.uk Git - vmctl/summary + + + + + + + + + + + +
+
+ +
 
+ + + + + + +
descriptionConstrained VM management, via SSH.
ownerMark Wooding
last changeSat, 4 Apr 2015 12:36:59 +0000 (13:36 +0100)
+
+shortlog +
+ + + + + + + + + + + + + + + + +
2015-04-04Mark Wooding.ssh/Makefile: Add dependency on sshsvc.conf. master
2013-09-03Mark Wooding.ssh: Generate the awful authorized_keys file.
2012-08-26Mark WoodingInitial version.
+
+heads +
+ + + + + +
8 years agomaster
+ + + + + diff --git a/swh/lister/gitweb/tests/test_lister.py b/swh/lister/gitweb/tests/test_lister.py new file mode 100644 index 0000000..1c83e7d --- /dev/null +++ b/swh/lister/gitweb/tests/test_lister.py @@ -0,0 +1,154 @@ +# Copyright (C) 2023 The Software Heritage developers +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import os +from typing import List + +import pytest + +from swh.lister import __version__ +from swh.lister.gitweb.lister import ( + GitwebLister, + parse_last_update, + try_to_determine_git_repository, +) +from swh.lister.pattern import ListerStats + +MAIN_INSTANCE = "git.distorted.org.uk" +MAIN_INSTANCE_URL = f"https://{MAIN_INSTANCE}/~mdw" + + +def test_lister_gitweb_instantiate(swh_scheduler): + """Build a lister with either an url or an instance is supported.""" + url = MAIN_INSTANCE_URL + lister = GitwebLister(swh_scheduler, url=url) + assert lister is not None + assert lister.url == url + + assert GitwebLister(swh_scheduler, instance=MAIN_INSTANCE) is not None + assert lister is not None + assert lister.url == url + + +def test_lister_gitweb_fail_to_instantiate(swh_scheduler): + """Build a lister without its url nor its instance should raise""" + # ... It will raise without any of those + with pytest.raises(ValueError, match="'url' or 'instance'"): + GitwebLister(swh_scheduler) + + +def test_lister_gitweb_get_pages(requests_mock_datadir, swh_scheduler): + """Computing the number of pages scrapped during a listing.""" + url = MAIN_INSTANCE_URL + lister_gitweb = GitwebLister(swh_scheduler, url=url) + + expected_nb_origins = 7 + + repos: List[List[str]] = list(lister_gitweb.get_pages()) + flattened_repos = sum(repos, []) + assert len(flattened_repos) == expected_nb_origins + + for listed_url in flattened_repos: + assert listed_url["url"].startswith(url) + + +def test_lister_gitweb_run(requests_mock_datadir, swh_scheduler): + """Gitweb lister nominal listing case.""" + + url = MAIN_INSTANCE_URL + lister_gitweb = GitwebLister(swh_scheduler, url=url) + + stats = lister_gitweb.run() + + expected_nb_origins = 7 # main page will get filtered out + assert stats == ListerStats(pages=1, origins=expected_nb_origins) + + # test page parsing + scheduler_origins = swh_scheduler.get_listed_origins( + lister_gitweb.lister_obj.id + ).results + assert len(scheduler_origins) == expected_nb_origins + + assert url.startswith("https://") + + # test listed repositories + for listed_origin in scheduler_origins: + assert listed_origin.visit_type == "git" + assert listed_origin.url.startswith(url) + assert listed_origin.url.startswith("https://") + assert listed_origin.last_update is not None + assert "," not in listed_origin.url + + # test user agent content + for request in requests_mock_datadir.request_history: + assert "User-Agent" in request.headers + user_agent = request.headers["User-Agent"] + assert "Software Heritage gitweb lister" in user_agent + assert __version__ in user_agent + + +def test_lister_gitweb_get_pages_with_pages_and_retry( + requests_mock_datadir, requests_mock, datadir, mocker, swh_scheduler +): + """Rate limited page are tested back after some time so ingestion can proceed.""" + url = MAIN_INSTANCE_URL + with open(os.path.join(datadir, f"https_{MAIN_INSTANCE}/~mdw"), "rb") as page: + requests_mock.get( + url, + [ + {"content": None, "status_code": 429}, + {"content": None, "status_code": 429}, + {"content": page.read(), "status_code": 200}, + ], + ) + + lister_gitweb = GitwebLister(swh_scheduler, url=url) + + mocker.patch.object(lister_gitweb.http_request.retry, "sleep") + + pages: List[List[str]] = list(lister_gitweb.get_pages()) + flattened_repos = sum(pages, []) + assert len(pages) == 1 + assert len(flattened_repos) == 7 + + +def test_lister_gitweb_get_origin_from_repo_failing( + swh_scheduler, requests_mock_datadir +): + """Instances whose summary does not return anything are filtered out.""" + # This instance has some more origins which no longer returns their summary + lister_gitweb = GitwebLister(swh_scheduler, url=f"https://{MAIN_INSTANCE}/foobar") + + stats = lister_gitweb.run() + + # so they are filtered out, only the 7 we know are thus listed + expected_nb_origins = 7 + assert stats == ListerStats(pages=1, origins=expected_nb_origins) + + +@pytest.mark.parametrize( + "url,expected_repo", + [ + ( + "https://git.shadowcat.co.uk?p=urisagit/gitosis-admin.git", + "git://git.shadowcat.co.uk/urisagit/gitosis-admin.git", + ), + ( + "https://git.shadowcat.co.uk?p=File-Slurp.git;a=summary", + "git://git.shadowcat.co.uk/File-Slurp.git", + ), + ("https://domain.org/foobar", None), + ], +) +def test_try_to_determine_git_repository(url, expected_repo): + assert try_to_determine_git_repository(url) == expected_repo + + +def test_parse_last_update(): + assert parse_last_update(None) is None + assert parse_last_update("No commits") is None + + date = parse_last_update("6 months ago") + assert date is not None + assert date.tzinfo is not None diff --git a/swh/lister/gitweb/tests/test_tasks.py b/swh/lister/gitweb/tests/test_tasks.py new file mode 100644 index 0000000..1e5cc34 --- /dev/null +++ b/swh/lister/gitweb/tests/test_tasks.py @@ -0,0 +1,30 @@ +# Copyright (C) 2023 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from swh.lister.pattern import ListerStats + + +def test_gitweb_lister_task( + swh_scheduler_celery_app, swh_scheduler_celery_worker, mocker +): + # setup the mocked GitwebLister + lister = mocker.patch("swh.lister.gitweb.tasks.GitwebLister") + lister.from_configfile.return_value = lister + lister.run.return_value = ListerStats(pages=10, origins=500) + + kwargs = dict( + url="https://git.gentoo.org/", instance="kernel", base_git_url=None, max_pages=1 + ) + + res = swh_scheduler_celery_app.send_task( + "swh.lister.gitweb.tasks.GitwebListerTask", + kwargs=kwargs, + ) + assert res + res.wait() + assert res.successful() + + lister.from_configfile.assert_called_once_with(**kwargs) + lister.run.assert_called_once_with() diff --git a/swh/lister/tests/test_cli.py b/swh/lister/tests/test_cli.py index 19411c9..29a9a01 100644 --- a/swh/lister/tests/test_cli.py +++ b/swh/lister/tests/test_cli.py @@ -42,6 +42,9 @@ lister_args = { "url": "https://archives.fedoraproject.org/pub/archive/fedora/linux/releases/", }, "pagure": {"instance": "pagure.io"}, + "gitweb": { + "url": "https://git.distorted.org.uk/~mdw/", + }, }