Use beautifulsoup4 CSS selectors to simplify code and type checking

As the types-beautifulsoup4 package gets installed in the swh virtualenv
as it is a swh-scanner test dependency, some mypy errors were reported
related to beautifulsoup4 typing.

As the returned type for the find method of bs4 is the following union:
Tag | NavigableString | None, isinstance calls must be used to ensure
proper typing which is not great.

So prefer to use the select_one method instead where a simple None check
must be done to ensure typing is correct as it is returning Optional[Tag].
In a similar manner, replace use of find_all method by select method.

It also has the advantage to simplify the code.
This commit is contained in:
Antoine Lambert 2024-04-15 16:58:46 +02:00
parent e6a35c55b0
commit 41407e0eff
10 changed files with 100 additions and 100 deletions

View file

@ -1,10 +1,9 @@
# Copyright (C) 2023 The Software Heritage developers
# Copyright (C) 2023-2024 The Software Heritage developers
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from datetime import datetime, timezone
import logging
import re
from typing import Any, Dict, Iterator, List, Optional
from urllib.parse import parse_qs, urljoin, urlparse
@ -80,14 +79,12 @@ class GitwebLister(StatelessLister[Repositories]):
page_results = []
for tr in bs_idx.find("table", {"class": re.compile("project_list")}).find_all(
"tr"
):
link = tr.find("a")
for tr in bs_idx.select("table.project_list tr"):
link = tr.select_one("a")
if not link:
continue
repo_url = urljoin(self.url, link["href"]).strip("/")
repo_url = urljoin(self.url, link.attrs["href"]).strip("/")
# Skip this description page which is listed but won't yield any origins to list
if repo_url.endswith("?o=descr"):
@ -95,7 +92,7 @@ class GitwebLister(StatelessLister[Repositories]):
# This retrieves the date interval in natural language (e.g. '9 years ago')
# to actual python datetime interval so we can derive last update
span = tr.find("td", {"class": re.compile("age.*")})
span = tr.select_one('td[class^="age"]')
page_results.append(
{"url": repo_url, "last_update_interval": span.text if span else None}
)
@ -134,8 +131,8 @@ class GitwebLister(StatelessLister[Repositories]):
return None
urls = []
for row in bs.find_all("tr", {"class": "metadata_url"}):
url = row.contents[-1].string.strip()
for row in bs.select("tr.metadata_url"):
url = row.select("td")[-1].text.strip()
for scheme in ("http", "https", "git"):
# remove any string prefix before origin
pos = url.find(f"{scheme}://")