Use beautifulsoup4 CSS selectors to simplify code and type checking

As the types-beautifulsoup4 package gets installed in the swh virtualenv
as it is a swh-scanner test dependency, some mypy errors were reported
related to beautifulsoup4 typing.

As the returned type for the find method of bs4 is the following union:
Tag | NavigableString | None, isinstance calls must be used to ensure
proper typing which is not great.

So prefer to use the select_one method instead where a simple None check
must be done to ensure typing is correct as it is returning Optional[Tag].
In a similar manner, replace use of find_all method by select method.

It also has the advantage to simplify the code.
This commit is contained in:
Antoine Lambert 2024-04-15 16:58:46 +02:00
parent e6a35c55b0
commit 41407e0eff
10 changed files with 100 additions and 100 deletions

View file

@ -1,10 +1,9 @@
# Copyright (C) 2019-2023 The Software Heritage developers
# Copyright (C) 2019-2024 The Software Heritage developers
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from datetime import datetime, timezone
import logging
import re
from typing import Any, Dict, Iterator, List, Optional
from urllib.parse import urljoin, urlparse
@ -95,14 +94,16 @@ class CGitLister(StatelessLister[Repositories]):
page_results = []
for tr in bs_idx.find("div", {"class": "content"}).find_all(
"tr", {"class": ""}
):
repository_link = tr.find("a")["href"]
for tr in bs_idx.select("div.content tr:not([class])"):
repository_link = tr.select_one("a")
if repository_link is None:
continue
repo_url = None
git_url = None
base_url = urljoin(self.url, repository_link).strip("/")
base_url = urljoin(self.url, repository_link.attrs["href"]).strip("/")
if self.base_git_url: # mapping provided
# computing git url
git_url = base_url.replace(self.url, self.base_git_url)
@ -111,7 +112,7 @@ class CGitLister(StatelessLister[Repositories]):
# the git url (cf. self.get_origins_from_page)
repo_url = base_url
span = tr.find("span", {"class": re.compile("age-")})
span = tr.select_one('span[class^="age-"]')
last_updated_date = span.get("title") if span else None
page_results.append(
@ -125,12 +126,14 @@ class CGitLister(StatelessLister[Repositories]):
yield page_results
try:
pager = bs_idx.find("ul", {"class": "pager"})
current_page = pager.find("a", {"class": "current"})
if current_page:
next_page = current_page.parent.next_sibling.a["href"]
next_page = urljoin(self.url, next_page)
next_page_li = bs_idx.select_one(
"ul.pager li:has(> a.current) + li:has(> a)"
)
next_page = (
urljoin(self.url, next_page_li.select("a")[0].attrs["href"])
if next_page_li
else None
)
except (AttributeError, KeyError):
# no pager, or no next page
next_page = None
@ -169,27 +172,26 @@ class CGitLister(StatelessLister[Repositories]):
return None
# check if we are on the summary tab, if not, go to this tab
tab = bs.find("table", {"class": "tabs"})
if tab:
summary_a = tab.find("a", string="summary")
if summary_a:
summary_url = urljoin(repository_url, summary_a["href"]).strip("/")
summary_a = bs.select_one('table.tabs a:-soup-contains("summary")')
if summary_a:
summary_path = summary_a.attrs["href"]
summary_url = urljoin(repository_url, summary_path).strip("/")
if summary_url != repository_url:
logger.debug(
"%s : Active tab is not the summary, trying to load the summary page",
repository_url,
)
return self._get_origin_from_repository_url(summary_url)
else:
logger.debug("No summary tab found on %s", repository_url)
if summary_url != repository_url:
logger.debug(
"%s : Active tab is not the summary, trying to load the summary page",
repository_url,
)
return self._get_origin_from_repository_url(summary_url)
else:
logger.debug("No summary tab found on %s", repository_url)
# origin urls are listed on the repository page
# TODO check if forcing https is better or not ?
# <link rel='vcs-git' href='git://...' title='...'/>
# <link rel='vcs-git' href='http://...' title='...'/>
# <link rel='vcs-git' href='https://...' title='...'/>
urls = [x["href"] for x in bs.find_all("a", {"rel": "vcs-git"})]
urls = [x.attrs["href"] for x in bs.select('a[rel="vcs-git"]')]
if not urls:
logger.debug("No git urls found on %s", repository_url)