Use beautifulsoup4 CSS selectors to simplify code and type checking

As the types-beautifulsoup4 package gets installed in the swh virtualenv
as it is a swh-scanner test dependency, some mypy errors were reported
related to beautifulsoup4 typing.

As the returned type for the find method of bs4 is the following union:
Tag | NavigableString | None, isinstance calls must be used to ensure
proper typing which is not great.

So prefer to use the select_one method instead where a simple None check
must be done to ensure typing is correct as it is returning Optional[Tag].
In a similar manner, replace use of find_all method by select method.

It also has the advantage to simplify the code.
This commit is contained in:
Antoine Lambert 2024-04-15 16:58:46 +02:00
parent e6a35c55b0
commit 41407e0eff
10 changed files with 100 additions and 100 deletions

View file

@ -1,4 +1,4 @@
# Copyright (C) 2022-2023 The Software Heritage developers
# Copyright (C) 2022-2024 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
@ -128,10 +128,10 @@ class ArchLister(StatelessLister[ArchListerPage]):
)
response = self.http_request(url)
soup = BeautifulSoup(response.text, "html.parser")
links = soup.find_all("a", href=True)
links = soup.select("a[href]")
# drop the first line (used to go to up directory)
if links[0].attrs["href"] == "../":
if links and links[0].attrs["href"] == "../":
links.pop(0)
versions = []
@ -156,26 +156,28 @@ class ArchLister(StatelessLister[ArchListerPage]):
arch = m.group("arch")
version = m.group("version")
# Extract last_modified and an approximate file size
# Extract last_modified date
last_modified = None
raw_text = link.next_sibling
raw_text_rex = re.compile(
r"^(?P<last_modified>\d+-\w+-\d+ \d\d:\d\d)\s+.*$"
)
s = raw_text_rex.search(raw_text.strip())
if s is None:
logger.error(
"Can not find a match for 'last_modified' in '%(raw_text)s'",
dict(raw_text=raw_text),
if raw_text:
raw_text_rex = re.compile(
r"^(?P<last_modified>\d+-\w+-\d+ \d\d:\d\d)\s+.*$"
)
else:
values = s.groups()
assert values and len(values) == 1
last_modified_str = values[0]
s = raw_text_rex.search(raw_text.text.strip())
if s is None:
logger.error(
"Can not find a match for 'last_modified' in '%(raw_text)s'",
dict(raw_text=raw_text),
)
else:
values = s.groups()
assert values and len(values) == 1
last_modified_str = values[0]
# format as expected
last_modified = datetime.datetime.strptime(
last_modified_str, "%d-%b-%Y %H:%M"
).isoformat()
# format as expected
last_modified = datetime.datetime.strptime(
last_modified_str, "%d-%b-%Y %H:%M"
).isoformat()
# link url is relative, format a canonical one
url = self.ARCH_PACKAGE_DOWNLOAD_URL_PATTERN.format(