Use beautifulsoup4 CSS selectors to simplify code and type checking
As the types-beautifulsoup4 package gets installed in the swh virtualenv as it is a swh-scanner test dependency, some mypy errors were reported related to beautifulsoup4 typing. As the returned type for the find method of bs4 is the following union: Tag | NavigableString | None, isinstance calls must be used to ensure proper typing which is not great. So prefer to use the select_one method instead where a simple None check must be done to ensure typing is correct as it is returning Optional[Tag]. In a similar manner, replace use of find_all method by select method. It also has the advantage to simplify the code.
This commit is contained in:
parent
e6a35c55b0
commit
41407e0eff
10 changed files with 100 additions and 100 deletions
|
@ -1,4 +1,4 @@
|
|||
# Copyright (C) 2022-2023 The Software Heritage developers
|
||||
# Copyright (C) 2022-2024 The Software Heritage developers
|
||||
# See the AUTHORS file at the top-level directory of this distribution
|
||||
# License: GNU General Public License version 3, or any later version
|
||||
# See top-level LICENSE file for more information
|
||||
|
@ -128,10 +128,10 @@ class ArchLister(StatelessLister[ArchListerPage]):
|
|||
)
|
||||
response = self.http_request(url)
|
||||
soup = BeautifulSoup(response.text, "html.parser")
|
||||
links = soup.find_all("a", href=True)
|
||||
links = soup.select("a[href]")
|
||||
|
||||
# drop the first line (used to go to up directory)
|
||||
if links[0].attrs["href"] == "../":
|
||||
if links and links[0].attrs["href"] == "../":
|
||||
links.pop(0)
|
||||
|
||||
versions = []
|
||||
|
@ -156,26 +156,28 @@ class ArchLister(StatelessLister[ArchListerPage]):
|
|||
arch = m.group("arch")
|
||||
version = m.group("version")
|
||||
|
||||
# Extract last_modified and an approximate file size
|
||||
# Extract last_modified date
|
||||
last_modified = None
|
||||
raw_text = link.next_sibling
|
||||
raw_text_rex = re.compile(
|
||||
r"^(?P<last_modified>\d+-\w+-\d+ \d\d:\d\d)\s+.*$"
|
||||
)
|
||||
s = raw_text_rex.search(raw_text.strip())
|
||||
if s is None:
|
||||
logger.error(
|
||||
"Can not find a match for 'last_modified' in '%(raw_text)s'",
|
||||
dict(raw_text=raw_text),
|
||||
if raw_text:
|
||||
raw_text_rex = re.compile(
|
||||
r"^(?P<last_modified>\d+-\w+-\d+ \d\d:\d\d)\s+.*$"
|
||||
)
|
||||
else:
|
||||
values = s.groups()
|
||||
assert values and len(values) == 1
|
||||
last_modified_str = values[0]
|
||||
s = raw_text_rex.search(raw_text.text.strip())
|
||||
if s is None:
|
||||
logger.error(
|
||||
"Can not find a match for 'last_modified' in '%(raw_text)s'",
|
||||
dict(raw_text=raw_text),
|
||||
)
|
||||
else:
|
||||
values = s.groups()
|
||||
assert values and len(values) == 1
|
||||
last_modified_str = values[0]
|
||||
|
||||
# format as expected
|
||||
last_modified = datetime.datetime.strptime(
|
||||
last_modified_str, "%d-%b-%Y %H:%M"
|
||||
).isoformat()
|
||||
# format as expected
|
||||
last_modified = datetime.datetime.strptime(
|
||||
last_modified_str, "%d-%b-%Y %H:%M"
|
||||
).isoformat()
|
||||
|
||||
# link url is relative, format a canonical one
|
||||
url = self.ARCH_PACKAGE_DOWNLOAD_URL_PATTERN.format(
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue