Use beautifulsoup4 CSS selectors to simplify code and type checking

As the types-beautifulsoup4 package gets installed in the swh virtualenv as it is a swh-scanner test dependency, some mypy errors were reported related to beautifulsoup4 typing. As the returned type for the find method of bs4 is the following union: Tag | NavigableString | None, isinstance calls must be used to ensure proper typing which is not great. So prefer to use the select_one method instead where a simple None check must be done to ensure typing is correct as it is returning Optional[Tag]. In a similar manner, replace use of find_all method by select method. It also has the advantage to simplify the code.
2024-04-15 16:58:46 +02:00 · 2024-04-15 16:58:46 +02:00 · 41407e0eff
commit 41407e0eff
parent e6a35c55b0
10 changed files with 100 additions and 100 deletions
--- a/swh/lister/arch/lister.py
+++ b/swh/lister/arch/lister.py
@ -1,4 +1,4 @@
-# Copyright (C) 2022-2023  The Software Heritage developers
+# Copyright (C) 2022-2024  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
@ -128,10 +128,10 @@ class ArchLister(StatelessLister[ArchListerPage]):
        )
        response = self.http_request(url)
        soup = BeautifulSoup(response.text, "html.parser")
-        links = soup.find_all("a", href=True)
+        links = soup.select("a[href]")

        # drop the first line (used to go to up directory)
-        if links[0].attrs["href"] == "../":
+        if links and links[0].attrs["href"] == "../":
            links.pop(0)

        versions = []
@ -156,26 +156,28 @@ class ArchLister(StatelessLister[ArchListerPage]):
                    arch = m.group("arch")
                    version = m.group("version")

-                # Extract last_modified and an approximate file size
+                # Extract last_modified date
+                last_modified = None
                raw_text = link.next_sibling
-                raw_text_rex = re.compile(
-                    r"^(?P<last_modified>\d+-\w+-\d+ \d\d:\d\d)\s+.*$"
-                )
-                s = raw_text_rex.search(raw_text.strip())
-                if s is None:
-                    logger.error(
-                        "Can not find a match for 'last_modified' in '%(raw_text)s'",
-                        dict(raw_text=raw_text),
+                if raw_text:
+                    raw_text_rex = re.compile(
+                        r"^(?P<last_modified>\d+-\w+-\d+ \d\d:\d\d)\s+.*$"
                    )
-                else:
-                    values = s.groups()
-                    assert values and len(values) == 1
-                    last_modified_str = values[0]
+                    s = raw_text_rex.search(raw_text.text.strip())
+                    if s is None:
+                        logger.error(
+                            "Can not find a match for 'last_modified' in '%(raw_text)s'",
+                            dict(raw_text=raw_text),
+                        )
+                    else:
+                        values = s.groups()
+                        assert values and len(values) == 1
+                        last_modified_str = values[0]

-                # format as expected
-                last_modified = datetime.datetime.strptime(
-                    last_modified_str, "%d-%b-%Y %H:%M"
-                ).isoformat()
+                    # format as expected
+                    last_modified = datetime.datetime.strptime(
+                        last_modified_str, "%d-%b-%Y %H:%M"
+                    ).isoformat()

                # link url is relative, format a canonical one
                url = self.ARCH_PACKAGE_DOWNLOAD_URL_PATTERN.format(