Use beautifulsoup4 CSS selectors to simplify code and type checking
As the types-beautifulsoup4 package gets installed in the swh virtualenv as it is a swh-scanner test dependency, some mypy errors were reported related to beautifulsoup4 typing. As the returned type for the find method of bs4 is the following union: Tag | NavigableString | None, isinstance calls must be used to ensure proper typing which is not great. So prefer to use the select_one method instead where a simple None check must be done to ensure typing is correct as it is returning Optional[Tag]. In a similar manner, replace use of find_all method by select method. It also has the advantage to simplify the code.
This commit is contained in:
parent
e6a35c55b0
commit
41407e0eff
10 changed files with 100 additions and 100 deletions
|
@ -1,4 +1,4 @@
|
|||
# Copyright (C) 2022-2023 The Software Heritage developers
|
||||
# Copyright (C) 2022-2024 The Software Heritage developers
|
||||
# See the AUTHORS file at the top-level directory of this distribution
|
||||
# License: GNU General Public License version 3, or any later version
|
||||
# See top-level LICENSE file for more information
|
||||
|
@ -128,10 +128,10 @@ class ArchLister(StatelessLister[ArchListerPage]):
|
|||
)
|
||||
response = self.http_request(url)
|
||||
soup = BeautifulSoup(response.text, "html.parser")
|
||||
links = soup.find_all("a", href=True)
|
||||
links = soup.select("a[href]")
|
||||
|
||||
# drop the first line (used to go to up directory)
|
||||
if links[0].attrs["href"] == "../":
|
||||
if links and links[0].attrs["href"] == "../":
|
||||
links.pop(0)
|
||||
|
||||
versions = []
|
||||
|
@ -156,26 +156,28 @@ class ArchLister(StatelessLister[ArchListerPage]):
|
|||
arch = m.group("arch")
|
||||
version = m.group("version")
|
||||
|
||||
# Extract last_modified and an approximate file size
|
||||
# Extract last_modified date
|
||||
last_modified = None
|
||||
raw_text = link.next_sibling
|
||||
raw_text_rex = re.compile(
|
||||
r"^(?P<last_modified>\d+-\w+-\d+ \d\d:\d\d)\s+.*$"
|
||||
)
|
||||
s = raw_text_rex.search(raw_text.strip())
|
||||
if s is None:
|
||||
logger.error(
|
||||
"Can not find a match for 'last_modified' in '%(raw_text)s'",
|
||||
dict(raw_text=raw_text),
|
||||
if raw_text:
|
||||
raw_text_rex = re.compile(
|
||||
r"^(?P<last_modified>\d+-\w+-\d+ \d\d:\d\d)\s+.*$"
|
||||
)
|
||||
else:
|
||||
values = s.groups()
|
||||
assert values and len(values) == 1
|
||||
last_modified_str = values[0]
|
||||
s = raw_text_rex.search(raw_text.text.strip())
|
||||
if s is None:
|
||||
logger.error(
|
||||
"Can not find a match for 'last_modified' in '%(raw_text)s'",
|
||||
dict(raw_text=raw_text),
|
||||
)
|
||||
else:
|
||||
values = s.groups()
|
||||
assert values and len(values) == 1
|
||||
last_modified_str = values[0]
|
||||
|
||||
# format as expected
|
||||
last_modified = datetime.datetime.strptime(
|
||||
last_modified_str, "%d-%b-%Y %H:%M"
|
||||
).isoformat()
|
||||
# format as expected
|
||||
last_modified = datetime.datetime.strptime(
|
||||
last_modified_str, "%d-%b-%Y %H:%M"
|
||||
).isoformat()
|
||||
|
||||
# link url is relative, format a canonical one
|
||||
url = self.ARCH_PACKAGE_DOWNLOAD_URL_PATTERN.format(
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
# Copyright (C) 2023 The Software Heritage developers
|
||||
# Copyright (C) 2023-2024 The Software Heritage developers
|
||||
# See the AUTHORS file at the top-level directory of this distribution
|
||||
# License: GNU General Public License version 3, or any later version
|
||||
# See top-level LICENSE file for more information
|
||||
|
@ -146,10 +146,11 @@ class BioconductorLister(Lister[BioconductorListerState, BioconductorListerPage]
|
|||
f"{self.BIOCONDUCTOR_HOMEPAGE}/about/release-announcements"
|
||||
).text
|
||||
bs = BeautifulSoup(html, "html.parser")
|
||||
|
||||
return [
|
||||
tr.find_all("td")[0].text
|
||||
for tr in reversed(bs.find("table").find("tbody").find_all("tr"))
|
||||
if tr.find_all("td")[2].find("a")
|
||||
tr.select("td")[0].text
|
||||
for tr in reversed(bs.select("table tbody tr"))
|
||||
if tr.select("td")[2].select("a")
|
||||
]
|
||||
|
||||
def parse_packages(self, text: str) -> Dict[str, Any]:
|
||||
|
|
|
@ -1,10 +1,9 @@
|
|||
# Copyright (C) 2019-2023 The Software Heritage developers
|
||||
# Copyright (C) 2019-2024 The Software Heritage developers
|
||||
# License: GNU General Public License version 3, or any later version
|
||||
# See top-level LICENSE file for more information
|
||||
|
||||
from datetime import datetime, timezone
|
||||
import logging
|
||||
import re
|
||||
from typing import Any, Dict, Iterator, List, Optional
|
||||
from urllib.parse import urljoin, urlparse
|
||||
|
||||
|
@ -95,14 +94,16 @@ class CGitLister(StatelessLister[Repositories]):
|
|||
|
||||
page_results = []
|
||||
|
||||
for tr in bs_idx.find("div", {"class": "content"}).find_all(
|
||||
"tr", {"class": ""}
|
||||
):
|
||||
repository_link = tr.find("a")["href"]
|
||||
for tr in bs_idx.select("div.content tr:not([class])"):
|
||||
repository_link = tr.select_one("a")
|
||||
|
||||
if repository_link is None:
|
||||
continue
|
||||
|
||||
repo_url = None
|
||||
git_url = None
|
||||
|
||||
base_url = urljoin(self.url, repository_link).strip("/")
|
||||
base_url = urljoin(self.url, repository_link.attrs["href"]).strip("/")
|
||||
if self.base_git_url: # mapping provided
|
||||
# computing git url
|
||||
git_url = base_url.replace(self.url, self.base_git_url)
|
||||
|
@ -111,7 +112,7 @@ class CGitLister(StatelessLister[Repositories]):
|
|||
# the git url (cf. self.get_origins_from_page)
|
||||
repo_url = base_url
|
||||
|
||||
span = tr.find("span", {"class": re.compile("age-")})
|
||||
span = tr.select_one('span[class^="age-"]')
|
||||
last_updated_date = span.get("title") if span else None
|
||||
|
||||
page_results.append(
|
||||
|
@ -125,12 +126,14 @@ class CGitLister(StatelessLister[Repositories]):
|
|||
yield page_results
|
||||
|
||||
try:
|
||||
pager = bs_idx.find("ul", {"class": "pager"})
|
||||
|
||||
current_page = pager.find("a", {"class": "current"})
|
||||
if current_page:
|
||||
next_page = current_page.parent.next_sibling.a["href"]
|
||||
next_page = urljoin(self.url, next_page)
|
||||
next_page_li = bs_idx.select_one(
|
||||
"ul.pager li:has(> a.current) + li:has(> a)"
|
||||
)
|
||||
next_page = (
|
||||
urljoin(self.url, next_page_li.select("a")[0].attrs["href"])
|
||||
if next_page_li
|
||||
else None
|
||||
)
|
||||
except (AttributeError, KeyError):
|
||||
# no pager, or no next page
|
||||
next_page = None
|
||||
|
@ -169,27 +172,26 @@ class CGitLister(StatelessLister[Repositories]):
|
|||
return None
|
||||
|
||||
# check if we are on the summary tab, if not, go to this tab
|
||||
tab = bs.find("table", {"class": "tabs"})
|
||||
if tab:
|
||||
summary_a = tab.find("a", string="summary")
|
||||
if summary_a:
|
||||
summary_url = urljoin(repository_url, summary_a["href"]).strip("/")
|
||||
summary_a = bs.select_one('table.tabs a:-soup-contains("summary")')
|
||||
if summary_a:
|
||||
summary_path = summary_a.attrs["href"]
|
||||
summary_url = urljoin(repository_url, summary_path).strip("/")
|
||||
|
||||
if summary_url != repository_url:
|
||||
logger.debug(
|
||||
"%s : Active tab is not the summary, trying to load the summary page",
|
||||
repository_url,
|
||||
)
|
||||
return self._get_origin_from_repository_url(summary_url)
|
||||
else:
|
||||
logger.debug("No summary tab found on %s", repository_url)
|
||||
if summary_url != repository_url:
|
||||
logger.debug(
|
||||
"%s : Active tab is not the summary, trying to load the summary page",
|
||||
repository_url,
|
||||
)
|
||||
return self._get_origin_from_repository_url(summary_url)
|
||||
else:
|
||||
logger.debug("No summary tab found on %s", repository_url)
|
||||
|
||||
# origin urls are listed on the repository page
|
||||
# TODO check if forcing https is better or not ?
|
||||
# <link rel='vcs-git' href='git://...' title='...'/>
|
||||
# <link rel='vcs-git' href='http://...' title='...'/>
|
||||
# <link rel='vcs-git' href='https://...' title='...'/>
|
||||
urls = [x["href"] for x in bs.find_all("a", {"rel": "vcs-git"})]
|
||||
urls = [x.attrs["href"] for x in bs.select('a[rel="vcs-git"]')]
|
||||
|
||||
if not urls:
|
||||
logger.debug("No git urls found on %s", repository_url)
|
||||
|
|
|
@ -1,10 +1,9 @@
|
|||
# Copyright (C) 2023 The Software Heritage developers
|
||||
# Copyright (C) 2023-2024 The Software Heritage developers
|
||||
# License: GNU General Public License version 3, or any later version
|
||||
# See top-level LICENSE file for more information
|
||||
|
||||
from datetime import datetime, timezone
|
||||
import logging
|
||||
import re
|
||||
from typing import Any, Dict, Iterator, List, Optional
|
||||
from urllib.parse import parse_qs, urljoin, urlparse
|
||||
|
||||
|
@ -80,14 +79,12 @@ class GitwebLister(StatelessLister[Repositories]):
|
|||
|
||||
page_results = []
|
||||
|
||||
for tr in bs_idx.find("table", {"class": re.compile("project_list")}).find_all(
|
||||
"tr"
|
||||
):
|
||||
link = tr.find("a")
|
||||
for tr in bs_idx.select("table.project_list tr"):
|
||||
link = tr.select_one("a")
|
||||
if not link:
|
||||
continue
|
||||
|
||||
repo_url = urljoin(self.url, link["href"]).strip("/")
|
||||
repo_url = urljoin(self.url, link.attrs["href"]).strip("/")
|
||||
|
||||
# Skip this description page which is listed but won't yield any origins to list
|
||||
if repo_url.endswith("?o=descr"):
|
||||
|
@ -95,7 +92,7 @@ class GitwebLister(StatelessLister[Repositories]):
|
|||
|
||||
# This retrieves the date interval in natural language (e.g. '9 years ago')
|
||||
# to actual python datetime interval so we can derive last update
|
||||
span = tr.find("td", {"class": re.compile("age.*")})
|
||||
span = tr.select_one('td[class^="age"]')
|
||||
page_results.append(
|
||||
{"url": repo_url, "last_update_interval": span.text if span else None}
|
||||
)
|
||||
|
@ -134,8 +131,8 @@ class GitwebLister(StatelessLister[Repositories]):
|
|||
return None
|
||||
|
||||
urls = []
|
||||
for row in bs.find_all("tr", {"class": "metadata_url"}):
|
||||
url = row.contents[-1].string.strip()
|
||||
for row in bs.select("tr.metadata_url"):
|
||||
url = row.select("td")[-1].text.strip()
|
||||
for scheme in ("http", "https", "git"):
|
||||
# remove any string prefix before origin
|
||||
pos = url.find(f"{scheme}://")
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
# Copyright (C) 2021-2022 The Software Heritage developers
|
||||
# Copyright (C) 2021-2024 The Software Heritage developers
|
||||
# See the AUTHORS file at the top-level directory of this distribution
|
||||
# License: GNU General Public License version 3, or any later version
|
||||
# See top-level LICENSE file for more information
|
||||
|
@ -226,32 +226,22 @@ class MavenLister(Lister[MavenListerState, RepoPage]):
|
|||
logger.info("Found %s poms.", len(out_pom))
|
||||
|
||||
# Now fetch pom files and scan them for scm info.
|
||||
|
||||
logger.info("Fetching poms..")
|
||||
logger.info("Fetching poms ...")
|
||||
for pom_url in out_pom:
|
||||
try:
|
||||
response = self.http_request(pom_url)
|
||||
parsed_pom = BeautifulSoup(response.content, "xml")
|
||||
project = parsed_pom.find("project")
|
||||
if project is None:
|
||||
continue
|
||||
scm = project.find("scm")
|
||||
if scm is not None:
|
||||
connection = scm.find("connection")
|
||||
if connection is not None:
|
||||
artifact_metadata_d = {
|
||||
"type": "scm",
|
||||
"doc": out_pom[pom_url],
|
||||
"url": connection.text,
|
||||
}
|
||||
logger.debug(
|
||||
"* Yielding pom %s: %s", pom_url, artifact_metadata_d
|
||||
)
|
||||
yield artifact_metadata_d
|
||||
else:
|
||||
logger.debug("No scm.connection in pom %s", pom_url)
|
||||
connection = parsed_pom.select_one("project scm connection")
|
||||
if connection is not None:
|
||||
artifact_metadata_d = {
|
||||
"type": "scm",
|
||||
"doc": out_pom[pom_url],
|
||||
"url": connection.text,
|
||||
}
|
||||
logger.debug("* Yielding pom %s: %s", pom_url, artifact_metadata_d)
|
||||
yield artifact_metadata_d
|
||||
else:
|
||||
logger.debug("No scm in pom %s", pom_url)
|
||||
logger.debug("No project.scm.connection in pom %s", pom_url)
|
||||
except requests.HTTPError:
|
||||
logger.warning(
|
||||
"POM info page could not be fetched, skipping project '%s'",
|
||||
|
|
|
@ -146,7 +146,7 @@ class NugetLister(Lister[NugetListerState, NugetListerPage]):
|
|||
)
|
||||
continue
|
||||
xml = BeautifulSoup(res_metadata.content, "xml")
|
||||
repo = xml.find("repository")
|
||||
repo = xml.select_one("repository")
|
||||
if repo and "url" in repo.attrs and "type" in repo.attrs:
|
||||
vcs_url = repo.attrs["url"]
|
||||
vcs_type = repo.attrs["type"]
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
# Copyright (C) 2022-2023 The Software Heritage developers
|
||||
# Copyright (C) 2022-2024 The Software Heritage developers
|
||||
# See the AUTHORS file at the top-level directory of this distribution
|
||||
# License: GNU General Public License version 3, or any later version
|
||||
# See top-level LICENSE file for more information
|
||||
|
@ -82,8 +82,8 @@ class RubyGemsLister(StatelessLister[RubyGemsListerPage]):
|
|||
def get_latest_dump_file(self) -> str:
|
||||
response = self.http_request(self.RUBY_GEMS_POSTGRES_DUMP_LIST_URL)
|
||||
xml = BeautifulSoup(response.content, "xml")
|
||||
contents = xml.find_all("Contents")
|
||||
return contents[-1].find("Key").text
|
||||
contents = xml.select("Contents")
|
||||
return contents[-1].select("Key")[0].text
|
||||
|
||||
def create_rubygems_db(
|
||||
self, postgresql: Postgresql
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
# Copyright (C) 2021-2023 The Software Heritage developers
|
||||
# Copyright (C) 2021-2024 The Software Heritage developers
|
||||
# See the AUTHORS file at the top-level directory of this distribution
|
||||
# License: GNU General Public License version 3, or any later version
|
||||
# See top-level LICENSE file for more information
|
||||
|
@ -367,7 +367,7 @@ class SourceForgeLister(Lister[SourceForgeListerState, SourceForgeListerPage]):
|
|||
else:
|
||||
bs = BeautifulSoup(response.text, features="html.parser")
|
||||
cvs_base_url = "rsync://a.cvs.sourceforge.net/cvsroot"
|
||||
for text in [b.text for b in bs.find_all("b")]:
|
||||
for text in [b.text for b in bs.select("b")]:
|
||||
match = re.search(rf".*/cvsroot/{project} co -P (.+)", text)
|
||||
if match is not None:
|
||||
module = match.group(1)
|
||||
|
|
|
@ -1,10 +1,9 @@
|
|||
# Copyright (C) 2023 The Software Heritage developers
|
||||
# Copyright (C) 2023-2024 The Software Heritage developers
|
||||
# License: GNU General Public License version 3, or any later version
|
||||
# See top-level LICENSE file for more information
|
||||
|
||||
from datetime import datetime, timezone
|
||||
import logging
|
||||
import re
|
||||
from typing import Any, Dict, Iterator, List, Optional
|
||||
from urllib.parse import urlparse
|
||||
|
||||
|
@ -73,15 +72,19 @@ class StagitLister(StatelessLister[Repositories]):
|
|||
|
||||
page_results = []
|
||||
|
||||
for tr in bs_idx.find("table", {"id": re.compile("index")}).find_all("tr"):
|
||||
link = tr.find("a")
|
||||
index_table = bs_idx.select_one("table#index")
|
||||
if index_table is None:
|
||||
return
|
||||
|
||||
for tr in index_table.select("tr"):
|
||||
link = tr.select_one("a")
|
||||
if not link:
|
||||
continue
|
||||
|
||||
repo_description_url = self.url + "/" + link["href"]
|
||||
repo_description_url = self.url + "/" + link.attrs["href"]
|
||||
|
||||
# This retrieves the date in format "%Y-%m-%d %H:%M"
|
||||
tds = tr.find_all("td")
|
||||
tds = tr.select("td")
|
||||
last_update = tds[-1].text if tds and tds[-1] else None
|
||||
|
||||
page_results.append(
|
||||
|
@ -122,10 +125,14 @@ class StagitLister(StatelessLister[Repositories]):
|
|||
return None
|
||||
|
||||
urls = [
|
||||
td.find("a")["href"]
|
||||
for row in bs.find_all("tr", {"class": "url"})
|
||||
for td in row.find_all("td")
|
||||
if td.text.startswith("git clone")
|
||||
a.attrs["href"]
|
||||
for a in [
|
||||
td.select_one("a")
|
||||
for row in bs.select("tr.url")
|
||||
for td in row.select("td")
|
||||
if td.text.startswith("git clone")
|
||||
]
|
||||
if a is not None
|
||||
]
|
||||
|
||||
if not urls:
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue