pypi: Use BeautifulSoup for parsing HTML instead of xmltodict
xmltodict now raises an error while trying to parse the HTML content of https://pypi.org/simple/ page. So use BeautifulSoup HTML parser instead as it is aleady a requirement of swh-lister and it does not fail parsing the PyPI HTML page. Also drop no longer used xmltodict in requirements.
This commit is contained in:
parent
4245c5046f
commit
2461c97bbb
3 changed files with 4 additions and 7 deletions
3
mypy.ini
3
mypy.ini
|
@ -36,6 +36,3 @@ ignore_missing_imports = True
|
|||
|
||||
[mypy-urllib3.util.*]
|
||||
ignore_missing_imports = True
|
||||
|
||||
[mypy-xmltodict.*]
|
||||
ignore_missing_imports = True
|
||||
|
|
|
@ -1,7 +1,6 @@
|
|||
python_debian
|
||||
requests
|
||||
setuptools
|
||||
xmltodict
|
||||
iso8601
|
||||
beautifulsoup4
|
||||
launchpadlib
|
||||
|
|
|
@ -6,8 +6,8 @@
|
|||
import logging
|
||||
from typing import Iterator, List, Optional
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
import requests
|
||||
import xmltodict
|
||||
|
||||
from swh.scheduler.interface import SchedulerInterface
|
||||
from swh.scheduler.model import ListedOrigin
|
||||
|
@ -54,8 +54,9 @@ class PyPILister(StatelessLister[PackageListPage]):
|
|||
|
||||
response.raise_for_status()
|
||||
|
||||
page_xmldict = xmltodict.parse(response.content)
|
||||
page_results = [p["#text"] for p in page_xmldict["html"]["body"]["a"]]
|
||||
page = BeautifulSoup(response.content, features="html.parser")
|
||||
|
||||
page_results = [p.text for p in page.find_all("a")]
|
||||
|
||||
yield page_results
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue