pypi: Use BeautifulSoup for parsing HTML instead of xmltodict

xmltodict now raises an error while trying to parse the HTML content
of https://pypi.org/simple/ page.

So use BeautifulSoup HTML parser instead as it is aleady a requirement
of swh-lister and it does not fail parsing the PyPI HTML page.

Also drop no longer used xmltodict in requirements.
This commit is contained in:
Antoine Lambert 2021-02-05 14:17:32 +01:00
parent 4245c5046f
commit 2461c97bbb
3 changed files with 4 additions and 7 deletions

View file

@ -36,6 +36,3 @@ ignore_missing_imports = True
[mypy-urllib3.util.*]
ignore_missing_imports = True
[mypy-xmltodict.*]
ignore_missing_imports = True

View file

@ -1,7 +1,6 @@
python_debian
requests
setuptools
xmltodict
iso8601
beautifulsoup4
launchpadlib

View file

@ -6,8 +6,8 @@
import logging
from typing import Iterator, List, Optional
from bs4 import BeautifulSoup
import requests
import xmltodict
from swh.scheduler.interface import SchedulerInterface
from swh.scheduler.model import ListedOrigin
@ -54,8 +54,9 @@ class PyPILister(StatelessLister[PackageListPage]):
response.raise_for_status()
page_xmldict = xmltodict.parse(response.content)
page_results = [p["#text"] for p in page_xmldict["html"]["body"]["a"]]
page = BeautifulSoup(response.content, features="html.parser")
page_results = [p.text for p in page.find_all("a")]
yield page_results