pypi: Use BeautifulSoup for parsing HTML instead of xmltodict

xmltodict now raises an error while trying to parse the HTML content of https://pypi.org/simple/ page. So use BeautifulSoup HTML parser instead as it is aleady a requirement of swh-lister and it does not fail parsing the PyPI HTML page. Also drop no longer used xmltodict in requirements.
2021-02-05 14:17:32 +01:00 · 2021-02-05 14:17:32 +01:00 · 2461c97bbb
commit 2461c97bbb
parent 4245c5046f
3 changed files with 4 additions and 7 deletions
--- a/mypy.ini
+++ b/mypy.ini
@ -36,6 +36,3 @@ ignore_missing_imports = True

 [mypy-urllib3.util.*]
 ignore_missing_imports = True
-
-[mypy-xmltodict.*]
-ignore_missing_imports = True
--- a/requirements.txt
+++ b/requirements.txt
@ -1,7 +1,6 @@
 python_debian
 requests
 setuptools
-xmltodict
 iso8601
 beautifulsoup4
 launchpadlib
--- a/swh/lister/pypi/lister.py
+++ b/swh/lister/pypi/lister.py
@ -6,8 +6,8 @@
 import logging
 from typing import Iterator, List, Optional

+from bs4 import BeautifulSoup
 import requests
-import xmltodict

 from swh.scheduler.interface import SchedulerInterface
 from swh.scheduler.model import ListedOrigin
@ -54,8 +54,9 @@ class PyPILister(StatelessLister[PackageListPage]):

        response.raise_for_status()

-        page_xmldict = xmltodict.parse(response.content)
-        page_results = [p["#text"] for p in page_xmldict["html"]["body"]["a"]]
+        page = BeautifulSoup(response.content, features="html.parser")
+
+        page_results = [p.text for p in page.find_all("a")]

        yield page_results