From 2461c97bbbc430f5119968fc10c97f7b0cc60417 Mon Sep 17 00:00:00 2001
From: Antoine Lambert <antoine.lambert@inria.fr>
Date: Fri, 5 Feb 2021 14:17:32 +0100
Subject: [PATCH] pypi: Use BeautifulSoup for parsing HTML instead of xmltodict

xmltodict now raises an error while trying to parse the HTML content
of https://pypi.org/simple/ page.

So use BeautifulSoup HTML parser instead as it is aleady a requirement
of swh-lister and it does not fail parsing the PyPI HTML page.

Also drop no longer used xmltodict in requirements.
---
 mypy.ini                  | 3 ---
 requirements.txt          | 1 -
 swh/lister/pypi/lister.py | 7 ++++---
 3 files changed, 4 insertions(+), 7 deletions(-)

diff --git a/mypy.ini b/mypy.ini
index c84a7e7..8aab2fa 100644
--- a/mypy.ini
+++ b/mypy.ini
@@ -36,6 +36,3 @@ ignore_missing_imports = True
 
 [mypy-urllib3.util.*]
 ignore_missing_imports = True
-
-[mypy-xmltodict.*]
-ignore_missing_imports = True
diff --git a/requirements.txt b/requirements.txt
index 34bf339..4f6c24e 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,7 +1,6 @@
 python_debian
 requests
 setuptools
-xmltodict
 iso8601
 beautifulsoup4
 launchpadlib
diff --git a/swh/lister/pypi/lister.py b/swh/lister/pypi/lister.py
index ae9874b..ad52e22 100644
--- a/swh/lister/pypi/lister.py
+++ b/swh/lister/pypi/lister.py
@@ -6,8 +6,8 @@
 import logging
 from typing import Iterator, List, Optional
 
+from bs4 import BeautifulSoup
 import requests
-import xmltodict
 
 from swh.scheduler.interface import SchedulerInterface
 from swh.scheduler.model import ListedOrigin
@@ -54,8 +54,9 @@ class PyPILister(StatelessLister[PackageListPage]):
 
         response.raise_for_status()
 
-        page_xmldict = xmltodict.parse(response.content)
-        page_results = [p["#text"] for p in page_xmldict["html"]["body"]["a"]]
+        page = BeautifulSoup(response.content, features="html.parser")
+
+        page_results = [p.text for p in page.find_all("a")]
 
         yield page_results