From 2461c97bbbc430f5119968fc10c97f7b0cc60417 Mon Sep 17 00:00:00 2001 From: Antoine Lambert Date: Fri, 5 Feb 2021 14:17:32 +0100 Subject: [PATCH] pypi: Use BeautifulSoup for parsing HTML instead of xmltodict xmltodict now raises an error while trying to parse the HTML content of https://pypi.org/simple/ page. So use BeautifulSoup HTML parser instead as it is aleady a requirement of swh-lister and it does not fail parsing the PyPI HTML page. Also drop no longer used xmltodict in requirements. --- mypy.ini | 3 --- requirements.txt | 1 - swh/lister/pypi/lister.py | 7 ++++--- 3 files changed, 4 insertions(+), 7 deletions(-) diff --git a/mypy.ini b/mypy.ini index c84a7e7..8aab2fa 100644 --- a/mypy.ini +++ b/mypy.ini @@ -36,6 +36,3 @@ ignore_missing_imports = True [mypy-urllib3.util.*] ignore_missing_imports = True - -[mypy-xmltodict.*] -ignore_missing_imports = True diff --git a/requirements.txt b/requirements.txt index 34bf339..4f6c24e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,6 @@ python_debian requests setuptools -xmltodict iso8601 beautifulsoup4 launchpadlib diff --git a/swh/lister/pypi/lister.py b/swh/lister/pypi/lister.py index ae9874b..ad52e22 100644 --- a/swh/lister/pypi/lister.py +++ b/swh/lister/pypi/lister.py @@ -6,8 +6,8 @@ import logging from typing import Iterator, List, Optional +from bs4 import BeautifulSoup import requests -import xmltodict from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin @@ -54,8 +54,9 @@ class PyPILister(StatelessLister[PackageListPage]): response.raise_for_status() - page_xmldict = xmltodict.parse(response.content) - page_results = [p["#text"] for p in page_xmldict["html"]["body"]["a"]] + page = BeautifulSoup(response.content, features="html.parser") + + page_results = [p.text for p in page.find_all("a")] yield page_results