pypi: Use BeautifulSoup for parsing HTML instead of xmltodict
xmltodict now raises an error while trying to parse the HTML content of https://pypi.org/simple/ page. So use BeautifulSoup HTML parser instead as it is aleady a requirement of swh-lister and it does not fail parsing the PyPI HTML page. Also drop no longer used xmltodict in requirements.
This commit is contained in:
parent
4245c5046f
commit
2461c97bbb
3 changed files with 4 additions and 7 deletions
|
@ -6,8 +6,8 @@
|
|||
import logging
|
||||
from typing import Iterator, List, Optional
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
import requests
|
||||
import xmltodict
|
||||
|
||||
from swh.scheduler.interface import SchedulerInterface
|
||||
from swh.scheduler.model import ListedOrigin
|
||||
|
@ -54,8 +54,9 @@ class PyPILister(StatelessLister[PackageListPage]):
|
|||
|
||||
response.raise_for_status()
|
||||
|
||||
page_xmldict = xmltodict.parse(response.content)
|
||||
page_results = [p["#text"] for p in page_xmldict["html"]["body"]["a"]]
|
||||
page = BeautifulSoup(response.content, features="html.parser")
|
||||
|
||||
page_results = [p.text for p in page.find_all("a")]
|
||||
|
||||
yield page_results
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue