swh.lister.pypi: Use pypi's legacy html based api to list packages

The xmlrpc is marked as deprecated [1]. Even if it's not now, the
legacy api is not marked as deprecated. So moving towards this one
sounds more reasonable [2].

[1] https://warehouse.readthedocs.io/api-reference/xml-rpc/#pypi-s-xml-rpc-methods

[2] https://warehouse.readthedocs.io/api-reference/legacy/#simple-project-api

Related T422
This commit is contained in:
Antoine R. Dumont (@ardumont) 2018-07-27 15:12:00 +02:00
parent 6ff3b90859
commit 3a65fbb4c8
No known key found for this signature in database
GPG key ID: 52E2E9840D10C3B8
3 changed files with 47 additions and 20 deletions

View file

@ -2,21 +2,23 @@
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import xmltodict
from .models import PyPiModel
from swh.scheduler import utils
from swh.lister.core.simple_lister import SimpleLister
from swh.lister.core.lister_transports import ListerXMLRPCTransport
from swh.lister.core.lister_transports import ListerOnePageApiTransport
class PyPiLister(ListerXMLRPCTransport, SimpleLister):
class PyPiLister(ListerOnePageApiTransport, SimpleLister):
# Template path expecting an integer that represents the page id
MODEL = PyPiModel
LISTER_NAME = 'pypi'
SERVER = 'https://pypi.org/pypi'
PAGE = 'https://pypi.org/simple/'
def __init__(self, override_config=None):
ListerXMLRPCTransport.__init__(self)
ListerOnePageApiTransport .__init__(self)
SimpleLister.__init__(self, override_config=override_config)
def task_dict(self, origin_type, origin_url, **kwargs):
@ -33,11 +35,13 @@ class PyPiLister(ListerXMLRPCTransport, SimpleLister):
_type, _policy, origin_url,
project_metadata_url=project_metadata_url)
def list_packages(self, client):
"""(Override) List the actual pypi origins from the api.
def list_packages(self, response):
"""(Override) List the actual pypi origins from the response.
"""
return client.list_packages()
result = xmltodict.parse(response.content)
_all = result['html']['body']['a']
return [package['#text'] for package in _all]
def _compute_urls(self, repo_name):
"""Returns a tuple (project_url, project_metadata_url)