diff --git a/swh/lister/core/lister_transports.py b/swh/lister/core/lister_transports.py index 7f77449..a1f346c 100644 --- a/swh/lister/core/lister_transports.py +++ b/swh/lister/core/lister_transports.py @@ -7,6 +7,7 @@ import random from datetime import datetime from email.utils import parsedate from pprint import pformat +from xmlrpc import client import requests import xmltodict @@ -35,15 +36,8 @@ class ListerXMLRPCTransport(abc.ABC): """Initialize client to query for result """ - from xmlrpc import client return client.ServerProxy(path) - def list_packages(self, client): - """Listing method - - """ - pass - def request_uri(self, _): """Same uri called once @@ -64,24 +58,22 @@ class ListerXMLRPCTransport(abc.ABC): return False, 0 def transport_request(self, identifier): - """Implements SWHListerBase.transport_request for HTTP using Requests. + """Implements SWHListerBase.transport_request """ path = self.request_uri(identifier) - # params = self.request_params(identifier) # we cannot use this... - try: - _client = self.get_client(path) - return self.list_packages(_client) + return self.get_client(path) except Exception as e: raise FetchError(e) def transport_response_to_string(self, response): """Implements SWHListerBase.transport_response_to_string for XMLRPC given responses. + """ s = pformat(self.SERVER) - s += '\n#\n' + pformat(response) + s += '\n#\n' + pformat(response) # Note: will potentially be big return s @@ -216,3 +208,25 @@ class SWHListerHttpTransport(abc.ABC): except Exception: # not xml s += pformat(response.text) return s + + +class ListerOnePageApiTransport(SWHListerHttpTransport): + """Use the request library for retrieving a basic html page and parse + the result. + + To be used in conjunction with SWHListerBase or a subclass of it. + + """ + PAGE = AbstractAttribute("The server api's unique page to retrieve and " + "parse for information") + PATH_TEMPLATE = None # we do not use it + + def __init__(self, api_baseurl=None): + self.session = requests.Session() + self.lister_version = __version__ + + def request_uri(self, _): + """Get the full request URI given the transport_request identifier. + + """ + return self.PAGE diff --git a/swh/lister/core/simple_lister.py b/swh/lister/core/simple_lister.py index 42b707c..c9e9d6b 100644 --- a/swh/lister/core/simple_lister.py +++ b/swh/lister/core/simple_lister.py @@ -2,6 +2,7 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +import abc import logging from .lister_base import SWHListerBase @@ -17,6 +18,13 @@ class SimpleLister(SWHListerBase): information and stores those in db """ + @abc.abstractmethod + def list_packages(self, *args): + """Listing packages method. + + """ + pass + def ingest_data(self, identifier, checks=False): """Rework the base ingest_data. Request server endpoint which gives all in one go. @@ -32,6 +40,7 @@ class SimpleLister(SWHListerBase): """ # Request (partial?) list of repositories info response = self.safely_issue_request(identifier) + response = self.list_packages(response) if not response: return response, [] models_list = self.transport_response_simplified(response) diff --git a/swh/lister/pypi/lister.py b/swh/lister/pypi/lister.py index 857b951..294f0b4 100644 --- a/swh/lister/pypi/lister.py +++ b/swh/lister/pypi/lister.py @@ -2,21 +2,23 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +import xmltodict + from .models import PyPiModel from swh.scheduler import utils from swh.lister.core.simple_lister import SimpleLister -from swh.lister.core.lister_transports import ListerXMLRPCTransport +from swh.lister.core.lister_transports import ListerOnePageApiTransport -class PyPiLister(ListerXMLRPCTransport, SimpleLister): +class PyPiLister(ListerOnePageApiTransport, SimpleLister): # Template path expecting an integer that represents the page id MODEL = PyPiModel LISTER_NAME = 'pypi' - SERVER = 'https://pypi.org/pypi' + PAGE = 'https://pypi.org/simple/' def __init__(self, override_config=None): - ListerXMLRPCTransport.__init__(self) + ListerOnePageApiTransport .__init__(self) SimpleLister.__init__(self, override_config=override_config) def task_dict(self, origin_type, origin_url, **kwargs): @@ -33,11 +35,13 @@ class PyPiLister(ListerXMLRPCTransport, SimpleLister): _type, _policy, origin_url, project_metadata_url=project_metadata_url) - def list_packages(self, client): - """(Override) List the actual pypi origins from the api. + def list_packages(self, response): + """(Override) List the actual pypi origins from the response. """ - return client.list_packages() + result = xmltodict.parse(response.content) + _all = result['html']['body']['a'] + return [package['#text'] for package in _all] def _compute_urls(self, repo_name): """Returns a tuple (project_url, project_metadata_url)