From 6ff3b908595d72920c57e2c9fe37f778aec092fc Mon Sep 17 00:00:00 2001 From: "Antoine R. Dumont (@ardumont)" Date: Fri, 27 Jul 2018 13:54:08 +0200 Subject: [PATCH] swh.lister.pypi: Add a pypi lister implementation using xmlprc api Based solely on pypi's deprecated xmlrpc api [1]. No other way of listing pypi.org is referenced (except for parsing an html page through a legacy api [2]) [1] https://warehouse.readthedocs.io/api-reference/xml-rpc/#pypi-s-xml-rpc-methods [2] https://pypi.python.org/simple/ Related T422 --- README.md | 39 ++++++++++++++++ swh/lister/cli.py | 7 ++- swh/lister/core/lister_transports.py | 67 +++++++++++++++++++++++++- swh/lister/core/simple_lister.py | 67 ++++++++++++++++++++++++++ swh/lister/pypi/__init__.py | 0 swh/lister/pypi/lister.py | 70 ++++++++++++++++++++++++++++ swh/lister/pypi/models.py | 16 +++++++ swh/lister/pypi/tasks.py | 20 ++++++++ 8 files changed, 284 insertions(+), 2 deletions(-) create mode 100644 swh/lister/core/simple_lister.py create mode 100644 swh/lister/pypi/__init__.py create mode 100644 swh/lister/pypi/lister.py create mode 100644 swh/lister/pypi/models.py create mode 100644 swh/lister/pypi/tasks.py diff --git a/README.md b/README.md index 3c3878b..274c311 100644 --- a/README.md +++ b/README.md @@ -160,3 +160,42 @@ Note: This expects storage (5002) and scheduler (5008) services to run locally DEBUG:urllib3.connectionpool:Starting new HTTP connection (1): deb.debian.org DEBUG:urllib3.connectionpool:http://deb.debian.org:80 "GET /debian//dists/stretch/main/source/Sources.xz HTTP/1.1" 302 325 ... + + +## lister-debian + +### preparation steps + +1. git clone under $SWH_ENVIRONMENT_HOME/swh-lister (of your choosing) +2. mkdir ~/.config/swh/ ~/.cache/swh/lister/pypi/ +3. create configuration file ~/.config/swh/lister-pypi.yml +4. Bootstrap the db instance schema + + $ createdb lister-pypi + $ python3 -m swh.lister.cli --db-url postgres:///lister-pypi \ + --lister pypi \ + --create-tables \ + --with-data + + Note: This bootstraps a minimum data set needed for the pypi + lister to run (for development) + +### Configuration file sample + + $ cat ~/.config/swh/lister-pypi.yml + # see http://docs.sqlalchemy.org/en/latest/core/engines.html#database-urls + lister_db_url: postgres:///lister-pypi + credentials: [] + cache_responses: True + cache_dir: /home/zack/.cache/swh/lister/pypi + +Note: This expects storage (5002) and scheduler (5008) services to run locally + +### Run + + $ python3 + Python 3.6.6 (default, Jun 27 2018, 14:44:17) + [GCC 8.1.0] on linux + Type "help", "copyright", "credits" or "license" for more information. + >>> from swh.lister.pypi.tasks import PyPiListerTask; PyPiListerTask().run_task() + >>> diff --git a/swh/lister/cli.py b/swh/lister/cli.py index ee7ff09..4997fe6 100644 --- a/swh/lister/cli.py +++ b/swh/lister/cli.py @@ -6,7 +6,7 @@ import click -SUPPORTED_LISTERS = ['github', 'gitlab', 'bitbucket', 'debian'] +SUPPORTED_LISTERS = ['github', 'gitlab', 'bitbucket', 'debian', 'pypi'] @click.command() @@ -71,6 +71,11 @@ def cli(db_url, lister, create_tables, drop_tables, with_data): lister.db_session.add_all(areas) lister.db_session.commit() + elif lister == 'pypi': + from .pypi.models import ModelBase + from .pypi.lister import PyPiLister + _lister = PyPiLister(override_config=override_conf) + else: raise ValueError('Only supported listers are %s' % SUPPORTED_LISTERS) diff --git a/swh/lister/core/lister_transports.py b/swh/lister/core/lister_transports.py index d6e85b6..7f77449 100644 --- a/swh/lister/core/lister_transports.py +++ b/swh/lister/core/lister_transports.py @@ -1,4 +1,4 @@ -# Copyright (C) 2017 the Software Heritage developers +# Copyright (C) 2017-2018 the Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -20,6 +20,71 @@ from .abstractattribute import AbstractAttribute from .lister_base import FetchError +class ListerXMLRPCTransport(abc.ABC): + """Use the xmlrpc library for making Lister endpoint requests. + + To be used in conjunction with SWHListerBase or a subclass of it. + """ + SERVER = AbstractAttribute('string containing the server to contact for ' + 'information') + + def __init__(self): + self.lister_version = __version__ + + def get_client(self, path): + """Initialize client to query for result + + """ + from xmlrpc import client + return client.ServerProxy(path) + + def list_packages(self, client): + """Listing method + + """ + pass + + def request_uri(self, _): + """Same uri called once + + """ + return self.SERVER + + def request_params(self, identifier): + """Cannot pass any parameters to query to the xmlrpc client so cannot + even pass our user-agent specifics. + + """ + return {} + + def transport_quota_check(self, response): + """No rate limit dealing explained. + + """ + return False, 0 + + def transport_request(self, identifier): + """Implements SWHListerBase.transport_request for HTTP using Requests. + + """ + path = self.request_uri(identifier) + # params = self.request_params(identifier) # we cannot use this... + + try: + _client = self.get_client(path) + return self.list_packages(_client) + except Exception as e: + raise FetchError(e) + + def transport_response_to_string(self, response): + """Implements SWHListerBase.transport_response_to_string for XMLRPC + given responses. + """ + s = pformat(self.SERVER) + s += '\n#\n' + pformat(response) + return s + + class SWHListerHttpTransport(abc.ABC): """Use the Requests library for making Lister endpoint requests. diff --git a/swh/lister/core/simple_lister.py b/swh/lister/core/simple_lister.py new file mode 100644 index 0000000..42b707c --- /dev/null +++ b/swh/lister/core/simple_lister.py @@ -0,0 +1,67 @@ +# Copyright (C) 2018 the Software Heritage developers +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import logging + +from .lister_base import SWHListerBase + + +class SimpleLister(SWHListerBase): + """Lister* intermediate class for any service that follows the simple, + 'list in oneshot information' pattern. + + - Client sends a request to list repositories in oneshot + + - Client receives structured (json/xml/etc) response with + information and stores those in db + + """ + def ingest_data(self, identifier, checks=False): + """Rework the base ingest_data. + Request server endpoint which gives all in one go. + + Simplify and filter response list of repositories. Inject + repo information into local db. Queue loader tasks for + linked repositories. + + Args: + identifier: Resource identifier (unused) + checks (bool): Additional checks required (unused) + + """ + # Request (partial?) list of repositories info + response = self.safely_issue_request(identifier) + if not response: + return response, [] + models_list = self.transport_response_simplified(response) + models_list = self.filter_before_inject(models_list) + from swh.core import utils + all_injected = [] + for models in utils.grouper(models_list, n=1000): + models = list(models) + logging.debug('models: %s' % len(models)) + # inject into local db + injected = self.inject_repo_data_into_db(models) + # queue workers + self.create_missing_origins_and_tasks(models, injected) + all_injected.append(injected) + # flush + self.db_session.commit() + self.db_session = self.mk_session() + + return response, all_injected + + def run(self): + """Query the server which answers in one query. Stores the + information, dropping actual redundant information we + already have. + + Returns: + nothing + + """ + dump_not_used_identifier = 0 + response, injected_repos = self.ingest_data(dump_not_used_identifier) + if not response and not injected_repos: + logging.info('No response from api server, stopping') diff --git a/swh/lister/pypi/__init__.py b/swh/lister/pypi/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/swh/lister/pypi/lister.py b/swh/lister/pypi/lister.py new file mode 100644 index 0000000..857b951 --- /dev/null +++ b/swh/lister/pypi/lister.py @@ -0,0 +1,70 @@ +# Copyright (C) 2018 the Software Heritage developers +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from .models import PyPiModel + +from swh.scheduler import utils +from swh.lister.core.simple_lister import SimpleLister +from swh.lister.core.lister_transports import ListerXMLRPCTransport + + +class PyPiLister(ListerXMLRPCTransport, SimpleLister): + # Template path expecting an integer that represents the page id + MODEL = PyPiModel + LISTER_NAME = 'pypi' + SERVER = 'https://pypi.org/pypi' + + def __init__(self, override_config=None): + ListerXMLRPCTransport.__init__(self) + SimpleLister.__init__(self, override_config=override_config) + + def task_dict(self, origin_type, origin_url, **kwargs): + """(Override) Return task format dict + + This is overridden from the lister_base as more information is + needed for the ingestion task creation. + + """ + _type = 'origin-update-%s' % origin_type + _policy = 'recurring' + project_metadata_url = kwargs.get('html_url') + return utils.create_task_dict( + _type, _policy, origin_url, + project_metadata_url=project_metadata_url) + + def list_packages(self, client): + """(Override) List the actual pypi origins from the api. + + """ + return client.list_packages() + + def _compute_urls(self, repo_name): + """Returns a tuple (project_url, project_metadata_url) + + """ + return ( + 'https://pypi.org/pypi/%s/' % repo_name, + 'https://pypi.org/pypi/%s/json' % repo_name + ) + + def get_model_from_repo(self, repo_name): + """(Override) Transform from repository representation to model + + """ + project_url, project_url_meta = self._compute_urls(repo_name) + return { + 'uid': repo_name, + 'name': repo_name, + 'full_name': repo_name, + 'html_url': project_url_meta, + 'origin_url': project_url, + 'origin_type': 'pypi', + 'description': None, + } + + def transport_response_simplified(self, response): + """(Override) Transform response to list for model manipulation + + """ + return [self.get_model_from_repo(repo_name) for repo_name in response] diff --git a/swh/lister/pypi/models.py b/swh/lister/pypi/models.py new file mode 100644 index 0000000..b035f4c --- /dev/null +++ b/swh/lister/pypi/models.py @@ -0,0 +1,16 @@ +# Copyright (C) 2018 the Software Heritage developers +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from sqlalchemy import Column, String + +from ..core.models import ModelBase + + +class PyPiModel(ModelBase): + """a PyPi repository representation + + """ + __tablename__ = 'pypi_repo' + + uid = Column(String, primary_key=True) diff --git a/swh/lister/pypi/tasks.py b/swh/lister/pypi/tasks.py new file mode 100644 index 0000000..d8b0e2c --- /dev/null +++ b/swh/lister/pypi/tasks.py @@ -0,0 +1,20 @@ +# Copyright (C) 2018 the Software Heritage developers +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from ..core.tasks import ListerTaskBase +from .lister import PyPiLister + + +class PyPiListerTask(ListerTaskBase): + """Full PyPi lister (list all available origins from the api). + + """ + task_queue = 'swh_lister_pypi_refresh' + + def new_lister(self): + return PyPiLister() + + def run_task(self): + lister = self.new_lister() + lister.run()