swh.lister.pypi: Add a pypi lister implementation using xmlprc api

Based solely on pypi's deprecated xmlrpc api [1].  No other way of listing
pypi.org is referenced (except for parsing an html page through a
legacy api [2])

[1] https://warehouse.readthedocs.io/api-reference/xml-rpc/#pypi-s-xml-rpc-methods

[2] https://pypi.python.org/simple/

Related T422
This commit is contained in:
Antoine R. Dumont (@ardumont) 2018-07-27 13:54:08 +02:00
parent 94913e2bf3
commit 6ff3b90859
No known key found for this signature in database
GPG key ID: 52E2E9840D10C3B8
8 changed files with 284 additions and 2 deletions

View file

@ -160,3 +160,42 @@ Note: This expects storage (5002) and scheduler (5008) services to run locally
DEBUG:urllib3.connectionpool:Starting new HTTP connection (1): deb.debian.org
DEBUG:urllib3.connectionpool:http://deb.debian.org:80 "GET /debian//dists/stretch/main/source/Sources.xz HTTP/1.1" 302 325
...
## lister-debian
### preparation steps
1. git clone under $SWH_ENVIRONMENT_HOME/swh-lister (of your choosing)
2. mkdir ~/.config/swh/ ~/.cache/swh/lister/pypi/
3. create configuration file ~/.config/swh/lister-pypi.yml
4. Bootstrap the db instance schema
$ createdb lister-pypi
$ python3 -m swh.lister.cli --db-url postgres:///lister-pypi \
--lister pypi \
--create-tables \
--with-data
Note: This bootstraps a minimum data set needed for the pypi
lister to run (for development)
### Configuration file sample
$ cat ~/.config/swh/lister-pypi.yml
# see http://docs.sqlalchemy.org/en/latest/core/engines.html#database-urls
lister_db_url: postgres:///lister-pypi
credentials: []
cache_responses: True
cache_dir: /home/zack/.cache/swh/lister/pypi
Note: This expects storage (5002) and scheduler (5008) services to run locally
### Run
$ python3
Python 3.6.6 (default, Jun 27 2018, 14:44:17)
[GCC 8.1.0] on linux
Type "help", "copyright", "credits" or "license" for more information.
>>> from swh.lister.pypi.tasks import PyPiListerTask; PyPiListerTask().run_task()
>>>

View file

@ -6,7 +6,7 @@
import click
SUPPORTED_LISTERS = ['github', 'gitlab', 'bitbucket', 'debian']
SUPPORTED_LISTERS = ['github', 'gitlab', 'bitbucket', 'debian', 'pypi']
@click.command()
@ -71,6 +71,11 @@ def cli(db_url, lister, create_tables, drop_tables, with_data):
lister.db_session.add_all(areas)
lister.db_session.commit()
elif lister == 'pypi':
from .pypi.models import ModelBase
from .pypi.lister import PyPiLister
_lister = PyPiLister(override_config=override_conf)
else:
raise ValueError('Only supported listers are %s' % SUPPORTED_LISTERS)

View file

@ -1,4 +1,4 @@
# Copyright (C) 2017 the Software Heritage developers
# Copyright (C) 2017-2018 the Software Heritage developers
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
@ -20,6 +20,71 @@ from .abstractattribute import AbstractAttribute
from .lister_base import FetchError
class ListerXMLRPCTransport(abc.ABC):
"""Use the xmlrpc library for making Lister endpoint requests.
To be used in conjunction with SWHListerBase or a subclass of it.
"""
SERVER = AbstractAttribute('string containing the server to contact for '
'information')
def __init__(self):
self.lister_version = __version__
def get_client(self, path):
"""Initialize client to query for result
"""
from xmlrpc import client
return client.ServerProxy(path)
def list_packages(self, client):
"""Listing method
"""
pass
def request_uri(self, _):
"""Same uri called once
"""
return self.SERVER
def request_params(self, identifier):
"""Cannot pass any parameters to query to the xmlrpc client so cannot
even pass our user-agent specifics.
"""
return {}
def transport_quota_check(self, response):
"""No rate limit dealing explained.
"""
return False, 0
def transport_request(self, identifier):
"""Implements SWHListerBase.transport_request for HTTP using Requests.
"""
path = self.request_uri(identifier)
# params = self.request_params(identifier) # we cannot use this...
try:
_client = self.get_client(path)
return self.list_packages(_client)
except Exception as e:
raise FetchError(e)
def transport_response_to_string(self, response):
"""Implements SWHListerBase.transport_response_to_string for XMLRPC
given responses.
"""
s = pformat(self.SERVER)
s += '\n#\n' + pformat(response)
return s
class SWHListerHttpTransport(abc.ABC):
"""Use the Requests library for making Lister endpoint requests.

View file

@ -0,0 +1,67 @@
# Copyright (C) 2018 the Software Heritage developers
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import logging
from .lister_base import SWHListerBase
class SimpleLister(SWHListerBase):
"""Lister* intermediate class for any service that follows the simple,
'list in oneshot information' pattern.
- Client sends a request to list repositories in oneshot
- Client receives structured (json/xml/etc) response with
information and stores those in db
"""
def ingest_data(self, identifier, checks=False):
"""Rework the base ingest_data.
Request server endpoint which gives all in one go.
Simplify and filter response list of repositories. Inject
repo information into local db. Queue loader tasks for
linked repositories.
Args:
identifier: Resource identifier (unused)
checks (bool): Additional checks required (unused)
"""
# Request (partial?) list of repositories info
response = self.safely_issue_request(identifier)
if not response:
return response, []
models_list = self.transport_response_simplified(response)
models_list = self.filter_before_inject(models_list)
from swh.core import utils
all_injected = []
for models in utils.grouper(models_list, n=1000):
models = list(models)
logging.debug('models: %s' % len(models))
# inject into local db
injected = self.inject_repo_data_into_db(models)
# queue workers
self.create_missing_origins_and_tasks(models, injected)
all_injected.append(injected)
# flush
self.db_session.commit()
self.db_session = self.mk_session()
return response, all_injected
def run(self):
"""Query the server which answers in one query. Stores the
information, dropping actual redundant information we
already have.
Returns:
nothing
"""
dump_not_used_identifier = 0
response, injected_repos = self.ingest_data(dump_not_used_identifier)
if not response and not injected_repos:
logging.info('No response from api server, stopping')

View file

70
swh/lister/pypi/lister.py Normal file
View file

@ -0,0 +1,70 @@
# Copyright (C) 2018 the Software Heritage developers
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from .models import PyPiModel
from swh.scheduler import utils
from swh.lister.core.simple_lister import SimpleLister
from swh.lister.core.lister_transports import ListerXMLRPCTransport
class PyPiLister(ListerXMLRPCTransport, SimpleLister):
# Template path expecting an integer that represents the page id
MODEL = PyPiModel
LISTER_NAME = 'pypi'
SERVER = 'https://pypi.org/pypi'
def __init__(self, override_config=None):
ListerXMLRPCTransport.__init__(self)
SimpleLister.__init__(self, override_config=override_config)
def task_dict(self, origin_type, origin_url, **kwargs):
"""(Override) Return task format dict
This is overridden from the lister_base as more information is
needed for the ingestion task creation.
"""
_type = 'origin-update-%s' % origin_type
_policy = 'recurring'
project_metadata_url = kwargs.get('html_url')
return utils.create_task_dict(
_type, _policy, origin_url,
project_metadata_url=project_metadata_url)
def list_packages(self, client):
"""(Override) List the actual pypi origins from the api.
"""
return client.list_packages()
def _compute_urls(self, repo_name):
"""Returns a tuple (project_url, project_metadata_url)
"""
return (
'https://pypi.org/pypi/%s/' % repo_name,
'https://pypi.org/pypi/%s/json' % repo_name
)
def get_model_from_repo(self, repo_name):
"""(Override) Transform from repository representation to model
"""
project_url, project_url_meta = self._compute_urls(repo_name)
return {
'uid': repo_name,
'name': repo_name,
'full_name': repo_name,
'html_url': project_url_meta,
'origin_url': project_url,
'origin_type': 'pypi',
'description': None,
}
def transport_response_simplified(self, response):
"""(Override) Transform response to list for model manipulation
"""
return [self.get_model_from_repo(repo_name) for repo_name in response]

16
swh/lister/pypi/models.py Normal file
View file

@ -0,0 +1,16 @@
# Copyright (C) 2018 the Software Heritage developers
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from sqlalchemy import Column, String
from ..core.models import ModelBase
class PyPiModel(ModelBase):
"""a PyPi repository representation
"""
__tablename__ = 'pypi_repo'
uid = Column(String, primary_key=True)

20
swh/lister/pypi/tasks.py Normal file
View file

@ -0,0 +1,20 @@
# Copyright (C) 2018 the Software Heritage developers
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from ..core.tasks import ListerTaskBase
from .lister import PyPiLister
class PyPiListerTask(ListerTaskBase):
"""Full PyPi lister (list all available origins from the api).
"""
task_queue = 'swh_lister_pypi_refresh'
def new_lister(self):
return PyPiLister()
def run_task(self):
lister = self.new_lister()
lister.run()