swh.lister.pypi: Add a pypi lister implementation using xmlprc api
Based solely on pypi's deprecated xmlrpc api [1]. No other way of listing pypi.org is referenced (except for parsing an html page through a legacy api [2]) [1] https://warehouse.readthedocs.io/api-reference/xml-rpc/#pypi-s-xml-rpc-methods [2] https://pypi.python.org/simple/ Related T422
This commit is contained in:
parent
94913e2bf3
commit
6ff3b90859
8 changed files with 284 additions and 2 deletions
|
@ -6,7 +6,7 @@
|
|||
import click
|
||||
|
||||
|
||||
SUPPORTED_LISTERS = ['github', 'gitlab', 'bitbucket', 'debian']
|
||||
SUPPORTED_LISTERS = ['github', 'gitlab', 'bitbucket', 'debian', 'pypi']
|
||||
|
||||
|
||||
@click.command()
|
||||
|
@ -71,6 +71,11 @@ def cli(db_url, lister, create_tables, drop_tables, with_data):
|
|||
lister.db_session.add_all(areas)
|
||||
lister.db_session.commit()
|
||||
|
||||
elif lister == 'pypi':
|
||||
from .pypi.models import ModelBase
|
||||
from .pypi.lister import PyPiLister
|
||||
_lister = PyPiLister(override_config=override_conf)
|
||||
|
||||
else:
|
||||
raise ValueError('Only supported listers are %s' % SUPPORTED_LISTERS)
|
||||
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
# Copyright (C) 2017 the Software Heritage developers
|
||||
# Copyright (C) 2017-2018 the Software Heritage developers
|
||||
# License: GNU General Public License version 3, or any later version
|
||||
# See top-level LICENSE file for more information
|
||||
|
||||
|
@ -20,6 +20,71 @@ from .abstractattribute import AbstractAttribute
|
|||
from .lister_base import FetchError
|
||||
|
||||
|
||||
class ListerXMLRPCTransport(abc.ABC):
|
||||
"""Use the xmlrpc library for making Lister endpoint requests.
|
||||
|
||||
To be used in conjunction with SWHListerBase or a subclass of it.
|
||||
"""
|
||||
SERVER = AbstractAttribute('string containing the server to contact for '
|
||||
'information')
|
||||
|
||||
def __init__(self):
|
||||
self.lister_version = __version__
|
||||
|
||||
def get_client(self, path):
|
||||
"""Initialize client to query for result
|
||||
|
||||
"""
|
||||
from xmlrpc import client
|
||||
return client.ServerProxy(path)
|
||||
|
||||
def list_packages(self, client):
|
||||
"""Listing method
|
||||
|
||||
"""
|
||||
pass
|
||||
|
||||
def request_uri(self, _):
|
||||
"""Same uri called once
|
||||
|
||||
"""
|
||||
return self.SERVER
|
||||
|
||||
def request_params(self, identifier):
|
||||
"""Cannot pass any parameters to query to the xmlrpc client so cannot
|
||||
even pass our user-agent specifics.
|
||||
|
||||
"""
|
||||
return {}
|
||||
|
||||
def transport_quota_check(self, response):
|
||||
"""No rate limit dealing explained.
|
||||
|
||||
"""
|
||||
return False, 0
|
||||
|
||||
def transport_request(self, identifier):
|
||||
"""Implements SWHListerBase.transport_request for HTTP using Requests.
|
||||
|
||||
"""
|
||||
path = self.request_uri(identifier)
|
||||
# params = self.request_params(identifier) # we cannot use this...
|
||||
|
||||
try:
|
||||
_client = self.get_client(path)
|
||||
return self.list_packages(_client)
|
||||
except Exception as e:
|
||||
raise FetchError(e)
|
||||
|
||||
def transport_response_to_string(self, response):
|
||||
"""Implements SWHListerBase.transport_response_to_string for XMLRPC
|
||||
given responses.
|
||||
"""
|
||||
s = pformat(self.SERVER)
|
||||
s += '\n#\n' + pformat(response)
|
||||
return s
|
||||
|
||||
|
||||
class SWHListerHttpTransport(abc.ABC):
|
||||
"""Use the Requests library for making Lister endpoint requests.
|
||||
|
||||
|
|
67
swh/lister/core/simple_lister.py
Normal file
67
swh/lister/core/simple_lister.py
Normal file
|
@ -0,0 +1,67 @@
|
|||
# Copyright (C) 2018 the Software Heritage developers
|
||||
# License: GNU General Public License version 3, or any later version
|
||||
# See top-level LICENSE file for more information
|
||||
|
||||
import logging
|
||||
|
||||
from .lister_base import SWHListerBase
|
||||
|
||||
|
||||
class SimpleLister(SWHListerBase):
|
||||
"""Lister* intermediate class for any service that follows the simple,
|
||||
'list in oneshot information' pattern.
|
||||
|
||||
- Client sends a request to list repositories in oneshot
|
||||
|
||||
- Client receives structured (json/xml/etc) response with
|
||||
information and stores those in db
|
||||
|
||||
"""
|
||||
def ingest_data(self, identifier, checks=False):
|
||||
"""Rework the base ingest_data.
|
||||
Request server endpoint which gives all in one go.
|
||||
|
||||
Simplify and filter response list of repositories. Inject
|
||||
repo information into local db. Queue loader tasks for
|
||||
linked repositories.
|
||||
|
||||
Args:
|
||||
identifier: Resource identifier (unused)
|
||||
checks (bool): Additional checks required (unused)
|
||||
|
||||
"""
|
||||
# Request (partial?) list of repositories info
|
||||
response = self.safely_issue_request(identifier)
|
||||
if not response:
|
||||
return response, []
|
||||
models_list = self.transport_response_simplified(response)
|
||||
models_list = self.filter_before_inject(models_list)
|
||||
from swh.core import utils
|
||||
all_injected = []
|
||||
for models in utils.grouper(models_list, n=1000):
|
||||
models = list(models)
|
||||
logging.debug('models: %s' % len(models))
|
||||
# inject into local db
|
||||
injected = self.inject_repo_data_into_db(models)
|
||||
# queue workers
|
||||
self.create_missing_origins_and_tasks(models, injected)
|
||||
all_injected.append(injected)
|
||||
# flush
|
||||
self.db_session.commit()
|
||||
self.db_session = self.mk_session()
|
||||
|
||||
return response, all_injected
|
||||
|
||||
def run(self):
|
||||
"""Query the server which answers in one query. Stores the
|
||||
information, dropping actual redundant information we
|
||||
already have.
|
||||
|
||||
Returns:
|
||||
nothing
|
||||
|
||||
"""
|
||||
dump_not_used_identifier = 0
|
||||
response, injected_repos = self.ingest_data(dump_not_used_identifier)
|
||||
if not response and not injected_repos:
|
||||
logging.info('No response from api server, stopping')
|
0
swh/lister/pypi/__init__.py
Normal file
0
swh/lister/pypi/__init__.py
Normal file
70
swh/lister/pypi/lister.py
Normal file
70
swh/lister/pypi/lister.py
Normal file
|
@ -0,0 +1,70 @@
|
|||
# Copyright (C) 2018 the Software Heritage developers
|
||||
# License: GNU General Public License version 3, or any later version
|
||||
# See top-level LICENSE file for more information
|
||||
|
||||
from .models import PyPiModel
|
||||
|
||||
from swh.scheduler import utils
|
||||
from swh.lister.core.simple_lister import SimpleLister
|
||||
from swh.lister.core.lister_transports import ListerXMLRPCTransport
|
||||
|
||||
|
||||
class PyPiLister(ListerXMLRPCTransport, SimpleLister):
|
||||
# Template path expecting an integer that represents the page id
|
||||
MODEL = PyPiModel
|
||||
LISTER_NAME = 'pypi'
|
||||
SERVER = 'https://pypi.org/pypi'
|
||||
|
||||
def __init__(self, override_config=None):
|
||||
ListerXMLRPCTransport.__init__(self)
|
||||
SimpleLister.__init__(self, override_config=override_config)
|
||||
|
||||
def task_dict(self, origin_type, origin_url, **kwargs):
|
||||
"""(Override) Return task format dict
|
||||
|
||||
This is overridden from the lister_base as more information is
|
||||
needed for the ingestion task creation.
|
||||
|
||||
"""
|
||||
_type = 'origin-update-%s' % origin_type
|
||||
_policy = 'recurring'
|
||||
project_metadata_url = kwargs.get('html_url')
|
||||
return utils.create_task_dict(
|
||||
_type, _policy, origin_url,
|
||||
project_metadata_url=project_metadata_url)
|
||||
|
||||
def list_packages(self, client):
|
||||
"""(Override) List the actual pypi origins from the api.
|
||||
|
||||
"""
|
||||
return client.list_packages()
|
||||
|
||||
def _compute_urls(self, repo_name):
|
||||
"""Returns a tuple (project_url, project_metadata_url)
|
||||
|
||||
"""
|
||||
return (
|
||||
'https://pypi.org/pypi/%s/' % repo_name,
|
||||
'https://pypi.org/pypi/%s/json' % repo_name
|
||||
)
|
||||
|
||||
def get_model_from_repo(self, repo_name):
|
||||
"""(Override) Transform from repository representation to model
|
||||
|
||||
"""
|
||||
project_url, project_url_meta = self._compute_urls(repo_name)
|
||||
return {
|
||||
'uid': repo_name,
|
||||
'name': repo_name,
|
||||
'full_name': repo_name,
|
||||
'html_url': project_url_meta,
|
||||
'origin_url': project_url,
|
||||
'origin_type': 'pypi',
|
||||
'description': None,
|
||||
}
|
||||
|
||||
def transport_response_simplified(self, response):
|
||||
"""(Override) Transform response to list for model manipulation
|
||||
|
||||
"""
|
||||
return [self.get_model_from_repo(repo_name) for repo_name in response]
|
16
swh/lister/pypi/models.py
Normal file
16
swh/lister/pypi/models.py
Normal file
|
@ -0,0 +1,16 @@
|
|||
# Copyright (C) 2018 the Software Heritage developers
|
||||
# License: GNU General Public License version 3, or any later version
|
||||
# See top-level LICENSE file for more information
|
||||
|
||||
from sqlalchemy import Column, String
|
||||
|
||||
from ..core.models import ModelBase
|
||||
|
||||
|
||||
class PyPiModel(ModelBase):
|
||||
"""a PyPi repository representation
|
||||
|
||||
"""
|
||||
__tablename__ = 'pypi_repo'
|
||||
|
||||
uid = Column(String, primary_key=True)
|
20
swh/lister/pypi/tasks.py
Normal file
20
swh/lister/pypi/tasks.py
Normal file
|
@ -0,0 +1,20 @@
|
|||
# Copyright (C) 2018 the Software Heritage developers
|
||||
# License: GNU General Public License version 3, or any later version
|
||||
# See top-level LICENSE file for more information
|
||||
|
||||
from ..core.tasks import ListerTaskBase
|
||||
from .lister import PyPiLister
|
||||
|
||||
|
||||
class PyPiListerTask(ListerTaskBase):
|
||||
"""Full PyPi lister (list all available origins from the api).
|
||||
|
||||
"""
|
||||
task_queue = 'swh_lister_pypi_refresh'
|
||||
|
||||
def new_lister(self):
|
||||
return PyPiLister()
|
||||
|
||||
def run_task(self):
|
||||
lister = self.new_lister()
|
||||
lister.run()
|
Loading…
Add table
Add a link
Reference in a new issue