implement listers as plugins

Listers are declared as plugins via the `swh.workers` entry_point.

As such, the registry function is expected to return a dict with the
`task_modules` field (as for generic worker plugins), plus:

- `lister`: the lister class,
- `models`: list of SQLAlchemy models used by this lister,
- `init` (optionnal): hook (callable) used to initialize the lister's state
  (typically, create/initialize the database for this lister).
  If not set, the default implementation creates database tables (after
  optionally having deleted exisintg ones) according to models declared in
  the `models` register field.

There is no need for explicitely add lister task modules in the main
`conftest` module, but any new/extra lister to be tested must be registered
(the tested lister module must be properly installed in the test environment).

Also refactor a bit the cli tools:
- add support for the standard --config-file option at the 'lister' group
  level,
- move the --db-url to the 'lister' group,
- drop the --lister option for the `swh lister db-init` cli tool:
  initializing (especially with --drop-tables) the database for a single
  lister is unreliable, since all tables are created using a sibgle MetaData
  (in the same namespace).
This commit is contained in:
David Douard 2019-09-03 15:01:58 +02:00
parent c67a926f26
commit e3c0ea9d90
18 changed files with 279 additions and 216 deletions

View file

@ -1,4 +1,3 @@
swh.core
swh.storage >= 0.0.122
swh.storage[schemata]
swh.scheduler >= 0.0.39
swh.storage[schemata] >= 0.0.122
swh.scheduler >= 0.0.58

View file

@ -1,4 +1,4 @@
pytest<4
pytest
pytest-postgresql
requests_mock
testing.postgresql

View file

@ -55,6 +55,18 @@ setup(
swh-lister=swh.lister.cli:cli
[swh.cli.subcommands]
lister=swh.lister.cli:lister
[swh.workers]
lister.bitbucket=swh.lister.bitbucket:register
lister.cgit=swh.lister.cgit:register
lister.cran=swh.lister.cran:register
lister.debian=swh.lister.debian:register
lister.github=swh.lister.github:register
lister.gitlab=swh.lister.gitlab:register
lister.gnu=swh.lister.gnu:register
lister.npm=swh.lister.npm:register
lister.packagist=swh.lister.packagist:register
lister.phabricator=swh.lister.phabricator:register
lister.pypi=swh.lister.pypi:register
''',
classifiers=[
"Programming Language :: Python :: 3",

View file

@ -0,0 +1,13 @@
# Copyright (C) 2019 the Software Heritage developers
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
def register():
from .models import BitBucketModel
from .lister import BitBucketLister
return {'models': [BitBucketModel],
'lister': BitBucketLister,
'task_modules': ['%s.tasks' % __name__],
}

View file

@ -0,0 +1,13 @@
# Copyright (C) 2019 the Software Heritage developers
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
def register():
from .models import CGitModel
from .lister import CGitLister
return {'models': [CGitModel],
'lister': CGitLister,
'task_modules': ['%s.tasks' % __name__],
}

View file

@ -4,191 +4,103 @@
# See top-level LICENSE file for more information
import logging
import pkg_resources
from copy import deepcopy
import click
from sqlalchemy import create_engine
from swh.core.cli import CONTEXT_SETTINGS
from swh.lister.core.models import initialize
logger = logging.getLogger(__name__)
SUPPORTED_LISTERS = ['github', 'gitlab', 'bitbucket', 'debian', 'pypi',
'npm', 'phabricator', 'gnu', 'cran', 'cgit', 'packagist']
LISTERS = {entry_point.name.split('.', 1)[1]: entry_point
for entry_point in pkg_resources.iter_entry_points('swh.workers')
if entry_point.name.split('.', 1)[0] == 'lister'}
SUPPORTED_LISTERS = list(LISTERS)
# Base urls for most listers
DEFAULT_BASEURLS = {
'gitlab': 'https://gitlab.com/api/v4/',
'phabricator': 'https://forge.softwareheritage.org',
}
def get_lister(lister_name, db_url, drop_tables=False, **conf):
def get_lister(lister_name, db_url=None, **conf):
"""Instantiate a lister given its name.
Args:
lister_name (str): Lister's name
db_url (str): Db's service url access
conf (dict): Extra configuration (policy, priority for example)
conf (dict): Configuration dict (lister db cnx, policy, priority...)
Returns:
Tuple (instantiated lister, drop_tables function, init schema function,
insert minimum data function)
"""
override_conf = {
'lister': {
'cls': 'local',
'args': {'db': db_url}
},
**conf,
}
# To allow api_baseurl override per lister
if 'api_baseurl' in override_conf:
api_baseurl = override_conf.pop('api_baseurl')
else:
api_baseurl = DEFAULT_BASEURLS.get(lister_name)
insert_minimum_data_fn = None
if lister_name == 'github':
from .github.models import IndexingModelBase as ModelBase
from .github.lister import GitHubLister
_lister = GitHubLister(api_baseurl='https://api.github.com',
override_config=override_conf)
elif lister_name == 'bitbucket':
from .bitbucket.models import IndexingModelBase as ModelBase
from .bitbucket.lister import BitBucketLister
_lister = BitBucketLister(api_baseurl='https://api.bitbucket.org/2.0',
override_config=override_conf)
elif lister_name == 'gitlab':
from .gitlab.models import ModelBase
from .gitlab.lister import GitLabLister
_lister = GitLabLister(api_baseurl=api_baseurl,
override_config=override_conf)
elif lister_name == 'debian':
from .debian.lister import DebianLister
ModelBase = DebianLister.MODEL # noqa
_lister = DebianLister(override_config=override_conf)
def insert_minimum_data_fn(lister_name, lister):
logger.info('Inserting minimal data for %s', lister_name)
from swh.storage.schemata.distribution import (
Distribution, Area)
d = Distribution(
name='Debian',
type='deb',
mirror_uri='http://deb.debian.org/debian/')
lister.db_session.add(d)
areas = []
for distribution_name in ['stretch']:
for area_name in ['main', 'contrib', 'non-free']:
areas.append(Area(
name='%s/%s' % (distribution_name, area_name),
distribution=d,
))
lister.db_session.add_all(areas)
lister.db_session.commit()
elif lister_name == 'pypi':
from .pypi.models import ModelBase
from .pypi.lister import PyPILister
_lister = PyPILister(override_config=override_conf)
elif lister_name == 'npm':
from .npm.models import IndexingModelBase as ModelBase
from .npm.models import NpmVisitModel
from .npm.lister import NpmLister
_lister = NpmLister(override_config=override_conf)
def insert_minimum_data_fn(lister_name, lister):
logger.info('Inserting minimal data for %s', lister_name)
if drop_tables:
NpmVisitModel.metadata.drop_all(lister.db_engine)
NpmVisitModel.metadata.create_all(lister.db_engine)
elif lister_name == 'phabricator':
from .phabricator.models import IndexingModelBase as ModelBase
from .phabricator.lister import PhabricatorLister
_lister = PhabricatorLister(api_baseurl=api_baseurl,
override_config=override_conf)
elif lister_name == 'gnu':
from .gnu.models import ModelBase
from .gnu.lister import GNULister
_lister = GNULister(override_config=override_conf)
elif lister_name == 'cran':
from .cran.models import ModelBase
from .cran.lister import CRANLister
_lister = CRANLister(override_config=override_conf)
elif lister_name == 'cgit':
from .cgit.models import ModelBase
from .cgit.lister import CGitLister
_lister = CGitLister(url=api_baseurl,
override_config=override_conf)
elif lister_name == 'packagist':
from .packagist.models import ModelBase # noqa
from .packagist.lister import PackagistLister
_lister = PackagistLister(override_config=override_conf)
else:
if lister_name not in LISTERS:
raise ValueError(
'Invalid lister %s: only supported listers are %s' %
(lister_name, SUPPORTED_LISTERS))
drop_table_fn = None
if drop_tables:
def drop_table_fn(lister_name, lister):
logger.info('Dropping tables for %s', lister_name)
ModelBase.metadata.drop_all(lister.db_engine)
def init_schema_fn(lister_name, lister):
logger.info('Creating tables for %s', lister_name)
ModelBase.metadata.create_all(lister.db_engine)
return _lister, drop_table_fn, init_schema_fn, insert_minimum_data_fn
if db_url:
conf['lister'] = {'cls': 'local', 'args': {'db': db_url}}
# To allow api_baseurl override per lister
registry_entry = LISTERS[lister_name].load()()
lister_cls = registry_entry['lister']
lister = lister_cls(override_config=conf)
return lister
@click.group(name='lister', context_settings=CONTEXT_SETTINGS)
@click.option('--config-file', '-C', default=None,
type=click.Path(exists=True, dir_okay=False,),
help="Configuration file.")
@click.option('--db-url', '-d', default=None,
help='SQLAlchemy DB URL; see '
'<http://docs.sqlalchemy.org/en/latest/core/engines.html#database-urls>') # noqa
@click.pass_context
def lister(ctx):
def lister(ctx, config_file, db_url):
'''Software Heritage Lister tools.'''
pass
from swh.core import config
ctx.ensure_object(dict)
override_conf = {}
if db_url:
override_conf['lister'] = {
'cls': 'local',
'args': {'db': db_url}
}
conf = config.read(config_file, override_conf)
ctx.obj['config'] = conf
ctx.obj['override_conf'] = override_conf
@lister.command(name='db-init', context_settings=CONTEXT_SETTINGS)
@click.option('--db-url', '-d', default='postgres:///lister',
help='SQLAlchemy DB URL; see '
'<http://docs.sqlalchemy.org/en/latest/core/engines.html#database-urls>') # noqa
@click.argument('listers', required=1, nargs=-1,
type=click.Choice(SUPPORTED_LISTERS + ['all']))
@click.option('--drop-tables', '-D', is_flag=True, default=False,
help='Drop tables before creating the database schema')
@click.pass_context
def cli(ctx, db_url, listers, drop_tables):
def db_init(ctx, drop_tables):
"""Initialize the database model for given listers.
"""
if 'all' in listers:
listers = SUPPORTED_LISTERS
for lister_name in listers:
logger.info('Initializing lister %s', lister_name)
lister, drop_schema_fn, init_schema_fn, insert_minimum_data_fn = \
get_lister(lister_name, db_url, drop_tables=drop_tables)
cfg = ctx.obj['config']
lister_cfg = cfg['lister']
if lister_cfg['cls'] != 'local':
click.echo('A local lister configuration is required')
ctx.exit(1)
if drop_schema_fn:
drop_schema_fn(lister_name, lister)
db_url = lister_cfg['args']['db']
db_engine = create_engine(db_url)
init_schema_fn(lister_name, lister)
for lister, entrypoint in LISTERS.items():
logger.info('Loading lister %s', lister)
registry_entry = entrypoint.load()()
if insert_minimum_data_fn:
insert_minimum_data_fn(lister_name, lister)
logger.info('Initializing database')
initialize(db_engine, drop_tables)
for lister, entrypoint in LISTERS.items():
init_hook = registry_entry.get('init')
if callable(init_hook):
logger.info('Calling init hook for %s', lister)
init_hook(db_engine)
@lister.command(name='run', context_settings=CONTEXT_SETTINGS,
@ -196,9 +108,6 @@ def cli(ctx, db_url, listers, drop_tables):
'instance. The output of this listing results in '
'"oneshot" tasks in the scheduler db with a priority '
'defined by the user')
@click.option('--db-url', '-d', default='postgres:///lister',
help='SQLAlchemy DB URL; see '
'<http://docs.sqlalchemy.org/en/latest/core/engines.html#database-urls>') # noqa
@click.option('--lister', '-l', help='Lister to run',
type=click.Choice(SUPPORTED_LISTERS))
@click.option('--priority', '-p', default='high',
@ -206,23 +115,19 @@ def cli(ctx, db_url, listers, drop_tables):
help='Task priority for the listed repositories to ingest')
@click.argument('options', nargs=-1)
@click.pass_context
def run(ctx, db_url, lister, priority, options):
def run(ctx, lister, priority, options):
from swh.scheduler.cli.utils import parse_options
config = deepcopy(ctx.obj['config'])
if options:
_, kwargs = parse_options(options)
else:
kwargs = {}
config.update(parse_options(options)[1])
override_config = {
'priority': priority,
'policy': 'oneshot',
**kwargs,
}
config['priority'] = priority
config['policy'] = 'oneshot'
lister, _, _, _ = get_lister(lister, db_url, **override_config)
lister.run()
get_lister(lister, **config).run()
if __name__ == '__main__':
cli()
lister()

View file

@ -4,12 +4,15 @@
import abc
from datetime import datetime
import logging
from sqlalchemy import Column, DateTime, Integer, String
from sqlalchemy.ext.declarative import declarative_base, DeclarativeMeta
from .abstractattribute import AbstractAttribute
logger = logging.getLogger(__name__)
SQLBase = declarative_base()
@ -46,3 +49,23 @@ class IndexingModelBase(ModelBase, metaclass=ABCSQLMeta):
# The value used for sorting, segmenting, or api query paging,
# because uids aren't always sequential.
indexable = AbstractAttribute('Column(<indexable_type>, index=True)')
def initialize(db_engine, drop_tables=False, **kwargs):
"""Default database initialization function for a lister.
Typically called from the lister's initialization hook.
Args:
models (list): list of SQLAlchemy tables/models to drop/create.
db_enfine (): the SQLAlchemy DB engine.
drop_tables (bool): if True, tables will be dropped before
(re)creating them.
"""
if drop_tables:
logger.info('Dropping tables')
SQLBase.metadata.drop_all(db_engine, checkfirst=True)
logger.info('Creating tables')
SQLBase.metadata.create_all(db_engine, checkfirst=True)

View file

@ -1,19 +1 @@
import pytest
from swh.scheduler.tests.conftest import * # noqa
@pytest.fixture(scope='session')
def celery_includes():
return [
'swh.lister.bitbucket.tasks',
'swh.lister.cgit.tasks',
'swh.lister.cran.tasks',
'swh.lister.debian.tasks',
'swh.lister.github.tasks',
'swh.lister.gitlab.tasks',
'swh.lister.gnu.tasks',
'swh.lister.npm.tasks',
'swh.lister.packagist.tasks',
'swh.lister.phabricator.tasks',
'swh.lister.pypi.tasks',
]

View file

@ -0,0 +1,13 @@
# Copyright (C) 2019 the Software Heritage developers
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
def register():
from .models import CRANModel
from .lister import CRANLister
return {'models': [CRANModel],
'lister': CRANLister,
'task_modules': ['%s.tasks' % __name__],
}

View file

@ -0,0 +1,40 @@
# Copyright (C) 2019 the Software Heritage developers
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
def debian_init(db_engine, override_conf=None):
from swh.storage.schemata.distribution import (
Distribution, Area)
from .lister import DebianLister
lister = DebianLister(override_config=override_conf)
if not lister.db_session\
.query(Distribution)\
.filter(Distribution.name == 'Debian')\
.one_or_none():
d = Distribution(
name='Debian',
type='deb',
mirror_uri='http://deb.debian.org/debian/')
lister.db_session.add(d)
areas = []
for distribution_name in ['stretch', 'buster']:
for area_name in ['main', 'contrib', 'non-free']:
areas.append(Area(
name='%s/%s' % (distribution_name, area_name),
distribution=d,
))
lister.db_session.add_all(areas)
lister.db_session.commit()
def register():
from .lister import DebianLister
return {'models': [DebianLister.MODEL],
'lister': DebianLister,
'task_modules': ['%s.tasks' % __name__],
'init': debian_init}

View file

@ -0,0 +1,13 @@
# Copyright (C) 2019 the Software Heritage developers
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
def register():
from .models import GitHubModel
from .lister import GitHubLister
return {'models': [GitHubModel],
'lister': GitHubLister,
'task_modules': ['%s.tasks' % __name__],
}

View file

@ -0,0 +1,13 @@
# Copyright (C) 2019 the Software Heritage developers
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
def register():
from .models import GitLabModel
from .lister import GitLabLister
return {'models': [GitLabModel],
'lister': GitLabLister,
'task_modules': ['%s.tasks' % __name__],
}

View file

@ -0,0 +1,13 @@
# Copyright (C) 2019 the Software Heritage developers
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
def register():
from .models import GNUModel
from .lister import GNULister
return {'models': [GNUModel],
'lister': GNULister,
'task_modules': ['%s.tasks' % __name__],
}

View file

@ -0,0 +1,13 @@
# Copyright (C) 2019 the Software Heritage developers
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
def register():
from .models import NpmVisitModel, NpmModel
from .lister import NpmLister
return {'models': [NpmVisitModel, NpmModel],
'lister': NpmLister,
'task_modules': ['%s.tasks' % __name__],
}

View file

@ -0,0 +1,13 @@
# Copyright (C) 2019 the Software Heritage developers
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
def register():
from .models import PackagistModel
from .lister import PackagistLister
return {'models': [PackagistModel],
'lister': PackagistLister,
'task_modules': ['%s.tasks' % __name__],
}

View file

@ -0,0 +1,13 @@
# Copyright (C) 2019 the Software Heritage developers
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
def register():
from .models import PhabricatorModel
from .lister import PhabricatorLister
return {'models': [PhabricatorModel],
'lister': PhabricatorLister,
'task_modules': ['%s.tasks' % __name__],
}

View file

@ -0,0 +1,13 @@
# Copyright (C) 2019 the Software Heritage developers
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
def register():
from .models import PyPIModel
from .lister import PyPILister
return {'models': [PyPIModel],
'lister': PyPILister,
'task_modules': ['%s.tasks' % __name__],
}

View file

@ -6,7 +6,7 @@
import pytest
from swh.lister.core.lister_base import ListerBase
from swh.lister.cli import get_lister, SUPPORTED_LISTERS, DEFAULT_BASEURLS
from swh.lister.cli import get_lister, SUPPORTED_LISTERS
from .test_utils import init_db
@ -24,32 +24,9 @@ def test_get_lister():
"""
db_url = init_db().url()
supported_listers_with_init = {'npm', 'debian'}
supported_listers = set(SUPPORTED_LISTERS) - supported_listers_with_init
for lister_name in supported_listers:
lst, drop_fn, init_fn, insert_data_fn = get_lister(lister_name, db_url)
for lister_name in SUPPORTED_LISTERS:
lst = get_lister(lister_name, db_url)
assert isinstance(lst, ListerBase)
assert drop_fn is None
assert init_fn is not None
assert insert_data_fn is None
for lister_name in supported_listers_with_init:
lst, drop_fn, init_fn, insert_data_fn = get_lister(lister_name, db_url)
assert isinstance(lst, ListerBase)
assert drop_fn is None
assert init_fn is not None
assert insert_data_fn is not None
for lister_name in supported_listers_with_init:
lst, drop_fn, init_fn, insert_data_fn = get_lister(lister_name, db_url,
drop_tables=True)
assert isinstance(lst, ListerBase)
assert drop_fn is not None
assert init_fn is not None
assert insert_data_fn is not None
def test_get_lister_override():
@ -67,9 +44,9 @@ def test_get_lister_override():
# check the override ends up defined in the lister
for lister_name, (url_key, url_value) in listers.items():
lst, drop_fn, init_fn, insert_data_fn = get_lister(
lst = get_lister(
lister_name, db_url, **{
'api_baseurl': url_value,
url_key: url_value,
'priority': 'high',
'policy': 'oneshot',
})
@ -81,14 +58,9 @@ def test_get_lister_override():
# check the default urls are used and not the override (since it's not
# passed)
for lister_name, (url_key, url_value) in listers.items():
lst, drop_fn, init_fn, insert_data_fn = get_lister(lister_name, db_url)
lst = get_lister(lister_name, db_url)
# no override so this does not end up in lister's configuration
assert url_key not in lst.config
# then the default base url is used
default_url = DEFAULT_BASEURLS[lister_name]
assert getattr(lst, url_key) == default_url
assert 'priority' not in lst.config
assert 'oneshot' not in lst.config