Enable black

- blackify all the python files,
- enable black in pre-commit,
- add a black tox environment.
This commit is contained in:
David Douard 2020-04-08 16:31:22 +02:00
parent 1ae75166c7
commit 93a4d8b784
97 changed files with 1734 additions and 1642 deletions

View file

@ -7,14 +7,15 @@ def register():
from .models import NpmVisitModel, NpmModel
from .lister import NpmLister
return {'models': [NpmVisitModel, NpmModel],
'lister': NpmLister,
'task_modules': ['%s.tasks' % __name__],
'task_types': {
'list-npm-full': {
'default_interval': '7 days',
'min_interval': '7 days',
'max_interval': '7 days',
},
},
}
return {
"models": [NpmVisitModel, NpmModel],
"lister": NpmLister,
"task_modules": ["%s.tasks" % __name__],
"task_types": {
"list-npm-full": {
"default_interval": "7 days",
"min_interval": "7 days",
"max_interval": "7 days",
},
},
}

View file

@ -14,15 +14,17 @@ class NpmListerBase(IndexingHttpLister):
"""List packages available in the npm registry in a paginated way
"""
MODEL = NpmModel
LISTER_NAME = 'npm'
instance = 'npm'
def __init__(self, url='https://replicate.npmjs.com',
per_page=1000, override_config=None):
MODEL = NpmModel
LISTER_NAME = "npm"
instance = "npm"
def __init__(
self, url="https://replicate.npmjs.com", per_page=1000, override_config=None
):
super().__init__(url=url, override_config=override_config)
self.per_page = per_page + 1
self.PATH_TEMPLATE += '&limit=%s' % self.per_page
self.PATH_TEMPLATE += "&limit=%s" % self.per_page
@property
def ADDITIONAL_CONFIG(self) -> Dict[str, Any]:
@ -30,22 +32,22 @@ class NpmListerBase(IndexingHttpLister):
"""
default_config = super().ADDITIONAL_CONFIG
default_config['loading_task_policy'] = ('str', 'recurring')
default_config["loading_task_policy"] = ("str", "recurring")
return default_config
def get_model_from_repo(self, repo_name: str) -> Dict[str, str]:
"""(Override) Transform from npm package name to model
"""
package_url = 'https://www.npmjs.com/package/%s' % repo_name
package_url = "https://www.npmjs.com/package/%s" % repo_name
return {
'uid': repo_name,
'indexable': repo_name,
'name': repo_name,
'full_name': repo_name,
'html_url': package_url,
'origin_url': package_url,
'origin_type': 'npm',
"uid": repo_name,
"indexable": repo_name,
"name": repo_name,
"full_name": repo_name,
"html_url": package_url,
"origin_url": package_url,
"origin_type": "npm",
}
def task_dict(self, origin_type: str, origin_url: str, **kwargs):
@ -56,10 +58,9 @@ class NpmListerBase(IndexingHttpLister):
needed for the ingestion task creation.
"""
task_type = 'load-%s' % origin_type
task_policy = self.config['loading_task_policy']
return create_task_dict(task_type, task_policy,
url=origin_url)
task_type = "load-%s" % origin_type
task_policy = self.config["loading_task_policy"]
return create_task_dict(task_type, task_policy, url=origin_url)
def request_headers(self) -> Dict[str, Any]:
"""(Override) Set requests headers to send when querying the npm
@ -67,7 +68,7 @@ class NpmListerBase(IndexingHttpLister):
"""
headers = super().request_headers()
headers['Accept'] = 'application/json'
headers["Accept"] = "application/json"
return headers
def string_pattern_check(self, inner: int, lower: int, upper: int = None):
@ -83,25 +84,24 @@ class NpmLister(NpmListerBase):
"""List all packages available in the npm registry in a paginated way
"""
PATH_TEMPLATE = '/_all_docs?startkey="%s"'
def get_next_target_from_response(
self, response: Response) -> Optional[str]:
def get_next_target_from_response(self, response: Response) -> Optional[str]:
"""(Override) Get next npm package name to continue the listing
"""
repos = response.json()['rows']
return repos[-1]['id'] if len(repos) == self.per_page else None
repos = response.json()["rows"]
return repos[-1]["id"] if len(repos) == self.per_page else None
def transport_response_simplified(
self, response: Response) -> List[Dict[str, str]]:
def transport_response_simplified(self, response: Response) -> List[Dict[str, str]]:
"""(Override) Transform npm registry response to list for model manipulation
"""
repos = response.json()['rows']
repos = response.json()["rows"]
if len(repos) == self.per_page:
repos = repos[:-1]
return [self.get_model_from_repo(repo['id']) for repo in repos]
return [self.get_model_from_repo(repo["id"]) for repo in repos]
class NpmIncrementalLister(NpmListerBase):
@ -109,30 +109,29 @@ class NpmIncrementalLister(NpmListerBase):
update_seq value of the underlying CouchDB database, in a paginated way.
"""
PATH_TEMPLATE = '/_changes?since=%s'
PATH_TEMPLATE = "/_changes?since=%s"
@property
def CONFIG_BASE_FILENAME(self): # noqa: N802
return 'lister_npm_incremental'
return "lister_npm_incremental"
def get_next_target_from_response(
self, response: Response) -> Optional[str]:
def get_next_target_from_response(self, response: Response) -> Optional[str]:
"""(Override) Get next npm package name to continue the listing.
"""
repos = response.json()['results']
return repos[-1]['seq'] if len(repos) == self.per_page else None
repos = response.json()["results"]
return repos[-1]["seq"] if len(repos) == self.per_page else None
def transport_response_simplified(
self, response: Response) -> List[Dict[str, str]]:
def transport_response_simplified(self, response: Response) -> List[Dict[str, str]]:
"""(Override) Transform npm registry response to list for model
manipulation.
"""
repos = response.json()['results']
repos = response.json()["results"]
if len(repos) == self.per_page:
repos = repos[:-1]
return [self.get_model_from_repo(repo['id']) for repo in repos]
return [self.get_model_from_repo(repo["id"]) for repo in repos]
def filter_before_inject(self, models_list: List[Dict[str, Any]]):
"""(Override) Filter out documents in the CouchDB database
@ -141,9 +140,9 @@ class NpmIncrementalLister(NpmListerBase):
"""
models_filtered = []
for model in models_list:
package_name = model['name']
package_name = model["name"]
# document related to CouchDB internals
if package_name.startswith('_design/'):
if package_name.startswith("_design/"):
continue
models_filtered.append(model)
return models_filtered

View file

@ -11,9 +11,10 @@ class NpmVisitModel(SQLBase, metaclass=ABCSQLMeta):
"""Table to store the npm registry state at the time of a
content listing by Software Heritage
"""
__tablename__ = 'npm_visit'
uid = Column(Integer, Sequence('npm_visit_id_seq'), primary_key=True)
__tablename__ = "npm_visit"
uid = Column(Integer, Sequence("npm_visit_id_seq"), primary_key=True)
visit_date = Column(DateTime, nullable=False)
doc_count = Column(BigInteger)
doc_del_count = Column(BigInteger)
@ -29,7 +30,8 @@ class NpmModel(IndexingModelBase):
"""A npm package representation
"""
__tablename__ = 'npm_repo'
__tablename__ = "npm_repo"
uid = Column(String, primary_key=True)
indexable = Column(String, index=True)

View file

@ -13,15 +13,22 @@ from swh.lister.npm.models import NpmVisitModel
@contextmanager
def save_registry_state(lister):
params = {'headers': lister.request_headers()}
params = {"headers": lister.request_headers()}
registry_state = lister.session.get(lister.url, **params)
registry_state = registry_state.json()
keys = ('doc_count', 'doc_del_count', 'update_seq', 'purge_seq',
'disk_size', 'data_size', 'committed_update_seq',
'compacted_seq')
keys = (
"doc_count",
"doc_del_count",
"update_seq",
"purge_seq",
"disk_size",
"data_size",
"committed_update_seq",
"compacted_seq",
)
state = {key: registry_state[key] for key in keys}
state['visit_date'] = datetime.now()
state["visit_date"] = datetime.now()
yield
npm_visit = NpmVisitModel(**state)
lister.db_session.add(npm_visit)
@ -34,29 +41,31 @@ def get_last_update_seq(lister):
query = lister.db_session.query(NpmVisitModel.update_seq)
row = query.order_by(NpmVisitModel.uid.desc()).first()
if not row:
raise ValueError('No npm registry listing previously performed ! '
'This is required prior to the execution of an '
'incremental listing.')
raise ValueError(
"No npm registry listing previously performed ! "
"This is required prior to the execution of an "
"incremental listing."
)
return row[0]
@shared_task(name=__name__ + '.NpmListerTask')
@shared_task(name=__name__ + ".NpmListerTask")
def list_npm_full(**lister_args):
'Full lister for the npm (javascript) registry'
"Full lister for the npm (javascript) registry"
lister = NpmLister(**lister_args)
with save_registry_state(lister):
return lister.run()
@shared_task(name=__name__ + '.NpmIncrementalListerTask')
@shared_task(name=__name__ + ".NpmIncrementalListerTask")
def list_npm_incremental(**lister_args):
'Incremental lister for the npm (javascript) registry'
"Incremental lister for the npm (javascript) registry"
lister = NpmIncrementalLister(**lister_args)
update_seq_start = get_last_update_seq(lister)
with save_registry_state(lister):
return lister.run(min_bound=update_seq_start)
@shared_task(name=__name__ + '.ping')
@shared_task(name=__name__ + ".ping")
def _ping():
return 'OK'
return "OK"

View file

@ -10,14 +10,16 @@ from swh.lister.core.tests.conftest import * # noqa
@pytest.fixture
def lister_npm(swh_listers):
lister = swh_listers['npm']
lister = swh_listers["npm"]
# Add the load-deb-package in the scheduler backend
lister.scheduler.create_task_type({
'type': 'load-npm',
'description': 'Load npm package',
'backend_name': 'swh.loader.package.tasks.LoadNpm',
'default_interval': '1 day',
})
lister.scheduler.create_task_type(
{
"type": "load-npm",
"description": "Load npm package",
"backend_name": "swh.loader.package.tasks.LoadNpm",
"default_interval": "1 day",
}
)
return lister

View file

@ -21,10 +21,10 @@ logger = logging.getLogger(__name__)
class NpmListerTester(HttpListerTesterBase, unittest.TestCase):
Lister = NpmLister
test_re = re.compile(r'^.*/_all_docs\?startkey="(.+)".*')
lister_subdir = 'npm'
good_api_response_file = 'data/replicate.npmjs.com/api_response.json'
bad_api_response_file = 'data/api_empty_response.json'
first_index = 'jquery'
lister_subdir = "npm"
good_api_response_file = "data/replicate.npmjs.com/api_response.json"
bad_api_response_file = "data/api_empty_response.json"
first_index = "jquery"
entries_per_page = 100
@requests_mock.Mocker()
@ -37,11 +37,11 @@ class NpmListerTester(HttpListerTesterBase, unittest.TestCase):
class NpmIncrementalListerTester(HttpListerTesterBase, unittest.TestCase):
Lister = NpmIncrementalLister
test_re = re.compile(r'^.*/_changes\?since=([0-9]+).*')
lister_subdir = 'npm'
good_api_response_file = 'data/api_inc_response.json'
bad_api_response_file = 'data/api_inc_empty_response.json'
first_index = '6920642'
test_re = re.compile(r"^.*/_changes\?since=([0-9]+).*")
lister_subdir = "npm"
good_api_response_file = "data/api_inc_response.json"
bad_api_response_file = "data/api_inc_empty_response.json"
first_index = "6920642"
entries_per_page = 100
@requests_mock.Mocker()
@ -58,27 +58,27 @@ def check_tasks(tasks: List[Any]):
"""
for row in tasks:
logger.debug('row: %s', row)
assert row['type'] == 'load-npm'
logger.debug("row: %s", row)
assert row["type"] == "load-npm"
# arguments check
args = row['arguments']['args']
args = row["arguments"]["args"]
assert len(args) == 0
# kwargs
kwargs = row['arguments']['kwargs']
kwargs = row["arguments"]["kwargs"]
assert len(kwargs) == 1
package_url = kwargs['url']
package_name = package_url.split('/')[-1]
assert package_url == f'https://www.npmjs.com/package/{package_name}'
package_url = kwargs["url"]
package_name = package_url.split("/")[-1]
assert package_url == f"https://www.npmjs.com/package/{package_name}"
assert row['policy'] == 'recurring'
assert row['priority'] is None
assert row["policy"] == "recurring"
assert row["priority"] is None
def test_lister_npm_basic_listing(lister_npm, requests_mock_datadir):
lister_npm.run()
tasks = lister_npm.scheduler.search_tasks(task_type='load-npm')
tasks = lister_npm.scheduler.search_tasks(task_type="load-npm")
assert len(tasks) == 100
check_tasks(tasks)
@ -89,10 +89,11 @@ def test_lister_npm_listing_pagination(lister_npm, requests_mock_datadir):
# Patch per page pagination
lister.per_page = 10 + 1
lister.PATH_TEMPLATE = lister.PATH_TEMPLATE.replace(
'&limit=1001', '&limit=%s' % lister.per_page)
"&limit=1001", "&limit=%s" % lister.per_page
)
lister.run()
tasks = lister.scheduler.search_tasks(task_type='load-npm')
tasks = lister.scheduler.search_tasks(task_type="load-npm")
assert len(tasks) == 2 * 10 # only 2 files with 10 results each
check_tasks(tasks)

View file

@ -8,23 +8,22 @@ def mock_save(lister):
def test_ping(swh_app, celery_session_worker):
res = swh_app.send_task(
'swh.lister.npm.tasks.ping')
res = swh_app.send_task("swh.lister.npm.tasks.ping")
assert res
res.wait()
assert res.successful()
assert res.result == 'OK'
assert res.result == "OK"
@patch('swh.lister.npm.tasks.save_registry_state')
@patch('swh.lister.npm.tasks.NpmLister')
@patch("swh.lister.npm.tasks.save_registry_state")
@patch("swh.lister.npm.tasks.NpmLister")
def test_lister(lister, save, swh_app, celery_session_worker):
# setup the mocked NpmLister
lister.return_value = lister
lister.run.return_value = None
save.side_effect = mock_save
res = swh_app.send_task('swh.lister.npm.tasks.NpmListerTask')
res = swh_app.send_task("swh.lister.npm.tasks.NpmListerTask")
assert res
res.wait()
assert res.successful()
@ -33,9 +32,9 @@ def test_lister(lister, save, swh_app, celery_session_worker):
lister.run.assert_called_once_with()
@patch('swh.lister.npm.tasks.save_registry_state')
@patch('swh.lister.npm.tasks.get_last_update_seq')
@patch('swh.lister.npm.tasks.NpmIncrementalLister')
@patch("swh.lister.npm.tasks.save_registry_state")
@patch("swh.lister.npm.tasks.get_last_update_seq")
@patch("swh.lister.npm.tasks.NpmIncrementalLister")
def test_incremental(lister, seq, save, swh_app, celery_session_worker):
# setup the mocked NpmLister
lister.return_value = lister
@ -43,8 +42,7 @@ def test_incremental(lister, seq, save, swh_app, celery_session_worker):
seq.return_value = 42
save.side_effect = mock_save
res = swh_app.send_task(
'swh.lister.npm.tasks.NpmIncrementalListerTask')
res = swh_app.send_task("swh.lister.npm.tasks.NpmIncrementalListerTask")
assert res
res.wait()
assert res.successful()