diff --git a/debian/control b/debian/control index bc95993..22c40a0 100644 --- a/debian/control +++ b/debian/control @@ -13,7 +13,7 @@ Build-Depends: debhelper (>= 9), python3-sqlalchemy (>= 1.0), python3-swh.core, python3-swh.scheduler (>= 0.0.31~), - python3-swh.storage, + python3-swh.storage (>= 0.0.103~), python3-swh.storage.schemata, python3-testing.postgresql, python3-vcversioner, @@ -24,6 +24,7 @@ Homepage: https://forge.softwareheritage.org/source/swh-lister/ Package: python3-swh.lister Architecture: all Depends: python3-swh.scheduler (>= 0.0.31~), + python3-swh.storage (>= 0.0.103~), ${misc:Depends}, ${python3:Depends} Breaks: python3-swh.lister.github diff --git a/requirements-swh.txt b/requirements-swh.txt index e1ba1af..f39d241 100644 --- a/requirements-swh.txt +++ b/requirements-swh.txt @@ -1,3 +1,4 @@ swh.core +swh.storage >= 0.0.103 swh.storage[schemata] >= 0.0.76 swh.scheduler >= 0.0.31 diff --git a/swh/lister/core/lister_base.py b/swh/lister/core/lister_base.py index 60074dd..8bda609 100644 --- a/swh/lister/core/lister_base.py +++ b/swh/lister/core/lister_base.py @@ -5,6 +5,7 @@ import abc import datetime import gzip +import json import logging import os import re @@ -378,7 +379,7 @@ class SWHListerBase(abc.ABC, config.SWHConfig): return sql_repo - def origin_dict(self, origin_type, origin_url): + def origin_dict(self, origin_type, origin_url, **kwargs): """Return special dict format for the origins list Args: @@ -458,16 +459,37 @@ class SWHListerBase(abc.ABC, config.SWHConfig): Returns: Nothing. Modifies injected_repos. """ + origins = {} + tasks = {} + + def _origin_key(m): + _type = m.get('origin_type', m.get('type')) + _url = m.get('origin_url', m.get('url')) + return '%s-%s' % (_type, _url) + + def _task_key(m): + return '%s-%s' % (m['type'], json.dumps(m['arguments'])) + for m in models_list: ir = injected_repos[m['uid']] if not ir.origin_id: - ir.origin_id = self.storage.origin_add_one( - self.origin_dict(m['origin_type'], m['origin_url']) - ) + origin_dict = self.origin_dict(**m) + origins[_origin_key(m)] = (ir, m, origin_dict) if not ir.task_id: - ir.task_id = self.scheduler.create_tasks( - [self.task_dict(**m)] - )[0]['id'] + task_dict = self.task_dict(**m) + tasks[_task_key(task_dict)] = (ir, m, task_dict) + + new_origins = self.storage.origin_add( + (origin_dicts for (_, _, origin_dicts) in origins.values())) + for origin in new_origins: + ir, m, _ = origins[_origin_key(origin)] + ir.origin_id = origin['id'] + + new_tasks = self.scheduler.create_tasks( + (task_dicts for (_, _, task_dicts) in tasks.values())) + for task in new_tasks: + ir, m, _ = tasks[_task_key(task)] + ir.task_id = task['id'] def ingest_data(self, identifier, checks=False): """The core data fetch sequence. Request server endpoint. Simplify and