diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 2ab2d5e..46b4702 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -23,6 +23,11 @@ repos: language: system types: [python] +- repo: https://github.com/python/black + rev: 19.10b0 + hooks: + - id: black + # unfortunately, we are far from being able to enable this... # - repo: https://github.com/PyCQA/pydocstyle.git # rev: 4.0.0 @@ -34,14 +39,3 @@ repos: # language: python # types: [python] -# black requires py3.6+ -#- repo: https://github.com/python/black -# rev: 19.3b0 -# hooks: -# - id: black -# language_version: python3 -#- repo: https://github.com/asottile/blacken-docs -# rev: v1.0.0-1 -# hooks: -# - id: blacken-docs -# additional_dependencies: [black==19.3b0] diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000..8d79b7e --- /dev/null +++ b/setup.cfg @@ -0,0 +1,6 @@ +[flake8] +# E203: whitespaces before ':' +# E231: missing whitespace after ',' +# W503: line break before binary operator +ignore = E203,E231,W503 +max-line-length = 88 diff --git a/setup.py b/setup.py index 905d1f4..f3a07da 100755 --- a/setup.py +++ b/setup.py @@ -12,15 +12,15 @@ from io import open here = path.abspath(path.dirname(__file__)) # Get the long description from the README file -with open(path.join(here, 'README.md'), encoding='utf-8') as f: +with open(path.join(here, "README.md"), encoding="utf-8") as f: long_description = f.read() def parse_requirements(name=None): if name: - reqf = 'requirements-%s.txt' % name + reqf = "requirements-%s.txt" % name else: - reqf = 'requirements.txt' + reqf = "requirements.txt" requirements = [] if not path.exists(reqf): @@ -29,28 +29,28 @@ def parse_requirements(name=None): with open(reqf) as f: for line in f.readlines(): line = line.strip() - if not line or line.startswith('#'): + if not line or line.startswith("#"): continue requirements.append(line) return requirements setup( - name='swh.lister', - description='Software Heritage lister', + name="swh.lister", + description="Software Heritage lister", long_description=long_description, - long_description_content_type='text/markdown', - author='Software Heritage developers', - author_email='swh-devel@inria.fr', - url='https://forge.softwareheritage.org/diffusion/DLSGH/', + long_description_content_type="text/markdown", + author="Software Heritage developers", + author_email="swh-devel@inria.fr", + url="https://forge.softwareheritage.org/diffusion/DLSGH/", packages=find_packages(), - install_requires=parse_requirements() + parse_requirements('swh'), - tests_require=parse_requirements('test'), - setup_requires=['vcversioner'], - extras_require={'testing': parse_requirements('test')}, + install_requires=parse_requirements() + parse_requirements("swh"), + tests_require=parse_requirements("test"), + setup_requires=["vcversioner"], + extras_require={"testing": parse_requirements("test")}, vcversioner={}, include_package_data=True, - entry_points=''' + entry_points=""" [swh.cli.subcommands] lister=swh.lister.cli:lister [swh.workers] @@ -65,7 +65,7 @@ setup( lister.packagist=swh.lister.packagist:register lister.phabricator=swh.lister.phabricator:register lister.pypi=swh.lister.pypi:register - ''', + """, classifiers=[ "Programming Language :: Python :: 3", "Intended Audience :: Developers", @@ -74,8 +74,8 @@ setup( "Development Status :: 5 - Production/Stable", ], project_urls={ - 'Bug Reports': 'https://forge.softwareheritage.org/maniphest', - 'Funding': 'https://www.softwareheritage.org/donate', - 'Source': 'https://forge.softwareheritage.org/source/swh-lister', + "Bug Reports": "https://forge.softwareheritage.org/maniphest", + "Funding": "https://www.softwareheritage.org/donate", + "Source": "https://forge.softwareheritage.org/source/swh-lister", }, ) diff --git a/swh/lister/__init__.py b/swh/lister/__init__.py index 840ceca..b37e946 100644 --- a/swh/lister/__init__.py +++ b/swh/lister/__init__.py @@ -11,17 +11,19 @@ logger = logging.getLogger(__name__) try: - __version__ = pkg_resources.get_distribution('swh.lister').version + __version__ = pkg_resources.get_distribution("swh.lister").version except pkg_resources.DistributionNotFound: - __version__ = 'devel' + __version__ = "devel" -USER_AGENT_TEMPLATE = 'Software Heritage Lister (%s)' +USER_AGENT_TEMPLATE = "Software Heritage Lister (%s)" USER_AGENT = USER_AGENT_TEMPLATE % __version__ -LISTERS = {entry_point.name.split('.', 1)[1]: entry_point - for entry_point in pkg_resources.iter_entry_points('swh.workers') - if entry_point.name.split('.', 1)[0] == 'lister'} +LISTERS = { + entry_point.name.split(".", 1)[1]: entry_point + for entry_point in pkg_resources.iter_entry_points("swh.workers") + if entry_point.name.split(".", 1)[0] == "lister" +} SUPPORTED_LISTERS = list(LISTERS) @@ -41,12 +43,13 @@ def get_lister(lister_name, db_url=None, **conf): """ if lister_name not in LISTERS: raise ValueError( - 'Invalid lister %s: only supported listers are %s' % - (lister_name, SUPPORTED_LISTERS)) + "Invalid lister %s: only supported listers are %s" + % (lister_name, SUPPORTED_LISTERS) + ) if db_url: - conf['lister'] = {'cls': 'local', 'args': {'db': db_url}} + conf["lister"] = {"cls": "local", "args": {"db": db_url}} registry_entry = LISTERS[lister_name].load()() - lister_cls = registry_entry['lister'] + lister_cls = registry_entry["lister"] lister = lister_cls(override_config=conf) return lister diff --git a/swh/lister/bitbucket/__init__.py b/swh/lister/bitbucket/__init__.py index 7a524e2..917c7bd 100644 --- a/swh/lister/bitbucket/__init__.py +++ b/swh/lister/bitbucket/__init__.py @@ -7,7 +7,8 @@ def register(): from .models import BitBucketModel from .lister import BitBucketLister - return {'models': [BitBucketModel], - 'lister': BitBucketLister, - 'task_modules': ['%s.tasks' % __name__], - } + return { + "models": [BitBucketModel], + "lister": BitBucketLister, + "task_modules": ["%s.tasks" % __name__], + } diff --git a/swh/lister/bitbucket/lister.py b/swh/lister/bitbucket/lister.py index e067148..d64281c 100644 --- a/swh/lister/bitbucket/lister.py +++ b/swh/lister/bitbucket/lister.py @@ -19,34 +19,33 @@ logger = logging.getLogger(__name__) class BitBucketLister(IndexingHttpLister): - PATH_TEMPLATE = '/repositories?after=%s' + PATH_TEMPLATE = "/repositories?after=%s" MODEL = BitBucketModel - LISTER_NAME = 'bitbucket' - DEFAULT_URL = 'https://api.bitbucket.org/2.0' - instance = 'bitbucket' + LISTER_NAME = "bitbucket" + DEFAULT_URL = "https://api.bitbucket.org/2.0" + instance = "bitbucket" default_min_bound = datetime.fromtimestamp(0, timezone.utc) # type: Any - def __init__(self, url: str = None, - override_config=None, per_page: int = 100) -> None: + def __init__( + self, url: str = None, override_config=None, per_page: int = 100 + ) -> None: super().__init__(url=url, override_config=override_config) - per_page = self.config.get('per_page', per_page) + per_page = self.config.get("per_page", per_page) - self.PATH_TEMPLATE = '%s&pagelen=%s' % ( - self.PATH_TEMPLATE, per_page) + self.PATH_TEMPLATE = "%s&pagelen=%s" % (self.PATH_TEMPLATE, per_page) def get_model_from_repo(self, repo: Dict) -> Dict[str, Any]: return { - 'uid': repo['uuid'], - 'indexable': iso8601.parse_date(repo['created_on']), - 'name': repo['name'], - 'full_name': repo['full_name'], - 'html_url': repo['links']['html']['href'], - 'origin_url': repo['links']['clone'][0]['href'], - 'origin_type': repo['scm'], + "uid": repo["uuid"], + "indexable": iso8601.parse_date(repo["created_on"]), + "name": repo["name"], + "full_name": repo["full_name"], + "html_url": repo["links"]["html"]["href"], + "origin_url": repo["links"]["clone"][0]["href"], + "origin_type": repo["scm"], } - def get_next_target_from_response(self, response: Response - ) -> Optional[datetime]: + def get_next_target_from_response(self, response: Response) -> Optional[datetime]: """This will read the 'next' link from the api response if any and return it as a datetime. @@ -58,23 +57,23 @@ class BitBucketLister(IndexingHttpLister): """ body = response.json() - next_ = body.get('next') + next_ = body.get("next") if next_ is not None: next_ = parse.urlparse(next_) - return iso8601.parse_date(parse.parse_qs(next_.query)['after'][0]) + return iso8601.parse_date(parse.parse_qs(next_.query)["after"][0]) return None - def transport_response_simplified(self, response: Response - ) -> List[Dict[str, Any]]: - repos = response.json()['values'] + def transport_response_simplified(self, response: Response) -> List[Dict[str, Any]]: + repos = response.json()["values"] return [self.get_model_from_repo(repo) for repo in repos] def request_uri(self, identifier: datetime) -> str: # type: ignore identifier_str = parse.quote(identifier.isoformat()) - return super().request_uri(identifier_str or '1970-01-01') + return super().request_uri(identifier_str or "1970-01-01") - def is_within_bounds(self, inner: int, lower: Optional[int] = None, - upper: Optional[int] = None) -> bool: + def is_within_bounds( + self, inner: int, lower: Optional[int] = None, upper: Optional[int] = None + ) -> bool: # values are expected to be datetimes if lower is None and upper is None: ret = True diff --git a/swh/lister/bitbucket/models.py b/swh/lister/bitbucket/models.py index d299b5b..dca32f7 100644 --- a/swh/lister/bitbucket/models.py +++ b/swh/lister/bitbucket/models.py @@ -9,7 +9,8 @@ from swh.lister.core.models import IndexingModelBase class BitBucketModel(IndexingModelBase): """a BitBucket repository""" - __tablename__ = 'bitbucket_repo' + + __tablename__ = "bitbucket_repo" uid = Column(String, primary_key=True) indexable = Column(DateTime(timezone=True), index=True) diff --git a/swh/lister/bitbucket/tasks.py b/swh/lister/bitbucket/tasks.py index 3b64de0..68cae21 100644 --- a/swh/lister/bitbucket/tasks.py +++ b/swh/lister/bitbucket/tasks.py @@ -10,20 +10,20 @@ from .lister import BitBucketLister GROUP_SPLIT = 10000 -@shared_task(name=__name__ + '.IncrementalBitBucketLister') +@shared_task(name=__name__ + ".IncrementalBitBucketLister") def list_bitbucket_incremental(**lister_args): - '''Incremental update of the BitBucket forge''' + """Incremental update of the BitBucket forge""" lister = BitBucketLister(**lister_args) return lister.run(min_bound=lister.db_last_index(), max_bound=None) -@shared_task(name=__name__ + '.RangeBitBucketLister') +@shared_task(name=__name__ + ".RangeBitBucketLister") def _range_bitbucket_lister(start, end, **lister_args): lister = BitBucketLister(**lister_args) return lister.run(min_bound=start, max_bound=end) -@shared_task(name=__name__ + '.FullBitBucketRelister', bind=True) +@shared_task(name=__name__ + ".FullBitBucketRelister", bind=True) def list_bitbucket_full(self, split=None, **lister_args): """Full update of the BitBucket forge @@ -33,21 +33,22 @@ def list_bitbucket_full(self, split=None, **lister_args): lister = BitBucketLister(**lister_args) ranges = lister.db_partition_indices(split or GROUP_SPLIT) if not ranges: - self.log.info('Nothing to list') + self.log.info("Nothing to list") return random.shuffle(ranges) - promise = group(_range_bitbucket_lister.s(minv, maxv, **lister_args) - for minv, maxv in ranges)() - self.log.debug('%s OK (spawned %s subtasks)', (self.name, len(ranges))) + promise = group( + _range_bitbucket_lister.s(minv, maxv, **lister_args) for minv, maxv in ranges + )() + self.log.debug("%s OK (spawned %s subtasks)", (self.name, len(ranges))) try: promise.save() # so that we can restore the GroupResult in tests except (NotImplementedError, AttributeError): - self.log.info('Unable to call save_group with current result backend.') + self.log.info("Unable to call save_group with current result backend.") # FIXME: what to do in terms of return here? return promise.id -@shared_task(name=__name__ + '.ping') +@shared_task(name=__name__ + ".ping") def _ping(): - return 'OK' + return "OK" diff --git a/swh/lister/bitbucket/tests/test_lister.py b/swh/lister/bitbucket/tests/test_lister.py index eda17d6..191b7c5 100644 --- a/swh/lister/bitbucket/tests/test_lister.py +++ b/swh/lister/bitbucket/tests/test_lister.py @@ -26,12 +26,12 @@ def _convert_type(req_index): class BitBucketListerTester(HttpListerTester, unittest.TestCase): Lister = BitBucketLister - test_re = re.compile(r'/repositories\?after=([^?&]+)') - lister_subdir = 'bitbucket' - good_api_response_file = 'data/https_api.bitbucket.org/response.json' - bad_api_response_file = 'data/https_api.bitbucket.org/empty_response.json' - first_index = _convert_type('2008-07-12T07:44:01.476818+00:00') - last_index = _convert_type('2008-07-19T06:16:43.044743+00:00') + test_re = re.compile(r"/repositories\?after=([^?&]+)") + lister_subdir = "bitbucket" + good_api_response_file = "data/https_api.bitbucket.org/response.json" + bad_api_response_file = "data/https_api.bitbucket.org/empty_response.json" + first_index = _convert_type("2008-07-12T07:44:01.476818+00:00") + last_index = _convert_type("2008-07-19T06:16:43.044743+00:00") entries_per_page = 10 convert_type = _convert_type @@ -57,57 +57,64 @@ class BitBucketListerTester(HttpListerTester, unittest.TestCase): self.disable_db(fl) # stores no results - fl.run(min_bound=self.first_index - timedelta(days=3), - max_bound=self.first_index) + fl.run( + min_bound=self.first_index - timedelta(days=3), max_bound=self.first_index + ) def test_is_within_bounds(self): fl = self.get_fl() - self.assertTrue(fl.is_within_bounds( - iso8601.parse_date('2008-07-15'), - self.first_index, self.last_index)) - self.assertFalse(fl.is_within_bounds( - iso8601.parse_date('2008-07-20'), - self.first_index, self.last_index)) - self.assertFalse(fl.is_within_bounds( - iso8601.parse_date('2008-07-11'), - self.first_index, self.last_index)) + self.assertTrue( + fl.is_within_bounds( + iso8601.parse_date("2008-07-15"), self.first_index, self.last_index + ) + ) + self.assertFalse( + fl.is_within_bounds( + iso8601.parse_date("2008-07-20"), self.first_index, self.last_index + ) + ) + self.assertFalse( + fl.is_within_bounds( + iso8601.parse_date("2008-07-11"), self.first_index, self.last_index + ) + ) def test_lister_bitbucket(swh_listers, requests_mock_datadir): """Simple bitbucket listing should create scheduled tasks (git, hg) """ - lister = swh_listers['bitbucket'] + lister = swh_listers["bitbucket"] lister.run() - r = lister.scheduler.search_tasks(task_type='load-hg') + r = lister.scheduler.search_tasks(task_type="load-hg") assert len(r) == 9 for row in r: - args = row['arguments']['args'] - kwargs = row['arguments']['kwargs'] + args = row["arguments"]["args"] + kwargs = row["arguments"]["kwargs"] assert len(args) == 0 assert len(kwargs) == 1 - url = kwargs['url'] + url = kwargs["url"] - assert url.startswith('https://bitbucket.org') + assert url.startswith("https://bitbucket.org") - assert row['policy'] == 'recurring' - assert row['priority'] is None + assert row["policy"] == "recurring" + assert row["priority"] is None - r = lister.scheduler.search_tasks(task_type='load-git') + r = lister.scheduler.search_tasks(task_type="load-git") assert len(r) == 1 for row in r: - args = row['arguments']['args'] - kwargs = row['arguments']['kwargs'] + args = row["arguments"]["args"] + kwargs = row["arguments"]["kwargs"] assert len(args) == 0 assert len(kwargs) == 1 - url = kwargs['url'] + url = kwargs["url"] - assert url.startswith('https://bitbucket.org') + assert url.startswith("https://bitbucket.org") - assert row['policy'] == 'recurring' - assert row['priority'] is None + assert row["policy"] == "recurring" + assert row["priority"] is None diff --git a/swh/lister/bitbucket/tests/test_tasks.py b/swh/lister/bitbucket/tests/test_tasks.py index bd881ab..9441f09 100644 --- a/swh/lister/bitbucket/tests/test_tasks.py +++ b/swh/lister/bitbucket/tests/test_tasks.py @@ -5,23 +5,21 @@ from unittest.mock import patch def test_ping(swh_app, celery_session_worker): - res = swh_app.send_task( - 'swh.lister.bitbucket.tasks.ping') + res = swh_app.send_task("swh.lister.bitbucket.tasks.ping") assert res res.wait() assert res.successful() - assert res.result == 'OK' + assert res.result == "OK" -@patch('swh.lister.bitbucket.tasks.BitBucketLister') +@patch("swh.lister.bitbucket.tasks.BitBucketLister") def test_incremental(lister, swh_app, celery_session_worker): # setup the mocked BitbucketLister lister.return_value = lister lister.db_last_index.return_value = 42 lister.run.return_value = None - res = swh_app.send_task( - 'swh.lister.bitbucket.tasks.IncrementalBitBucketLister') + res = swh_app.send_task("swh.lister.bitbucket.tasks.IncrementalBitBucketLister") assert res res.wait() assert res.successful() @@ -31,15 +29,15 @@ def test_incremental(lister, swh_app, celery_session_worker): lister.run.assert_called_once_with(min_bound=42, max_bound=None) -@patch('swh.lister.bitbucket.tasks.BitBucketLister') +@patch("swh.lister.bitbucket.tasks.BitBucketLister") def test_range(lister, swh_app, celery_session_worker): # setup the mocked BitbucketLister lister.return_value = lister lister.run.return_value = None res = swh_app.send_task( - 'swh.lister.bitbucket.tasks.RangeBitBucketLister', - kwargs=dict(start=12, end=42)) + "swh.lister.bitbucket.tasks.RangeBitBucketLister", kwargs=dict(start=12, end=42) + ) assert res res.wait() assert res.successful() @@ -49,16 +47,14 @@ def test_range(lister, swh_app, celery_session_worker): lister.run.assert_called_once_with(min_bound=12, max_bound=42) -@patch('swh.lister.bitbucket.tasks.BitBucketLister') +@patch("swh.lister.bitbucket.tasks.BitBucketLister") def test_relister(lister, swh_app, celery_session_worker): # setup the mocked BitbucketLister lister.return_value = lister lister.run.return_value = None - lister.db_partition_indices.return_value = [ - (i, i+9) for i in range(0, 50, 10)] + lister.db_partition_indices.return_value = [(i, i + 9) for i in range(0, 50, 10)] - res = swh_app.send_task( - 'swh.lister.bitbucket.tasks.FullBitBucketRelister') + res = swh_app.send_task("swh.lister.bitbucket.tasks.FullBitBucketRelister") assert res res.wait() @@ -85,5 +81,6 @@ def test_relister(lister, swh_app, celery_session_worker): # lister.run should have been called once per partition interval for i in range(5): - assert (dict(min_bound=10*i, max_bound=10*i + 9),) \ - in lister.run.call_args_list + assert ( + dict(min_bound=10 * i, max_bound=10 * i + 9), + ) in lister.run.call_args_list diff --git a/swh/lister/cgit/__init__.py b/swh/lister/cgit/__init__.py index 00d5788..f5f9cf6 100644 --- a/swh/lister/cgit/__init__.py +++ b/swh/lister/cgit/__init__.py @@ -7,7 +7,8 @@ def register(): from .models import CGitModel from .lister import CGitLister - return {'models': [CGitModel], - 'lister': CGitLister, - 'task_modules': ['%s.tasks' % __name__], - } + return { + "models": [CGitModel], + "lister": CGitLister, + "task_modules": ["%s.tasks" % __name__], + } diff --git a/swh/lister/cgit/lister.py b/swh/lister/cgit/lister.py index bf37ea3..5d88edc 100644 --- a/swh/lister/cgit/lister.py +++ b/swh/lister/cgit/lister.py @@ -50,13 +50,13 @@ class CGitLister(ListerBase): Args: 'https://git.savannah.gnu.org/git/elisp-es.git' """ + MODEL = CGitModel - DEFAULT_URL = 'https://git.savannah.gnu.org/cgit/' - LISTER_NAME = 'cgit' + DEFAULT_URL = "https://git.savannah.gnu.org/cgit/" + LISTER_NAME = "cgit" url_prefix_present = True - def __init__(self, url=None, instance=None, - override_config=None): + def __init__(self, url=None, instance=None, override_config=None): """Lister class for CGit repositories. Args: @@ -69,7 +69,7 @@ class CGitLister(ListerBase): super().__init__(override_config=override_config) if url is None: - url = self.config.get('url', self.DEFAULT_URL) + url = self.config.get("url", self.DEFAULT_URL) self.url = url if not instance: @@ -78,23 +78,22 @@ class CGitLister(ListerBase): self.session = Session() self.session.mount(self.url, HTTPAdapter(max_retries=3)) self.session.headers = { - 'User-Agent': USER_AGENT, + "User-Agent": USER_AGENT, } def run(self) -> Dict[str, str]: - status = 'uneventful' + status = "uneventful" total = 0 for repos in grouper(self.get_repos(), 10): - models = list(filter(None, (self.build_model(repo) - for repo in repos))) + models = list(filter(None, (self.build_model(repo) for repo in repos))) injected_repos = self.inject_repo_data_into_db(models) self.schedule_missing_tasks(models, injected_repos) self.db_session.commit() total += len(injected_repos) - logger.debug('Scheduled %s tasks for %s', total, self.url) - status = 'eventful' + logger.debug("Scheduled %s tasks for %s", total, self.url) + status = "eventful" - return {'status': status} + return {"status": status} def get_repos(self) -> Generator[str, None, None]: """Generate git 'project' URLs found on the current CGit server @@ -103,16 +102,16 @@ class CGitLister(ListerBase): next_page = self.url while next_page: bs_idx = self.get_and_parse(next_page) - for tr in bs_idx.find( - 'div', {"class": "content"}).find_all( - "tr", {"class": ""}): - yield urljoin(self.url, tr.find('a')['href']) + for tr in bs_idx.find("div", {"class": "content"}).find_all( + "tr", {"class": ""} + ): + yield urljoin(self.url, tr.find("a")["href"]) try: - pager = bs_idx.find('ul', {'class': 'pager'}) - current_page = pager.find('a', {'class': 'current'}) + pager = bs_idx.find("ul", {"class": "pager"}) + current_page = pager.find("a", {"class": "current"}) if current_page: - next_page = current_page.parent.next_sibling.a['href'] + next_page = current_page.parent.next_sibling.a["href"] next_page = urljoin(self.url, next_page) except (AttributeError, KeyError): # no pager, or no next page @@ -123,28 +122,28 @@ class CGitLister(ListerBase): return the repo description (dict) suitable for insertion in the db. """ bs = self.get_and_parse(repo_url) - urls = [x['href'] for x in bs.find_all('a', {'rel': 'vcs-git'})] + urls = [x["href"] for x in bs.find_all("a", {"rel": "vcs-git"})] if not urls: return None # look for the http/https url, if any, and use it as origin_url for url in urls: - if urlparse(url).scheme in ('http', 'https'): + if urlparse(url).scheme in ("http", "https"): origin_url = url break else: # otherwise, choose the first one origin_url = urls[0] - return {'uid': repo_url, - 'name': bs.find('a', title=re.compile('.+'))['title'], - 'origin_type': 'git', - 'instance': self.instance, - 'origin_url': origin_url, - } + return { + "uid": repo_url, + "name": bs.find("a", title=re.compile(".+"))["title"], + "origin_type": "git", + "instance": self.instance, + "origin_url": origin_url, + } def get_and_parse(self, url: str) -> BeautifulSoup: "Get the given url and parse the retrieved HTML using BeautifulSoup" - return BeautifulSoup(self.session.get(url).text, - features='html.parser') + return BeautifulSoup(self.session.get(url).text, features="html.parser") diff --git a/swh/lister/cgit/models.py b/swh/lister/cgit/models.py index be10161..61bc545 100644 --- a/swh/lister/cgit/models.py +++ b/swh/lister/cgit/models.py @@ -11,7 +11,8 @@ class CGitModel(ModelBase): """a CGit repository representation """ - __tablename__ = 'cgit_repo' + + __tablename__ = "cgit_repo" uid = Column(String, primary_key=True) instance = Column(String, index=True) diff --git a/swh/lister/cgit/tasks.py b/swh/lister/cgit/tasks.py index 2d60e36..2e41133 100644 --- a/swh/lister/cgit/tasks.py +++ b/swh/lister/cgit/tasks.py @@ -7,12 +7,12 @@ from celery import shared_task from .lister import CGitLister -@shared_task(name=__name__ + '.CGitListerTask') +@shared_task(name=__name__ + ".CGitListerTask") def list_cgit(**lister_args): - '''Lister task for CGit instances''' + """Lister task for CGit instances""" return CGitLister(**lister_args).run() -@shared_task(name=__name__ + '.ping') +@shared_task(name=__name__ + ".ping") def _ping(): - return 'OK' + return "OK" diff --git a/swh/lister/cgit/tests/test_lister.py b/swh/lister/cgit/tests/test_lister.py index ca8ddd5..bc71e15 100644 --- a/swh/lister/cgit/tests/test_lister.py +++ b/swh/lister/cgit/tests/test_lister.py @@ -7,38 +7,38 @@ from swh.lister import __version__ def test_lister_no_page(requests_mock_datadir, swh_listers): - lister = swh_listers['cgit'] + lister = swh_listers["cgit"] - assert lister.url == 'https://git.savannah.gnu.org/cgit/' + assert lister.url == "https://git.savannah.gnu.org/cgit/" repos = list(lister.get_repos()) assert len(repos) == 977 - assert repos[0] == 'https://git.savannah.gnu.org/cgit/elisp-es.git/' + assert repos[0] == "https://git.savannah.gnu.org/cgit/elisp-es.git/" # note the url below is NOT a subpath of /cgit/ - assert repos[-1] == 'https://git.savannah.gnu.org/path/to/yetris.git/' # noqa + assert repos[-1] == "https://git.savannah.gnu.org/path/to/yetris.git/" # noqa # note the url below is NOT on the same server - assert repos[-2] == 'http://example.org/cgit/xstarcastle.git/' + assert repos[-2] == "http://example.org/cgit/xstarcastle.git/" def test_lister_model(requests_mock_datadir, swh_listers): - lister = swh_listers['cgit'] + lister = swh_listers["cgit"] repo = next(lister.get_repos()) model = lister.build_model(repo) assert model == { - 'uid': 'https://git.savannah.gnu.org/cgit/elisp-es.git/', - 'name': 'elisp-es.git', - 'origin_type': 'git', - 'instance': 'git.savannah.gnu.org', - 'origin_url': 'https://git.savannah.gnu.org/git/elisp-es.git' - } + "uid": "https://git.savannah.gnu.org/cgit/elisp-es.git/", + "name": "elisp-es.git", + "origin_type": "git", + "instance": "git.savannah.gnu.org", + "origin_url": "https://git.savannah.gnu.org/git/elisp-es.git", + } def test_lister_with_pages(requests_mock_datadir, swh_listers): - lister = swh_listers['cgit'] - lister.url = 'https://git.tizen/cgit/' + lister = swh_listers["cgit"] + lister.url = "https://git.tizen/cgit/" repos = list(lister.get_repos()) # we should have 16 repos (listed on 3 pages) @@ -46,37 +46,37 @@ def test_lister_with_pages(requests_mock_datadir, swh_listers): def test_lister_run(requests_mock_datadir, swh_listers): - lister = swh_listers['cgit'] - lister.url = 'https://git.tizen/cgit/' + lister = swh_listers["cgit"] + lister.url = "https://git.tizen/cgit/" lister.run() - r = lister.scheduler.search_tasks(task_type='load-git') + r = lister.scheduler.search_tasks(task_type="load-git") assert len(r) == 16 for row in r: - assert row['type'] == 'load-git' + assert row["type"] == "load-git" # arguments check - args = row['arguments']['args'] + args = row["arguments"]["args"] assert len(args) == 0 # kwargs - kwargs = row['arguments']['kwargs'] + kwargs = row["arguments"]["kwargs"] assert len(kwargs) == 1 - url = kwargs['url'] - assert url.startswith('https://git.tizen') + url = kwargs["url"] + assert url.startswith("https://git.tizen") - assert row['policy'] == 'recurring' - assert row['priority'] is None + assert row["policy"] == "recurring" + assert row["priority"] is None def test_lister_requests(requests_mock_datadir, swh_listers): - lister = swh_listers['cgit'] - lister.url = 'https://git.tizen/cgit/' + lister = swh_listers["cgit"] + lister.url = "https://git.tizen/cgit/" lister.run() assert len(requests_mock_datadir.request_history) != 0 for request in requests_mock_datadir.request_history: - assert 'User-Agent' in request.headers - user_agent = request.headers['User-Agent'] - assert 'Software Heritage Lister' in user_agent + assert "User-Agent" in request.headers + user_agent = request.headers["User-Agent"] + assert "Software Heritage Lister" in user_agent assert __version__ in user_agent diff --git a/swh/lister/cgit/tests/test_tasks.py b/swh/lister/cgit/tests/test_tasks.py index 38bf7b7..866bfde 100644 --- a/swh/lister/cgit/tests/test_tasks.py +++ b/swh/lister/cgit/tests/test_tasks.py @@ -2,29 +2,27 @@ from unittest.mock import patch def test_ping(swh_app, celery_session_worker): - res = swh_app.send_task( - 'swh.lister.cgit.tasks.ping') + res = swh_app.send_task("swh.lister.cgit.tasks.ping") assert res res.wait() assert res.successful() - assert res.result == 'OK' + assert res.result == "OK" -@patch('swh.lister.cgit.tasks.CGitLister') +@patch("swh.lister.cgit.tasks.CGitLister") def test_lister(lister, swh_app, celery_session_worker): # setup the mocked CGitLister lister.return_value = lister lister.run.return_value = None res = swh_app.send_task( - 'swh.lister.cgit.tasks.CGitListerTask', - kwargs=dict(url='https://git.kernel.org/', instance='kernel')) + "swh.lister.cgit.tasks.CGitListerTask", + kwargs=dict(url="https://git.kernel.org/", instance="kernel"), + ) assert res res.wait() assert res.successful() - lister.assert_called_once_with( - url='https://git.kernel.org/', - instance='kernel') + lister.assert_called_once_with(url="https://git.kernel.org/", instance="kernel") lister.db_last_index.assert_not_called() lister.run.assert_called_once_with() diff --git a/swh/lister/cli.py b/swh/lister/cli.py index 365c36a..c725213 100644 --- a/swh/lister/cli.py +++ b/swh/lister/cli.py @@ -23,104 +23,123 @@ logger = logging.getLogger(__name__) # value used when inserting a new task-type in the scheduler db will be the one # under the 'full' key below (because it matches xxx_full). DEFAULT_TASK_TYPE = { - 'full': { # for tasks like 'list_xxx_full()' - 'default_interval': '90 days', - 'min_interval': '90 days', - 'max_interval': '90 days', - 'backoff_factor': 1 - }, - '*': { # value if not suffix matches - 'default_interval': '1 day', - 'min_interval': '1 day', - 'max_interval': '1 day', - 'backoff_factor': 1 - }, - } + "full": { # for tasks like 'list_xxx_full()' + "default_interval": "90 days", + "min_interval": "90 days", + "max_interval": "90 days", + "backoff_factor": 1, + }, + "*": { # value if not suffix matches + "default_interval": "1 day", + "min_interval": "1 day", + "max_interval": "1 day", + "backoff_factor": 1, + }, +} -@click.group(name='lister', context_settings=CONTEXT_SETTINGS) -@click.option('--config-file', '-C', default=None, - type=click.Path(exists=True, dir_okay=False,), - help="Configuration file.") -@click.option('--db-url', '-d', default=None, - help='SQLAlchemy DB URL; see ' - '') # noqa +@click.group(name="lister", context_settings=CONTEXT_SETTINGS) +@click.option( + "--config-file", + "-C", + default=None, + type=click.Path(exists=True, dir_okay=False,), + help="Configuration file.", +) +@click.option( + "--db-url", + "-d", + default=None, + help="SQLAlchemy DB URL; see " + "", +) # noqa @click.pass_context def lister(ctx, config_file, db_url): - '''Software Heritage Lister tools.''' + """Software Heritage Lister tools.""" from swh.core import config + ctx.ensure_object(dict) if not config_file: - config_file = os.environ.get('SWH_CONFIG_FILENAME') + config_file = os.environ.get("SWH_CONFIG_FILENAME") conf = config.read(config_file) if db_url: - conf['lister'] = { - 'cls': 'local', - 'args': {'db': db_url} - } - ctx.obj['config'] = conf + conf["lister"] = {"cls": "local", "args": {"db": db_url}} + ctx.obj["config"] = conf -@lister.command(name='db-init', context_settings=CONTEXT_SETTINGS) -@click.option('--drop-tables', '-D', is_flag=True, default=False, - help='Drop tables before creating the database schema') +@lister.command(name="db-init", context_settings=CONTEXT_SETTINGS) +@click.option( + "--drop-tables", + "-D", + is_flag=True, + default=False, + help="Drop tables before creating the database schema", +) @click.pass_context def db_init(ctx, drop_tables): """Initialize the database model for given listers. """ - cfg = ctx.obj['config'] - lister_cfg = cfg['lister'] - if lister_cfg['cls'] != 'local': - click.echo('A local lister configuration is required') + cfg = ctx.obj["config"] + lister_cfg = cfg["lister"] + if lister_cfg["cls"] != "local": + click.echo("A local lister configuration is required") ctx.exit(1) - db_url = lister_cfg['args']['db'] + db_url = lister_cfg["args"]["db"] db_engine = create_engine(db_url) registry = {} for lister, entrypoint in LISTERS.items(): - logger.info('Loading lister %s', lister) + logger.info("Loading lister %s", lister) registry[lister] = entrypoint.load()() - logger.info('Initializing database') + logger.info("Initializing database") initialize(db_engine, drop_tables) for lister, entrypoint in LISTERS.items(): registry_entry = registry[lister] - init_hook = registry_entry.get('init') + init_hook = registry_entry.get("init") if callable(init_hook): - logger.info('Calling init hook for %s', lister) + logger.info("Calling init hook for %s", lister) init_hook(db_engine) -@lister.command(name='run', context_settings=CONTEXT_SETTINGS, - help='Trigger a full listing run for a particular forge ' - 'instance. The output of this listing results in ' - '"oneshot" tasks in the scheduler db with a priority ' - 'defined by the user') -@click.option('--lister', '-l', help='Lister to run', - type=click.Choice(SUPPORTED_LISTERS)) -@click.option('--priority', '-p', default='high', - type=click.Choice(['high', 'medium', 'low']), - help='Task priority for the listed repositories to ingest') -@click.argument('options', nargs=-1) +@lister.command( + name="run", + context_settings=CONTEXT_SETTINGS, + help="Trigger a full listing run for a particular forge " + "instance. The output of this listing results in " + '"oneshot" tasks in the scheduler db with a priority ' + "defined by the user", +) +@click.option( + "--lister", "-l", help="Lister to run", type=click.Choice(SUPPORTED_LISTERS) +) +@click.option( + "--priority", + "-p", + default="high", + type=click.Choice(["high", "medium", "low"]), + help="Task priority for the listed repositories to ingest", +) +@click.argument("options", nargs=-1) @click.pass_context def run(ctx, lister, priority, options): from swh.scheduler.cli.utils import parse_options - config = deepcopy(ctx.obj['config']) + config = deepcopy(ctx.obj["config"]) if options: config.update(parse_options(options)[1]) - config['priority'] = priority - config['policy'] = 'oneshot' + config["priority"] = priority + config["policy"] = "oneshot" get_lister(lister, **config).run() -if __name__ == '__main__': +if __name__ == "__main__": lister() diff --git a/swh/lister/core/abstractattribute.py b/swh/lister/core/abstractattribute.py index fdb4219..01eb84a 100644 --- a/swh/lister/core/abstractattribute.py +++ b/swh/lister/core/abstractattribute.py @@ -20,8 +20,9 @@ class AbstractAttribute: AbstractAttribute('docstring for foo') """ + __isabstractmethod__ = True def __init__(self, docstring=None): if docstring is not None: - self.__doc__ = 'AbstractAttribute: ' + docstring + self.__doc__ = "AbstractAttribute: " + docstring diff --git a/swh/lister/core/indexing_lister.py b/swh/lister/core/indexing_lister.py index d13933d..f7c6aa4 100644 --- a/swh/lister/core/indexing_lister.py +++ b/swh/lister/core/indexing_lister.py @@ -49,18 +49,19 @@ class IndexingLister(ListerBase): def get_next_target_from_response """ + flush_packet_db = 20 """Number of iterations in-between write flushes of lister repositories to db (see fn:`run`). """ - default_min_bound = '' + default_min_bound = "" """Default initialization value for the minimum boundary index to use when undefined (see fn:`run`). """ @abc.abstractmethod def get_next_target_from_response( - self, response: Response + self, response: Response ) -> Union[Optional[datetime], Optional[str], Optional[int]]: """Find the next server endpoint identifier given the entire response. @@ -78,14 +79,16 @@ class IndexingLister(ListerBase): # You probably don't need to override anything below this line. def filter_before_inject( - self, models_list: List[Dict[str, Any]]) -> List[Dict[str, Any]]: + self, models_list: List[Dict[str, Any]] + ) -> List[Dict[str, Any]]: """Overrides ListerBase.filter_before_inject Bounds query results by this Lister's set max_index. """ models_list = [ - m for m in models_list - if self.is_within_bounds(m['indexable'], None, self.max_index) + m + for m in models_list + if self.is_within_bounds(m["indexable"], None, self.max_index) ] return models_list @@ -108,7 +111,7 @@ class IndexingLister(ListerBase): return retlist def db_partition_indices( - self, partition_size: int + self, partition_size: int ) -> List[Tuple[Optional[int], Optional[int]]]: """Describe an index-space compartmentalization of the db table in equal sized chunks. This is used to describe min&max bounds for @@ -135,14 +138,19 @@ class IndexingLister(ListerBase): return [] if isinstance(min_index, str): + def format_bound(bound): return bound.isoformat() + min_index = dateutil.parser.parse(min_index) max_index = dateutil.parser.parse(max_index) elif isinstance(max_index - min_index, int): + def format_bound(bound): return int(bound) + else: + def format_bound(bound): return bound @@ -156,9 +164,7 @@ class IndexingLister(ListerBase): # Trim duplicate bounds bounds.append(None) - bounds = [cur - for cur, next in zip(bounds[:-1], bounds[1:]) - if cur != next] + bounds = [cur for cur, next in zip(bounds[:-1], bounds[1:]) if cur != next] # Remove bounds for lowest and highest partition bounds[0] = bounds[-1] = None @@ -204,8 +210,9 @@ class IndexingLister(ListerBase): deleted_repos = self.winnow_models( self.db_query_range(start, end), self.MODEL.uid, keep_these ) - tasks_to_disable = [repo.task_id for repo in deleted_repos - if repo.task_id is not None] + tasks_to_disable = [ + repo.task_id for repo in deleted_repos if repo.task_id is not None + ] if tasks_to_disable: self.scheduler.disable_tasks(tasks_to_disable) for repo in deleted_repos: @@ -224,7 +231,7 @@ class IndexingLister(ListerBase): Returns: nothing """ - status = 'uneventful' + status = "uneventful" self.min_index = min_bound self.max_index = max_bound @@ -233,7 +240,7 @@ class IndexingLister(ListerBase): for i in count(1): response, injected_repos = self.ingest_data(index) if not response and not injected_repos: - logger.info('No response from api server, stopping') + logger.info("No response from api server, stopping") return next_index = self.get_next_target_from_response(response) @@ -243,23 +250,22 @@ class IndexingLister(ListerBase): # termination condition if next_index is None or next_index == index: - logger.info('stopping after index %s, no next link found', - index) + logger.info("stopping after index %s, no next link found", index) return index = next_index - logger.debug('Index: %s', index) + logger.debug("Index: %s", index) yield i for i in ingest_indexes(): if (i % self.flush_packet_db) == 0: - logger.debug('Flushing updates at index %s', i) + logger.debug("Flushing updates at index %s", i) self.db_session.commit() self.db_session = self.mk_session() - status = 'eventful' + status = "eventful" self.db_session.commit() self.db_session = self.mk_session() - return {'status': status} + return {"status": status} class IndexingHttpLister(ListerHttpTransport, IndexingLister): diff --git a/swh/lister/core/lister_base.py b/swh/lister/core/lister_base.py index 5c02642..ff6e086 100644 --- a/swh/lister/core/lister_base.py +++ b/swh/lister/core/lister_base.py @@ -68,11 +68,12 @@ class ListerBase(abc.ABC, config.SWHConfig): """ MODEL = AbstractAttribute( - 'Subclass type (not instance) of swh.lister.core.models.ModelBase ' - 'customized for a specific service.' + "Subclass type (not instance) of swh.lister.core.models.ModelBase " + "customized for a specific service." ) # type: Union[AbstractAttribute, Type[Any]] LISTER_NAME = AbstractAttribute( - "Lister's name") # type: Union[AbstractAttribute, str] + "Lister's name" + ) # type: Union[AbstractAttribute, str] def transport_request(self, identifier): """Given a target endpoint identifier to query, try once to request it. @@ -138,8 +139,7 @@ class ListerBase(abc.ABC, config.SWHConfig): """ pass - def filter_before_inject( - self, models_list: List[Dict]) -> List[Dict]: + def filter_before_inject(self, models_list: List[Dict]) -> List[Dict]: """Filter models_list entries prior to injection in the db. This is ran directly after `transport_response_simplified`. @@ -154,8 +154,7 @@ class ListerBase(abc.ABC, config.SWHConfig): """ return models_list - def do_additional_checks( - self, models_list: List[Dict]) -> List[Dict]: + def do_additional_checks(self, models_list: List[Dict]) -> List[Dict]: """Execute some additional checks on the model list (after the filtering). @@ -173,8 +172,8 @@ class ListerBase(abc.ABC, config.SWHConfig): return models_list def is_within_bounds( - self, inner: int, - lower: Optional[int] = None, upper: Optional[int] = None) -> bool: + self, inner: int, lower: Optional[int] = None, upper: Optional[int] = None + ) -> bool: """See if a sortable value is inside the range [lower,upper]. MAY BE OVERRIDDEN, for example if the server indexable* key is @@ -201,11 +200,15 @@ class ListerBase(abc.ABC, config.SWHConfig): self.string_pattern_check(inner, lower, upper) except Exception as e: - logger.error(str(e) + ': %s, %s, %s' % - (('inner=%s%s' % (type(inner), inner)), - ('lower=%s%s' % (type(lower), lower)), - ('upper=%s%s' % (type(upper), upper))) - ) + logger.error( + str(e) + + ": %s, %s, %s" + % ( + ("inner=%s%s" % (type(inner), inner)), + ("lower=%s%s" % (type(lower), lower)), + ("upper=%s%s" % (type(upper), upper)), + ) + ) raise return ret @@ -213,30 +216,23 @@ class ListerBase(abc.ABC, config.SWHConfig): # You probably don't need to override anything below this line. DEFAULT_CONFIG = { - 'scheduler': ('dict', { - 'cls': 'remote', - 'args': { - 'url': 'http://localhost:5008/' - }, - }), - 'lister': ('dict', { - 'cls': 'local', - 'args': { - 'db': 'postgresql:///lister', - }, - }), + "scheduler": ( + "dict", + {"cls": "remote", "args": {"url": "http://localhost:5008/"},}, + ), + "lister": ("dict", {"cls": "local", "args": {"db": "postgresql:///lister",},}), } @property def CONFIG_BASE_FILENAME(self): # noqa: N802 - return 'lister_%s' % self.LISTER_NAME + return "lister_%s" % self.LISTER_NAME @property def ADDITIONAL_CONFIG(self): # noqa: N802 return { - 'credentials': ('dict', {}), - 'cache_responses': ('bool', False), - 'cache_dir': ('str', '~/.cache/swh/lister/%s' % self.LISTER_NAME), + "credentials": ("dict", {}), + "cache_responses": ("bool", False), + "cache_dir": ("str", "~/.cache/swh/lister/%s" % self.LISTER_NAME), } INITIAL_BACKOFF = 10 @@ -245,21 +241,21 @@ class ListerBase(abc.ABC, config.SWHConfig): def __init__(self, override_config=None): self.backoff = self.INITIAL_BACKOFF - logger.debug('Loading config from %s' % self.CONFIG_BASE_FILENAME) + logger.debug("Loading config from %s" % self.CONFIG_BASE_FILENAME) self.config = self.parse_config_file( base_filename=self.CONFIG_BASE_FILENAME, - additional_configs=[self.ADDITIONAL_CONFIG] + additional_configs=[self.ADDITIONAL_CONFIG], ) - self.config['cache_dir'] = os.path.expanduser(self.config['cache_dir']) - if self.config['cache_responses']: - config.prepare_folders(self.config, 'cache_dir') + self.config["cache_dir"] = os.path.expanduser(self.config["cache_dir"]) + if self.config["cache_responses"]: + config.prepare_folders(self.config, "cache_dir") if override_config: self.config.update(override_config) - logger.debug('%s CONFIG=%s' % (self, self.config)) - self.scheduler = get_scheduler(**self.config['scheduler']) - self.db_engine = create_engine(self.config['lister']['args']['db']) + logger.debug("%s CONFIG=%s" % (self, self.config)) + self.scheduler = get_scheduler(**self.config["scheduler"]) + self.db_engine = create_engine(self.config["lister"]["args"]["db"]) self.mk_session = sessionmaker(bind=self.db_engine) self.db_session = self.mk_session() @@ -285,7 +281,7 @@ class ListerBase(abc.ABC, config.SWHConfig): server response """ retries_left = self.MAX_RETRIES - do_cache = self.config['cache_responses'] + do_cache = self.config["cache_responses"] r = None while retries_left > 0: try: @@ -293,8 +289,9 @@ class ListerBase(abc.ABC, config.SWHConfig): except FetchError: # network-level connection error, try again logger.warning( - 'connection error on %s: sleep for %d seconds' % - (identifier, self.CONN_SLEEP)) + "connection error on %s: sleep for %d seconds" + % (identifier, self.CONN_SLEEP) + ) time.sleep(self.CONN_SLEEP) retries_left -= 1 continue @@ -306,8 +303,8 @@ class ListerBase(abc.ABC, config.SWHConfig): must_retry, delay = self.transport_quota_check(r) if must_retry: logger.warning( - 'rate limited on %s: sleep for %f seconds' % - (identifier, delay)) + "rate limited on %s: sleep for %f seconds" % (identifier, delay) + ) time.sleep(delay) else: # request ok break @@ -315,8 +312,7 @@ class ListerBase(abc.ABC, config.SWHConfig): retries_left -= 1 if not retries_left: - logger.warning( - 'giving up on %s: max retries exceeded' % identifier) + logger.warning("giving up on %s: max retries exceeded" % identifier) return r @@ -332,8 +328,7 @@ class ListerBase(abc.ABC, config.SWHConfig): """ if isinstance(key, str): key = self.MODEL.__dict__[key] - return self.db_session.query(self.MODEL) \ - .filter(key == value).first() + return self.db_session.query(self.MODEL).filter(key == value).first() def winnow_models(self, mlist, key, to_remove): """Given a list of models, remove any with matching @@ -358,8 +353,7 @@ class ListerBase(abc.ABC, config.SWHConfig): def db_num_entries(self): """Return the known number of entries in the lister db""" - return self.db_session.query(func.count('*')).select_from(self.MODEL) \ - .scalar() + return self.db_session.query(func.count("*")).select_from(self.MODEL).scalar() def db_inject_repo(self, model_dict): """Add/update a new repo to the db and mark it last_seen now. @@ -372,7 +366,7 @@ class ListerBase(abc.ABC, config.SWHConfig): object associated with the injection """ - sql_repo = self.db_query_equal('uid', model_dict['uid']) + sql_repo = self.db_query_equal("uid", model_dict["uid"]) if not sql_repo: sql_repo = self.MODEL(**model_dict) @@ -384,8 +378,7 @@ class ListerBase(abc.ABC, config.SWHConfig): return sql_repo - def task_dict(self, origin_type: str, - origin_url: str, **kwargs) -> Dict[str, Any]: + def task_dict(self, origin_type: str, origin_url: str, **kwargs) -> Dict[str, Any]: """Return special dict format for the tasks list Args: @@ -394,11 +387,11 @@ class ListerBase(abc.ABC, config.SWHConfig): Returns: the same information in a different form """ - logger.debug('origin-url: %s, type: %s', origin_url, origin_type) - _type = 'load-%s' % origin_type - _policy = kwargs.get('policy', 'recurring') - priority = kwargs.get('priority') - kw = {'priority': priority} if priority else {} + logger.debug("origin-url: %s, type: %s", origin_url, origin_type) + _type = "load-%s" % origin_type + _policy = kwargs.get("policy", "recurring") + priority = kwargs.get("priority") + kw = {"priority": priority} if priority else {} return utils.create_task_dict(_type, _policy, url=origin_url, **kw) def string_pattern_check(self, a, b, c=None): @@ -420,14 +413,15 @@ class ListerBase(abc.ABC, config.SWHConfig): pattern. """ if isinstance(a, str): - a_pattern = re.sub('[a-zA-Z0-9]', - '[a-zA-Z0-9]', - re.escape(a)) - if (isinstance(b, str) and (re.match(a_pattern, b) is None) - or isinstance(c, str) and - (re.match(a_pattern, c) is None)): + a_pattern = re.sub("[a-zA-Z0-9]", "[a-zA-Z0-9]", re.escape(a)) + if ( + isinstance(b, str) + and (re.match(a_pattern, b) is None) + or isinstance(c, str) + and (re.match(a_pattern, c) is None) + ): logger.debug(a_pattern) - raise TypeError('incomparable string patterns detected') + raise TypeError("incomparable string patterns detected") def inject_repo_data_into_db(self, models_list: List[Dict]) -> Dict: """Inject data into the db. @@ -441,11 +435,12 @@ class ListerBase(abc.ABC, config.SWHConfig): """ injected_repos = {} for m in models_list: - injected_repos[m['uid']] = self.db_inject_repo(m) + injected_repos[m["uid"]] = self.db_inject_repo(m) return injected_repos def schedule_missing_tasks( - self, models_list: List[Dict], injected_repos: Dict) -> None: + self, models_list: List[Dict], injected_repos: Dict + ) -> None: """Schedule any newly created db entries that do not have been scheduled yet. @@ -463,20 +458,17 @@ class ListerBase(abc.ABC, config.SWHConfig): tasks = {} def _task_key(m): - return '%s-%s' % ( - m['type'], - json.dumps(m['arguments'], sort_keys=True) - ) + return "%s-%s" % (m["type"], json.dumps(m["arguments"], sort_keys=True)) for m in models_list: - ir = injected_repos[m['uid']] + ir = injected_repos[m["uid"]] if not ir.task_id: # Patching the model instance to add the policy/priority task # scheduling - if 'policy' in self.config: - m['policy'] = self.config['policy'] - if 'priority' in self.config: - m['priority'] = self.config['priority'] + if "policy" in self.config: + m["policy"] = self.config["policy"] + if "priority" in self.config: + m["priority"] = self.config["priority"] task_dict = self.task_dict(**m) tasks[_task_key(task_dict)] = (ir, m, task_dict) @@ -485,7 +477,7 @@ class ListerBase(abc.ABC, config.SWHConfig): new_tasks = self.scheduler.create_tasks(list(grouped_tasks)) for task in new_tasks: ir, m, _ = tasks[_task_key(task)] - ir.task_id = task['id'] + ir.task_id = task["id"] def ingest_data(self, identifier: int, checks: bool = False): """The core data fetch sequence. Request server endpoint. Simplify and @@ -523,13 +515,7 @@ class ListerBase(abc.ABC, config.SWHConfig): """ datepath = utcnow().isoformat() - fname = os.path.join( - self.config['cache_dir'], - datepath + '.gz', - ) + fname = os.path.join(self.config["cache_dir"], datepath + ".gz",) - with gzip.open(fname, 'w') as f: - f.write(bytes( - self.transport_response_to_string(response), - 'UTF-8' - )) + with gzip.open(fname, "w") as f: + f.write(bytes(self.transport_response_to_string(response), "UTF-8")) diff --git a/swh/lister/core/lister_transports.py b/swh/lister/core/lister_transports.py index f4d6920..a1027ac 100644 --- a/swh/lister/core/lister_transports.py +++ b/swh/lister/core/lister_transports.py @@ -29,14 +29,14 @@ class ListerHttpTransport(abc.ABC): To be used in conjunction with ListerBase or a subclass of it. """ + DEFAULT_URL = None # type: Optional[str] - PATH_TEMPLATE = \ - AbstractAttribute( - 'string containing a python string format pattern that produces' - ' the API endpoint path for listing stored repositories when given' - ' an index, e.g., "/repositories?after=%s". To be implemented in' - ' the API-specific class inheriting this.' - ) # type: Union[AbstractAttribute, Optional[str]] + PATH_TEMPLATE = AbstractAttribute( + "string containing a python string format pattern that produces" + " the API endpoint path for listing stored repositories when given" + ' an index, e.g., "/repositories?after=%s". To be implemented in' + " the API-specific class inheriting this." + ) # type: Union[AbstractAttribute, Optional[str]] EXPECTED_STATUS_CODES = (200, 429, 403, 404) @@ -45,9 +45,7 @@ class ListerHttpTransport(abc.ABC): MAY BE OVERRIDDEN if request headers are needed. """ - return { - 'User-Agent': USER_AGENT_TEMPLATE % self.lister_version - } + return {"User-Agent": USER_AGENT_TEMPLATE % self.lister_version} def request_instance_credentials(self) -> List[Dict[str, Any]]: """Returns dictionary of any credentials configuration needed by the @@ -82,7 +80,7 @@ class ListerHttpTransport(abc.ABC): list of credential dicts for the current lister. """ - all_creds = self.config.get('credentials') # type: ignore + all_creds = self.config.get("credentials") # type: ignore if not all_creds: return [] lister_creds = all_creds.get(self.LISTER_NAME, {}) # type: ignore @@ -110,14 +108,16 @@ class ListerHttpTransport(abc.ABC): """ params = {} - params['headers'] = self.request_headers() or {} + params["headers"] = self.request_headers() or {} creds = self.request_instance_credentials() if not creds: return params auth = random.choice(creds) if creds else None if auth: - params['auth'] = (auth['username'], # type: ignore - auth['password']) + params["auth"] = ( + auth["username"], # type: ignore + auth["password"], + ) return params def transport_quota_check(self, response): @@ -130,7 +130,7 @@ class ListerHttpTransport(abc.ABC): """ if response.status_code == 429: # HTTP too many requests - retry_after = response.headers.get('Retry-After', self.back_off()) + retry_after = response.headers.get("Retry-After", self.back_off()) try: # might be seconds return True, float(retry_after) @@ -145,17 +145,16 @@ class ListerHttpTransport(abc.ABC): def __init__(self, url=None): if not url: - url = self.config.get('url') + url = self.config.get("url") if not url: url = self.DEFAULT_URL if not url: - raise NameError('HTTP Lister Transport requires an url.') + raise NameError("HTTP Lister Transport requires an url.") self.url = url # eg. 'https://api.github.com' self.session = requests.Session() self.lister_version = __version__ - def _transport_action( - self, identifier: str, method: str = 'get') -> Response: + def _transport_action(self, identifier: str, method: str = "get") -> Response: """Permit to ask information to the api prior to actually executing query. @@ -163,16 +162,16 @@ class ListerHttpTransport(abc.ABC): path = self.request_uri(identifier) params = self.request_params(identifier) - logger.debug('path: %s', path) - logger.debug('params: %s', params) - logger.debug('method: %s', method) + logger.debug("path: %s", path) + logger.debug("params: %s", params) + logger.debug("method: %s", method) try: - if method == 'head': + if method == "head": response = self.session.head(path, **params) else: response = self.session.get(path, **params) except requests.exceptions.ConnectionError as e: - logger.warning('Failed to fetch %s: %s', path, e) + logger.warning("Failed to fetch %s: %s", path, e) raise FetchError(e) else: if response.status_code not in self.EXPECTED_STATUS_CODES: @@ -183,7 +182,7 @@ class ListerHttpTransport(abc.ABC): """Retrieve head information on api. """ - return self._transport_action(identifier, method='head') + return self._transport_action(identifier, method="head") def transport_request(self, identifier: str) -> Response: """Implements ListerBase.transport_request for HTTP using Requests. @@ -198,10 +197,10 @@ class ListerHttpTransport(abc.ABC): Requests responses. """ s = pformat(response.request.path_url) - s += '\n#\n' + pformat(response.request.headers) - s += '\n#\n' + pformat(response.status_code) - s += '\n#\n' + pformat(response.headers) - s += '\n#\n' + s += "\n#\n" + pformat(response.request.headers) + s += "\n#\n" + pformat(response.status_code) + s += "\n#\n" + pformat(response.headers) + s += "\n#\n" try: # json? s += pformat(response.json()) except Exception: # not json @@ -219,9 +218,10 @@ class ListerOnePageApiTransport(ListerHttpTransport): To be used in conjunction with ListerBase or a subclass of it. """ + PAGE = AbstractAttribute( - "URL of the API's unique page to retrieve and parse " - "for information") # type: Union[AbstractAttribute, str] + "URL of the API's unique page to retrieve and parse " "for information" + ) # type: Union[AbstractAttribute, str] PATH_TEMPLATE = None # we do not use it def __init__(self, url=None): diff --git a/swh/lister/core/models.py b/swh/lister/core/models.py index 27eb080..7e87d78 100644 --- a/swh/lister/core/models.py +++ b/swh/lister/core/models.py @@ -25,12 +25,12 @@ class ABCSQLMeta(abc.ABCMeta, DeclarativeMeta): class ModelBase(SQLBase, metaclass=ABCSQLMeta): """a common repository""" + __abstract__ = True - __tablename__ = \ - AbstractAttribute # type: Union[Type[AbstractAttribute], str] + __tablename__ = AbstractAttribute # type: Union[Type[AbstractAttribute], str] uid = AbstractAttribute( - 'Column(, primary_key=True)' + "Column(, primary_key=True)" ) # type: Union[AbstractAttribute, Column] name = Column(String, index=True) @@ -44,19 +44,18 @@ class ModelBase(SQLBase, metaclass=ABCSQLMeta): task_id = Column(Integer) def __init__(self, **kw): - kw['last_seen'] = datetime.now() + kw["last_seen"] = datetime.now() super().__init__(**kw) class IndexingModelBase(ModelBase, metaclass=ABCSQLMeta): __abstract__ = True - __tablename__ = \ - AbstractAttribute # type: Union[Type[AbstractAttribute], str] + __tablename__ = AbstractAttribute # type: Union[Type[AbstractAttribute], str] # The value used for sorting, segmenting, or api query paging, # because uids aren't always sequential. indexable = AbstractAttribute( - 'Column(, index=True)' + "Column(, index=True)" ) # type: Union[AbstractAttribute, Column] @@ -72,8 +71,8 @@ def initialize(db_engine, drop_tables=False, **kwargs): (re)creating them. """ if drop_tables: - logger.info('Dropping tables') + logger.info("Dropping tables") SQLBase.metadata.drop_all(db_engine, checkfirst=True) - logger.info('Creating tables') + logger.info("Creating tables") SQLBase.metadata.create_all(db_engine, checkfirst=True) diff --git a/swh/lister/core/page_by_page_lister.py b/swh/lister/core/page_by_page_lister.py index 8bcce45..1f38a2a 100644 --- a/swh/lister/core/page_by_page_lister.py +++ b/swh/lister/core/page_by_page_lister.py @@ -37,6 +37,7 @@ class PageByPageLister(ListerBase): def get_next_target_from_response """ + @abc.abstractmethod def get_next_target_from_response(self, response): """Find the next server endpoint page given the entire response. @@ -87,7 +88,7 @@ class PageByPageLister(ListerBase): """ for m in models_list: - sql_repo = self.db_query_equal('uid', m['uid']) + sql_repo = self.db_query_equal("uid", m["uid"]) if sql_repo: return False return models_list @@ -110,7 +111,7 @@ class PageByPageLister(ListerBase): nothing """ - status = 'uneventful' + status = "uneventful" page = min_bound or 0 loop_count = 0 @@ -118,32 +119,30 @@ class PageByPageLister(ListerBase): self.max_page = max_bound while self.is_within_bounds(page, self.min_page, self.max_page): - logging.info('listing repos starting at %s' % page) + logging.info("listing repos starting at %s" % page) - response, injected_repos = self.ingest_data(page, - checks=check_existence) + response, injected_repos = self.ingest_data(page, checks=check_existence) if not response and not injected_repos: - logging.info('No response from api server, stopping') + logging.info("No response from api server, stopping") break elif not injected_repos: - logging.info('Repositories already seen, stopping') + logging.info("Repositories already seen, stopping") break - status = 'eventful' + status = "eventful" next_page = self.get_next_target_from_response(response) # termination condition if (next_page is None) or (next_page == page): - logging.info('stopping after page %s, no next link found' % - page) + logging.info("stopping after page %s, no next link found" % page) break else: page = next_page loop_count += 1 if loop_count == 20: - logging.info('flushing updates') + logging.info("flushing updates") loop_count = 0 self.db_session.commit() self.db_session = self.mk_session() @@ -151,7 +150,7 @@ class PageByPageLister(ListerBase): self.db_session.commit() self.db_session = self.mk_session() - return {'status': status} + return {"status": status} class PageByPageHttpLister(ListerHttpTransport, PageByPageLister): @@ -159,6 +158,7 @@ class PageByPageHttpLister(ListerHttpTransport, PageByPageLister): combining PageByPageLister and ListerHttpTransport. """ + def __init__(self, url=None, override_config=None): PageByPageLister.__init__(self, override_config=override_config) ListerHttpTransport.__init__(self, url=url) diff --git a/swh/lister/core/simple_lister.py b/swh/lister/core/simple_lister.py index fa09ed8..9612986 100644 --- a/swh/lister/core/simple_lister.py +++ b/swh/lister/core/simple_lister.py @@ -24,6 +24,7 @@ class SimpleLister(ListerBase): information and stores those in db """ + flush_packet_db = 2 """Number of iterations in-between write flushes of lister repositories to db (see fn:`ingest_data`). @@ -57,14 +58,14 @@ class SimpleLister(ListerBase): all_injected = [] for i, models in enumerate(utils.grouper(models_list, n=100), start=1): models = list(models) - logging.debug('models: %s' % len(models)) + logging.debug("models: %s" % len(models)) # inject into local db injected = self.inject_repo_data_into_db(models) # queue workers self.schedule_missing_tasks(models, injected) all_injected.append(injected) if (i % self.flush_packet_db) == 0: - logger.debug('Flushing updates at index %s', i) + logger.debug("Flushing updates at index %s", i) self.db_session.commit() self.db_session = self.mk_session() @@ -88,9 +89,9 @@ class SimpleLister(ListerBase): dump_not_used_identifier = 0 response, injected_repos = self.ingest_data(dump_not_used_identifier) if not response and not injected_repos: - logging.info('No response from api server, stopping') - status = 'uneventful' + logging.info("No response from api server, stopping") + status = "uneventful" else: - status = 'eventful' + status = "eventful" - return {'status': status} + return {"status": status} diff --git a/swh/lister/core/tests/conftest.py b/swh/lister/core/tests/conftest.py index b7093d8..9d3491b 100644 --- a/swh/lister/core/tests/conftest.py +++ b/swh/lister/core/tests/conftest.py @@ -19,13 +19,14 @@ logger = logging.getLogger(__name__) @pytest.fixture def swh_listers(request, postgresql_proc, postgresql, swh_scheduler): - db_url = 'postgresql://{user}@{host}:{port}/{dbname}'.format( - host=postgresql_proc.host, - port=postgresql_proc.port, - user='postgres', - dbname='tests') + db_url = "postgresql://{user}@{host}:{port}/{dbname}".format( + host=postgresql_proc.host, + port=postgresql_proc.port, + user="postgres", + dbname="tests", + ) - logger.debug('lister db_url: %s', db_url) + logger.debug("lister db_url: %s", db_url) listers = {} @@ -37,11 +38,13 @@ def swh_listers(request, postgresql_proc, postgresql, swh_scheduler): initialize(create_engine(db_url), drop_tables=True) # Add the load-archive-files expected by some listers (gnu, cran, ...) - swh_scheduler.create_task_type({ - 'type': 'load-archive-files', - 'description': 'Load archive files.', - 'backend_name': 'swh.loader.package.tasks.LoadArchive', - 'default_interval': '1 day', - }) + swh_scheduler.create_task_type( + { + "type": "load-archive-files", + "description": "Load archive files.", + "backend_name": "swh.loader.package.tasks.LoadArchive", + "default_interval": "1 day", + } + ) return listers diff --git a/swh/lister/core/tests/test_abstractattribute.py b/swh/lister/core/tests/test_abstractattribute.py index 8190d01..113ee0a 100644 --- a/swh/lister/core/tests/test_abstractattribute.py +++ b/swh/lister/core/tests/test_abstractattribute.py @@ -13,8 +13,8 @@ from swh.lister.core.abstractattribute import AbstractAttribute class BaseClass(abc.ABC): v1 = AbstractAttribute # type: Any v2 = AbstractAttribute() # type: Any - v3 = AbstractAttribute('changed docstring') # type: Any - v4 = 'qux' + v3 = AbstractAttribute("changed docstring") # type: Any + v4 = "qux" class BadSubclass1(BaseClass): @@ -22,19 +22,19 @@ class BadSubclass1(BaseClass): class BadSubclass2(BaseClass): - v1 = 'foo' - v2 = 'bar' + v1 = "foo" + v2 = "bar" class BadSubclass3(BaseClass): - v2 = 'bar' - v3 = 'baz' + v2 = "bar" + v3 = "baz" class GoodSubclass(BaseClass): - v1 = 'foo' - v2 = 'bar' - v3 = 'baz' + v1 = "foo" + v2 = "bar" + v3 = "baz" class TestAbstractAttributes(unittest.TestCase): @@ -54,13 +54,12 @@ class TestAbstractAttributes(unittest.TestCase): self.assertIsInstance(GoodSubclass(), GoodSubclass) gsc = GoodSubclass() - self.assertEqual(gsc.v1, 'foo') - self.assertEqual(gsc.v2, 'bar') - self.assertEqual(gsc.v3, 'baz') - self.assertEqual(gsc.v4, 'qux') + self.assertEqual(gsc.v1, "foo") + self.assertEqual(gsc.v2, "bar") + self.assertEqual(gsc.v3, "baz") + self.assertEqual(gsc.v4, "qux") def test_aa_docstrings(self): self.assertEqual(BaseClass.v1.__doc__, AbstractAttribute.__doc__) self.assertEqual(BaseClass.v2.__doc__, AbstractAttribute.__doc__) - self.assertEqual(BaseClass.v3.__doc__, - 'AbstractAttribute: changed docstring') + self.assertEqual(BaseClass.v3.__doc__, "AbstractAttribute: changed docstring") diff --git a/swh/lister/core/tests/test_indexing_lister.py b/swh/lister/core/tests/test_indexing_lister.py index 7e20bf1..3d29ab7 100644 --- a/swh/lister/core/tests/test_indexing_lister.py +++ b/swh/lister/core/tests/test_indexing_lister.py @@ -9,7 +9,7 @@ from swh.lister.core.indexing_lister import IndexingLister class MockedIndexingListerDbPartitionIndices(IndexingLister): # Abstract Attribute boilerplate - LISTER_NAME = 'DbPartitionIndices' + LISTER_NAME = "DbPartitionIndices" MODEL = type(None) # ABC boilerplate @@ -33,9 +33,7 @@ class MockedIndexingListerDbPartitionIndices(IndexingLister): def test_db_partition_indices(): m = MockedIndexingListerDbPartitionIndices( - num_entries=1000, - first_index=1, - last_index=10001, + num_entries=1000, first_index=1, last_index=10001, ) assert m @@ -49,9 +47,7 @@ def test_db_partition_indices(): def test_db_partition_indices_zero_first(): m = MockedIndexingListerDbPartitionIndices( - num_entries=1000, - first_index=0, - last_index=10000, + num_entries=1000, first_index=0, last_index=10000, ) assert m @@ -65,9 +61,7 @@ def test_db_partition_indices_zero_first(): def test_db_partition_indices_small_index_range(): m = MockedIndexingListerDbPartitionIndices( - num_entries=5000, - first_index=0, - last_index=5, + num_entries=5000, first_index=0, last_index=5, ) assert m @@ -78,8 +72,8 @@ def test_db_partition_indices_small_index_range(): def test_db_partition_indices_date_indices(): # 24 hour delta - first = datetime.datetime.fromisoformat('2019-11-01T00:00:00+00:00') - last = datetime.datetime.fromisoformat('2019-11-02T00:00:00+00:00') + first = datetime.datetime.fromisoformat("2019-11-01T00:00:00+00:00") + last = datetime.datetime.fromisoformat("2019-11-02T00:00:00+00:00") m = MockedIndexingListerDbPartitionIndices( # one entry per second @@ -102,9 +96,7 @@ def test_db_partition_indices_date_indices(): def test_db_partition_indices_float_index_range(): m = MockedIndexingListerDbPartitionIndices( - num_entries=10000, - first_index=0.0, - last_index=1.0, + num_entries=10000, first_index=0.0, last_index=1.0, ) assert m @@ -120,9 +112,7 @@ def test_db_partition_indices_float_index_range(): def test_db_partition_indices_uneven_int_index_range(): m = MockedIndexingListerDbPartitionIndices( - num_entries=5641, - first_index=0, - last_index=10000, + num_entries=5641, first_index=0, last_index=10000, ) assert m diff --git a/swh/lister/core/tests/test_lister.py b/swh/lister/core/tests/test_lister.py index 908fd9c..a835b1e 100644 --- a/swh/lister/core/tests/test_lister.py +++ b/swh/lister/core/tests/test_lister.py @@ -22,8 +22,9 @@ def noop(*args, **kwargs): def test_version_generation(): - assert swh.lister.__version__ != 'devel', \ - "Make sure swh.lister is installed (e.g. pip install -e .)" + assert ( + swh.lister.__version__ != "devel" + ), "Make sure swh.lister is installed (e.g. pip install -e .)" class HttpListerTesterBase(abc.ABC): @@ -35,13 +36,17 @@ class HttpListerTesterBase(abc.ABC): to customize for a specific listing service. """ + Lister = AbstractAttribute( - 'Lister class to test') # type: Union[AbstractAttribute, Type[Any]] + "Lister class to test" + ) # type: Union[AbstractAttribute, Type[Any]] lister_subdir = AbstractAttribute( - 'bitbucket, github, etc.') # type: Union[AbstractAttribute, str] + "bitbucket, github, etc." + ) # type: Union[AbstractAttribute, str] good_api_response_file = AbstractAttribute( - 'Example good response body') # type: Union[AbstractAttribute, str] - LISTER_NAME = 'fake-lister' + "Example good response body" + ) # type: Union[AbstractAttribute, str] + LISTER_NAME = "fake-lister" # May need to override this if the headers are used for something def response_headers(self, request): @@ -53,7 +58,7 @@ class HttpListerTesterBase(abc.ABC): def mock_rate_quota(self, n, request, context): self.rate_limit += 1 context.status_code = 429 - context.headers['Retry-After'] = '1' + context.headers["Retry-After"] = "1" return '{"error":"dummy"}' def __init__(self, *args, **kwargs): @@ -89,8 +94,9 @@ class HttpListerTesterBase(abc.ABC): """ if override_config or self.fl is None: - self.fl = self.Lister(url='https://fakeurl', - override_config=override_config) + self.fl = self.Lister( + url="https://fakeurl", override_config=override_config + ) self.fl.INITIAL_BACKOFF = 1 self.fl.reset_backoff() @@ -105,23 +111,25 @@ class HttpListerTesterBase(abc.ABC): task_id = 0 current_nb_tasks = len(self.scheduler_tasks) if current_nb_tasks > 0: - task_id = self.scheduler_tasks[-1]['id'] + 1 + task_id = self.scheduler_tasks[-1]["id"] + 1 for task in tasks: scheduler_task = dict(task) - scheduler_task.update({ - 'status': 'next_run_not_scheduled', - 'retries_left': 0, - 'priority': None, - 'id': task_id, - 'current_interval': datetime.timedelta(days=64) - }) + scheduler_task.update( + { + "status": "next_run_not_scheduled", + "retries_left": 0, + "priority": None, + "id": task_id, + "current_interval": datetime.timedelta(days=64), + } + ) self.scheduler_tasks.append(scheduler_task) task_id = task_id + 1 return self.scheduler_tasks[current_nb_tasks:] def _disable_tasks(task_ids): for task_id in task_ids: - self.scheduler_tasks[task_id]['status'] = 'disabled' + self.scheduler_tasks[task_id]["status"] = "disabled" fl.scheduler.create_tasks = Mock(wraps=_create_tasks) fl.scheduler.disable_tasks = Mock(wraps=_disable_tasks) @@ -167,26 +175,29 @@ class HttpListerTester(HttpListerTesterBase, abc.ABC): to customize for a specific listing service. """ + last_index = AbstractAttribute( - 'Last index ' - 'in good_api_response') # type: Union[AbstractAttribute, int] + "Last index " "in good_api_response" + ) # type: Union[AbstractAttribute, int] first_index = AbstractAttribute( - 'First index in ' - ' good_api_response') # type: Union[AbstractAttribute, Optional[int]] + "First index in " " good_api_response" + ) # type: Union[AbstractAttribute, Optional[int]] bad_api_response_file = AbstractAttribute( - 'Example bad response body') # type: Union[AbstractAttribute, str] + "Example bad response body" + ) # type: Union[AbstractAttribute, str] entries_per_page = AbstractAttribute( - 'Number of results in ' - 'good response') # type: Union[AbstractAttribute, int] + "Number of results in " "good response" + ) # type: Union[AbstractAttribute, int] test_re = AbstractAttribute( - 'Compiled regex matching the server url. Must capture the ' - 'index value.') # type: Union[AbstractAttribute, Pattern] + "Compiled regex matching the server url. Must capture the " "index value." + ) # type: Union[AbstractAttribute, Pattern] convert_type = str # type: Callable[..., Any] """static method used to convert the "request_index" to its right type (for indexing listers for example, this is in accordance with the model's "indexable" column). """ + def mock_response(self, request, context): self.fl.reset_backoff() self.rate_limit = 1 @@ -200,9 +211,11 @@ class HttpListerTester(HttpListerTesterBase, abc.ABC): else: response_file = self.bad_api_response_file - with open('swh/lister/%s/tests/%s' % (self.lister_subdir, - response_file), - 'r', encoding='utf-8') as r: + with open( + "swh/lister/%s/tests/%s" % (self.lister_subdir, response_file), + "r", + encoding="utf-8", + ) as r: return r.read() def request_index(self, request): @@ -214,12 +227,9 @@ class HttpListerTester(HttpListerTesterBase, abc.ABC): http_mocker.get(self.test_re, text=self.mock_response) db = init_db() - fl = self.get_fl(override_config={ - 'lister': { - 'cls': 'local', - 'args': {'db': db.url()} - } - }) + fl = self.get_fl( + override_config={"lister": {"cls": "local", "args": {"db": db.url()}}} + ) fl.db = db self.init_db(db, fl.MODEL) @@ -233,8 +243,7 @@ class HttpListerTester(HttpListerTesterBase, abc.ABC): fl.run() self.assertEqual(fl.db_last_index(), self.last_index) - ingested_repos = list(fl.db_query_range(self.first_index, - self.last_index)) + ingested_repos = list(fl.db_query_range(self.first_index, self.last_index)) self.assertEqual(len(ingested_repos), self.entries_per_page) @requests_mock.Mocker() @@ -307,13 +316,12 @@ class HttpListerTester(HttpListerTesterBase, abc.ABC): """ http_mocker.get(self.test_re, text=self.mock_response) fl = self.get_fl() - li = fl.transport_response_simplified( - self.get_api_response(self.first_index)) + li = fl.transport_response_simplified(self.get_api_response(self.first_index)) di = li[0] self.assertIsInstance(di, dict) - pubs = [k for k in vars(fl.MODEL).keys() if not k.startswith('_')] + pubs = [k for k in vars(fl.MODEL).keys() if not k.startswith("_")] for k in pubs: - if k not in ['last_seen', 'task_id', 'id']: + if k not in ["last_seen", "task_id", "id"]: self.assertIn(k, di) @requests_mock.Mocker() @@ -322,7 +330,7 @@ class HttpListerTester(HttpListerTesterBase, abc.ABC): """ http_mocker.get(self.test_re, text=self.mock_limit_twice_response) - with patch.object(time, 'sleep', wraps=time.sleep) as sleepmock: + with patch.object(time, "sleep", wraps=time.sleep) as sleepmock: self.get_api_response(self.first_index) self.assertEqual(sleepmock.call_count, 2) @@ -332,13 +340,14 @@ class HttpListerTester(HttpListerTesterBase, abc.ABC): fl.run() self.assertNotEqual(len(http_mocker.request_history), 0) for request in http_mocker.request_history: - assert 'User-Agent' in request.headers - user_agent = request.headers['User-Agent'] - assert 'Software Heritage Lister' in user_agent + assert "User-Agent" in request.headers + user_agent = request.headers["User-Agent"] + assert "Software Heritage Lister" in user_agent assert swh.lister.__version__ in user_agent - def scheduled_tasks_test(self, next_api_response_file, next_last_index, - http_mocker): + def scheduled_tasks_test( + self, next_api_response_file, next_last_index, http_mocker + ): """Check that no loading tasks get disabled when processing a new page of repositories returned by a forge API """ @@ -361,7 +370,7 @@ class HttpListerTester(HttpListerTesterBase, abc.ABC): # check tasks are not disabled for task in self.scheduler_tasks: - self.assertTrue(task['status'] != 'disabled') + self.assertTrue(task["status"] != "disabled") class HttpSimpleListerTester(HttpListerTesterBase, abc.ABC): @@ -372,20 +381,20 @@ class HttpSimpleListerTester(HttpListerTesterBase, abc.ABC): to customize for a specific listing service. """ + entries = AbstractAttribute( - 'Number of results ' - 'in good response') # type: Union[AbstractAttribute, int] + "Number of results " "in good response" + ) # type: Union[AbstractAttribute, int] PAGE = AbstractAttribute( - "URL of the server api's unique page to retrieve and " - "parse for information") # type: Union[AbstractAttribute, str] + "URL of the server api's unique page to retrieve and " "parse for information" + ) # type: Union[AbstractAttribute, str] def get_fl(self, override_config=None): """Retrieve an instance of fake lister (fl). """ if override_config or self.fl is None: - self.fl = self.Lister( - override_config=override_config) + self.fl = self.Lister(override_config=override_config) self.fl.INITIAL_BACKOFF = 1 self.fl.reset_backoff() @@ -399,9 +408,11 @@ class HttpSimpleListerTester(HttpListerTesterBase, abc.ABC): context.headers.update(custom_headers) response_file = self.good_api_response_file - with open('swh/lister/%s/tests/%s' % (self.lister_subdir, - response_file), - 'r', encoding='utf-8') as r: + with open( + "swh/lister/%s/tests/%s" % (self.lister_subdir, response_file), + "r", + encoding="utf-8", + ) as r: return r.read() @requests_mock.Mocker() @@ -410,7 +421,7 @@ class HttpSimpleListerTester(HttpListerTesterBase, abc.ABC): """ http_mocker.get(self.PAGE, text=self.mock_limit_twice_response) - with patch.object(time, 'sleep', wraps=time.sleep) as sleepmock: + with patch.object(time, "sleep", wraps=time.sleep) as sleepmock: self.get_api_response(0) self.assertEqual(sleepmock.call_count, 2) @@ -426,9 +437,9 @@ class HttpSimpleListerTester(HttpListerTesterBase, abc.ABC): li = fl.transport_response_simplified(li) di = li[0] self.assertIsInstance(di, dict) - pubs = [k for k in vars(fl.MODEL).keys() if not k.startswith('_')] + pubs = [k for k in vars(fl.MODEL).keys() if not k.startswith("_")] for k in pubs: - if k not in ['last_seen', 'task_id', 'id']: + if k not in ["last_seen", "task_id", "id"]: self.assertIn(k, di) @requests_mock.Mocker() @@ -437,8 +448,6 @@ class HttpSimpleListerTester(HttpListerTesterBase, abc.ABC): """ http_mocker.get(self.PAGE, text=self.mock_response) - li = self.get_fl().list_packages( - self.get_api_response(0) - ) + li = self.get_fl().list_packages(self.get_api_response(0)) self.assertIsInstance(li, list) self.assertEqual(len(li), self.entries) diff --git a/swh/lister/core/tests/test_model.py b/swh/lister/core/tests/test_model.py index 9f07223..f85bbdf 100644 --- a/swh/lister/core/tests/test_model.py +++ b/swh/lister/core/tests/test_model.py @@ -16,7 +16,7 @@ class BadSubclass1(ModelBase): class BadSubclass2(ModelBase): __abstract__ = True - __tablename__ = 'foo' + __tablename__ = "foo" class BadSubclass3(BadSubclass2): @@ -36,7 +36,7 @@ class IndexingBadSubclass(IndexingModelBase): class IndexingBadSubclass2(IndexingModelBase): __abstract__ = True - __tablename__ = 'foo' + __tablename__ = "foo" class IndexingBadSubclass3(IndexingBadSubclass2): @@ -47,7 +47,7 @@ class IndexingBadSubclass3(IndexingBadSubclass2): class IndexingGoodSubclass(IndexingModelBase): uid = Column(Integer, primary_key=True) indexable = Column(Integer, index=True) - __tablename__ = 'bar' + __tablename__ = "bar" class TestModel(unittest.TestCase): @@ -65,10 +65,10 @@ class TestModel(unittest.TestCase): BadSubclass3() self.assertIsInstance(GoodSubclass(), GoodSubclass) - gsc = GoodSubclass(uid='uid') + gsc = GoodSubclass(uid="uid") - self.assertEqual(gsc.__tablename__, 'foo') - self.assertEqual(gsc.uid, 'uid') + self.assertEqual(gsc.__tablename__, "foo") + self.assertEqual(gsc.uid, "uid") def test_indexing_model_instancing(self): with self.assertRaises(TypeError): @@ -84,8 +84,8 @@ class TestModel(unittest.TestCase): IndexingBadSubclass3() self.assertIsInstance(IndexingGoodSubclass(), IndexingGoodSubclass) - gsc = IndexingGoodSubclass(uid='uid', indexable='indexable') + gsc = IndexingGoodSubclass(uid="uid", indexable="indexable") - self.assertEqual(gsc.__tablename__, 'bar') - self.assertEqual(gsc.uid, 'uid') - self.assertEqual(gsc.indexable, 'indexable') + self.assertEqual(gsc.__tablename__, "bar") + self.assertEqual(gsc.uid, "uid") + self.assertEqual(gsc.indexable, "indexable") diff --git a/swh/lister/cran/__init__.py b/swh/lister/cran/__init__.py index 6abfa5b..3fa6586 100644 --- a/swh/lister/cran/__init__.py +++ b/swh/lister/cran/__init__.py @@ -7,7 +7,8 @@ def register(): from .models import CRANModel from .lister import CRANLister - return {'models': [CRANModel], - 'lister': CRANLister, - 'task_modules': ['%s.tasks' % __name__], - } + return { + "models": [CRANModel], + "lister": CRANLister, + "task_modules": ["%s.tasks" % __name__], + } diff --git a/swh/lister/cran/lister.py b/swh/lister/cran/lister.py index a818dd0..6f9e738 100644 --- a/swh/lister/cran/lister.py +++ b/swh/lister/cran/lister.py @@ -19,16 +19,23 @@ from swh.scheduler.utils import create_task_dict logger = logging.getLogger(__name__) -CRAN_MIRROR = 'https://cran.r-project.org' +CRAN_MIRROR = "https://cran.r-project.org" class CRANLister(SimpleLister): MODEL = CRANModel - LISTER_NAME = 'cran' - instance = 'cran' + LISTER_NAME = "cran" + instance = "cran" - def task_dict(self, origin_type, origin_url, version=None, html_url=None, - policy=None, **kwargs): + def task_dict( + self, + origin_type, + origin_url, + version=None, + html_url=None, + policy=None, + **kwargs, + ): """Return task format dict. This creates tasks with args and kwargs set, for example:: @@ -43,15 +50,15 @@ class CRANLister(SimpleLister): """ if not policy: - policy = 'oneshot' + policy = "oneshot" artifact_url = html_url - assert origin_type == 'tar' + assert origin_type == "tar" return create_task_dict( - 'load-cran', policy, - url=origin_url, artifacts=[{ - 'url': artifact_url, - 'version': version - }], retries_left=3 + "load-cran", + policy, + url=origin_url, + artifacts=[{"url": artifact_url, "version": version}], + retries_left=3, ) def safely_issue_request(self, identifier): @@ -91,23 +98,22 @@ class CRANLister(SimpleLister): """ return read_cran_data() - def get_model_from_repo( - self, repo: Mapping[str, str]) -> Mapping[str, str]: + def get_model_from_repo(self, repo: Mapping[str, str]) -> Mapping[str, str]: """Transform from repository representation to model """ - logger.debug('repo: %s', repo) + logger.debug("repo: %s", repo) origin_url, artifact_url = compute_origin_urls(repo) - package = repo['Package'] - version = repo['Version'] + package = repo["Package"] + version = repo["Version"] return { - 'uid': f'{package}-{version}', - 'name': package, - 'full_name': repo['Title'], - 'version': version, - 'html_url': artifact_url, - 'origin_url': origin_url, - 'origin_type': 'tar', + "uid": f"{package}-{version}", + "name": package, + "full_name": repo["Title"], + "version": version, + "html_url": artifact_url, + "origin_url": origin_url, + "origin_type": "tar", } @@ -115,11 +121,10 @@ def read_cran_data() -> List[Mapping[str, str]]: """Execute r script to read cran listing. """ - filepath = pkg_resources.resource_filename('swh.lister.cran', - 'list_all_packages.R') - logger.debug('script list-all-packages.R path: %s', filepath) + filepath = pkg_resources.resource_filename("swh.lister.cran", "list_all_packages.R") + logger.debug("script list-all-packages.R path: %s", filepath) response = subprocess.run(filepath, stdout=subprocess.PIPE, shell=False) - return json.loads(response.stdout.decode('utf-8')) + return json.loads(response.stdout.decode("utf-8")) def compute_origin_urls(repo: Mapping[str, str]) -> Tuple[str, str]: @@ -132,8 +137,8 @@ def compute_origin_urls(repo: Mapping[str, str]) -> Tuple[str, str]: the tuple project url, artifact url """ - package = repo['Package'] - version = repo['Version'] - origin_url = f'{CRAN_MIRROR}/package={package}' - artifact_url = f'{CRAN_MIRROR}/src/contrib/{package}_{version}.tar.gz' + package = repo["Package"] + version = repo["Version"] + origin_url = f"{CRAN_MIRROR}/package={package}" + artifact_url = f"{CRAN_MIRROR}/src/contrib/{package}_{version}.tar.gz" return origin_url, artifact_url diff --git a/swh/lister/cran/models.py b/swh/lister/cran/models.py index 3fe94d9..5c8dd5c 100644 --- a/swh/lister/cran/models.py +++ b/swh/lister/cran/models.py @@ -11,7 +11,8 @@ class CRANModel(ModelBase): """a CRAN repository representation """ - __tablename__ = 'cran_repo' + + __tablename__ = "cran_repo" uid = Column(String, primary_key=True) version = Column(String) diff --git a/swh/lister/cran/tasks.py b/swh/lister/cran/tasks.py index 74eef74..b541541 100644 --- a/swh/lister/cran/tasks.py +++ b/swh/lister/cran/tasks.py @@ -7,12 +7,12 @@ from celery import shared_task from swh.lister.cran.lister import CRANLister -@shared_task(name=__name__ + '.CRANListerTask') +@shared_task(name=__name__ + ".CRANListerTask") def list_cran(**lister_args): - '''Lister task for the CRAN registry''' + """Lister task for the CRAN registry""" return CRANLister(**lister_args).run() -@shared_task(name=__name__ + '.ping') +@shared_task(name=__name__ + ".ping") def _ping(): - return 'OK' + return "OK" diff --git a/swh/lister/cran/tests/conftest.py b/swh/lister/cran/tests/conftest.py index cce18ab..30d88c3 100644 --- a/swh/lister/cran/tests/conftest.py +++ b/swh/lister/cran/tests/conftest.py @@ -10,14 +10,16 @@ from swh.lister.core.tests.conftest import * # noqa @pytest.fixture def lister_cran(swh_listers): - lister = swh_listers['cran'] + lister = swh_listers["cran"] # Add the load-deb-package in the scheduler backend - lister.scheduler.create_task_type({ - 'type': 'load-cran', - 'description': 'Load a CRAN package', - 'backend_name': 'swh.loader.package.cran.tasks.LoaderCRAN', - 'default_interval': '1 day', - }) + lister.scheduler.create_task_type( + { + "type": "load-cran", + "description": "Load a CRAN package", + "backend_name": "swh.loader.package.cran.tasks.LoaderCRAN", + "default_interval": "1 day", + } + ) return lister diff --git a/swh/lister/cran/tests/test_lister.py b/swh/lister/cran/tests/test_lister.py index 3b6847c..d7ce6a4 100644 --- a/swh/lister/cran/tests/test_lister.py +++ b/swh/lister/cran/tests/test_lister.py @@ -13,28 +13,25 @@ from swh.lister.cran.lister import compute_origin_urls, CRAN_MIRROR def test_cran_compute_origin_urls(): - pack = 'something' - vers = '0.0.1' - origin_url, artifact_url = compute_origin_urls({ - 'Package': pack, - 'Version': vers, - }) + pack = "something" + vers = "0.0.1" + origin_url, artifact_url = compute_origin_urls({"Package": pack, "Version": vers,}) - assert origin_url == f'{CRAN_MIRROR}/package={pack}' - assert artifact_url == f'{CRAN_MIRROR}/src/contrib/{pack}_{vers}.tar.gz' + assert origin_url == f"{CRAN_MIRROR}/package={pack}" + assert artifact_url == f"{CRAN_MIRROR}/src/contrib/{pack}_{vers}.tar.gz" def test_cran_compute_origin_urls_failure(): - for incomplete_repo in [{'Version': '0.0.1'}, {'Package': 'package'}, {}]: + for incomplete_repo in [{"Version": "0.0.1"}, {"Package": "package"}, {}]: with pytest.raises(KeyError): compute_origin_urls(incomplete_repo) -@patch('swh.lister.cran.lister.read_cran_data') +@patch("swh.lister.cran.lister.read_cran_data") def test_cran_lister_cran(mock_cran, datadir, lister_cran): lister = lister_cran - with open(path.join(datadir, 'list-r-packages.json')) as f: + with open(path.join(datadir, "list-r-packages.json")) as f: data = json.loads(f.read()) mock_cran.return_value = data @@ -42,31 +39,33 @@ def test_cran_lister_cran(mock_cran, datadir, lister_cran): lister.run() - r = lister.scheduler.search_tasks(task_type='load-cran') + r = lister.scheduler.search_tasks(task_type="load-cran") assert len(r) == 6 for row in r: - assert row['type'] == 'load-cran' + assert row["type"] == "load-cran" # arguments check - args = row['arguments']['args'] + args = row["arguments"]["args"] assert len(args) == 0 # kwargs - kwargs = row['arguments']['kwargs'] + kwargs = row["arguments"]["kwargs"] assert len(kwargs) == 2 - assert set(kwargs.keys()) == {'url', 'artifacts'} + assert set(kwargs.keys()) == {"url", "artifacts"} - artifacts = kwargs['artifacts'] + artifacts = kwargs["artifacts"] assert len(artifacts) == 1 - assert set(artifacts[0].keys()) == {'url', 'version'} + assert set(artifacts[0].keys()) == {"url", "version"} - assert row['policy'] == 'oneshot' - assert row['retries_left'] == 3 + assert row["policy"] == "oneshot" + assert row["retries_left"] == 3 - origin_url = kwargs['url'] - record = lister.db_session \ - .query(lister.MODEL) \ - .filter(origin_url == origin_url).first() + origin_url = kwargs["url"] + record = ( + lister.db_session.query(lister.MODEL) + .filter(origin_url == origin_url) + .first() + ) assert record - assert record.uid == f'{record.name}-{record.version}' + assert record.uid == f"{record.name}-{record.version}" diff --git a/swh/lister/cran/tests/test_tasks.py b/swh/lister/cran/tests/test_tasks.py index 1a0b95a..9ff3d9f 100644 --- a/swh/lister/cran/tests/test_tasks.py +++ b/swh/lister/cran/tests/test_tasks.py @@ -2,22 +2,20 @@ from unittest.mock import patch def test_ping(swh_app, celery_session_worker): - res = swh_app.send_task( - 'swh.lister.cran.tasks.ping') + res = swh_app.send_task("swh.lister.cran.tasks.ping") assert res res.wait() assert res.successful() - assert res.result == 'OK' + assert res.result == "OK" -@patch('swh.lister.cran.tasks.CRANLister') +@patch("swh.lister.cran.tasks.CRANLister") def test_lister(lister, swh_app, celery_session_worker): # setup the mocked CRANLister lister.return_value = lister lister.run.return_value = None - res = swh_app.send_task( - 'swh.lister.cran.tasks.CRANListerTask') + res = swh_app.send_task("swh.lister.cran.tasks.CRANListerTask") assert res res.wait() assert res.successful() diff --git a/swh/lister/debian/__init__.py b/swh/lister/debian/__init__.py index e07a179..9e201b0 100644 --- a/swh/lister/debian/__init__.py +++ b/swh/lister/debian/__init__.py @@ -11,11 +11,13 @@ from typing import Any, List, Mapping logger = logging.getLogger(__name__) -def debian_init(db_engine, - override_conf: Mapping[str, Any] = {}, - distribution_name: str = 'Debian', - suites: List[str] = ['stretch', 'buster', 'bullseye'], - components: List[str] = ['main', 'contrib', 'non-free']): +def debian_init( + db_engine, + override_conf: Mapping[str, Any] = {}, + distribution_name: str = "Debian", + suites: List[str] = ["stretch", "buster", "bullseye"], + components: List[str] = ["main", "contrib", "non-free"], +): """Initialize the debian data model. Args: @@ -28,30 +30,32 @@ def debian_init(db_engine, """ from swh.lister.debian.models import Distribution, Area from sqlalchemy.orm import sessionmaker + db_session = sessionmaker(bind=db_engine)() - distrib = db_session.query(Distribution) \ - .filter(Distribution.name == distribution_name) \ + distrib = ( + db_session.query(Distribution) + .filter(Distribution.name == distribution_name) .one_or_none() + ) if distrib is None: distrib = Distribution( - name=distribution_name, type='deb', - mirror_uri='http://deb.debian.org/debian/' + name=distribution_name, + type="deb", + mirror_uri="http://deb.debian.org/debian/", ) db_session.add(distrib) # Check the existing - existing_area = db_session.query(Area) \ - .filter(Area.distribution == distrib) \ - .all() + existing_area = db_session.query(Area).filter(Area.distribution == distrib).all() existing_area = set([a.name for a in existing_area]) - logger.debug('Area already known: %s', ', '.join(existing_area)) + logger.debug("Area already known: %s", ", ".join(existing_area)) # Create only the new ones for suite in suites: for component in components: - area_name = f'{suite}/{component}' + area_name = f"{suite}/{component}" if area_name in existing_area: logger.debug("Area '%s' already set, skipping", area_name) continue @@ -64,7 +68,10 @@ def debian_init(db_engine, def register() -> Mapping[str, Any]: from .lister import DebianLister - return {'models': [DebianLister.MODEL], - 'lister': DebianLister, - 'task_modules': ['%s.tasks' % __name__], - 'init': debian_init} + + return { + "models": [DebianLister.MODEL], + "lister": DebianLister, + "task_modules": ["%s.tasks" % __name__], + "init": debian_init, + } diff --git a/swh/lister/debian/lister.py b/swh/lister/debian/lister.py index b5c4c50..7355fc7 100644 --- a/swh/lister/debian/lister.py +++ b/swh/lister/debian/lister.py @@ -17,7 +17,10 @@ from typing import Mapping, Optional, Dict, Any from requests import Response from swh.lister.debian.models import ( - AreaSnapshot, Distribution, DistributionSnapshot, Package, + AreaSnapshot, + Distribution, + DistributionSnapshot, + Package, TempPackage, ) @@ -25,9 +28,9 @@ from swh.lister.core.lister_base import ListerBase, FetchError from swh.lister.core.lister_transports import ListerHttpTransport decompressors = { - 'gz': lambda f: gzip.GzipFile(fileobj=f), - 'bz2': bz2.BZ2File, - 'xz': lzma.LZMAFile, + "gz": lambda f: gzip.GzipFile(fileobj=f), + "bz2": bz2.BZ2File, + "xz": lzma.LZMAFile, } @@ -37,12 +40,15 @@ logger = logging.getLogger(__name__) class DebianLister(ListerHttpTransport, ListerBase): MODEL = Package PATH_TEMPLATE = None - LISTER_NAME = 'debian' - instance = 'debian' + LISTER_NAME = "debian" + instance = "debian" - def __init__(self, distribution: str = 'Debian', - date: Optional[datetime.datetime] = None, - override_config: Mapping = {}): + def __init__( + self, + distribution: str = "Debian", + date: Optional[datetime.datetime] = None, + override_config: Mapping = {}, + ): """Initialize the debian lister for a given distribution at a given date. @@ -55,9 +61,10 @@ class DebianLister(ListerHttpTransport, ListerBase): """ ListerHttpTransport.__init__(self, url="notused") ListerBase.__init__(self, override_config=override_config) - self.distribution = override_config.get('distribution', distribution) - self.date = override_config.get('date', date) or datetime.datetime.now( - tz=datetime.timezone.utc) + self.distribution = override_config.get("distribution", distribution) + self.date = override_config.get("date", date) or datetime.datetime.now( + tz=datetime.timezone.utc + ) def transport_request(self, identifier) -> Response: """Subvert ListerHttpTransport.transport_request, to try several @@ -83,9 +90,7 @@ class DebianLister(ListerHttpTransport, ListerBase): if response.status_code == 200: break else: - raise FetchError( - "Could not retrieve index for %s" % self.area - ) + raise FetchError("Could not retrieve index for %s" % self.area) self.decompressor = decompressors.get(compression) return response @@ -99,7 +104,7 @@ class DebianLister(ListerHttpTransport, ListerBase): # Enable streaming to allow wrapping the response in the decompressor # in transport_response_simplified. params = super().request_params(identifier) - params['stream'] = True + params["stream"] = True return params def transport_response_simplified(self, response): @@ -118,22 +123,22 @@ class DebianLister(ListerHttpTransport, ListerBase): files = defaultdict(dict) for field in src_pkg._multivalued_fields: - if field.startswith('checksums-'): - sum_name = field[len('checksums-'):] + if field.startswith("checksums-"): + sum_name = field[len("checksums-") :] else: - sum_name = 'md5sum' + sum_name = "md5sum" if field in src_pkg: for entry in src_pkg[field]: - name = entry['name'] - files[name]['name'] = entry['name'] - files[name]['size'] = int(entry['size'], 10) + name = entry["name"] + files[name]["name"] = entry["name"] + files[name]["size"] = int(entry["size"], 10) files[name][sum_name] = entry[sum_name] yield { - 'name': src_pkg['Package'], - 'version': src_pkg['Version'], - 'directory': src_pkg['Directory'], - 'files': files, + "name": src_pkg["Package"], + "version": src_pkg["Version"], + "directory": src_pkg["Directory"], + "files": files, } def inject_repo_data_into_db(self, models_list): @@ -149,13 +154,11 @@ class DebianLister(ListerHttpTransport, ListerBase): area_id = self.area.id for model in models_list: - name = model['name'] - version = model['version'] - temp_packages.append({ - 'area_id': area_id, - 'name': name, - 'version': version, - }) + name = model["name"] + version = model["version"] + temp_packages.append( + {"area_id": area_id, "name": name, "version": version,} + ) by_name_version[name, version] = model # Add all the listed packages to a temporary table @@ -172,15 +175,18 @@ class DebianLister(ListerHttpTransport, ListerBase): ) # Filter out the packages that already exist in the main Package table - new_packages = self.db_session\ - .query(TempPackage)\ - .options(load_only('name', 'version'))\ - .filter(~exists_tmp_pkg(self.db_session, Package))\ - .all() + new_packages = ( + self.db_session.query(TempPackage) + .options(load_only("name", "version")) + .filter(~exists_tmp_pkg(self.db_session, Package)) + .all() + ) - self.old_area_packages = self.db_session.query(Package).filter( - exists_tmp_pkg(self.db_session, TempPackage) - ).all() + self.old_area_packages = ( + self.db_session.query(Package) + .filter(exists_tmp_pkg(self.db_session, TempPackage)) + .all() + ) self.db_session.execute(DropTable(TempPackage.__table__)) @@ -188,8 +194,7 @@ class DebianLister(ListerHttpTransport, ListerBase): for package in new_packages: model = by_name_version[package.name, package.version] - added_packages.append(Package(area=self.area, - **model)) + added_packages.append(Package(area=self.area, **model)) self.db_session.add_all(added_packages) return added_packages @@ -210,26 +215,26 @@ class DebianLister(ListerHttpTransport, ListerBase): """Run the lister for a given (distribution, area) tuple. """ - distribution = self.db_session\ - .query(Distribution)\ - .options(joinedload(Distribution.areas))\ - .filter(Distribution.name == self.distribution)\ - .one_or_none() + distribution = ( + self.db_session.query(Distribution) + .options(joinedload(Distribution.areas)) + .filter(Distribution.name == self.distribution) + .one_or_none() + ) if not distribution: - logger.error("Distribution %s is not registered" % - self.distribution) - return {'status': 'failed'} + logger.error("Distribution %s is not registered" % self.distribution) + return {"status": "failed"} - if not distribution.type == 'deb': - logger.error("Distribution %s is not a Debian derivative" % - distribution) - return {'status': 'failed'} + if not distribution.type == "deb": + logger.error("Distribution %s is not a Debian derivative" % distribution) + return {"status": "failed"} date = self.date - logger.debug('Creating snapshot for distribution %s on date %s' % - (distribution, date)) + logger.debug( + "Creating snapshot for distribution %s on date %s" % (distribution, date) + ) snapshot = DistributionSnapshot(date=date, distribution=distribution) @@ -241,7 +246,7 @@ class DebianLister(ListerHttpTransport, ListerBase): self.area = area - logger.debug('Processing area %s' % area) + logger.debug("Processing area %s" % area) _, new_area_packages = self.ingest_data(None) area_snapshot = AreaSnapshot(snapshot=snapshot, area=area) @@ -253,4 +258,4 @@ class DebianLister(ListerHttpTransport, ListerBase): self.db_session.commit() - return {'status': 'eventful'} + return {"status": "eventful"} diff --git a/swh/lister/debian/models.py b/swh/lister/debian/models.py index 1a7058f..9335b82 100644 --- a/swh/lister/debian/models.py +++ b/swh/lister/debian/models.py @@ -34,78 +34,66 @@ from swh.lister.core.models import SQLBase class Distribution(SQLBase): """A distribution (e.g. Debian, Ubuntu, Fedora, ...)""" - __tablename__ = 'distribution' + + __tablename__ = "distribution" id = Column(Integer, primary_key=True) name = Column(String, unique=True, nullable=False) - type = Column(Enum('deb', 'rpm', name='distribution_types'), - nullable=False) + type = Column(Enum("deb", "rpm", name="distribution_types"), nullable=False) mirror_uri = Column(String, nullable=False) - areas = relationship('Area', back_populates='distribution') + areas = relationship("Area", back_populates="distribution") def origin_for_package(self, package_name: str) -> str: """Return the origin url for the given package """ - return '%s://%s/packages/%s' % (self.type, self.name, package_name) + return "%s://%s/packages/%s" % (self.type, self.name, package_name) def __repr__(self): - return 'Distribution(%s (%s) on %s)' % ( - self.name, - self.type, - self.mirror_uri, - ) + return "Distribution(%s (%s) on %s)" % (self.name, self.type, self.mirror_uri,) class Area(SQLBase): - __tablename__ = 'area' - __table_args__ = ( - UniqueConstraint('distribution_id', 'name'), - ) + __tablename__ = "area" + __table_args__ = (UniqueConstraint("distribution_id", "name"),) id = Column(Integer, primary_key=True) - distribution_id = Column(Integer, ForeignKey('distribution.id'), - nullable=False) + distribution_id = Column(Integer, ForeignKey("distribution.id"), nullable=False) name = Column(String, nullable=False) active = Column(Boolean, nullable=False, default=True) - distribution = relationship('Distribution', back_populates='areas') + distribution = relationship("Distribution", back_populates="areas") def index_uris(self): """Get possible URIs for this component's package index""" - if self.distribution.type == 'deb': - compression_exts = ('xz', 'bz2', 'gz', None) - base_uri = '%s/dists/%s/source/Sources' % ( + if self.distribution.type == "deb": + compression_exts = ("xz", "bz2", "gz", None) + base_uri = "%s/dists/%s/source/Sources" % ( self.distribution.mirror_uri, self.name, ) for ext in compression_exts: if ext: - yield (base_uri + '.' + ext, ext) + yield (base_uri + "." + ext, ext) else: yield (base_uri, None) else: raise NotImplementedError( - 'Do not know how to build index URI for Distribution type %s' % - self.distribution.type + "Do not know how to build index URI for Distribution type %s" + % self.distribution.type ) def __repr__(self): - return 'Area(%s of %s)' % ( - self.name, - self.distribution.name, - ) + return "Area(%s of %s)" % (self.name, self.distribution.name,) class Package(SQLBase): - __tablename__ = 'package' - __table_args__ = ( - UniqueConstraint('area_id', 'name', 'version'), - ) + __tablename__ = "package" + __table_args__ = (UniqueConstraint("area_id", "name", "version"),) id = Column(Integer, primary_key=True) - area_id = Column(Integer, ForeignKey('area.id'), nullable=False) + area_id = Column(Integer, ForeignKey("area.id"), nullable=False) name = Column(String, nullable=False) version = Column(String, nullable=False) directory = Column(String, nullable=False) @@ -116,7 +104,7 @@ class Package(SQLBase): revision_id = Column(LargeBinary(20)) - area = relationship('Area') + area = relationship("Area") @property def distribution(self): @@ -125,42 +113,38 @@ class Package(SQLBase): def fetch_uri(self, filename): """Get the URI to fetch the `filename` file associated with the package""" - if self.distribution.type == 'deb': - return '%s/%s/%s' % ( + if self.distribution.type == "deb": + return "%s/%s/%s" % ( self.distribution.mirror_uri, self.directory, filename, ) else: raise NotImplementedError( - 'Do not know how to build fetch URI for Distribution type %s' % - self.distribution.type + "Do not know how to build fetch URI for Distribution type %s" + % self.distribution.type ) def loader_dict(self): ret = { - 'id': self.id, - 'name': self.name, - 'version': self.version, + "id": self.id, + "name": self.name, + "version": self.version, } if self.revision_id: - ret['revision_id'] = binascii.hexlify(self.revision_id).decode() + ret["revision_id"] = binascii.hexlify(self.revision_id).decode() else: - files = { - name: checksums.copy() - for name, checksums in self.files.items() - } + files = {name: checksums.copy() for name, checksums in self.files.items()} for name in files: - files[name]['uri'] = self.fetch_uri(name) + files[name]["uri"] = self.fetch_uri(name) - ret.update({ - 'revision_id': None, - 'files': files, - }) + ret.update( + {"revision_id": None, "files": files,} + ) return ret def __repr__(self): - return 'Package(%s_%s of %s %s)' % ( + return "Package(%s_%s of %s %s)" % ( self.name, self.version, self.distribution.name, @@ -169,37 +153,36 @@ class Package(SQLBase): class DistributionSnapshot(SQLBase): - __tablename__ = 'distribution_snapshot' + __tablename__ = "distribution_snapshot" id = Column(Integer, primary_key=True) date = Column(DateTime, nullable=False, index=True) - distribution_id = Column(Integer, - ForeignKey('distribution.id'), - nullable=False) + distribution_id = Column(Integer, ForeignKey("distribution.id"), nullable=False) - distribution = relationship('Distribution') - areas = relationship('AreaSnapshot', back_populates='snapshot') + distribution = relationship("Distribution") + areas = relationship("AreaSnapshot", back_populates="snapshot") - def task_for_package(self, package_name: str, - package_versions: Mapping) -> Mapping[str, Any]: + def task_for_package( + self, package_name: str, package_versions: Mapping + ) -> Mapping[str, Any]: """Return the task dictionary for the given list of package versions """ origin_url = self.distribution.origin_for_package(package_name) return { - 'policy': 'oneshot', - 'type': 'load-%s-package' % self.distribution.type, - 'next_run': datetime.datetime.now(tz=datetime.timezone.utc), - 'arguments': { - 'args': [], - 'kwargs': { - 'url': origin_url, - 'date': self.date.isoformat(), - 'packages': package_versions, + "policy": "oneshot", + "type": "load-%s-package" % self.distribution.type, + "next_run": datetime.datetime.now(tz=datetime.timezone.utc), + "arguments": { + "args": [], + "kwargs": { + "url": origin_url, + "date": self.date.isoformat(), + "packages": package_versions, }, }, - 'retries_left': 3, + "retries_left": 3, } def get_packages(self): @@ -207,41 +190,38 @@ class DistributionSnapshot(SQLBase): for area_snapshot in self.areas: area_name = area_snapshot.area.name for package in area_snapshot.packages: - ref_name = '%s/%s' % (area_name, package.version) + ref_name = "%s/%s" % (area_name, package.version) packages[package.name][ref_name] = package.loader_dict() return packages area_snapshot_package_assoc = Table( - 'area_snapshot_package', SQLBase.metadata, - Column('area_snapshot_id', Integer, ForeignKey('area_snapshot.id'), - nullable=False), - Column('package_id', Integer, ForeignKey('package.id'), - nullable=False), + "area_snapshot_package", + SQLBase.metadata, + Column("area_snapshot_id", Integer, ForeignKey("area_snapshot.id"), nullable=False), + Column("package_id", Integer, ForeignKey("package.id"), nullable=False), ) class AreaSnapshot(SQLBase): - __tablename__ = 'area_snapshot' + __tablename__ = "area_snapshot" id = Column(Integer, primary_key=True) - snapshot_id = Column(Integer, - ForeignKey('distribution_snapshot.id'), - nullable=False) - area_id = Column(Integer, - ForeignKey('area.id'), - nullable=False) + snapshot_id = Column( + Integer, ForeignKey("distribution_snapshot.id"), nullable=False + ) + area_id = Column(Integer, ForeignKey("area.id"), nullable=False) - snapshot = relationship('DistributionSnapshot', back_populates='areas') - area = relationship('Area') - packages = relationship('Package', secondary=area_snapshot_package_assoc) + snapshot = relationship("DistributionSnapshot", back_populates="areas") + area = relationship("Area") + packages = relationship("Package", secondary=area_snapshot_package_assoc) class TempPackage(SQLBase): - __tablename__ = 'temp_package' + __tablename__ = "temp_package" __table_args__ = { - 'prefixes': ['TEMPORARY'], + "prefixes": ["TEMPORARY"], } id = Column(Integer, primary_key=True) diff --git a/swh/lister/debian/tasks.py b/swh/lister/debian/tasks.py index 04d1297..3099e61 100644 --- a/swh/lister/debian/tasks.py +++ b/swh/lister/debian/tasks.py @@ -7,12 +7,12 @@ from celery import shared_task from .lister import DebianLister -@shared_task(name=__name__ + '.DebianListerTask') +@shared_task(name=__name__ + ".DebianListerTask") def list_debian_distribution(distribution, **lister_args): - '''List a Debian distribution''' + """List a Debian distribution""" return DebianLister(distribution=distribution, **lister_args).run() -@shared_task(name=__name__ + '.ping') +@shared_task(name=__name__ + ".ping") def _ping(): - return 'OK' + return "OK" diff --git a/swh/lister/debian/tests/conftest.py b/swh/lister/debian/tests/conftest.py index 8bbc443..4b2ab4c 100644 --- a/swh/lister/debian/tests/conftest.py +++ b/swh/lister/debian/tests/conftest.py @@ -16,20 +16,20 @@ from swh.lister.debian import debian_init @pytest.fixture def lister_debian(swh_listers): - lister = swh_listers['debian'] + lister = swh_listers["debian"] # Initialize the debian data model - debian_init( - lister.db_engine, suites=['stretch'], components=['main', 'contrib'] - ) + debian_init(lister.db_engine, suites=["stretch"], components=["main", "contrib"]) # Add the load-deb-package in the scheduler backend - lister.scheduler.create_task_type({ - 'type': 'load-deb-package', - 'description': 'Load a Debian package', - 'backend_name': 'swh.loader.debian.tasks.LoaderDebianPackage', - 'default_interval': '1 day', - }) + lister.scheduler.create_task_type( + { + "type": "load-deb-package", + "description": "Load a Debian package", + "backend_name": "swh.loader.debian.tasks.LoaderDebianPackage", + "default_interval": "1 day", + } + ) return lister @@ -40,12 +40,10 @@ def sqlalchemy_engine(postgresql_proc): pg_port = postgresql_proc.port pg_user = postgresql_proc.user - pg_db = 'sqlalchemy-tests' + pg_db = "sqlalchemy-tests" - url = f'postgresql://{pg_user}@{pg_host}:{pg_port}/{pg_db}' - with DatabaseJanitor( - pg_user, pg_host, pg_port, pg_db, postgresql_proc.version - ): + url = f"postgresql://{pg_user}@{pg_host}:{pg_port}/{pg_db}" + with DatabaseJanitor(pg_user, pg_host, pg_port, pg_db, postgresql_proc.version): engine = create_engine(url) yield engine engine.dispose() diff --git a/swh/lister/debian/tests/test_init.py b/swh/lister/debian/tests/test_init.py index 928cfa6..860c4ec 100644 --- a/swh/lister/debian/tests/test_init.py +++ b/swh/lister/debian/tests/test_init.py @@ -17,29 +17,37 @@ def engine(session): def test_debian_init_step(engine, session): - distribution_name = 'KaliLinux' + distribution_name = "KaliLinux" - distrib = session.query(Distribution) \ - .filter(Distribution.name == distribution_name) \ + distrib = ( + session.query(Distribution) + .filter(Distribution.name == distribution_name) .one_or_none() + ) assert distrib is None all_area = session.query(Area).all() assert all_area == [] - suites = ['wheezy', 'jessie'] - components = ['main', 'contrib'] + suites = ["wheezy", "jessie"] + components = ["main", "contrib"] - debian_init(engine, distribution_name=distribution_name, - suites=suites, components=components) - distrib = session.query(Distribution) \ - .filter(Distribution.name == distribution_name) \ + debian_init( + engine, + distribution_name=distribution_name, + suites=suites, + components=components, + ) + distrib = ( + session.query(Distribution) + .filter(Distribution.name == distribution_name) .one_or_none() + ) assert distrib is not None assert distrib.name == distribution_name - assert distrib.type == 'deb' - assert distrib.mirror_uri == 'http://deb.debian.org/debian/' + assert distrib.type == "deb" + assert distrib.mirror_uri == "http://deb.debian.org/debian/" all_area = session.query(Area).all() assert len(all_area) == 2 * 2, "2 suites * 2 components per suite" @@ -47,7 +55,7 @@ def test_debian_init_step(engine, session): expected_area_names = [] for suite in suites: for component in components: - expected_area_names.append(f'{suite}/{component}') + expected_area_names.append(f"{suite}/{component}") for area in all_area: area.id = None @@ -56,12 +64,16 @@ def test_debian_init_step(engine, session): # check idempotency (on exact same call) - debian_init(engine, distribution_name=distribution_name, - suites=suites, components=components) + debian_init( + engine, + distribution_name=distribution_name, + suites=suites, + components=components, + ) - distribs = session.query(Distribution) \ - .filter(Distribution.name == distribution_name) \ - .all() + distribs = ( + session.query(Distribution).filter(Distribution.name == distribution_name).all() + ) assert len(distribs) == 1 distrib = distribs[0] @@ -70,8 +82,12 @@ def test_debian_init_step(engine, session): assert len(all_area) == 2 * 2, "2 suites * 2 components per suite" # Add a new suite - debian_init(engine, distribution_name=distribution_name, - suites=['lenny'], components=components) + debian_init( + engine, + distribution_name=distribution_name, + suites=["lenny"], + components=components, + ) all_area = [a.name for a in session.query(Area).all()] assert len(all_area) == (2 + 1) * 2, "3 suites * 2 components per suite" diff --git a/swh/lister/debian/tests/test_lister.py b/swh/lister/debian/tests/test_lister.py index 773289e..8694d8d 100644 --- a/swh/lister/debian/tests/test_lister.py +++ b/swh/lister/debian/tests/test_lister.py @@ -16,21 +16,21 @@ def test_lister_debian(lister_debian, datadir, requests_mock_datadir): # Run the lister lister_debian.run() - r = lister_debian.scheduler.search_tasks(task_type='load-deb-package') + r = lister_debian.scheduler.search_tasks(task_type="load-deb-package") assert len(r) == 151 for row in r: - assert row['type'] == 'load-deb-package' + assert row["type"] == "load-deb-package" # arguments check - args = row['arguments']['args'] + args = row["arguments"]["args"] assert len(args) == 0 # kwargs - kwargs = row['arguments']['kwargs'] - assert set(kwargs.keys()) == {'url', 'date', 'packages'} + kwargs = row["arguments"]["kwargs"] + assert set(kwargs.keys()) == {"url", "date", "packages"} - logger.debug('kwargs: %s', kwargs) - assert isinstance(kwargs['url'], str) + logger.debug("kwargs: %s", kwargs) + assert isinstance(kwargs["url"], str) - assert row['policy'] == 'oneshot' - assert row['priority'] is None + assert row["policy"] == "oneshot" + assert row["priority"] is None diff --git a/swh/lister/debian/tests/test_models.py b/swh/lister/debian/tests/test_models.py index 701d573..43d1555 100644 --- a/swh/lister/debian/tests/test_models.py +++ b/swh/lister/debian/tests/test_models.py @@ -10,13 +10,9 @@ from swh.lister.debian.models import Distribution, Area def test_area_index_uris_deb(session): d = Distribution( - name='Debian', type='deb', mirror_uri='http://deb.debian.org/debian' - ) - a = Area( - distribution=d, - name='unstable/main', - active=True, + name="Debian", type="deb", mirror_uri="http://deb.debian.org/debian" ) + a = Area(distribution=d, name="unstable/main", active=True,) session.add_all([d, a]) session.commit() @@ -26,14 +22,9 @@ def test_area_index_uris_deb(session): def test_area_index_uris_rpm(session): d = Distribution( - name='CentOS', type='rpm', - mirror_uri='http://centos.mirrors.proxad.net/' - ) - a = Area( - distribution=d, - name='8', - active=True, + name="CentOS", type="rpm", mirror_uri="http://centos.mirrors.proxad.net/" ) + a = Area(distribution=d, name="8", active=True,) session.add_all([d, a]) session.commit() diff --git a/swh/lister/debian/tests/test_tasks.py b/swh/lister/debian/tests/test_tasks.py index 7a6e97f..4fb08bd 100644 --- a/swh/lister/debian/tests/test_tasks.py +++ b/swh/lister/debian/tests/test_tasks.py @@ -7,25 +7,23 @@ from unittest.mock import patch def test_ping(swh_app, celery_session_worker): - res = swh_app.send_task( - 'swh.lister.debian.tasks.ping') + res = swh_app.send_task("swh.lister.debian.tasks.ping") assert res res.wait() assert res.successful() - assert res.result == 'OK' + assert res.result == "OK" -@patch('swh.lister.debian.tasks.DebianLister') +@patch("swh.lister.debian.tasks.DebianLister") def test_lister(lister, swh_app, celery_session_worker): # setup the mocked DebianLister lister.return_value = lister lister.run.return_value = None - res = swh_app.send_task( - 'swh.lister.debian.tasks.DebianListerTask', ('stretch',)) + res = swh_app.send_task("swh.lister.debian.tasks.DebianListerTask", ("stretch",)) assert res res.wait() assert res.successful() - lister.assert_called_once_with(distribution='stretch') + lister.assert_called_once_with(distribution="stretch") lister.run.assert_called_once_with() diff --git a/swh/lister/debian/utils.py b/swh/lister/debian/utils.py index 19a3e97..f6c4ca8 100644 --- a/swh/lister/debian/utils.py +++ b/swh/lister/debian/utils.py @@ -11,19 +11,18 @@ from swh.lister.debian.lister import DebianLister @click.group() -@click.option('--verbose/--no-verbose', default=False) +@click.option("--verbose/--no-verbose", default=False) @click.pass_context def cli(ctx, verbose): - ctx.obj['lister'] = DebianLister() + ctx.obj["lister"] = DebianLister() if verbose: loglevel = logging.DEBUG - logging.getLogger('sqlalchemy.engine').setLevel(logging.INFO) + logging.getLogger("sqlalchemy.engine").setLevel(logging.INFO) else: loglevel = logging.INFO logging.basicConfig( - format='%(asctime)s %(process)d %(levelname)s %(message)s', - level=loglevel, + format="%(asctime)s %(process)d %(levelname)s %(message)s", level=loglevel, ) @@ -31,23 +30,24 @@ def cli(ctx, verbose): @click.pass_context def create_schema(ctx): """Create the schema from the models""" - SQLBase.metadata.create_all(ctx.obj['lister'].db_engine) + SQLBase.metadata.create_all(ctx.obj["lister"].db_engine) @cli.command() -@click.option('--name', help='The name of the distribution') -@click.option('--type', help='The type of distribution') -@click.option('--mirror-uri', help='The URL to the mirror of the distribution') -@click.option('--area', help='The areas for the distribution', - multiple=True) +@click.option("--name", help="The name of the distribution") +@click.option("--type", help="The type of distribution") +@click.option("--mirror-uri", help="The URL to the mirror of the distribution") +@click.option("--area", help="The areas for the distribution", multiple=True) @click.pass_context def create_distribution(ctx, name, type, mirror_uri, area): to_add = [] - db_session = ctx.obj['lister'].db_session - d = db_session.query(Distribution)\ - .filter(Distribution.name == name)\ - .filter(Distribution.type == type)\ - .one_or_none() + db_session = ctx.obj["lister"].db_session + d = ( + db_session.query(Distribution) + .filter(Distribution.name == name) + .filter(Distribution.type == type) + .one_or_none() + ) if not d: d = Distribution(name=name, type=type, mirror_uri=mirror_uri) @@ -56,10 +56,12 @@ def create_distribution(ctx, name, type, mirror_uri, area): for area_name in area: a = None if d.id: - a = db_session.query(Area)\ - .filter(Area.distribution == d)\ - .filter(Area.name == area_name)\ - .one_or_none() + a = ( + db_session.query(Area) + .filter(Area.distribution == d) + .filter(Area.name == area_name) + .one_or_none() + ) if not a: a = Area(name=area_name, distribution=d) @@ -70,12 +72,12 @@ def create_distribution(ctx, name, type, mirror_uri, area): @cli.command() -@click.option('--name', help='The name of the distribution') +@click.option("--name", help="The name of the distribution") @click.pass_context def list_distribution(ctx, name): """List the distribution""" - ctx.obj['lister'].run(name) + ctx.obj["lister"].run(name) -if __name__ == '__main__': +if __name__ == "__main__": cli(obj={}) diff --git a/swh/lister/github/__init__.py b/swh/lister/github/__init__.py index 13f4688..1704bc8 100644 --- a/swh/lister/github/__init__.py +++ b/swh/lister/github/__init__.py @@ -7,7 +7,8 @@ def register(): from .models import GitHubModel from .lister import GitHubLister - return {'models': [GitHubModel], - 'lister': GitHubLister, - 'task_modules': ['%s.tasks' % __name__], - } + return { + "models": [GitHubModel], + "lister": GitHubLister, + "task_modules": ["%s.tasks" % __name__], + } diff --git a/swh/lister/github/lister.py b/swh/lister/github/lister.py index 066b884..0f3b71d 100644 --- a/swh/lister/github/lister.py +++ b/swh/lister/github/lister.py @@ -14,60 +14,57 @@ from requests import Response class GitHubLister(IndexingHttpLister): - PATH_TEMPLATE = '/repositories?since=%d' + PATH_TEMPLATE = "/repositories?since=%d" MODEL = GitHubModel - DEFAULT_URL = 'https://api.github.com' - API_URL_INDEX_RE = re.compile(r'^.*/repositories\?since=(\d+)') - LISTER_NAME = 'github' - instance = 'github' # There is only 1 instance of such lister + DEFAULT_URL = "https://api.github.com" + API_URL_INDEX_RE = re.compile(r"^.*/repositories\?since=(\d+)") + LISTER_NAME = "github" + instance = "github" # There is only 1 instance of such lister default_min_bound = 0 # type: Any def get_model_from_repo(self, repo: Dict[str, Any]) -> Dict[str, Any]: return { - 'uid': repo['id'], - 'indexable': repo['id'], - 'name': repo['name'], - 'full_name': repo['full_name'], - 'html_url': repo['html_url'], - 'origin_url': repo['html_url'], - 'origin_type': 'git', - 'fork': repo['fork'], + "uid": repo["id"], + "indexable": repo["id"], + "name": repo["name"], + "full_name": repo["full_name"], + "html_url": repo["html_url"], + "origin_url": repo["html_url"], + "origin_type": "git", + "fork": repo["fork"], } def transport_quota_check(self, response: Response) -> Tuple[bool, int]: - x_rate_limit_remaining = response.headers.get('X-RateLimit-Remaining') + x_rate_limit_remaining = response.headers.get("X-RateLimit-Remaining") if not x_rate_limit_remaining: return False, 0 reqs_remaining = int(x_rate_limit_remaining) if response.status_code == 403 and reqs_remaining == 0: - delay = int(response.headers['Retry-After']) + delay = int(response.headers["Retry-After"]) return True, delay return False, 0 - def get_next_target_from_response(self, - response: Response) -> Optional[int]: - if 'next' in response.links: - next_url = response.links['next']['url'] - return int( - self.API_URL_INDEX_RE.match(next_url).group(1)) # type: ignore + def get_next_target_from_response(self, response: Response) -> Optional[int]: + if "next" in response.links: + next_url = response.links["next"]["url"] + return int(self.API_URL_INDEX_RE.match(next_url).group(1)) # type: ignore return None - def transport_response_simplified(self, response: Response - ) -> List[Dict[str, Any]]: + def transport_response_simplified(self, response: Response) -> List[Dict[str, Any]]: repos = response.json() - return [self.get_model_from_repo(repo) - for repo in repos if repo and 'id' in repo] + return [ + self.get_model_from_repo(repo) for repo in repos if repo and "id" in repo + ] def request_headers(self) -> Dict[str, Any]: """(Override) Set requests headers to send when querying the GitHub API """ headers = super().request_headers() - headers['Accept'] = 'application/vnd.github.v3+json' + headers["Accept"] = "application/vnd.github.v3+json" return headers - def disable_deleted_repo_tasks(self, index: int, - next_index: int, keep_these: int): + def disable_deleted_repo_tasks(self, index: int, next_index: int, keep_these: int): """ (Overrides) Fix provided index value to avoid erroneously disabling some scheduler tasks """ @@ -75,5 +72,4 @@ class GitHubLister(IndexingHttpLister): # parameter, so increment the index to avoid disabling the latest # created task when processing a new repositories page returned by # the Github API - return super().disable_deleted_repo_tasks(index + 1, next_index, - keep_these) + return super().disable_deleted_repo_tasks(index + 1, next_index, keep_these) diff --git a/swh/lister/github/models.py b/swh/lister/github/models.py index 47df1a3..58de011 100644 --- a/swh/lister/github/models.py +++ b/swh/lister/github/models.py @@ -9,7 +9,8 @@ from swh.lister.core.models import IndexingModelBase class GitHubModel(IndexingModelBase): """a GitHub repository""" - __tablename__ = 'github_repo' + + __tablename__ = "github_repo" uid = Column(Integer, primary_key=True) indexable = Column(Integer, index=True) diff --git a/swh/lister/github/tasks.py b/swh/lister/github/tasks.py index 1b9f37e..e9d82a4 100644 --- a/swh/lister/github/tasks.py +++ b/swh/lister/github/tasks.py @@ -11,20 +11,20 @@ from swh.lister.github.lister import GitHubLister GROUP_SPLIT = 10000 -@shared_task(name=__name__ + '.IncrementalGitHubLister') +@shared_task(name=__name__ + ".IncrementalGitHubLister") def list_github_incremental(**lister_args): - 'Incremental update of GitHub' + "Incremental update of GitHub" lister = GitHubLister(**lister_args) return lister.run(min_bound=lister.db_last_index(), max_bound=None) -@shared_task(name=__name__ + '.RangeGitHubLister') +@shared_task(name=__name__ + ".RangeGitHubLister") def _range_github_lister(start, end, **lister_args): lister = GitHubLister(**lister_args) return lister.run(min_bound=start, max_bound=end) -@shared_task(name=__name__ + '.FullGitHubRelister', bind=True) +@shared_task(name=__name__ + ".FullGitHubRelister", bind=True) def list_github_full(self, split=None, **lister_args): """Full update of GitHub @@ -34,20 +34,21 @@ def list_github_full(self, split=None, **lister_args): lister = GitHubLister(**lister_args) ranges = lister.db_partition_indices(split or GROUP_SPLIT) if not ranges: - self.log.info('Nothing to list') + self.log.info("Nothing to list") return random.shuffle(ranges) - promise = group(_range_github_lister.s(minv, maxv, **lister_args) - for minv, maxv in ranges)() - self.log.debug('%s OK (spawned %s subtasks)' % (self.name, len(ranges))) + promise = group( + _range_github_lister.s(minv, maxv, **lister_args) for minv, maxv in ranges + )() + self.log.debug("%s OK (spawned %s subtasks)" % (self.name, len(ranges))) try: promise.save() # so that we can restore the GroupResult in tests except (NotImplementedError, AttributeError): - self.log.info('Unable to call save_group with current result backend.') + self.log.info("Unable to call save_group with current result backend.") # FIXME: what to do in terms of return here? return promise.id -@shared_task(name=__name__ + '.ping') +@shared_task(name=__name__ + ".ping") def _ping(): - return 'OK' + return "OK" diff --git a/swh/lister/github/tests/test_lister.py b/swh/lister/github/tests/test_lister.py index c0b2711..f33d721 100644 --- a/swh/lister/github/tests/test_lister.py +++ b/swh/lister/github/tests/test_lister.py @@ -14,65 +14,70 @@ from swh.lister.github.lister import GitHubLister class GitHubListerTester(HttpListerTester, unittest.TestCase): Lister = GitHubLister - test_re = re.compile(r'/repositories\?since=([^?&]+)') - lister_subdir = 'github' - good_api_response_file = 'data/https_api.github.com/first_response.json' - bad_api_response_file = 'data/https_api.github.com/empty_response.json' + test_re = re.compile(r"/repositories\?since=([^?&]+)") + lister_subdir = "github" + good_api_response_file = "data/https_api.github.com/first_response.json" + bad_api_response_file = "data/https_api.github.com/empty_response.json" first_index = 0 last_index = 369 entries_per_page = 100 convert_type = int def response_headers(self, request): - headers = {'X-RateLimit-Remaining': '1'} + headers = {"X-RateLimit-Remaining": "1"} if self.request_index(request) == self.first_index: - headers.update({ - 'Link': ';' - ' rel="next",' - ';' - ' rel="first"' % self.last_index - }) + headers.update( + { + "Link": ";" + ' rel="next",' + ";" + ' rel="first"' % self.last_index + } + ) else: - headers.update({ - 'Link': ';' - ' rel="first"' - }) + headers.update( + { + "Link": ";" + ' rel="first"' + } + ) return headers def mock_rate_quota(self, n, request, context): self.rate_limit += 1 context.status_code = 403 - context.headers['X-RateLimit-Remaining'] = '0' - context.headers['Retry-After'] = '1' # 1 second + context.headers["X-RateLimit-Remaining"] = "0" + context.headers["Retry-After"] = "1" # 1 second return '{"error":"dummy"}' @requests_mock.Mocker() def test_scheduled_tasks(self, http_mocker): self.scheduled_tasks_test( - 'data/https_api.github.com/next_response.json', 876, http_mocker) + "data/https_api.github.com/next_response.json", 876, http_mocker + ) def test_lister_github(swh_listers, requests_mock_datadir): """Simple github listing should create scheduled tasks """ - lister = swh_listers['github'] + lister = swh_listers["github"] lister.run() - r = lister.scheduler.search_tasks(task_type='load-git') + r = lister.scheduler.search_tasks(task_type="load-git") assert len(r) == 100 for row in r: - assert row['type'] == 'load-git' + assert row["type"] == "load-git" # arguments check - args = row['arguments']['args'] + args = row["arguments"]["args"] assert len(args) == 0 # kwargs - kwargs = row['arguments']['kwargs'] - url = kwargs['url'] - assert url.startswith('https://github.com') + kwargs = row["arguments"]["kwargs"] + url = kwargs["url"] + assert url.startswith("https://github.com") - assert row['policy'] == 'recurring' - assert row['priority'] is None + assert row["policy"] == "recurring" + assert row["priority"] is None diff --git a/swh/lister/github/tests/test_tasks.py b/swh/lister/github/tests/test_tasks.py index c652404..721d88d 100644 --- a/swh/lister/github/tests/test_tasks.py +++ b/swh/lister/github/tests/test_tasks.py @@ -5,23 +5,21 @@ from unittest.mock import patch def test_ping(swh_app, celery_session_worker): - res = swh_app.send_task( - 'swh.lister.github.tasks.ping') + res = swh_app.send_task("swh.lister.github.tasks.ping") assert res res.wait() assert res.successful() - assert res.result == 'OK' + assert res.result == "OK" -@patch('swh.lister.github.tasks.GitHubLister') +@patch("swh.lister.github.tasks.GitHubLister") def test_incremental(lister, swh_app, celery_session_worker): # setup the mocked GitHubLister lister.return_value = lister lister.db_last_index.return_value = 42 lister.run.return_value = None - res = swh_app.send_task( - 'swh.lister.github.tasks.IncrementalGitHubLister') + res = swh_app.send_task("swh.lister.github.tasks.IncrementalGitHubLister") assert res res.wait() assert res.successful() @@ -31,15 +29,15 @@ def test_incremental(lister, swh_app, celery_session_worker): lister.run.assert_called_once_with(min_bound=42, max_bound=None) -@patch('swh.lister.github.tasks.GitHubLister') +@patch("swh.lister.github.tasks.GitHubLister") def test_range(lister, swh_app, celery_session_worker): # setup the mocked GitHubLister lister.return_value = lister lister.run.return_value = None res = swh_app.send_task( - 'swh.lister.github.tasks.RangeGitHubLister', - kwargs=dict(start=12, end=42)) + "swh.lister.github.tasks.RangeGitHubLister", kwargs=dict(start=12, end=42) + ) assert res res.wait() assert res.successful() @@ -49,16 +47,14 @@ def test_range(lister, swh_app, celery_session_worker): lister.run.assert_called_once_with(min_bound=12, max_bound=42) -@patch('swh.lister.github.tasks.GitHubLister') +@patch("swh.lister.github.tasks.GitHubLister") def test_relister(lister, swh_app, celery_session_worker): # setup the mocked GitHubLister lister.return_value = lister lister.run.return_value = None - lister.db_partition_indices.return_value = [ - (i, i+9) for i in range(0, 50, 10)] + lister.db_partition_indices.return_value = [(i, i + 9) for i in range(0, 50, 10)] - res = swh_app.send_task( - 'swh.lister.github.tasks.FullGitHubRelister') + res = swh_app.send_task("swh.lister.github.tasks.FullGitHubRelister") assert res res.wait() @@ -86,5 +82,6 @@ def test_relister(lister, swh_app, celery_session_worker): # lister.run should have been called once per partition interval for i in range(5): # XXX inconsistent behavior: max_bound is INCLUDED here - assert (dict(min_bound=10*i, max_bound=10*i + 9),) \ - in lister.run.call_args_list + assert ( + dict(min_bound=10 * i, max_bound=10 * i + 9), + ) in lister.run.call_args_list diff --git a/swh/lister/gitlab/__init__.py b/swh/lister/gitlab/__init__.py index ca2b89b..5ddf416 100644 --- a/swh/lister/gitlab/__init__.py +++ b/swh/lister/gitlab/__init__.py @@ -7,7 +7,8 @@ def register(): from .models import GitLabModel from .lister import GitLabLister - return {'models': [GitLabModel], - 'lister': GitLabLister, - 'task_modules': ['%s.tasks' % __name__], - } + return { + "models": [GitLabModel], + "lister": GitLabLister, + "task_modules": ["%s.tasks" % __name__], + } diff --git a/swh/lister/gitlab/lister.py b/swh/lister/gitlab/lister.py index d3e45bf..ca70bf4 100644 --- a/swh/lister/gitlab/lister.py +++ b/swh/lister/gitlab/lister.py @@ -15,77 +15,83 @@ from requests import Response class GitLabLister(PageByPageHttpLister): # Template path expecting an integer that represents the page id - PATH_TEMPLATE = '/projects?page=%d&order_by=id' - DEFAULT_URL = 'https://gitlab.com/api/v4/' + PATH_TEMPLATE = "/projects?page=%d&order_by=id" + DEFAULT_URL = "https://gitlab.com/api/v4/" MODEL = GitLabModel - LISTER_NAME = 'gitlab' + LISTER_NAME = "gitlab" - def __init__(self, url=None, instance=None, - override_config=None, sort='asc', per_page=20): + def __init__( + self, url=None, instance=None, override_config=None, sort="asc", per_page=20 + ): super().__init__(url=url, override_config=override_config) if instance is None: instance = parse_url(self.url).host self.instance = instance - self.PATH_TEMPLATE = '%s&sort=%s&per_page=%s' % ( - self.PATH_TEMPLATE, sort, per_page) + self.PATH_TEMPLATE = "%s&sort=%s&per_page=%s" % ( + self.PATH_TEMPLATE, + sort, + per_page, + ) def uid(self, repo: Dict[str, Any]) -> str: - return '%s/%s' % (self.instance, repo['path_with_namespace']) + return "%s/%s" % (self.instance, repo["path_with_namespace"]) def get_model_from_repo(self, repo: Dict[str, Any]) -> Dict[str, Any]: return { - 'instance': self.instance, - 'uid': self.uid(repo), - 'name': repo['name'], - 'full_name': repo['path_with_namespace'], - 'html_url': repo['web_url'], - 'origin_url': repo['http_url_to_repo'], - 'origin_type': 'git', + "instance": self.instance, + "uid": self.uid(repo), + "name": repo["name"], + "full_name": repo["path_with_namespace"], + "html_url": repo["web_url"], + "origin_url": repo["http_url_to_repo"], + "origin_type": "git", } - def transport_quota_check(self, response: Response - ) -> Tuple[bool, Union[int, float]]: + def transport_quota_check( + self, response: Response + ) -> Tuple[bool, Union[int, float]]: """Deal with rate limit if any. """ # not all gitlab instance have rate limit - if 'RateLimit-Remaining' in response.headers: - reqs_remaining = int(response.headers['RateLimit-Remaining']) + if "RateLimit-Remaining" in response.headers: + reqs_remaining = int(response.headers["RateLimit-Remaining"]) if response.status_code == 403 and reqs_remaining == 0: - reset_at = int(response.headers['RateLimit-Reset']) + reset_at = int(response.headers["RateLimit-Reset"]) delay = min(reset_at - time.time(), 3600) return True, delay return False, 0 - def _get_int(self, headers: MutableMapping[str, Any], - key: str) -> Optional[int]: + def _get_int(self, headers: MutableMapping[str, Any], key: str) -> Optional[int]: _val = headers.get(key) if _val: return int(_val) return None - def get_next_target_from_response( - self, response: Response) -> Optional[int]: + def get_next_target_from_response(self, response: Response) -> Optional[int]: """Determine the next page identifier. """ - return self._get_int(response.headers, 'x-next-page') + return self._get_int(response.headers, "x-next-page") - def get_pages_information(self) -> Tuple[Optional[int], - Optional[int], Optional[int]]: + def get_pages_information( + self, + ) -> Tuple[Optional[int], Optional[int], Optional[int]]: """Determine pages information. """ response = self.transport_head(identifier=1) # type: ignore if not response.ok: raise ValueError( - 'Problem during information fetch: %s' % response.status_code) + "Problem during information fetch: %s" % response.status_code + ) h = response.headers - return (self._get_int(h, 'x-total'), - self._get_int(h, 'x-total-pages'), - self._get_int(h, 'x-per-page')) + return ( + self._get_int(h, "x-total"), + self._get_int(h, "x-total-pages"), + self._get_int(h, "x-per-page"), + ) - def transport_response_simplified(self, response: Response - ) -> List[Dict[str, Any]]: + def transport_response_simplified(self, response: Response) -> List[Dict[str, Any]]: repos = response.json() return [self.get_model_from_repo(repo) for repo in repos] diff --git a/swh/lister/gitlab/models.py b/swh/lister/gitlab/models.py index 1302e67..d1907f2 100644 --- a/swh/lister/gitlab/models.py +++ b/swh/lister/gitlab/models.py @@ -11,7 +11,8 @@ class GitLabModel(ModelBase): """a Gitlab repository from a gitlab instance """ - __tablename__ = 'gitlab_repo' + + __tablename__ = "gitlab_repo" uid = Column(String, primary_key=True) instance = Column(String, index=True) diff --git a/swh/lister/gitlab/tasks.py b/swh/lister/gitlab/tasks.py index e6a1755..85866bd 100644 --- a/swh/lister/gitlab/tasks.py +++ b/swh/lister/gitlab/tasks.py @@ -13,40 +13,41 @@ from .lister import GitLabLister NBPAGES = 10 -@shared_task(name=__name__ + '.IncrementalGitLabLister') +@shared_task(name=__name__ + ".IncrementalGitLabLister") def list_gitlab_incremental(**lister_args): """Incremental update of a GitLab instance""" - lister_args['sort'] = 'desc' + lister_args["sort"] = "desc" lister = GitLabLister(**lister_args) total_pages = lister.get_pages_information()[1] # stopping as soon as existing origins for that instance are detected return lister.run(min_bound=1, max_bound=total_pages, check_existence=True) -@shared_task(name=__name__ + '.RangeGitLabLister') +@shared_task(name=__name__ + ".RangeGitLabLister") def _range_gitlab_lister(start, end, **lister_args): lister = GitLabLister(**lister_args) return lister.run(min_bound=start, max_bound=end) -@shared_task(name=__name__ + '.FullGitLabRelister', bind=True) +@shared_task(name=__name__ + ".FullGitLabRelister", bind=True) def list_gitlab_full(self, **lister_args): """Full update of a GitLab instance""" lister = GitLabLister(**lister_args) _, total_pages, _ = lister.get_pages_information() ranges = list(utils.split_range(total_pages, NBPAGES)) random.shuffle(ranges) - promise = group(_range_gitlab_lister.s(minv, maxv, **lister_args) - for minv, maxv in ranges)() - self.log.debug('%s OK (spawned %s subtasks)' % (self.name, len(ranges))) + promise = group( + _range_gitlab_lister.s(minv, maxv, **lister_args) for minv, maxv in ranges + )() + self.log.debug("%s OK (spawned %s subtasks)" % (self.name, len(ranges))) try: promise.save() except (NotImplementedError, AttributeError): - self.log.info('Unable to call save_group with current result backend.') + self.log.info("Unable to call save_group with current result backend.") # FIXME: what to do in terms of return here? return promise.id -@shared_task(name=__name__ + '.ping') +@shared_task(name=__name__ + ".ping") def _ping(): - return 'OK' + return "OK" diff --git a/swh/lister/gitlab/tests/test_lister.py b/swh/lister/gitlab/tests/test_lister.py index 0d02423..041e969 100644 --- a/swh/lister/gitlab/tests/test_lister.py +++ b/swh/lister/gitlab/tests/test_lister.py @@ -17,50 +17,50 @@ logger = logging.getLogger(__name__) class GitLabListerTester(HttpListerTesterBase, unittest.TestCase): Lister = GitLabLister - test_re = re.compile(r'^.*/projects.*page=(\d+).*') - lister_subdir = 'gitlab' - good_api_response_file = 'data/gitlab.com/api_response.json' - bad_api_response_file = 'data/gitlab.com/api_empty_response.json' + test_re = re.compile(r"^.*/projects.*page=(\d+).*") + lister_subdir = "gitlab" + good_api_response_file = "data/gitlab.com/api_response.json" + bad_api_response_file = "data/gitlab.com/api_empty_response.json" first_index = 1 entries_per_page = 10 convert_type = int def response_headers(self, request): - headers = {'RateLimit-Remaining': '1'} + headers = {"RateLimit-Remaining": "1"} if self.request_index(request) == self.first_index: - headers.update({ - 'x-next-page': '3', - }) + headers.update( + {"x-next-page": "3",} + ) return headers def mock_rate_quota(self, n, request, context): self.rate_limit += 1 context.status_code = 403 - context.headers['RateLimit-Remaining'] = '0' + context.headers["RateLimit-Remaining"] = "0" one_second = int((datetime.now() + timedelta(seconds=1.5)).timestamp()) - context.headers['RateLimit-Reset'] = str(one_second) + context.headers["RateLimit-Reset"] = str(one_second) return '{"error":"dummy"}' def test_lister_gitlab(swh_listers, requests_mock_datadir): - lister = swh_listers['gitlab'] + lister = swh_listers["gitlab"] lister.run() - r = lister.scheduler.search_tasks(task_type='load-git') + r = lister.scheduler.search_tasks(task_type="load-git") assert len(r) == 10 for row in r: - assert row['type'] == 'load-git' + assert row["type"] == "load-git" # arguments check - args = row['arguments']['args'] + args = row["arguments"]["args"] assert len(args) == 0 # kwargs - kwargs = row['arguments']['kwargs'] - url = kwargs['url'] - assert url.startswith('https://gitlab.com') + kwargs = row["arguments"]["kwargs"] + url = kwargs["url"] + assert url.startswith("https://gitlab.com") - assert row['policy'] == 'recurring' - assert row['priority'] is None + assert row["policy"] == "recurring" + assert row["priority"] is None diff --git a/swh/lister/gitlab/tests/test_tasks.py b/swh/lister/gitlab/tests/test_tasks.py index 56332a1..1959989 100644 --- a/swh/lister/gitlab/tests/test_tasks.py +++ b/swh/lister/gitlab/tests/test_tasks.py @@ -5,43 +5,40 @@ from unittest.mock import patch def test_ping(swh_app, celery_session_worker): - res = swh_app.send_task( - 'swh.lister.gitlab.tasks.ping') + res = swh_app.send_task("swh.lister.gitlab.tasks.ping") assert res res.wait() assert res.successful() - assert res.result == 'OK' + assert res.result == "OK" -@patch('swh.lister.gitlab.tasks.GitLabLister') +@patch("swh.lister.gitlab.tasks.GitLabLister") def test_incremental(lister, swh_app, celery_session_worker): # setup the mocked GitlabLister lister.return_value = lister lister.run.return_value = None lister.get_pages_information.return_value = (None, 10, None) - res = swh_app.send_task( - 'swh.lister.gitlab.tasks.IncrementalGitLabLister') + res = swh_app.send_task("swh.lister.gitlab.tasks.IncrementalGitLabLister") assert res res.wait() assert res.successful() - lister.assert_called_once_with(sort='desc') + lister.assert_called_once_with(sort="desc") lister.db_last_index.assert_not_called() lister.get_pages_information.assert_called_once_with() - lister.run.assert_called_once_with( - min_bound=1, max_bound=10, check_existence=True) + lister.run.assert_called_once_with(min_bound=1, max_bound=10, check_existence=True) -@patch('swh.lister.gitlab.tasks.GitLabLister') +@patch("swh.lister.gitlab.tasks.GitLabLister") def test_range(lister, swh_app, celery_session_worker): # setup the mocked GitlabLister lister.return_value = lister lister.run.return_value = None res = swh_app.send_task( - 'swh.lister.gitlab.tasks.RangeGitLabLister', - kwargs=dict(start=12, end=42)) + "swh.lister.gitlab.tasks.RangeGitLabLister", kwargs=dict(start=12, end=42) + ) assert res res.wait() assert res.successful() @@ -51,17 +48,17 @@ def test_range(lister, swh_app, celery_session_worker): lister.run.assert_called_once_with(min_bound=12, max_bound=42) -@patch('swh.lister.gitlab.tasks.GitLabLister') +@patch("swh.lister.gitlab.tasks.GitLabLister") def test_relister(lister, swh_app, celery_session_worker): # setup the mocked GitlabLister lister.return_value = lister lister.run.return_value = None lister.get_pages_information.return_value = (None, 85, None) lister.db_partition_indices.return_value = [ - (i, i+9) for i in range(0, 80, 10)] + [(80, 85)] + (i, i + 9) for i in range(0, 80, 10) + ] + [(80, 85)] - res = swh_app.send_task( - 'swh.lister.gitlab.tasks.FullGitLabRelister') + res = swh_app.send_task("swh.lister.gitlab.tasks.FullGitLabRelister") assert res res.wait() @@ -90,24 +87,26 @@ def test_relister(lister, swh_app, celery_session_worker): # lister.run should have been called once per partition interval for i in range(8): # XXX inconsistent behavior: max_bound is EXCLUDED here - assert (dict(min_bound=10*i, max_bound=10*i + 10),) \ - in lister.run.call_args_list - assert (dict(min_bound=80, max_bound=85),) \ - in lister.run.call_args_list + assert ( + dict(min_bound=10 * i, max_bound=10 * i + 10), + ) in lister.run.call_args_list + assert (dict(min_bound=80, max_bound=85),) in lister.run.call_args_list -@patch('swh.lister.gitlab.tasks.GitLabLister') +@patch("swh.lister.gitlab.tasks.GitLabLister") def test_relister_instance(lister, swh_app, celery_session_worker): # setup the mocked GitlabLister lister.return_value = lister lister.run.return_value = None lister.get_pages_information.return_value = (None, 85, None) lister.db_partition_indices.return_value = [ - (i, i+9) for i in range(0, 80, 10)] + [(80, 85)] + (i, i + 9) for i in range(0, 80, 10) + ] + [(80, 85)] res = swh_app.send_task( - 'swh.lister.gitlab.tasks.FullGitLabRelister', - kwargs=dict(url='https://0xacab.org/api/v4')) + "swh.lister.gitlab.tasks.FullGitLabRelister", + kwargs=dict(url="https://0xacab.org/api/v4"), + ) assert res res.wait() @@ -123,7 +122,7 @@ def test_relister_instance(lister, swh_app, celery_session_worker): break sleep(1) - lister.assert_called_with(url='https://0xacab.org/api/v4') + lister.assert_called_with(url="https://0xacab.org/api/v4") # one by the FullGitlabRelister task # + 9 for the RangeGitlabLister subtasks @@ -136,7 +135,7 @@ def test_relister_instance(lister, swh_app, celery_session_worker): # lister.run should have been called once per partition interval for i in range(8): # XXX inconsistent behavior: max_bound is EXCLUDED here - assert (dict(min_bound=10*i, max_bound=10*i + 10),) \ - in lister.run.call_args_list - assert (dict(min_bound=80, max_bound=85),) \ - in lister.run.call_args_list + assert ( + dict(min_bound=10 * i, max_bound=10 * i + 10), + ) in lister.run.call_args_list + assert (dict(min_bound=80, max_bound=85),) in lister.run.call_args_list diff --git a/swh/lister/gnu/__init__.py b/swh/lister/gnu/__init__.py index 7787464..8ff20bb 100644 --- a/swh/lister/gnu/__init__.py +++ b/swh/lister/gnu/__init__.py @@ -7,7 +7,8 @@ def register(): from .models import GNUModel from .lister import GNULister - return {'models': [GNUModel], - 'lister': GNULister, - 'task_modules': ['%s.tasks' % __name__], - } + return { + "models": [GNUModel], + "lister": GNULister, + "task_modules": ["%s.tasks" % __name__], + } diff --git a/swh/lister/gnu/lister.py b/swh/lister/gnu/lister.py index 1f41b0a..3390078 100644 --- a/swh/lister/gnu/lister.py +++ b/swh/lister/gnu/lister.py @@ -18,12 +18,12 @@ logger = logging.getLogger(__name__) class GNULister(SimpleLister): MODEL = GNUModel - LISTER_NAME = 'gnu' - instance = 'gnu' + LISTER_NAME = "gnu" + instance = "gnu" def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) - self.gnu_tree = GNUTree('https://ftp.gnu.org/tree.json.gz') + self.gnu_tree = GNUTree("https://ftp.gnu.org/tree.json.gz") def task_dict(self, origin_type, origin_url, **kwargs): """Return task format dict @@ -51,10 +51,10 @@ class GNULister(SimpleLister): """ artifacts = self.gnu_tree.artifacts[origin_url] - assert origin_type == 'tar' + assert origin_type == "tar" return utils.create_task_dict( - 'load-archive-files', - kwargs.get('policy', 'oneshot'), + "load-archive-files", + kwargs.get("policy", "oneshot"), url=origin_url, artifacts=artifacts, retries_left=3, @@ -103,11 +103,11 @@ class GNULister(SimpleLister): """ return { - 'uid': repo['url'], - 'name': repo['name'], - 'full_name': repo['name'], - 'html_url': repo['url'], - 'origin_url': repo['url'], - 'time_last_updated': repo['time_modified'], - 'origin_type': 'tar', + "uid": repo["url"], + "name": repo["name"], + "full_name": repo["name"], + "html_url": repo["url"], + "origin_url": repo["url"], + "time_last_updated": repo["time_modified"], + "origin_type": "tar", } diff --git a/swh/lister/gnu/models.py b/swh/lister/gnu/models.py index 38c47ae..db024f7 100644 --- a/swh/lister/gnu/models.py +++ b/swh/lister/gnu/models.py @@ -11,7 +11,8 @@ class GNUModel(ModelBase): """a GNU repository representation """ - __tablename__ = 'gnu_repo' + + __tablename__ = "gnu_repo" uid = Column(String, primary_key=True) time_last_updated = Column(DateTime) diff --git a/swh/lister/gnu/tasks.py b/swh/lister/gnu/tasks.py index edcde7e..3134582 100644 --- a/swh/lister/gnu/tasks.py +++ b/swh/lister/gnu/tasks.py @@ -7,12 +7,12 @@ from celery import shared_task from .lister import GNULister -@shared_task(name=__name__ + '.GNUListerTask') +@shared_task(name=__name__ + ".GNUListerTask") def list_gnu_full(**lister_args): """List lister for the GNU source code archive""" return GNULister(**lister_args).run() -@shared_task(name=__name__ + '.ping') +@shared_task(name=__name__ + ".ping") def _ping(): - return 'OK' + return "OK" diff --git a/swh/lister/gnu/tests/test_lister.py b/swh/lister/gnu/tests/test_lister.py index a1c9c09..92a9bc4 100644 --- a/swh/lister/gnu/tests/test_lister.py +++ b/swh/lister/gnu/tests/test_lister.py @@ -10,43 +10,41 @@ logger = logging.getLogger(__name__) def test_gnu_lister(swh_listers, requests_mock_datadir): - lister = swh_listers['gnu'] + lister = swh_listers["gnu"] lister.run() - r = lister.scheduler.search_tasks(task_type='load-archive-files') + r = lister.scheduler.search_tasks(task_type="load-archive-files") assert len(r) == 383 for row in r: - assert row['type'] == 'load-archive-files' + assert row["type"] == "load-archive-files" # arguments check - args = row['arguments']['args'] + args = row["arguments"]["args"] assert len(args) == 0 # kwargs - kwargs = row['arguments']['kwargs'] - assert set(kwargs.keys()) == {'url', 'artifacts'} + kwargs = row["arguments"]["kwargs"] + assert set(kwargs.keys()) == {"url", "artifacts"} - url = kwargs['url'] - assert url.startswith('https://ftp.gnu.org') + url = kwargs["url"] + assert url.startswith("https://ftp.gnu.org") - url_suffix = url.split('https://ftp.gnu.org')[1] - assert 'gnu' in url_suffix or 'old-gnu' in url_suffix + url_suffix = url.split("https://ftp.gnu.org")[1] + assert "gnu" in url_suffix or "old-gnu" in url_suffix - artifacts = kwargs['artifacts'] + artifacts = kwargs["artifacts"] # check the artifact's structure artifact = artifacts[0] - assert set(artifact.keys()) == { - 'url', 'length', 'time', 'filename', 'version' - } + assert set(artifact.keys()) == {"url", "length", "time", "filename", "version"} for artifact in artifacts: logger.debug(artifact) # 'time' is an isoformat string now - for key in ['url', 'time', 'filename', 'version']: + for key in ["url", "time", "filename", "version"]: assert isinstance(artifact[key], str) - assert isinstance(artifact['length'], int) + assert isinstance(artifact["length"], int) - assert row['policy'] == 'oneshot' - assert row['priority'] is None - assert row['retries_left'] == 3 + assert row["policy"] == "oneshot" + assert row["priority"] is None + assert row["retries_left"] == 3 diff --git a/swh/lister/gnu/tests/test_tasks.py b/swh/lister/gnu/tests/test_tasks.py index 4c82f77..d496798 100644 --- a/swh/lister/gnu/tests/test_tasks.py +++ b/swh/lister/gnu/tests/test_tasks.py @@ -2,22 +2,20 @@ from unittest.mock import patch def test_ping(swh_app, celery_session_worker): - res = swh_app.send_task( - 'swh.lister.gnu.tasks.ping') + res = swh_app.send_task("swh.lister.gnu.tasks.ping") assert res res.wait() assert res.successful() - assert res.result == 'OK' + assert res.result == "OK" -@patch('swh.lister.gnu.tasks.GNULister') +@patch("swh.lister.gnu.tasks.GNULister") def test_lister(lister, swh_app, celery_session_worker): # setup the mocked GNULister lister.return_value = lister lister.run.return_value = None - res = swh_app.send_task( - 'swh.lister.gnu.tasks.GNUListerTask') + res = swh_app.send_task("swh.lister.gnu.tasks.GNUListerTask") assert res res.wait() assert res.successful() diff --git a/swh/lister/gnu/tests/test_tree.py b/swh/lister/gnu/tests/test_tree.py index ea25515..f09fe9e 100644 --- a/swh/lister/gnu/tests/test_tree.py +++ b/swh/lister/gnu/tests/test_tree.py @@ -9,26 +9,30 @@ import pytest from os import path from swh.lister.gnu.tree import ( - GNUTree, find_artifacts, check_filename_is_archive, load_raw_data, - get_version, format_date + GNUTree, + find_artifacts, + check_filename_is_archive, + load_raw_data, + get_version, + format_date, ) def test_load_raw_data_from_query(requests_mock_datadir): - actual_json = load_raw_data('https://ftp.gnu.org/tree.json.gz') + actual_json = load_raw_data("https://ftp.gnu.org/tree.json.gz") assert actual_json is not None assert isinstance(actual_json, list) assert len(actual_json) == 2 def test_load_raw_data_from_query_failure(requests_mock_datadir): - inexistant_url = 'https://ftp2.gnu.org/tree.unknown.gz' - with pytest.raises(ValueError, match='Error during query'): + inexistant_url = "https://ftp2.gnu.org/tree.unknown.gz" + with pytest.raises(ValueError, match="Error during query"): load_raw_data(inexistant_url) def test_load_raw_data_from_file(datadir): - filepath = path.join(datadir, 'https_ftp.gnu.org', 'tree.json.gz') + filepath = path.join(datadir, "https_ftp.gnu.org", "tree.json.gz") actual_json = load_raw_data(filepath) assert actual_json is not None assert isinstance(actual_json, list) @@ -36,115 +40,115 @@ def test_load_raw_data_from_file(datadir): def test_load_raw_data_from_file_failure(datadir): - unknown_path = path.join(datadir, 'ftp.gnu.org2', 'tree.json.gz') + unknown_path = path.join(datadir, "ftp.gnu.org2", "tree.json.gz") with pytest.raises(FileNotFoundError): load_raw_data(unknown_path) def test_tree_json(requests_mock_datadir): - tree_json = GNUTree('https://ftp.gnu.org/tree.json.gz') + tree_json = GNUTree("https://ftp.gnu.org/tree.json.gz") - assert tree_json.projects['https://ftp.gnu.org/gnu/8sync/'] == { - 'name': '8sync', - 'time_modified': '2017-03-18T06:10:08+00:00', - 'url': 'https://ftp.gnu.org/gnu/8sync/' + assert tree_json.projects["https://ftp.gnu.org/gnu/8sync/"] == { + "name": "8sync", + "time_modified": "2017-03-18T06:10:08+00:00", + "url": "https://ftp.gnu.org/gnu/8sync/", } - assert tree_json.projects['https://ftp.gnu.org/gnu/3dldf/'] == { - 'name': '3dldf', - 'time_modified': '2013-12-13T19:00:36+00:00', - 'url': 'https://ftp.gnu.org/gnu/3dldf/' + assert tree_json.projects["https://ftp.gnu.org/gnu/3dldf/"] == { + "name": "3dldf", + "time_modified": "2013-12-13T19:00:36+00:00", + "url": "https://ftp.gnu.org/gnu/3dldf/", } - assert tree_json.projects['https://ftp.gnu.org/gnu/a2ps/'] == { - 'name': 'a2ps', - 'time_modified': '2007-12-29T03:55:05+00:00', - 'url': 'https://ftp.gnu.org/gnu/a2ps/' + assert tree_json.projects["https://ftp.gnu.org/gnu/a2ps/"] == { + "name": "a2ps", + "time_modified": "2007-12-29T03:55:05+00:00", + "url": "https://ftp.gnu.org/gnu/a2ps/", } - assert tree_json.projects['https://ftp.gnu.org/old-gnu/xshogi/'] == { - 'name': 'xshogi', - 'time_modified': '2003-08-02T11:15:22+00:00', - 'url': 'https://ftp.gnu.org/old-gnu/xshogi/' + assert tree_json.projects["https://ftp.gnu.org/old-gnu/xshogi/"] == { + "name": "xshogi", + "time_modified": "2003-08-02T11:15:22+00:00", + "url": "https://ftp.gnu.org/old-gnu/xshogi/", } - assert tree_json.artifacts['https://ftp.gnu.org/old-gnu/zlibc/'] == [ + assert tree_json.artifacts["https://ftp.gnu.org/old-gnu/zlibc/"] == [ { - 'url': 'https://ftp.gnu.org/old-gnu/zlibc/zlibc-0.9b.tar.gz', # noqa - 'length': 90106, - 'time': '1997-03-10T08:00:00+00:00', - 'filename': 'zlibc-0.9b.tar.gz', - 'version': '0.9b', + "url": "https://ftp.gnu.org/old-gnu/zlibc/zlibc-0.9b.tar.gz", # noqa + "length": 90106, + "time": "1997-03-10T08:00:00+00:00", + "filename": "zlibc-0.9b.tar.gz", + "version": "0.9b", }, { - 'url': 'https://ftp.gnu.org/old-gnu/zlibc/zlibc-0.9e.tar.gz', # noqa - 'length': 89625, - 'time': '1997-04-07T07:00:00+00:00', - 'filename': 'zlibc-0.9e.tar.gz', - 'version': '0.9e', - } + "url": "https://ftp.gnu.org/old-gnu/zlibc/zlibc-0.9e.tar.gz", # noqa + "length": 89625, + "time": "1997-04-07T07:00:00+00:00", + "filename": "zlibc-0.9e.tar.gz", + "version": "0.9e", + }, ] def test_tree_json_failures(requests_mock_datadir): - url = 'https://unknown/tree.json.gz' + url = "https://unknown/tree.json.gz" tree_json = GNUTree(url) - with pytest.raises(ValueError, match='Error during query to %s' % url): - tree_json.artifacts['https://ftp.gnu.org/gnu/3dldf/'] + with pytest.raises(ValueError, match="Error during query to %s" % url): + tree_json.artifacts["https://ftp.gnu.org/gnu/3dldf/"] - with pytest.raises(ValueError, match='Error during query to %s' % url): - tree_json.projects['https://ftp.gnu.org/old-gnu/xshogi/'] + with pytest.raises(ValueError, match="Error during query to %s" % url): + tree_json.projects["https://ftp.gnu.org/old-gnu/xshogi/"] def test_find_artifacts_small_sample(datadir): expected_artifacts = [ { - 'url': '/root/artanis/artanis-0.2.1.tar.bz2', - 'time': '2017-05-19T14:59:39+00:00', - 'length': 424081, - 'version': '0.2.1', - 'filename': 'artanis-0.2.1.tar.bz2', + "url": "/root/artanis/artanis-0.2.1.tar.bz2", + "time": "2017-05-19T14:59:39+00:00", + "length": 424081, + "version": "0.2.1", + "filename": "artanis-0.2.1.tar.bz2", }, { - 'url': '/root/xboard/winboard/winboard-4_0_0-src.zip', # noqa - 'time': '1998-06-21T09:55:00+00:00', - 'length': 1514448, - 'version': '4_0_0-src', - 'filename': 'winboard-4_0_0-src.zip', + "url": "/root/xboard/winboard/winboard-4_0_0-src.zip", # noqa + "time": "1998-06-21T09:55:00+00:00", + "length": 1514448, + "version": "4_0_0-src", + "filename": "winboard-4_0_0-src.zip", }, { - 'url': '/root/xboard/xboard-3.6.2.tar.gz', # noqa - 'time': '1997-07-25T07:00:00+00:00', - 'length': 450164, - 'version': '3.6.2', - 'filename': 'xboard-3.6.2.tar.gz', + "url": "/root/xboard/xboard-3.6.2.tar.gz", # noqa + "time": "1997-07-25T07:00:00+00:00", + "length": 450164, + "version": "3.6.2", + "filename": "xboard-3.6.2.tar.gz", }, { - 'url': '/root/xboard/xboard-4.0.0.tar.gz', # noqa - 'time': '1998-06-21T09:55:00+00:00', - 'length': 514951, - 'version': '4.0.0', - 'filename': 'xboard-4.0.0.tar.gz', + "url": "/root/xboard/xboard-4.0.0.tar.gz", # noqa + "time": "1998-06-21T09:55:00+00:00", + "length": 514951, + "version": "4.0.0", + "filename": "xboard-4.0.0.tar.gz", }, ] - file_structure = json.load(open(path.join(datadir, 'tree.min.json'))) - actual_artifacts = find_artifacts(file_structure, '/root/') + file_structure = json.load(open(path.join(datadir, "tree.min.json"))) + actual_artifacts = find_artifacts(file_structure, "/root/") assert actual_artifacts == expected_artifacts def test_find_artifacts(datadir): - file_structure = json.load(open(path.join(datadir, 'tree.json'))) - actual_artifacts = find_artifacts(file_structure, '/root/') + file_structure = json.load(open(path.join(datadir, "tree.json"))) + actual_artifacts = find_artifacts(file_structure, "/root/") assert len(actual_artifacts) == 42 + 3 # tar + zip def test_check_filename_is_archive(): - for ext in ['abc.xy.zip', 'cvb.zip', 'abc.tar.bz2', 'something.tar']: + for ext in ["abc.xy.zip", "cvb.zip", "abc.tar.bz2", "something.tar"]: assert check_filename_is_archive(ext) is True - for ext in ['abc.tar.gz.sig', 'abc', 'something.zip2', 'foo.tar.']: + for ext in ["abc.tar.gz.sig", "abc", "something.zip2", "foo.tar."]: assert check_filename_is_archive(ext) is False @@ -155,54 +159,62 @@ def test_get_version(): """ for url, expected_branchname in [ - ('https://gnu.org/sthg/info-2.1.0.tar.gz', '2.1.0'), - ('https://gnu.org/sthg/info-2.1.2.zip', '2.1.2'), - ('https://sthg.org/gnu/sthg.tar.gz', 'sthg'), - ('https://sthg.org/gnu/DLDF-1.1.4.tar.gz', '1.1.4'), - ('https://sthg.org/gnu/anubis-latest.tar.bz2', 'latest'), - ('https://ftp.org/gnu/aris-w32.zip', 'w32'), - ('https://ftp.org/gnu/aris-w32-2.2.zip', 'w32-2.2'), - ('https://ftp.org/gnu/autogen.info.tar.gz', 'autogen.info'), - ('https://ftp.org/gnu/crypto-build-demo.tar.gz', - 'crypto-build-demo'), - ('https://ftp.org/gnu/clue+clio+xit.clisp.tar.gz', - 'clue+clio+xit.clisp'), - ('https://ftp.org/gnu/clue+clio.for-pcl.tar.gz', - 'clue+clio.for-pcl'), - ('https://ftp.org/gnu/clisp-hppa2.0-hp-hpux10.20.tar.gz', - 'hppa2.0-hp-hpux10.20'), - ('clisp-i386-solaris2.6.tar.gz', 'i386-solaris2.6'), - ('clisp-mips-sgi-irix6.5.tar.gz', 'mips-sgi-irix6.5'), - ('clisp-powerpc-apple-macos.tar.gz', 'powerpc-apple-macos'), - ('clisp-powerpc-unknown-linuxlibc6.tar.gz', - 'powerpc-unknown-linuxlibc6'), - - ('clisp-rs6000-ibm-aix3.2.5.tar.gz', 'rs6000-ibm-aix3.2.5'), - ('clisp-sparc-redhat51-linux.tar.gz', 'sparc-redhat51-linux'), - ('clisp-sparc-sun-solaris2.4.tar.gz', 'sparc-sun-solaris2.4'), - ('clisp-sparc-sun-sunos4.1.3_U1.tar.gz', - 'sparc-sun-sunos4.1.3_U1'), - ('clisp-2.25.1-powerpc-apple-MacOSX.tar.gz', - '2.25.1-powerpc-apple-MacOSX'), - ('clisp-2.27-PowerMacintosh-powerpc-Darwin-1.3.7.tar.gz', - '2.27-PowerMacintosh-powerpc-Darwin-1.3.7'), - ('clisp-2.27-i686-unknown-Linux-2.2.19.tar.gz', - '2.27-i686-unknown-Linux-2.2.19'), - ('clisp-2.28-i386-i386-freebsd-4.3-RELEASE.tar.gz', - '2.28-i386-i386-freebsd-4.3-RELEASE'), - ('clisp-2.28-i686-unknown-cygwin_me-4.90-1.3.10.tar.gz', - '2.28-i686-unknown-cygwin_me-4.90-1.3.10'), - ('clisp-2.29-i386-i386-freebsd-4.6-STABLE.tar.gz', - '2.29-i386-i386-freebsd-4.6-STABLE'), - ('clisp-2.29-i686-unknown-cygwin_nt-5.0-1.3.12.tar.gz', - '2.29-i686-unknown-cygwin_nt-5.0-1.3.12'), - ('gcl-2.5.3-ansi-japi-xdr.20030701_mingw32.zip', - '2.5.3-ansi-japi-xdr.20030701_mingw32'), - ('gettext-runtime-0.13.1.bin.woe32.zip', '0.13.1.bin.woe32'), - ('sather-logo_images.tar.gz', 'sather-logo_images'), - ('sather-specification-000328.html.tar.gz', '000328.html'), - ('something-10.1.0.7z', '10.1.0'), - + ("https://gnu.org/sthg/info-2.1.0.tar.gz", "2.1.0"), + ("https://gnu.org/sthg/info-2.1.2.zip", "2.1.2"), + ("https://sthg.org/gnu/sthg.tar.gz", "sthg"), + ("https://sthg.org/gnu/DLDF-1.1.4.tar.gz", "1.1.4"), + ("https://sthg.org/gnu/anubis-latest.tar.bz2", "latest"), + ("https://ftp.org/gnu/aris-w32.zip", "w32"), + ("https://ftp.org/gnu/aris-w32-2.2.zip", "w32-2.2"), + ("https://ftp.org/gnu/autogen.info.tar.gz", "autogen.info"), + ("https://ftp.org/gnu/crypto-build-demo.tar.gz", "crypto-build-demo"), + ("https://ftp.org/gnu/clue+clio+xit.clisp.tar.gz", "clue+clio+xit.clisp"), + ("https://ftp.org/gnu/clue+clio.for-pcl.tar.gz", "clue+clio.for-pcl"), + ( + "https://ftp.org/gnu/clisp-hppa2.0-hp-hpux10.20.tar.gz", + "hppa2.0-hp-hpux10.20", + ), + ("clisp-i386-solaris2.6.tar.gz", "i386-solaris2.6"), + ("clisp-mips-sgi-irix6.5.tar.gz", "mips-sgi-irix6.5"), + ("clisp-powerpc-apple-macos.tar.gz", "powerpc-apple-macos"), + ("clisp-powerpc-unknown-linuxlibc6.tar.gz", "powerpc-unknown-linuxlibc6"), + ("clisp-rs6000-ibm-aix3.2.5.tar.gz", "rs6000-ibm-aix3.2.5"), + ("clisp-sparc-redhat51-linux.tar.gz", "sparc-redhat51-linux"), + ("clisp-sparc-sun-solaris2.4.tar.gz", "sparc-sun-solaris2.4"), + ("clisp-sparc-sun-sunos4.1.3_U1.tar.gz", "sparc-sun-sunos4.1.3_U1"), + ("clisp-2.25.1-powerpc-apple-MacOSX.tar.gz", "2.25.1-powerpc-apple-MacOSX"), + ( + "clisp-2.27-PowerMacintosh-powerpc-Darwin-1.3.7.tar.gz", + "2.27-PowerMacintosh-powerpc-Darwin-1.3.7", + ), + ( + "clisp-2.27-i686-unknown-Linux-2.2.19.tar.gz", + "2.27-i686-unknown-Linux-2.2.19", + ), + ( + "clisp-2.28-i386-i386-freebsd-4.3-RELEASE.tar.gz", + "2.28-i386-i386-freebsd-4.3-RELEASE", + ), + ( + "clisp-2.28-i686-unknown-cygwin_me-4.90-1.3.10.tar.gz", + "2.28-i686-unknown-cygwin_me-4.90-1.3.10", + ), + ( + "clisp-2.29-i386-i386-freebsd-4.6-STABLE.tar.gz", + "2.29-i386-i386-freebsd-4.6-STABLE", + ), + ( + "clisp-2.29-i686-unknown-cygwin_nt-5.0-1.3.12.tar.gz", + "2.29-i686-unknown-cygwin_nt-5.0-1.3.12", + ), + ( + "gcl-2.5.3-ansi-japi-xdr.20030701_mingw32.zip", + "2.5.3-ansi-japi-xdr.20030701_mingw32", + ), + ("gettext-runtime-0.13.1.bin.woe32.zip", "0.13.1.bin.woe32"), + ("sather-logo_images.tar.gz", "sather-logo_images"), + ("sather-specification-000328.html.tar.gz", "000328.html"), + ("something-10.1.0.7z", "10.1.0"), ]: actual_branchname = get_version(url) @@ -211,16 +223,16 @@ def test_get_version(): def test_format_date(): for timestamp, expected_isoformat_date in [ - (1489817408, '2017-03-18T06:10:08+00:00'), - (1386961236, '2013-12-13T19:00:36+00:00'), - ('1198900505', '2007-12-29T03:55:05+00:00'), - (1059822922, '2003-08-02T11:15:22+00:00'), - ('1489817408', '2017-03-18T06:10:08+00:00'), + (1489817408, "2017-03-18T06:10:08+00:00"), + (1386961236, "2013-12-13T19:00:36+00:00"), + ("1198900505", "2007-12-29T03:55:05+00:00"), + (1059822922, "2003-08-02T11:15:22+00:00"), + ("1489817408", "2017-03-18T06:10:08+00:00"), ]: actual_date = format_date(timestamp) assert actual_date == expected_isoformat_date with pytest.raises(ValueError): - format_date('') + format_date("") with pytest.raises(TypeError): format_date(None) diff --git a/swh/lister/gnu/tree.py b/swh/lister/gnu/tree.py index 8ef6bd6..f01c0a7 100644 --- a/swh/lister/gnu/tree.py +++ b/swh/lister/gnu/tree.py @@ -24,12 +24,13 @@ class GNUTree: """Gnu Tree's representation """ + def __init__(self, url: str): self.url = url # filepath or uri u = urlparse(url) - self.base_url = '%s://%s' % (u.scheme, u.netloc) + self.base_url = "%s://%s" % (u.scheme, u.netloc) # Interesting top level directories - self.top_level_directories = ['gnu', 'old-gnu'] + self.top_level_directories = ["gnu", "old-gnu"] # internal state self._artifacts = {} # type: Mapping[str, Any] self._projects = {} # type: Mapping[str, Any] @@ -59,21 +60,23 @@ class GNUTree: artifacts = {} raw_data = load_raw_data(self.url)[0] - for directory in raw_data['contents']: - if directory['name'] not in self.top_level_directories: + for directory in raw_data["contents"]: + if directory["name"] not in self.top_level_directories: continue - infos = directory['contents'] + infos = directory["contents"] for info in infos: - if info['type'] == 'directory': - package_url = '%s/%s/%s/' % ( - self.base_url, directory['name'], info['name']) - package_artifacts = find_artifacts( - info['contents'], package_url) + if info["type"] == "directory": + package_url = "%s/%s/%s/" % ( + self.base_url, + directory["name"], + info["name"], + ) + package_artifacts = find_artifacts(info["contents"], package_url) if package_artifacts != []: repo_details = { - 'name': info['name'], - 'url': package_url, - 'time_modified': format_date(info['time']) + "name": info["name"], + "url": package_url, + "time_modified": format_date(info["time"]), } artifacts[package_url] = package_artifacts projects[package_url] = repo_details @@ -81,8 +84,9 @@ class GNUTree: return projects, artifacts -def find_artifacts(filesystem: List[Mapping[str, Any]], - url: str) -> List[Mapping[str, Any]]: +def find_artifacts( + filesystem: List[Mapping[str, Any]], url: str +) -> List[Mapping[str, Any]]: """Recursively list artifacts present in the folder and subfolders for a particular package url. @@ -127,23 +131,25 @@ def find_artifacts(filesystem: List[Mapping[str, Any]], """ artifacts = [] # type: List[Mapping[str, Any]] for info_file in filesystem: - filetype = info_file['type'] - filename = info_file['name'] - if filetype == 'file': + filetype = info_file["type"] + filename = info_file["name"] + if filetype == "file": if check_filename_is_archive(filename): uri = url + filename - artifacts.append({ - 'url': uri, - 'filename': filename, - 'time': format_date(info_file['time']), - 'length': int(info_file['size']), - 'version': get_version(filename), - }) + artifacts.append( + { + "url": uri, + "filename": filename, + "time": format_date(info_file["time"]), + "length": int(info_file["size"]), + "version": get_version(filename), + } + ) # It will recursively check for artifacts in all sub-folders - elif filetype == 'directory': + elif filetype == "directory": tarballs_in_dir = find_artifacts( - info_file['contents'], - url + filename + '/') + info_file["contents"], url + filename + "/" + ) artifacts.extend(tarballs_in_dir) return artifacts @@ -176,40 +182,67 @@ def check_filename_is_archive(filename: str) -> bool: """ file_suffixes = Path(filename).suffixes - if len(file_suffixes) == 1 and file_suffixes[-1] in ('.zip', '.tar'): + if len(file_suffixes) == 1 and file_suffixes[-1] in (".zip", ".tar"): return True elif len(file_suffixes) > 1: - if file_suffixes[-1] == '.zip' or file_suffixes[-2] == '.tar': + if file_suffixes[-1] == ".zip" or file_suffixes[-2] == ".tar": return True return False # to recognize existing naming pattern EXTENSIONS = [ - 'zip', - 'tar', - 'gz', 'tgz', - 'bz2', 'bzip2', - 'lzma', 'lz', - 'xz', - 'Z', '7z', + "zip", + "tar", + "gz", + "tgz", + "bz2", + "bzip2", + "lzma", + "lz", + "xz", + "Z", + "7z", ] VERSION_KEYWORDS = [ - 'cygwin_me', - 'w32', 'win32', 'nt', 'cygwin', 'mingw', - 'latest', 'alpha', 'beta', - 'release', 'stable', - 'hppa', - 'solaris', 'sunos', 'sun4u', 'sparc', 'sun', - 'aix', 'ibm', 'rs6000', - 'i386', 'i686', - 'linux', 'redhat', 'linuxlibc', - 'mips', - 'powerpc', 'macos', 'apple', 'darwin', 'macosx', 'powermacintosh', - 'unknown', - 'netbsd', 'freebsd', - 'sgi', 'irix', + "cygwin_me", + "w32", + "win32", + "nt", + "cygwin", + "mingw", + "latest", + "alpha", + "beta", + "release", + "stable", + "hppa", + "solaris", + "sunos", + "sun4u", + "sparc", + "sun", + "aix", + "ibm", + "rs6000", + "i386", + "i686", + "linux", + "redhat", + "linuxlibc", + "mips", + "powerpc", + "macos", + "apple", + "darwin", + "macosx", + "powermacintosh", + "unknown", + "netbsd", + "freebsd", + "sgi", + "irix", ] # Match a filename into components. @@ -225,7 +258,7 @@ VERSION_KEYWORDS = [ # greedily with +, software_name and release_number are matched lazily # with +? and *?). -PATTERN = r''' +PATTERN = r""" ^ (?: # We have a software name and a release number, separated with a @@ -239,9 +272,9 @@ PATTERN = r''' ) (?P(?:\.(?:{extensions}))+) $ -'''.format( - extensions='|'.join(EXTENSIONS), - vkeywords='|'.join('%s[-]?' % k for k in VERSION_KEYWORDS), +""".format( + extensions="|".join(EXTENSIONS), + vkeywords="|".join("%s[-]?" % k for k in VERSION_KEYWORDS), ) @@ -267,16 +300,15 @@ def get_version(uri: str) -> str: """ filename = path.split(uri)[-1] - m = re.match(PATTERN, filename, - flags=re.VERBOSE | re.IGNORECASE) + m = re.match(PATTERN, filename, flags=re.VERBOSE | re.IGNORECASE) if m: d = m.groupdict() - if d['software_name1'] and d['release_number']: - return d['release_number'] - if d['software_name2']: - return d['software_name2'] + if d["software_name1"] and d["release_number"]: + return d["release_number"] + if d["software_name2"]: + return d["software_name2"] - return '' + return "" def load_raw_data(url: str) -> Sequence[Mapping]: @@ -289,15 +321,15 @@ def load_raw_data(url: str) -> Sequence[Mapping]: The raw json list """ - if url.startswith('http://') or url.startswith('https://'): + if url.startswith("http://") or url.startswith("https://"): response = requests.get(url, allow_redirects=True) if not response.ok: - raise ValueError('Error during query to %s' % url) + raise ValueError("Error during query to %s" % url) raw = gzip.decompress(response.content) else: - with gzip.open(url, 'r') as f: + with gzip.open(url, "r") as f: raw = f.read() - raw_data = json.loads(raw.decode('utf-8')) + raw_data = json.loads(raw.decode("utf-8")) return raw_data diff --git a/swh/lister/npm/__init__.py b/swh/lister/npm/__init__.py index 77c3d38..0d10210 100644 --- a/swh/lister/npm/__init__.py +++ b/swh/lister/npm/__init__.py @@ -7,14 +7,15 @@ def register(): from .models import NpmVisitModel, NpmModel from .lister import NpmLister - return {'models': [NpmVisitModel, NpmModel], - 'lister': NpmLister, - 'task_modules': ['%s.tasks' % __name__], - 'task_types': { - 'list-npm-full': { - 'default_interval': '7 days', - 'min_interval': '7 days', - 'max_interval': '7 days', - }, - }, - } + return { + "models": [NpmVisitModel, NpmModel], + "lister": NpmLister, + "task_modules": ["%s.tasks" % __name__], + "task_types": { + "list-npm-full": { + "default_interval": "7 days", + "min_interval": "7 days", + "max_interval": "7 days", + }, + }, + } diff --git a/swh/lister/npm/lister.py b/swh/lister/npm/lister.py index 5214032..15c2556 100644 --- a/swh/lister/npm/lister.py +++ b/swh/lister/npm/lister.py @@ -14,15 +14,17 @@ class NpmListerBase(IndexingHttpLister): """List packages available in the npm registry in a paginated way """ - MODEL = NpmModel - LISTER_NAME = 'npm' - instance = 'npm' - def __init__(self, url='https://replicate.npmjs.com', - per_page=1000, override_config=None): + MODEL = NpmModel + LISTER_NAME = "npm" + instance = "npm" + + def __init__( + self, url="https://replicate.npmjs.com", per_page=1000, override_config=None + ): super().__init__(url=url, override_config=override_config) self.per_page = per_page + 1 - self.PATH_TEMPLATE += '&limit=%s' % self.per_page + self.PATH_TEMPLATE += "&limit=%s" % self.per_page @property def ADDITIONAL_CONFIG(self) -> Dict[str, Any]: @@ -30,22 +32,22 @@ class NpmListerBase(IndexingHttpLister): """ default_config = super().ADDITIONAL_CONFIG - default_config['loading_task_policy'] = ('str', 'recurring') + default_config["loading_task_policy"] = ("str", "recurring") return default_config def get_model_from_repo(self, repo_name: str) -> Dict[str, str]: """(Override) Transform from npm package name to model """ - package_url = 'https://www.npmjs.com/package/%s' % repo_name + package_url = "https://www.npmjs.com/package/%s" % repo_name return { - 'uid': repo_name, - 'indexable': repo_name, - 'name': repo_name, - 'full_name': repo_name, - 'html_url': package_url, - 'origin_url': package_url, - 'origin_type': 'npm', + "uid": repo_name, + "indexable": repo_name, + "name": repo_name, + "full_name": repo_name, + "html_url": package_url, + "origin_url": package_url, + "origin_type": "npm", } def task_dict(self, origin_type: str, origin_url: str, **kwargs): @@ -56,10 +58,9 @@ class NpmListerBase(IndexingHttpLister): needed for the ingestion task creation. """ - task_type = 'load-%s' % origin_type - task_policy = self.config['loading_task_policy'] - return create_task_dict(task_type, task_policy, - url=origin_url) + task_type = "load-%s" % origin_type + task_policy = self.config["loading_task_policy"] + return create_task_dict(task_type, task_policy, url=origin_url) def request_headers(self) -> Dict[str, Any]: """(Override) Set requests headers to send when querying the npm @@ -67,7 +68,7 @@ class NpmListerBase(IndexingHttpLister): """ headers = super().request_headers() - headers['Accept'] = 'application/json' + headers["Accept"] = "application/json" return headers def string_pattern_check(self, inner: int, lower: int, upper: int = None): @@ -83,25 +84,24 @@ class NpmLister(NpmListerBase): """List all packages available in the npm registry in a paginated way """ + PATH_TEMPLATE = '/_all_docs?startkey="%s"' - def get_next_target_from_response( - self, response: Response) -> Optional[str]: + def get_next_target_from_response(self, response: Response) -> Optional[str]: """(Override) Get next npm package name to continue the listing """ - repos = response.json()['rows'] - return repos[-1]['id'] if len(repos) == self.per_page else None + repos = response.json()["rows"] + return repos[-1]["id"] if len(repos) == self.per_page else None - def transport_response_simplified( - self, response: Response) -> List[Dict[str, str]]: + def transport_response_simplified(self, response: Response) -> List[Dict[str, str]]: """(Override) Transform npm registry response to list for model manipulation """ - repos = response.json()['rows'] + repos = response.json()["rows"] if len(repos) == self.per_page: repos = repos[:-1] - return [self.get_model_from_repo(repo['id']) for repo in repos] + return [self.get_model_from_repo(repo["id"]) for repo in repos] class NpmIncrementalLister(NpmListerBase): @@ -109,30 +109,29 @@ class NpmIncrementalLister(NpmListerBase): update_seq value of the underlying CouchDB database, in a paginated way. """ - PATH_TEMPLATE = '/_changes?since=%s' + + PATH_TEMPLATE = "/_changes?since=%s" @property def CONFIG_BASE_FILENAME(self): # noqa: N802 - return 'lister_npm_incremental' + return "lister_npm_incremental" - def get_next_target_from_response( - self, response: Response) -> Optional[str]: + def get_next_target_from_response(self, response: Response) -> Optional[str]: """(Override) Get next npm package name to continue the listing. """ - repos = response.json()['results'] - return repos[-1]['seq'] if len(repos) == self.per_page else None + repos = response.json()["results"] + return repos[-1]["seq"] if len(repos) == self.per_page else None - def transport_response_simplified( - self, response: Response) -> List[Dict[str, str]]: + def transport_response_simplified(self, response: Response) -> List[Dict[str, str]]: """(Override) Transform npm registry response to list for model manipulation. """ - repos = response.json()['results'] + repos = response.json()["results"] if len(repos) == self.per_page: repos = repos[:-1] - return [self.get_model_from_repo(repo['id']) for repo in repos] + return [self.get_model_from_repo(repo["id"]) for repo in repos] def filter_before_inject(self, models_list: List[Dict[str, Any]]): """(Override) Filter out documents in the CouchDB database @@ -141,9 +140,9 @@ class NpmIncrementalLister(NpmListerBase): """ models_filtered = [] for model in models_list: - package_name = model['name'] + package_name = model["name"] # document related to CouchDB internals - if package_name.startswith('_design/'): + if package_name.startswith("_design/"): continue models_filtered.append(model) return models_filtered diff --git a/swh/lister/npm/models.py b/swh/lister/npm/models.py index 5eb8d0d..08f7d6e 100644 --- a/swh/lister/npm/models.py +++ b/swh/lister/npm/models.py @@ -11,9 +11,10 @@ class NpmVisitModel(SQLBase, metaclass=ABCSQLMeta): """Table to store the npm registry state at the time of a content listing by Software Heritage """ - __tablename__ = 'npm_visit' - uid = Column(Integer, Sequence('npm_visit_id_seq'), primary_key=True) + __tablename__ = "npm_visit" + + uid = Column(Integer, Sequence("npm_visit_id_seq"), primary_key=True) visit_date = Column(DateTime, nullable=False) doc_count = Column(BigInteger) doc_del_count = Column(BigInteger) @@ -29,7 +30,8 @@ class NpmModel(IndexingModelBase): """A npm package representation """ - __tablename__ = 'npm_repo' + + __tablename__ = "npm_repo" uid = Column(String, primary_key=True) indexable = Column(String, index=True) diff --git a/swh/lister/npm/tasks.py b/swh/lister/npm/tasks.py index 1e4a51c..b0d06d4 100644 --- a/swh/lister/npm/tasks.py +++ b/swh/lister/npm/tasks.py @@ -13,15 +13,22 @@ from swh.lister.npm.models import NpmVisitModel @contextmanager def save_registry_state(lister): - params = {'headers': lister.request_headers()} + params = {"headers": lister.request_headers()} registry_state = lister.session.get(lister.url, **params) registry_state = registry_state.json() - keys = ('doc_count', 'doc_del_count', 'update_seq', 'purge_seq', - 'disk_size', 'data_size', 'committed_update_seq', - 'compacted_seq') + keys = ( + "doc_count", + "doc_del_count", + "update_seq", + "purge_seq", + "disk_size", + "data_size", + "committed_update_seq", + "compacted_seq", + ) state = {key: registry_state[key] for key in keys} - state['visit_date'] = datetime.now() + state["visit_date"] = datetime.now() yield npm_visit = NpmVisitModel(**state) lister.db_session.add(npm_visit) @@ -34,29 +41,31 @@ def get_last_update_seq(lister): query = lister.db_session.query(NpmVisitModel.update_seq) row = query.order_by(NpmVisitModel.uid.desc()).first() if not row: - raise ValueError('No npm registry listing previously performed ! ' - 'This is required prior to the execution of an ' - 'incremental listing.') + raise ValueError( + "No npm registry listing previously performed ! " + "This is required prior to the execution of an " + "incremental listing." + ) return row[0] -@shared_task(name=__name__ + '.NpmListerTask') +@shared_task(name=__name__ + ".NpmListerTask") def list_npm_full(**lister_args): - 'Full lister for the npm (javascript) registry' + "Full lister for the npm (javascript) registry" lister = NpmLister(**lister_args) with save_registry_state(lister): return lister.run() -@shared_task(name=__name__ + '.NpmIncrementalListerTask') +@shared_task(name=__name__ + ".NpmIncrementalListerTask") def list_npm_incremental(**lister_args): - 'Incremental lister for the npm (javascript) registry' + "Incremental lister for the npm (javascript) registry" lister = NpmIncrementalLister(**lister_args) update_seq_start = get_last_update_seq(lister) with save_registry_state(lister): return lister.run(min_bound=update_seq_start) -@shared_task(name=__name__ + '.ping') +@shared_task(name=__name__ + ".ping") def _ping(): - return 'OK' + return "OK" diff --git a/swh/lister/npm/tests/conftest.py b/swh/lister/npm/tests/conftest.py index a7f2433..bfa555f 100644 --- a/swh/lister/npm/tests/conftest.py +++ b/swh/lister/npm/tests/conftest.py @@ -10,14 +10,16 @@ from swh.lister.core.tests.conftest import * # noqa @pytest.fixture def lister_npm(swh_listers): - lister = swh_listers['npm'] + lister = swh_listers["npm"] # Add the load-deb-package in the scheduler backend - lister.scheduler.create_task_type({ - 'type': 'load-npm', - 'description': 'Load npm package', - 'backend_name': 'swh.loader.package.tasks.LoadNpm', - 'default_interval': '1 day', - }) + lister.scheduler.create_task_type( + { + "type": "load-npm", + "description": "Load npm package", + "backend_name": "swh.loader.package.tasks.LoadNpm", + "default_interval": "1 day", + } + ) return lister diff --git a/swh/lister/npm/tests/test_lister.py b/swh/lister/npm/tests/test_lister.py index 2a7ed8d..5a28a6d 100644 --- a/swh/lister/npm/tests/test_lister.py +++ b/swh/lister/npm/tests/test_lister.py @@ -21,10 +21,10 @@ logger = logging.getLogger(__name__) class NpmListerTester(HttpListerTesterBase, unittest.TestCase): Lister = NpmLister test_re = re.compile(r'^.*/_all_docs\?startkey="(.+)".*') - lister_subdir = 'npm' - good_api_response_file = 'data/replicate.npmjs.com/api_response.json' - bad_api_response_file = 'data/api_empty_response.json' - first_index = 'jquery' + lister_subdir = "npm" + good_api_response_file = "data/replicate.npmjs.com/api_response.json" + bad_api_response_file = "data/api_empty_response.json" + first_index = "jquery" entries_per_page = 100 @requests_mock.Mocker() @@ -37,11 +37,11 @@ class NpmListerTester(HttpListerTesterBase, unittest.TestCase): class NpmIncrementalListerTester(HttpListerTesterBase, unittest.TestCase): Lister = NpmIncrementalLister - test_re = re.compile(r'^.*/_changes\?since=([0-9]+).*') - lister_subdir = 'npm' - good_api_response_file = 'data/api_inc_response.json' - bad_api_response_file = 'data/api_inc_empty_response.json' - first_index = '6920642' + test_re = re.compile(r"^.*/_changes\?since=([0-9]+).*") + lister_subdir = "npm" + good_api_response_file = "data/api_inc_response.json" + bad_api_response_file = "data/api_inc_empty_response.json" + first_index = "6920642" entries_per_page = 100 @requests_mock.Mocker() @@ -58,27 +58,27 @@ def check_tasks(tasks: List[Any]): """ for row in tasks: - logger.debug('row: %s', row) - assert row['type'] == 'load-npm' + logger.debug("row: %s", row) + assert row["type"] == "load-npm" # arguments check - args = row['arguments']['args'] + args = row["arguments"]["args"] assert len(args) == 0 # kwargs - kwargs = row['arguments']['kwargs'] + kwargs = row["arguments"]["kwargs"] assert len(kwargs) == 1 - package_url = kwargs['url'] - package_name = package_url.split('/')[-1] - assert package_url == f'https://www.npmjs.com/package/{package_name}' + package_url = kwargs["url"] + package_name = package_url.split("/")[-1] + assert package_url == f"https://www.npmjs.com/package/{package_name}" - assert row['policy'] == 'recurring' - assert row['priority'] is None + assert row["policy"] == "recurring" + assert row["priority"] is None def test_lister_npm_basic_listing(lister_npm, requests_mock_datadir): lister_npm.run() - tasks = lister_npm.scheduler.search_tasks(task_type='load-npm') + tasks = lister_npm.scheduler.search_tasks(task_type="load-npm") assert len(tasks) == 100 check_tasks(tasks) @@ -89,10 +89,11 @@ def test_lister_npm_listing_pagination(lister_npm, requests_mock_datadir): # Patch per page pagination lister.per_page = 10 + 1 lister.PATH_TEMPLATE = lister.PATH_TEMPLATE.replace( - '&limit=1001', '&limit=%s' % lister.per_page) + "&limit=1001", "&limit=%s" % lister.per_page + ) lister.run() - tasks = lister.scheduler.search_tasks(task_type='load-npm') + tasks = lister.scheduler.search_tasks(task_type="load-npm") assert len(tasks) == 2 * 10 # only 2 files with 10 results each check_tasks(tasks) diff --git a/swh/lister/npm/tests/test_tasks.py b/swh/lister/npm/tests/test_tasks.py index 491374f..382e557 100644 --- a/swh/lister/npm/tests/test_tasks.py +++ b/swh/lister/npm/tests/test_tasks.py @@ -8,23 +8,22 @@ def mock_save(lister): def test_ping(swh_app, celery_session_worker): - res = swh_app.send_task( - 'swh.lister.npm.tasks.ping') + res = swh_app.send_task("swh.lister.npm.tasks.ping") assert res res.wait() assert res.successful() - assert res.result == 'OK' + assert res.result == "OK" -@patch('swh.lister.npm.tasks.save_registry_state') -@patch('swh.lister.npm.tasks.NpmLister') +@patch("swh.lister.npm.tasks.save_registry_state") +@patch("swh.lister.npm.tasks.NpmLister") def test_lister(lister, save, swh_app, celery_session_worker): # setup the mocked NpmLister lister.return_value = lister lister.run.return_value = None save.side_effect = mock_save - res = swh_app.send_task('swh.lister.npm.tasks.NpmListerTask') + res = swh_app.send_task("swh.lister.npm.tasks.NpmListerTask") assert res res.wait() assert res.successful() @@ -33,9 +32,9 @@ def test_lister(lister, save, swh_app, celery_session_worker): lister.run.assert_called_once_with() -@patch('swh.lister.npm.tasks.save_registry_state') -@patch('swh.lister.npm.tasks.get_last_update_seq') -@patch('swh.lister.npm.tasks.NpmIncrementalLister') +@patch("swh.lister.npm.tasks.save_registry_state") +@patch("swh.lister.npm.tasks.get_last_update_seq") +@patch("swh.lister.npm.tasks.NpmIncrementalLister") def test_incremental(lister, seq, save, swh_app, celery_session_worker): # setup the mocked NpmLister lister.return_value = lister @@ -43,8 +42,7 @@ def test_incremental(lister, seq, save, swh_app, celery_session_worker): seq.return_value = 42 save.side_effect = mock_save - res = swh_app.send_task( - 'swh.lister.npm.tasks.NpmIncrementalListerTask') + res = swh_app.send_task("swh.lister.npm.tasks.NpmIncrementalListerTask") assert res res.wait() assert res.successful() diff --git a/swh/lister/packagist/__init__.py b/swh/lister/packagist/__init__.py index 4060cf2..a97ede1 100644 --- a/swh/lister/packagist/__init__.py +++ b/swh/lister/packagist/__init__.py @@ -7,7 +7,8 @@ def register(): from .models import PackagistModel from .lister import PackagistLister - return {'models': [PackagistModel], - 'lister': PackagistLister, - 'task_modules': ['%s.tasks' % __name__], - } + return { + "models": [PackagistModel], + "lister": PackagistLister, + "task_modules": ["%s.tasks" % __name__], + } diff --git a/swh/lister/packagist/lister.py b/swh/lister/packagist/lister.py index 98e72f3..e7b9709 100644 --- a/swh/lister/packagist/lister.py +++ b/swh/lister/packagist/lister.py @@ -23,7 +23,7 @@ def compute_package_url(repo_name: str) -> str: """Compute packgist package url from repo name. """ - return 'https://repo.packagist.org/p/%s.json' % repo_name + return "https://repo.packagist.org/p/%s.json" % repo_name class PackagistLister(ListerOnePageApiTransport, SimpleLister): @@ -52,17 +52,19 @@ class PackagistLister(ListerOnePageApiTransport, SimpleLister): 'https://repo.packagist.org/p/hypejunction/hypegamemechanics.json' """ + MODEL = PackagistModel - LISTER_NAME = 'packagist' - PAGE = 'https://packagist.org/packages/list.json' - instance = 'packagist' + LISTER_NAME = "packagist" + PAGE = "https://packagist.org/packages/list.json" + instance = "packagist" def __init__(self, override_config=None): - ListerOnePageApiTransport .__init__(self) + ListerOnePageApiTransport.__init__(self) SimpleLister.__init__(self, override_config=override_config) - def task_dict(self, origin_type: str, origin_url: str, - **kwargs: Mapping[str, str]) -> Dict[str, Any]: + def task_dict( + self, origin_type: str, origin_url: str, **kwargs: Mapping[str, str] + ) -> Dict[str, Any]: """Return task format dict This is overridden from the lister_base as more information is @@ -70,18 +72,20 @@ class PackagistLister(ListerOnePageApiTransport, SimpleLister): """ return utils.create_task_dict( - 'load-%s' % origin_type, - kwargs.get('policy', 'recurring'), - kwargs.get('name'), origin_url, - retries_left=3) + "load-%s" % origin_type, + kwargs.get("policy", "recurring"), + kwargs.get("name"), + origin_url, + retries_left=3, + ) def list_packages(self, response: Any) -> List[str]: """List the actual packagist origins from the response. """ response = json.loads(response.text) - packages = [name for name in response['packageNames']] - logger.debug('Number of packages: %s', len(packages)) + packages = [name for name in response["packageNames"]] + logger.debug("Number of packages: %s", len(packages)) random.shuffle(packages) return packages @@ -91,10 +95,10 @@ class PackagistLister(ListerOnePageApiTransport, SimpleLister): """ url = compute_package_url(repo_name) return { - 'uid': repo_name, - 'name': repo_name, - 'full_name': repo_name, - 'html_url': url, - 'origin_url': url, - 'origin_type': 'packagist', + "uid": repo_name, + "name": repo_name, + "full_name": repo_name, + "html_url": url, + "origin_url": url, + "origin_type": "packagist", } diff --git a/swh/lister/packagist/models.py b/swh/lister/packagist/models.py index 36a6333..268f884 100644 --- a/swh/lister/packagist/models.py +++ b/swh/lister/packagist/models.py @@ -11,6 +11,7 @@ class PackagistModel(ModelBase): """a Packagist repository representation """ - __tablename__ = 'packagist_repo' + + __tablename__ = "packagist_repo" uid = Column(String, primary_key=True) diff --git a/swh/lister/packagist/tasks.py b/swh/lister/packagist/tasks.py index 146ebe2..6f6087b 100644 --- a/swh/lister/packagist/tasks.py +++ b/swh/lister/packagist/tasks.py @@ -7,12 +7,12 @@ from celery import shared_task from .lister import PackagistLister -@shared_task(name=__name__ + '.PackagistListerTask') +@shared_task(name=__name__ + ".PackagistListerTask") def list_packagist(**lister_args): - 'List the packagist (php) registry' + "List the packagist (php) registry" PackagistLister(**lister_args).run() -@shared_task(name=__name__ + '.ping') +@shared_task(name=__name__ + ".ping") def _ping(): - return 'OK' + return "OK" diff --git a/swh/lister/packagist/tests/conftest.py b/swh/lister/packagist/tests/conftest.py index fe31517..1eafc36 100644 --- a/swh/lister/packagist/tests/conftest.py +++ b/swh/lister/packagist/tests/conftest.py @@ -10,14 +10,16 @@ from swh.lister.core.tests.conftest import * # noqa @pytest.fixture def lister_packagist(swh_listers): - lister = swh_listers['packagist'] + lister = swh_listers["packagist"] # Amend the scheduler with an unknown yet load-packagist task type - lister.scheduler.create_task_type({ - 'type': 'load-packagist', - 'description': 'Load packagist origin', - 'backend_name': 'swh.loader.package.tasks.LoaderPackagist', - 'default_interval': '1 day', - }) + lister.scheduler.create_task_type( + { + "type": "load-packagist", + "description": "Load packagist origin", + "backend_name": "swh.loader.package.tasks.LoaderPackagist", + "default_interval": "1 day", + } + ) return lister diff --git a/swh/lister/packagist/tests/test_lister.py b/swh/lister/packagist/tests/test_lister.py index 869e6c2..3bfff49 100644 --- a/swh/lister/packagist/tests/test_lister.py +++ b/swh/lister/packagist/tests/test_lister.py @@ -12,27 +12,29 @@ from swh.lister.packagist.lister import PackagistLister, compute_package_url from swh.lister.core.tests.test_lister import HttpSimpleListerTester -expected_packages = ['0.0.0/composer-include-files', '0.0.0/laravel-env-shim', - '0.0.1/try-make-package', '0099ff/dialogflowphp', - '00f100/array_dot'] +expected_packages = [ + "0.0.0/composer-include-files", + "0.0.0/laravel-env-shim", + "0.0.1/try-make-package", + "0099ff/dialogflowphp", + "00f100/array_dot", +] expected_model = { - 'uid': '0099ff/dialogflowphp', - 'name': '0099ff/dialogflowphp', - 'full_name': '0099ff/dialogflowphp', - 'html_url': - 'https://repo.packagist.org/p/0099ff/dialogflowphp.json', - 'origin_url': - 'https://repo.packagist.org/p/0099ff/dialogflowphp.json', - 'origin_type': 'packagist', - } + "uid": "0099ff/dialogflowphp", + "name": "0099ff/dialogflowphp", + "full_name": "0099ff/dialogflowphp", + "html_url": "https://repo.packagist.org/p/0099ff/dialogflowphp.json", + "origin_url": "https://repo.packagist.org/p/0099ff/dialogflowphp.json", + "origin_type": "packagist", +} class PackagistListerTester(HttpSimpleListerTester, unittest.TestCase): Lister = PackagistLister - PAGE = 'https://packagist.org/packages/list.json' - lister_subdir = 'packagist' - good_api_response_file = 'data/https_packagist.org/packages_list.json' + PAGE = "https://packagist.org/packages/list.json" + lister_subdir = "packagist" + good_api_response_file = "data/https_packagist.org/packages_list.json" entries = 5 @requests_mock.Mocker() @@ -52,40 +54,41 @@ class PackagistListerTester(HttpSimpleListerTester, unittest.TestCase): """ fl = self.get_fl() - model = fl.transport_response_simplified(['0099ff/dialogflowphp']) + model = fl.transport_response_simplified(["0099ff/dialogflowphp"]) assert len(model) == 1 for key, values in model[0].items(): assert values == expected_model[key] - @patch('swh.lister.packagist.lister.utils.create_task_dict') + @patch("swh.lister.packagist.lister.utils.create_task_dict") def test_task_dict(self, mock_create_tasks): """Test the task creation of lister """ fl = self.get_fl() - fl.task_dict(origin_type='packagist', origin_url='https://abc', - name='test_pack') + fl.task_dict( + origin_type="packagist", origin_url="https://abc", name="test_pack" + ) mock_create_tasks.assert_called_once_with( - 'load-packagist', 'recurring', 'test_pack', 'https://abc', - retries_left=3) + "load-packagist", "recurring", "test_pack", "https://abc", retries_left=3 + ) def test_compute_package_url(): - expected_url = 'https://repo.packagist.org/p/hello.json' - actual_url = compute_package_url('hello') + expected_url = "https://repo.packagist.org/p/hello.json" + actual_url = compute_package_url("hello") assert actual_url == expected_url def test_packagist_lister(lister_packagist, requests_mock_datadir): lister_packagist.run() - r = lister_packagist.scheduler.search_tasks(task_type='load-packagist') + r = lister_packagist.scheduler.search_tasks(task_type="load-packagist") assert len(r) == 5 for row in r: - assert row['type'] == 'load-packagist' + assert row["type"] == "load-packagist" # arguments check - args = row['arguments']['args'] + args = row["arguments"]["args"] assert len(args) == 2 package = args[0] @@ -95,8 +98,8 @@ def test_packagist_lister(lister_packagist, requests_mock_datadir): assert url == expected_url # kwargs - kwargs = row['arguments']['kwargs'] + kwargs = row["arguments"]["kwargs"] assert kwargs == {} - assert row['policy'] == 'recurring' - assert row['priority'] is None + assert row["policy"] == "recurring" + assert row["priority"] is None diff --git a/swh/lister/packagist/tests/test_tasks.py b/swh/lister/packagist/tests/test_tasks.py index cbe807d..7c89b5b 100644 --- a/swh/lister/packagist/tests/test_tasks.py +++ b/swh/lister/packagist/tests/test_tasks.py @@ -6,22 +6,20 @@ from unittest.mock import patch def test_ping(swh_app, celery_session_worker): - res = swh_app.send_task( - 'swh.lister.packagist.tasks.ping') + res = swh_app.send_task("swh.lister.packagist.tasks.ping") assert res res.wait() assert res.successful() - assert res.result == 'OK' + assert res.result == "OK" -@patch('swh.lister.packagist.tasks.PackagistLister') +@patch("swh.lister.packagist.tasks.PackagistLister") def test_lister(lister, swh_app, celery_session_worker): # setup the mocked PackagistLister lister.return_value = lister lister.run.return_value = None - res = swh_app.send_task( - 'swh.lister.packagist.tasks.PackagistListerTask') + res = swh_app.send_task("swh.lister.packagist.tasks.PackagistListerTask") assert res res.wait() assert res.successful() diff --git a/swh/lister/phabricator/__init__.py b/swh/lister/phabricator/__init__.py index aeaee0a..3f5ff29 100644 --- a/swh/lister/phabricator/__init__.py +++ b/swh/lister/phabricator/__init__.py @@ -7,7 +7,8 @@ def register(): from .models import PhabricatorModel from .lister import PhabricatorLister - return {'models': [PhabricatorModel], - 'lister': PhabricatorLister, - 'task_modules': ['%s.tasks' % __name__], - } + return { + "models": [PhabricatorModel], + "lister": PhabricatorLister, + "task_modules": ["%s.tasks" % __name__], + } diff --git a/swh/lister/phabricator/lister.py b/swh/lister/phabricator/lister.py index bfb4a95..89487ae 100644 --- a/swh/lister/phabricator/lister.py +++ b/swh/lister/phabricator/lister.py @@ -21,11 +21,10 @@ logger = logging.getLogger(__name__) class PhabricatorLister(IndexingHttpLister): - PATH_TEMPLATE = '?order=oldest&attachments[uris]=1&after=%s' - DEFAULT_URL = \ - 'https://forge.softwareheritage.org/api/diffusion.repository.search' + PATH_TEMPLATE = "?order=oldest&attachments[uris]=1&after=%s" + DEFAULT_URL = "https://forge.softwareheritage.org/api/diffusion.repository.search" MODEL = PhabricatorModel - LISTER_NAME = 'phabricator' + LISTER_NAME = "phabricator" def __init__(self, url=None, instance=None, override_config=None): super().__init__(url=url, override_config=override_config) @@ -48,11 +47,14 @@ class PhabricatorLister(IndexingHttpLister): creds = self.request_instance_credentials() if not creds: raise ValueError( - 'Phabricator forge needs authentication credential to list.') - api_token = random.choice(creds)['password'] + "Phabricator forge needs authentication credential to list." + ) + api_token = random.choice(creds)["password"] - return {'headers': self.request_headers() or {}, - 'params': {'api.token': api_token}} + return { + "headers": self.request_headers() or {}, + "params": {"api.token": api_token}, + } def request_headers(self): """ @@ -60,39 +62,39 @@ class PhabricatorLister(IndexingHttpLister): Phabricator API """ headers = super().request_headers() - headers['Accept'] = 'application/json' + headers["Accept"] = "application/json" return headers - def get_model_from_repo( - self, repo: Dict[str, Any]) -> Optional[Dict[str, Any]]: - url = get_repo_url(repo['attachments']['uris']['uris']) + def get_model_from_repo(self, repo: Dict[str, Any]) -> Optional[Dict[str, Any]]: + url = get_repo_url(repo["attachments"]["uris"]["uris"]) if url is None: return None return { - 'uid': url, - 'indexable': repo['id'], - 'name': repo['fields']['shortName'], - 'full_name': repo['fields']['name'], - 'html_url': url, - 'origin_url': url, - 'origin_type': repo['fields']['vcs'], - 'instance': self.instance, + "uid": url, + "indexable": repo["id"], + "name": repo["fields"]["shortName"], + "full_name": repo["fields"]["name"], + "html_url": url, + "origin_url": url, + "origin_type": repo["fields"]["vcs"], + "instance": self.instance, } - def get_next_target_from_response( - self, response: Response) -> Optional[int]: - body = response.json()['result']['cursor'] - if body['after'] and body['after'] != 'null': - return int(body['after']) + def get_next_target_from_response(self, response: Response) -> Optional[int]: + body = response.json()["result"]["cursor"] + if body["after"] and body["after"] != "null": + return int(body["after"]) return None def transport_response_simplified( - self, response: Response) -> List[Optional[Dict[str, Any]]]: + self, response: Response + ) -> List[Optional[Dict[str, Any]]]: repos = response.json() - if repos['result'] is None: + if repos["result"] is None: raise ValueError( - 'Problem during information fetch: %s' % repos['error_code']) - repos = repos['result']['data'] + "Problem during information fetch: %s" % repos["error_code"] + ) + repos = repos["result"]["data"] return [self.get_model_from_repo(repo) for repo in repos] def filter_before_inject(self, models_list): @@ -103,8 +105,7 @@ class PhabricatorLister(IndexingHttpLister): models_list = [m for m in models_list if m is not None] return super().filter_before_inject(models_list) - def disable_deleted_repo_tasks( - self, index: int, next_index: int, keep_these: str): + def disable_deleted_repo_tasks(self, index: int, next_index: int, keep_these: str): """ (Overrides) Fix provided index value to avoid: @@ -113,7 +114,7 @@ class PhabricatorLister(IndexingHttpLister): """ # First call to the Phabricator API uses an empty 'after' parameter, # so set the index to 0 to avoid database query error - if index == '': + if index == "": index = 0 # Next listed repository ids are strictly greater than the 'after' # parameter, so increment the index to avoid disabling the latest @@ -121,8 +122,7 @@ class PhabricatorLister(IndexingHttpLister): # the Phabricator API else: index = index + 1 - return super().disable_deleted_repo_tasks(index, next_index, - keep_these) + return super().disable_deleted_repo_tasks(index, next_index, keep_these) def db_first_index(self) -> Optional[int]: """ @@ -172,19 +172,18 @@ def get_repo_url(attachments: List[Dict[str, Any]]) -> Optional[int]: """ processed_urls = defaultdict(dict) # type: Dict[str, Any] for uri in attachments: - protocol = uri['fields']['builtin']['protocol'] - url = uri['fields']['uri']['effective'] - identifier = uri['fields']['builtin']['identifier'] - if protocol in ('http', 'https'): + protocol = uri["fields"]["builtin"]["protocol"] + url = uri["fields"]["uri"]["effective"] + identifier = uri["fields"]["builtin"]["identifier"] + if protocol in ("http", "https"): processed_urls[protocol][identifier] = url elif protocol is None: - for protocol in ('https', 'http'): + for protocol in ("https", "http"): if url.startswith(protocol): - processed_urls[protocol]['undefined'] = url + processed_urls[protocol]["undefined"] = url break - for protocol in ['https', 'http']: - for identifier in ['shortname', 'callsign', 'id', 'undefined']: - if (protocol in processed_urls and - identifier in processed_urls[protocol]): + for protocol in ["https", "http"]: + for identifier in ["shortname", "callsign", "id", "undefined"]: + if protocol in processed_urls and identifier in processed_urls[protocol]: return processed_urls[protocol][identifier] return None diff --git a/swh/lister/phabricator/models.py b/swh/lister/phabricator/models.py index 96cc497..676be83 100644 --- a/swh/lister/phabricator/models.py +++ b/swh/lister/phabricator/models.py @@ -9,7 +9,8 @@ from swh.lister.core.models import IndexingModelBase class PhabricatorModel(IndexingModelBase): """a Phabricator repository""" - __tablename__ = 'phabricator_repo' + + __tablename__ = "phabricator_repo" uid = Column(String, primary_key=True) indexable = Column(Integer, index=True) diff --git a/swh/lister/phabricator/tasks.py b/swh/lister/phabricator/tasks.py index 614f4f2..69e562c 100644 --- a/swh/lister/phabricator/tasks.py +++ b/swh/lister/phabricator/tasks.py @@ -7,12 +7,12 @@ from celery import shared_task from swh.lister.phabricator.lister import PhabricatorLister -@shared_task(name=__name__ + '.FullPhabricatorLister') +@shared_task(name=__name__ + ".FullPhabricatorLister") def list_phabricator_full(**lister_args): """Full update of a Phabricator instance""" return PhabricatorLister(**lister_args).run() -@shared_task(name=__name__ + '.ping') +@shared_task(name=__name__ + ".ping") def _ping(): - return 'OK' + return "OK" diff --git a/swh/lister/phabricator/tests/conftest.py b/swh/lister/phabricator/tests/conftest.py index 22de766..2713ce9 100644 --- a/swh/lister/phabricator/tests/conftest.py +++ b/swh/lister/phabricator/tests/conftest.py @@ -10,17 +10,12 @@ from swh.lister.core.tests.conftest import * # noqa @pytest.fixture def lister_phabricator(swh_listers): - lister = swh_listers['phabricator'] + lister = swh_listers["phabricator"] # Amend the credentials lister.config = { - 'cache_responses': False, - 'credentials': { - 'phabricator': { - lister.instance: [{ - 'password': 'foo' - }] - }} + "cache_responses": False, + "credentials": {"phabricator": {lister.instance: [{"password": "foo"}]}}, } return lister diff --git a/swh/lister/phabricator/tests/test_lister.py b/swh/lister/phabricator/tests/test_lister.py index dcf76c0..6b95af0 100644 --- a/swh/lister/phabricator/tests/test_lister.py +++ b/swh/lister/phabricator/tests/test_lister.py @@ -21,12 +21,11 @@ logger = logging.getLogger(__name__) class PhabricatorListerTester(HttpListerTester, unittest.TestCase): Lister = PhabricatorLister # first request will have the after parameter empty - test_re = re.compile(r'\&after=([^?&]*)') - lister_subdir = 'phabricator' - good_api_response_file = 'data/api_first_response.json' - good_api_response_undefined_protocol = \ - 'data/api_response_undefined_protocol.json' - bad_api_response_file = 'data/api_empty_response.json' + test_re = re.compile(r"\&after=([^?&]*)") + lister_subdir = "phabricator" + good_api_response_file = "data/api_first_response.json" + good_api_response_undefined_protocol = "data/api_response_undefined_protocol.json" + bad_api_response_file = "data/api_empty_response.json" # first_index must be retrieved through a bootstrap process for Phabricator first_index = None last_index = 12 @@ -40,7 +39,7 @@ class PhabricatorListerTester(HttpListerTester, unittest.TestCase): """ m = self.test_re.search(request.path_url) idx = m.group(1) - if idx not in ('', 'None'): + if idx not in ("", "None"): return int(idx) def get_fl(self, override_config=None): @@ -48,41 +47,42 @@ class PhabricatorListerTester(HttpListerTester, unittest.TestCase): """ if override_config or self.fl is None: - credentials = {'phabricator': {'fake': [ - {'password': 'toto'} - ]}} - override_config = dict(credentials=credentials, - **(override_config or {})) - self.fl = self.Lister(url='https://fakeurl', instance='fake', - override_config=override_config) + credentials = {"phabricator": {"fake": [{"password": "toto"}]}} + override_config = dict(credentials=credentials, **(override_config or {})) + self.fl = self.Lister( + url="https://fakeurl", instance="fake", override_config=override_config + ) self.fl.INITIAL_BACKOFF = 1 self.fl.reset_backoff() return self.fl def test_get_repo_url(self): - f = open('swh/lister/%s/tests/%s' % (self.lister_subdir, - self.good_api_response_file)) + f = open( + "swh/lister/%s/tests/%s" % (self.lister_subdir, self.good_api_response_file) + ) api_response = json.load(f) - repos = api_response['result']['data'] + repos = api_response["result"]["data"] for repo in repos: self.assertEqual( - 'https://forge.softwareheritage.org/source/%s.git' % - (repo['fields']['shortName']), - get_repo_url(repo['attachments']['uris']['uris'])) + "https://forge.softwareheritage.org/source/%s.git" + % (repo["fields"]["shortName"]), + get_repo_url(repo["attachments"]["uris"]["uris"]), + ) - f = open('swh/lister/%s/tests/%s' % - (self.lister_subdir, - self.good_api_response_undefined_protocol)) + f = open( + "swh/lister/%s/tests/%s" + % (self.lister_subdir, self.good_api_response_undefined_protocol) + ) repo = json.load(f) self.assertEqual( - 'https://svn.blender.org/svnroot/bf-blender/', - get_repo_url(repo['attachments']['uris']['uris'])) + "https://svn.blender.org/svnroot/bf-blender/", + get_repo_url(repo["attachments"]["uris"]["uris"]), + ) @requests_mock.Mocker() def test_scheduled_tasks(self, http_mocker): - self.scheduled_tasks_test('data/api_next_response.json', 23, - http_mocker) + self.scheduled_tasks_test("data/api_next_response.json", 23, http_mocker) @requests_mock.Mocker() def test_scheduled_tasks_multiple_instances(self, http_mocker): @@ -92,19 +92,14 @@ class PhabricatorListerTester(HttpListerTester, unittest.TestCase): # list first Phabricator instance fl.run() - fl.instance = 'other_fake' - fl.config['credentials'] = { - 'phabricator': { - 'other_fake': [{ - 'password': 'foo' - }] - } + fl.instance = "other_fake" + fl.config["credentials"] = { + "phabricator": {"other_fake": [{"password": "foo"}]} } # list second Phabricator instance hosting repositories having # same ids as those listed from the first instance - self.good_api_response_file = \ - 'data/api_first_response_other_instance.json' + self.good_api_response_file = "data/api_first_response_other_instance.json" self.last_index = 13 fl.run() @@ -113,28 +108,28 @@ class PhabricatorListerTester(HttpListerTester, unittest.TestCase): # check tasks are not disabled for task in self.scheduler_tasks: - self.assertTrue(task['status'] != 'disabled') + self.assertTrue(task["status"] != "disabled") def test_phabricator_lister(lister_phabricator, requests_mock_datadir): lister = lister_phabricator assert lister.url == lister.DEFAULT_URL - assert lister.instance == 'forge.softwareheritage.org' + assert lister.instance == "forge.softwareheritage.org" lister.run() - r = lister.scheduler.search_tasks(task_type='load-git') + r = lister.scheduler.search_tasks(task_type="load-git") assert len(r) == 10 for row in r: - assert row['type'] == 'load-git' + assert row["type"] == "load-git" # arguments check - args = row['arguments']['args'] + args = row["arguments"]["args"] assert len(args) == 0 # kwargs - kwargs = row['arguments']['kwargs'] - url = kwargs['url'] + kwargs = row["arguments"]["kwargs"] + url = kwargs["url"] assert lister.instance in url - assert row['policy'] == 'recurring' - assert row['priority'] is None + assert row["policy"] == "recurring" + assert row["priority"] is None diff --git a/swh/lister/phabricator/tests/test_tasks.py b/swh/lister/phabricator/tests/test_tasks.py index bf8f307..38e1686 100644 --- a/swh/lister/phabricator/tests/test_tasks.py +++ b/swh/lister/phabricator/tests/test_tasks.py @@ -4,9 +4,8 @@ def test_ping(swh_app, celery_session_worker): - res = swh_app.send_task( - 'swh.lister.phabricator.tasks.ping') + res = swh_app.send_task("swh.lister.phabricator.tasks.ping") assert res res.wait() assert res.successful() - assert res.result == 'OK' + assert res.result == "OK" diff --git a/swh/lister/pypi/__init__.py b/swh/lister/pypi/__init__.py index 0f845c3..6266e58 100644 --- a/swh/lister/pypi/__init__.py +++ b/swh/lister/pypi/__init__.py @@ -7,7 +7,8 @@ def register(): from .models import PyPIModel from .lister import PyPILister - return {'models': [PyPIModel], - 'lister': PyPILister, - 'task_modules': ['%s.tasks' % __name__], - } + return { + "models": [PyPIModel], + "lister": PyPILister, + "task_modules": ["%s.tasks" % __name__], + } diff --git a/swh/lister/pypi/lister.py b/swh/lister/pypi/lister.py index 0f22ae0..e7223e7 100644 --- a/swh/lister/pypi/lister.py +++ b/swh/lister/pypi/lister.py @@ -18,12 +18,12 @@ from requests import Response class PyPILister(ListerOnePageApiTransport, SimpleLister): MODEL = PyPIModel - LISTER_NAME = 'pypi' - PAGE = 'https://pypi.org/simple/' - instance = 'pypi' # As of today only the main pypi.org is used + LISTER_NAME = "pypi" + PAGE = "https://pypi.org/simple/" + instance = "pypi" # As of today only the main pypi.org is used def __init__(self, override_config=None): - ListerOnePageApiTransport .__init__(self) + ListerOnePageApiTransport.__init__(self) SimpleLister.__init__(self, override_config=override_config) def task_dict(self, origin_type: str, origin_url: str, **kwargs): @@ -33,17 +33,16 @@ class PyPILister(ListerOnePageApiTransport, SimpleLister): needed for the ingestion task creation. """ - _type = 'load-%s' % origin_type - _policy = kwargs.get('policy', 'recurring') - return utils.create_task_dict( - _type, _policy, url=origin_url) + _type = "load-%s" % origin_type + _policy = kwargs.get("policy", "recurring") + return utils.create_task_dict(_type, _policy, url=origin_url) def list_packages(self, response: Response) -> list: """(Override) List the actual pypi origins from the response. """ result = xmltodict.parse(response.content) - _packages = [p['#text'] for p in result['html']['body']['a']] + _packages = [p["#text"] for p in result["html"]["body"]["a"]] random.shuffle(_packages) return _packages @@ -51,7 +50,7 @@ class PyPILister(ListerOnePageApiTransport, SimpleLister): """Returns origin_url """ - return 'https://pypi.org/project/%s/' % repo_name + return "https://pypi.org/project/%s/" % repo_name def get_model_from_repo(self, repo_name: str) -> Dict[str, Any]: """(Override) Transform from repository representation to model @@ -59,10 +58,10 @@ class PyPILister(ListerOnePageApiTransport, SimpleLister): """ origin_url = self.origin_url(repo_name) return { - 'uid': origin_url, - 'name': repo_name, - 'full_name': repo_name, - 'html_url': origin_url, - 'origin_url': origin_url, - 'origin_type': 'pypi', + "uid": origin_url, + "name": repo_name, + "full_name": repo_name, + "html_url": origin_url, + "origin_url": origin_url, + "origin_type": "pypi", } diff --git a/swh/lister/pypi/models.py b/swh/lister/pypi/models.py index f34eef9..a6ec5ff 100644 --- a/swh/lister/pypi/models.py +++ b/swh/lister/pypi/models.py @@ -11,6 +11,7 @@ class PyPIModel(ModelBase): """a PyPI repository representation """ - __tablename__ = 'pypi_repo' + + __tablename__ = "pypi_repo" uid = Column(String, primary_key=True) diff --git a/swh/lister/pypi/tasks.py b/swh/lister/pypi/tasks.py index b59e6b0..a6ef7f3 100644 --- a/swh/lister/pypi/tasks.py +++ b/swh/lister/pypi/tasks.py @@ -7,12 +7,12 @@ from celery import shared_task from .lister import PyPILister -@shared_task(name=__name__ + '.PyPIListerTask') +@shared_task(name=__name__ + ".PyPIListerTask") def list_pypi(**lister_args): - 'Full update of the PyPI (python) registry' + "Full update of the PyPI (python) registry" return PyPILister(**lister_args).run() -@shared_task(name=__name__ + '.ping') +@shared_task(name=__name__ + ".ping") def _ping(): - return 'OK' + return "OK" diff --git a/swh/lister/pypi/tests/conftest.py b/swh/lister/pypi/tests/conftest.py index 50a4239..658fdcb 100644 --- a/swh/lister/pypi/tests/conftest.py +++ b/swh/lister/pypi/tests/conftest.py @@ -10,14 +10,16 @@ from swh.lister.core.tests.conftest import * # noqa @pytest.fixture def lister_pypi(swh_listers): - lister = swh_listers['pypi'] + lister = swh_listers["pypi"] # Add the load-deb-package in the scheduler backend - lister.scheduler.create_task_type({ - 'type': 'load-pypi', - 'description': 'Load PyPI package', - 'backend_name': 'swh.loader.package.tasks.LoadPyPI', - 'default_interval': '1 day', - }) + lister.scheduler.create_task_type( + { + "type": "load-pypi", + "description": "Load PyPI package", + "backend_name": "swh.loader.package.tasks.LoadPyPI", + "default_interval": "1 day", + } + ) return lister diff --git a/swh/lister/pypi/tests/test_lister.py b/swh/lister/pypi/tests/test_lister.py index 6f7fc4d..6338130 100644 --- a/swh/lister/pypi/tests/test_lister.py +++ b/swh/lister/pypi/tests/test_lister.py @@ -7,21 +7,21 @@ def test_pypi_lister(lister_pypi, requests_mock_datadir): lister_pypi.run() - r = lister_pypi.scheduler.search_tasks(task_type='load-pypi') + r = lister_pypi.scheduler.search_tasks(task_type="load-pypi") assert len(r) == 4 for row in r: - assert row['type'] == 'load-pypi' + assert row["type"] == "load-pypi" # arguments check - args = row['arguments']['args'] + args = row["arguments"]["args"] assert len(args) == 0 # kwargs - kwargs = row['arguments']['kwargs'] + kwargs = row["arguments"]["kwargs"] assert len(kwargs) == 1 - origin_url = kwargs['url'] - assert 'https://pypi.org/project' in origin_url + origin_url = kwargs["url"] + assert "https://pypi.org/project" in origin_url - assert row['policy'] == 'recurring' - assert row['priority'] is None + assert row["policy"] == "recurring" + assert row["priority"] is None diff --git a/swh/lister/pypi/tests/test_tasks.py b/swh/lister/pypi/tests/test_tasks.py index ab7032b..89ffeac 100644 --- a/swh/lister/pypi/tests/test_tasks.py +++ b/swh/lister/pypi/tests/test_tasks.py @@ -2,22 +2,20 @@ from unittest.mock import patch def test_ping(swh_app, celery_session_worker): - res = swh_app.send_task( - 'swh.lister.pypi.tasks.ping') + res = swh_app.send_task("swh.lister.pypi.tasks.ping") assert res res.wait() assert res.successful() - assert res.result == 'OK' + assert res.result == "OK" -@patch('swh.lister.pypi.tasks.PyPILister') +@patch("swh.lister.pypi.tasks.PyPILister") def test_lister(lister, swh_app, celery_session_worker): # setup the mocked PypiLister lister.return_value = lister lister.run.return_value = None - res = swh_app.send_task( - 'swh.lister.pypi.tasks.PyPIListerTask') + res = swh_app.send_task("swh.lister.pypi.tasks.PyPIListerTask") assert res res.wait() assert res.successful() diff --git a/swh/lister/tests/test_cli.py b/swh/lister/tests/test_cli.py index 3224c81..bc59895 100644 --- a/swh/lister/tests/test_cli.py +++ b/swh/lister/tests/test_cli.py @@ -15,7 +15,7 @@ from .test_utils import init_db def test_get_lister_wrong_input(): """Unsupported lister should raise""" with pytest.raises(ValueError) as e: - get_lister('unknown', 'db-url') + get_lister("unknown", "db-url") assert "Invalid lister" in str(e.value) @@ -37,23 +37,22 @@ def test_get_lister_override(): db_url = init_db().url() listers = { - 'gitlab': 'https://other.gitlab.uni/api/v4/', - 'phabricator': 'https://somewhere.org/api/diffusion.repository.search', - 'cgit': 'https://some.where/cgit', + "gitlab": "https://other.gitlab.uni/api/v4/", + "phabricator": "https://somewhere.org/api/diffusion.repository.search", + "cgit": "https://some.where/cgit", } # check the override ends up defined in the lister for lister_name, url in listers.items(): lst = get_lister( - lister_name, db_url, **{ - 'url': url, - 'priority': 'high', - 'policy': 'oneshot', - }) + lister_name, + db_url, + **{"url": url, "priority": "high", "policy": "oneshot",} + ) assert lst.url == url - assert lst.config['priority'] == 'high' - assert lst.config['policy'] == 'oneshot' + assert lst.config["priority"] == "high" + assert lst.config["policy"] == "oneshot" # check the default urls are used and not the override (since it's not # passed) @@ -61,7 +60,7 @@ def test_get_lister_override(): lst = get_lister(lister_name, db_url) # no override so this does not end up in lister's configuration - assert 'url' not in lst.config - assert 'priority' not in lst.config - assert 'oneshot' not in lst.config + assert "url" not in lst.config + assert "priority" not in lst.config + assert "oneshot" not in lst.config assert lst.url == lst.DEFAULT_URL diff --git a/swh/lister/tests/test_utils.py b/swh/lister/tests/test_utils.py index 1fe7e7a..05966ce 100644 --- a/swh/lister/tests/test_utils.py +++ b/swh/lister/tests/test_utils.py @@ -10,7 +10,6 @@ from swh.lister import utils class UtilsTest(unittest.TestCase): - def test_split_range(self): actual_ranges = list(utils.split_range(14, 5)) self.assertEqual(actual_ranges, [(0, 5), (5, 10), (10, 14)]) @@ -33,6 +32,6 @@ def init_db(): db object to ease db manipulation """ - initdb_args = Postgresql.DEFAULT_SETTINGS['initdb_args'] - initdb_args = ' '.join([initdb_args, '-E UTF-8']) + initdb_args = Postgresql.DEFAULT_SETTINGS["initdb_args"] + initdb_args = " ".join([initdb_args, "-E UTF-8"]) return Postgresql(initdb_args=initdb_args) diff --git a/tox.ini b/tox.ini index 7e3f601..ef77275 100644 --- a/tox.ini +++ b/tox.ini @@ -1,5 +1,5 @@ [tox] -envlist=flake8,mypy,py3 +envlist=black,flake8,mypy,py3 [testenv] extras = @@ -13,6 +13,13 @@ commands = !dev: --cov={envsitepackagesdir}/swh/lister/ --cov-branch \ {envsitepackagesdir}/swh/lister/ {posargs} +[testenv:black] +skip_install = true +deps = + black +commands = + {envpython} -m black --check swh + [testenv:flake8] skip_install = true deps =