diff --git a/docs/tutorial.rst b/docs/tutorial.rst index eafafd9..5e1b552 100644 --- a/docs/tutorial.rst +++ b/docs/tutorial.rst @@ -374,7 +374,7 @@ More about listers See current implemented listers as examples (GitHub_, Bitbucket_, CGit_, GitLab_ ). -.. _GitHub: https://forge.softwareheritage.org/source/swh-lister/browse/master/swh/lister/github/lister.py -.. _Bitbucket: https://forge.softwareheritage.org/source/swh-lister/browse/master/swh/lister/bitbucket/lister.py -.. _CGit: https://forge.softwareheritage.org/source/swh-lister/browse/master/swh/lister/cgit/lister.py -.. _GitLab: https://forge.softwareheritage.org/source/swh-lister/browse/master/swh/lister/gitlab/lister.py +.. _GitHub: https://gitlab.softwareheritage.org/swh/devel/swh-lister/-/blob/master/swh/lister/github/lister.py +.. _Bitbucket: https://gitlab.softwareheritage.org/swh/devel/swh-lister/-/blob/master/swh/lister/bitbucket/lister.py +.. _CGit: https://gitlab.softwareheritage.org/swh/devel/swh-lister/-/blob/master/swh/lister/cgit/lister.py +.. _GitLab: https://gitlab.softwareheritage.org/swh/devel/swh-lister/-/blob/master/swh/lister/gitlab/lister.py diff --git a/pyproject.toml b/pyproject.toml index 255dfdc..fc243a0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -43,6 +43,7 @@ testing = {file = ["requirements-test.txt"]} "lister.debian" = "swh.lister.debian:register" "lister.dlang" = "swh.lister.dlang:register" "lister.elm" = "swh.lister.elm:register" +"lister.f-droid" = "swh.lister.f_droid:register" "lister.gitea" = "swh.lister.gitea:register" "lister.github" = "swh.lister.github:register" "lister.gitiles" = "swh.lister.gitiles:register" diff --git a/requirements-swh.txt b/requirements-swh.txt index 074e961..76a8876 100644 --- a/requirements-swh.txt +++ b/requirements-swh.txt @@ -1,2 +1,2 @@ -swh.core[db] >= 3.4.0 -swh.scheduler >= 2.7.0 +swh.core[db] >= 4.0.0 +swh.scheduler >= 3.0.0 diff --git a/requirements-test.txt b/requirements-test.txt index 4cd12d0..7d25c75 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -4,7 +4,7 @@ pandas-stubs pytest >= 8.1 pytest-mock requests_mock -swh-scheduler[testing] >= 2.7.0 +swh.scheduler[pytest] >= 3.1.0 types-beautifulsoup4 types-click types-dateparser diff --git a/requirements.txt b/requirements.txt index bd20daf..87e2168 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,7 +7,7 @@ launchpadlib looseversion lxml mercurial -psycopg2 +psycopg pyreadr python_debian repomd diff --git a/swh/lister/__init__.py b/swh/lister/__init__.py index 4d31083..7021419 100644 --- a/swh/lister/__init__.py +++ b/swh/lister/__init__.py @@ -1,18 +1,17 @@ -# Copyright (C) 2018-2022 The Software Heritage developers +# Copyright (C) 2018-2025 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +from importlib.metadata import PackageNotFoundError, entry_points, version import logging -import pkg_resources - logger = logging.getLogger(__name__) try: - __version__ = pkg_resources.get_distribution("swh.lister").version -except pkg_resources.DistributionNotFound: + __version__ = version("swh-lister") +except PackageNotFoundError: __version__ = "devel" USER_AGENT_TEMPLATE = ( @@ -22,7 +21,7 @@ USER_AGENT_TEMPLATE = ( LISTERS = { entry_point.name.split(".", 1)[1]: entry_point - for entry_point in pkg_resources.iter_entry_points("swh.workers") + for entry_point in entry_points().select(group="swh.workers") if entry_point.name.split(".", 1)[0] == "lister" } diff --git a/swh/lister/cpan/__init__.py b/swh/lister/cpan/__init__.py index 2624112..b34561b 100644 --- a/swh/lister/cpan/__init__.py +++ b/swh/lister/cpan/__init__.py @@ -58,10 +58,10 @@ You can follow lister execution by displaying logs of swh-lister service:: .. _cpan.org: https://cpan.org/ .. _metacpan.org: https://metacpan.org/ .. _http api endpoint: https://explorer.metacpan.org/?url=/release/ -.. _search: https://github.com/metacpan/metacpan-api/blob/master/docs/API-docs.md#search-without-constraints # noqa: B950 +.. _search: https://github.com/metacpan/metacpan-api/blob/master/docs/API-docs.md#search-without-constraints -""" +""" # noqa: B950 def register(): diff --git a/swh/lister/crates/lister.py b/swh/lister/crates/lister.py index fe8b56a..d94834c 100644 --- a/swh/lister/crates/lister.py +++ b/swh/lister/crates/lister.py @@ -1,4 +1,4 @@ -# Copyright (C) 2022-2024 The Software Heritage developers +# Copyright (C) 2022-2025 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -18,6 +18,7 @@ import iso8601 from looseversion import LooseVersion2 from swh.core.utils import grouper +from swh.model.hashutil import HASH_BLOCK_SIZE from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin @@ -119,7 +120,7 @@ class CratesLister(Lister[CratesListerState, CratesListerPage]): # Download the Db dump with self.http_request(self.DB_DUMP_URL, stream=True) as res: with open(archive_path, "wb") as out_file: - for chunk in res.iter_content(chunk_size=1024): + for chunk in res.iter_content(chunk_size=HASH_BLOCK_SIZE): out_file.write(chunk) # Extract the Db dump diff --git a/swh/lister/debian/lister.py b/swh/lister/debian/lister.py index 0b300f3..cb64f15 100644 --- a/swh/lister/debian/lister.py +++ b/swh/lister/debian/lister.py @@ -200,6 +200,8 @@ class DebianLister(Lister[DebianListerState, DebianPageType]): sum_name = "md5sum" if field_ in src_pkg: for entry in src_pkg[field_]: + if "name" not in entry: + continue name = entry["name"] files[name]["name"] = name files[name]["size"] = int(entry["size"], 10) diff --git a/swh/lister/dlang/__init__.py b/swh/lister/dlang/__init__.py index 930f95f..fcb9fd1 100644 --- a/swh/lister/dlang/__init__.py +++ b/swh/lister/dlang/__init__.py @@ -62,7 +62,7 @@ You can follow lister execution by displaying logs of swh-lister service:: .. _Dlang: https://dlang.org/ .. _DUB: https://code.dlang.org/ -.. _http api endpoint: https://code.dlang.org/api/packages/dump" +.. _http api endpoint: https://code.dlang.org/api/packages/dump """ diff --git a/swh/lister/f_droid/__init__.py b/swh/lister/f_droid/__init__.py new file mode 100644 index 0000000..9f428de --- /dev/null +++ b/swh/lister/f_droid/__init__.py @@ -0,0 +1,12 @@ +# Copyright (C) 2019-2021 the Software Heritage developers +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + + +def register(): + from .lister import FDroidLister + + return { + "lister": FDroidLister, + "task_modules": ["%s.tasks" % __name__], + } diff --git a/swh/lister/f_droid/lister.py b/swh/lister/f_droid/lister.py new file mode 100644 index 0000000..ace1e9b --- /dev/null +++ b/swh/lister/f_droid/lister.py @@ -0,0 +1,80 @@ +# Copyright (C) 2021-2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import logging +from typing import Any, Dict, Iterator, List + +from swh.scheduler.interface import SchedulerInterface +from swh.scheduler.model import ListedOrigin + + +from ..pattern import CredentialsType, StatelessLister +from datetime import datetime + +logger = logging.getLogger(__name__) + +FDroidListerPage = List[Dict[str, Any]] + + +class FDroidLister(StatelessLister[FDroidListerPage]): + """List origins from the FDroid.""" + + LISTER_NAME = "f_droid" + INSTANCE = "f_droid" + VISIT_TYPE = "f_droid" + REPO_INDEX_V2_URL = "https://f-droid.org/repo/index-v2.json" + PACKAGE_URL_TEMPLATE = "https://f-droid.org/packages/{package_id}" + + def __init__( + self, + scheduler: SchedulerInterface, + url: str = REPO_INDEX_V2_URL, + instance: str = INSTANCE, + credentials: CredentialsType = None, + ): + super().__init__( + scheduler=scheduler, + credentials=credentials, + url=url, + instance=instance, + ) + + self.session.headers.update({"Accept": "application/json"}) + + def get_pages(self) -> Iterator[FDroidListerPage]: + data = self.http_request(self.url).json() + + if "packages" not in data or not isinstance(data["packages"], dict): + raise RuntimeError( + f"Invalid response from {self.url}: missing 'packages' key" + ) + + packages: Dict[str, Any] = data["packages"] + return iter([[packages]]) + + def get_origins_from_page(self, page: FDroidListerPage) -> Iterator[ListedOrigin]: + """Convert a page of FDroidLister repositories into a list of ListedOrigins""" + assert self.lister_obj.id is not None + + for item in page: + for id, value in item.items(): + metadata = value.get("metaData", {}) + versions = value.get("versions", {}) + last_updated = metadata.get("lastUpdated", None) + if last_updated is not None: + last_updated = datetime.fromtimestamp(last_updated) + + yield ListedOrigin( + lister_id=self.lister_obj.id, + visit_type=self.VISIT_TYPE, + url=self.PACKAGE_URL_TEMPLATE.format( + package_id=id, + ), + last_update=last_updated, + extra_loader_arguments={ + "metadata": metadata, + "versions": versions, + }, + ) diff --git a/swh/lister/f_droid/tasks.py b/swh/lister/f_droid/tasks.py new file mode 100644 index 0000000..575c4e9 --- /dev/null +++ b/swh/lister/f_droid/tasks.py @@ -0,0 +1,19 @@ +# Copyright (C) 2018-2023 the Software Heritage developers +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from celery import shared_task + +from .lister import FDroidLister + + +@shared_task(name=f"{__name__}.FDroidListerTask") +def list_pypi(**lister_args): + "Full listing of the FDroid registry" + lister = FDroidLister.from_configfile(**lister_args) + return lister.run().dict() + + +@shared_task(name=__name__ + ".ping") +def _ping(): + return "OK" diff --git a/swh/lister/gitea/lister.py b/swh/lister/gitea/lister.py index e429756..8845ab0 100644 --- a/swh/lister/gitea/lister.py +++ b/swh/lister/gitea/lister.py @@ -13,10 +13,10 @@ logger = logging.getLogger(__name__) class GiteaLister(GogsLister): """List origins from Gitea. - Gitea API documentation: https://try.gitea.io/api/swagger + Gitea API documentation: https://gitea.com/api/swagger The API does pagination and provides navigation URLs through the 'Link' header. The default value for page size is the maximum value observed on the instances - accessible at https://try.gitea.io/api/v1/ and https://codeberg.org/api/v1/.""" + accessible at https://gitea.com/api/v1/ and https://codeberg.org/api/v1/.""" LISTER_NAME = "gitea" diff --git a/swh/lister/github/lister.py b/swh/lister/github/lister.py index 986f3d6..8712d36 100644 --- a/swh/lister/github/lister.py +++ b/swh/lister/github/lister.py @@ -52,7 +52,7 @@ class GitHubLister(Lister[GitHubListerState, List[Dict[str, Any]]]): When the credentials aren't set in the lister config, the lister can run in anonymous mode too (e.g. for testing purposes). - .. _abuse rate limit policy: https://developer.github.com/v3/guides/best-practices-for-integrators/#dealing-with-abuse-rate-limits + .. _abuse rate limit policy: https://developer.github.com/v3/guides/best-practices-for-integrators/#handle-rate-limit-errors-appropriately Args: diff --git a/swh/lister/gnu/tests/data/https_ftp.gnu.org/tree.json.gz b/swh/lister/gnu/tests/data/https_ftp.gnu.org/tree.json.gz index 34b3b28..2142473 100644 Binary files a/swh/lister/gnu/tests/data/https_ftp.gnu.org/tree.json.gz and b/swh/lister/gnu/tests/data/https_ftp.gnu.org/tree.json.gz differ diff --git a/swh/lister/gnu/tests/data/tree.json b/swh/lister/gnu/tests/data/tree.json index e4a99d4..1f2bb9f 100644 --- a/swh/lister/gnu/tests/data/tree.json +++ b/swh/lister/gnu/tests/data/tree.json @@ -69,5 +69,6 @@ {"type":"file","name":"xboard-4.2.5.tar.gz","size":1055502,"time":"1008466945"}, {"type":"file","name":"xboard-4.2.6.tar.gz","size":1057625,"time":"1012641715"}, {"type":"file","name":"xboard-4.2.7.tar.gz","size":1318110,"time":"1070057764"} - ]} + ]}, + {"type":"directory","name":"no-contents","size":4096,"time":"1254860068"} ] diff --git a/swh/lister/gnu/tree.py b/swh/lister/gnu/tree.py index ec48cf0..26e4f2b 100644 --- a/swh/lister/gnu/tree.py +++ b/swh/lister/gnu/tree.py @@ -61,7 +61,7 @@ class GNUTree: for directory in raw_data["contents"]: if directory["name"] not in self.top_level_directories: continue - infos = directory["contents"] + infos = directory.get("contents", []) for info in infos: if info["type"] == "directory": package_url = "%s/%s/%s/" % ( @@ -69,7 +69,9 @@ class GNUTree: directory["name"], info["name"], ) - package_artifacts = find_artifacts(info["contents"], package_url) + package_artifacts = find_artifacts( + info.get("contents", []), package_url + ) if package_artifacts != []: repo_details = { "name": info["name"], @@ -146,7 +148,7 @@ def find_artifacts( # It will recursively check for artifacts in all sub-folders elif filetype == "directory": tarballs_in_dir = find_artifacts( - info_file["contents"], url + filename + "/" + info_file.get("contents", []), url + filename + "/" ) artifacts.extend(tarballs_in_dir) diff --git a/swh/lister/packagist/lister.py b/swh/lister/packagist/lister.py index fea1952..8095237 100644 --- a/swh/lister/packagist/lister.py +++ b/swh/lister/packagist/lister.py @@ -1,4 +1,4 @@ -# Copyright (C) 2019-2023 The Software Heritage developers +# Copyright (C) 2019-2025 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -7,7 +7,7 @@ from dataclasses import dataclass from datetime import datetime, timezone import logging from random import shuffle -from typing import Any, Dict, Iterator, List, Optional +from typing import Any, Dict, Iterable, Iterator, List, Optional import iso8601 import requests @@ -21,7 +21,7 @@ from ..pattern import CredentialsType, Lister logger = logging.getLogger(__name__) -PackagistPageType = List[str] +PackagistPageType = Iterable[str] class NotModifiedSinceLastVisit(ValueError): diff --git a/swh/lister/rubygems/lister.py b/swh/lister/rubygems/lister.py index 400a992..aa5593c 100644 --- a/swh/lister/rubygems/lister.py +++ b/swh/lister/rubygems/lister.py @@ -15,7 +15,7 @@ import tempfile from typing import Any, Dict, Iterator, Optional, Tuple from bs4 import BeautifulSoup -import psycopg2 +import psycopg from testing.postgresql import Postgresql from swh.scheduler.interface import SchedulerInterface @@ -87,20 +87,18 @@ class RubyGemsLister(StatelessLister[RubyGemsListerPage]): def create_rubygems_db( self, postgresql: Postgresql - ) -> Tuple[str, psycopg2._psycopg.connection]: + ) -> Tuple[str, psycopg.Connection[Any]]: logger.debug("Creating rubygems database") db_dsn = postgresql.dsn() db_url = postgresql.url().replace(db_dsn["database"], self.DB_NAME) - db = psycopg2.connect(**db_dsn) - db.set_isolation_level(psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT) + db = psycopg.connect(autocommit=True, conninfo=postgresql.url()) with db.cursor() as cursor: cursor.execute(f"CREATE DATABASE {self.DB_NAME}") db_dsn["database"] = self.DB_NAME - db = psycopg2.connect(**db_dsn) - db.set_isolation_level(psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT) + db = psycopg.connect(conninfo=db_url, autocommit=True) with db.cursor() as cursor: cursor.execute("CREATE EXTENSION IF NOT EXISTS hstore") diff --git a/swh/lister/rubygems/tests/test_lister.py b/swh/lister/rubygems/tests/test_lister.py index 122c8c7..8b80b51 100644 --- a/swh/lister/rubygems/tests/test_lister.py +++ b/swh/lister/rubygems/tests/test_lister.py @@ -133,7 +133,6 @@ def network_requests_mock(datadir, requests_mock): ) -@pytest.mark.db def test_rubygems_lister(swh_scheduler, expected_listed_origins): lister = RubyGemsLister(scheduler=swh_scheduler) res = lister.run()