Compare commits

...

10 commits

Author SHA1 Message Date
01f2e56a53
add fdroid lister 2025-05-18 16:14:25 +08:00
Antoine Lambert
213a4a152f
crates: Bump chunk size when downloading database dump
It allows faster download of the database dump located at
https://static.crates.io/db-dump.tar.gz.
2025-04-15 12:17:57 +02:00
Antoine Lambert
ceb1b6450e gnu: Fix KeyError exception due to missing field in JSON data
Latest GNU JSON listing is missing the contents field for a directory
so a KeyError exception was raised by the lister.
2025-04-04 12:03:10 +00:00
Nicolas Dandrimont
41c13438b4 Use swh-scheduler[pytest] instead of swh-scheduler[testing] 2025-03-31 18:57:11 +02:00
Pierre-Yves David
08fda328be Migration to psycopg3 2025-03-21 17:05:07 +01:00
Antoine Lambert
61cfd77da1
debian: Fix error since python-debian 1.0 release
Since python-debian 1.0 release, an extra paragraph is returned
when calling Sources.iter_paragraphs that does not have the
expected schema so ensure to ignore it.
2025-03-13 13:33:33 +01:00
Antoine Lambert
f2f9c7d19e Migrate from deprecated pkg_resources package to importlib.metadata 2025-02-26 08:59:30 +01:00
Antoine Lambert
6b4f84a384
packagist: Fix mypy error after typing added to grouper 2025-02-25 10:54:12 +01:00
Antoine Lambert
bde37867d8 docs: Fix broken external links
Those were spotted thanks to the sphinx linkcheck builder
2025-02-20 10:15:54 +00:00
Antoine Lambert
3771a411ae
tests: Remove no longer needed pytest custom marker named db
This was used at the time we were building debian packages for
swh components but we no longer do that.
2025-02-17 16:29:09 +01:00
21 changed files with 150 additions and 36 deletions

View file

@ -374,7 +374,7 @@ More about listers
See current implemented listers as examples (GitHub_, Bitbucket_, CGit_, GitLab_ ).
.. _GitHub: https://forge.softwareheritage.org/source/swh-lister/browse/master/swh/lister/github/lister.py
.. _Bitbucket: https://forge.softwareheritage.org/source/swh-lister/browse/master/swh/lister/bitbucket/lister.py
.. _CGit: https://forge.softwareheritage.org/source/swh-lister/browse/master/swh/lister/cgit/lister.py
.. _GitLab: https://forge.softwareheritage.org/source/swh-lister/browse/master/swh/lister/gitlab/lister.py
.. _GitHub: https://gitlab.softwareheritage.org/swh/devel/swh-lister/-/blob/master/swh/lister/github/lister.py
.. _Bitbucket: https://gitlab.softwareheritage.org/swh/devel/swh-lister/-/blob/master/swh/lister/bitbucket/lister.py
.. _CGit: https://gitlab.softwareheritage.org/swh/devel/swh-lister/-/blob/master/swh/lister/cgit/lister.py
.. _GitLab: https://gitlab.softwareheritage.org/swh/devel/swh-lister/-/blob/master/swh/lister/gitlab/lister.py

View file

@ -43,6 +43,7 @@ testing = {file = ["requirements-test.txt"]}
"lister.debian" = "swh.lister.debian:register"
"lister.dlang" = "swh.lister.dlang:register"
"lister.elm" = "swh.lister.elm:register"
"lister.f-droid" = "swh.lister.f_droid:register"
"lister.gitea" = "swh.lister.gitea:register"
"lister.github" = "swh.lister.github:register"
"lister.gitiles" = "swh.lister.gitiles:register"

View file

@ -1,2 +1,2 @@
swh.core[db] >= 3.4.0
swh.scheduler >= 2.7.0
swh.core[db] >= 4.0.0
swh.scheduler >= 3.0.0

View file

@ -4,7 +4,7 @@ pandas-stubs
pytest >= 8.1
pytest-mock
requests_mock
swh-scheduler[testing] >= 2.7.0
swh.scheduler[pytest] >= 3.1.0
types-beautifulsoup4
types-click
types-dateparser

View file

@ -7,7 +7,7 @@ launchpadlib
looseversion
lxml
mercurial
psycopg2
psycopg
pyreadr
python_debian
repomd

View file

@ -1,18 +1,17 @@
# Copyright (C) 2018-2022 The Software Heritage developers
# Copyright (C) 2018-2025 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from importlib.metadata import PackageNotFoundError, entry_points, version
import logging
import pkg_resources
logger = logging.getLogger(__name__)
try:
__version__ = pkg_resources.get_distribution("swh.lister").version
except pkg_resources.DistributionNotFound:
__version__ = version("swh-lister")
except PackageNotFoundError:
__version__ = "devel"
USER_AGENT_TEMPLATE = (
@ -22,7 +21,7 @@ USER_AGENT_TEMPLATE = (
LISTERS = {
entry_point.name.split(".", 1)[1]: entry_point
for entry_point in pkg_resources.iter_entry_points("swh.workers")
for entry_point in entry_points().select(group="swh.workers")
if entry_point.name.split(".", 1)[0] == "lister"
}

View file

@ -58,10 +58,10 @@ You can follow lister execution by displaying logs of swh-lister service::
.. _cpan.org: https://cpan.org/
.. _metacpan.org: https://metacpan.org/
.. _http api endpoint: https://explorer.metacpan.org/?url=/release/
.. _search: https://github.com/metacpan/metacpan-api/blob/master/docs/API-docs.md#search-without-constraints # noqa: B950
.. _search: https://github.com/metacpan/metacpan-api/blob/master/docs/API-docs.md#search-without-constraints
"""
""" # noqa: B950
def register():

View file

@ -1,4 +1,4 @@
# Copyright (C) 2022-2024 The Software Heritage developers
# Copyright (C) 2022-2025 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
@ -18,6 +18,7 @@ import iso8601
from looseversion import LooseVersion2
from swh.core.utils import grouper
from swh.model.hashutil import HASH_BLOCK_SIZE
from swh.scheduler.interface import SchedulerInterface
from swh.scheduler.model import ListedOrigin
@ -119,7 +120,7 @@ class CratesLister(Lister[CratesListerState, CratesListerPage]):
# Download the Db dump
with self.http_request(self.DB_DUMP_URL, stream=True) as res:
with open(archive_path, "wb") as out_file:
for chunk in res.iter_content(chunk_size=1024):
for chunk in res.iter_content(chunk_size=HASH_BLOCK_SIZE):
out_file.write(chunk)
# Extract the Db dump

View file

@ -200,6 +200,8 @@ class DebianLister(Lister[DebianListerState, DebianPageType]):
sum_name = "md5sum"
if field_ in src_pkg:
for entry in src_pkg[field_]:
if "name" not in entry:
continue
name = entry["name"]
files[name]["name"] = name
files[name]["size"] = int(entry["size"], 10)

View file

@ -62,7 +62,7 @@ You can follow lister execution by displaying logs of swh-lister service::
.. _Dlang: https://dlang.org/
.. _DUB: https://code.dlang.org/
.. _http api endpoint: https://code.dlang.org/api/packages/dump"
.. _http api endpoint: https://code.dlang.org/api/packages/dump
"""

View file

@ -0,0 +1,12 @@
# Copyright (C) 2019-2021 the Software Heritage developers
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
def register():
from .lister import FDroidLister
return {
"lister": FDroidLister,
"task_modules": ["%s.tasks" % __name__],
}

View file

@ -0,0 +1,80 @@
# Copyright (C) 2021-2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import logging
from typing import Any, Dict, Iterator, List
from swh.scheduler.interface import SchedulerInterface
from swh.scheduler.model import ListedOrigin
from ..pattern import CredentialsType, StatelessLister
from datetime import datetime
logger = logging.getLogger(__name__)
FDroidListerPage = List[Dict[str, Any]]
class FDroidLister(StatelessLister[FDroidListerPage]):
"""List origins from the FDroid."""
LISTER_NAME = "f_droid"
INSTANCE = "f_droid"
VISIT_TYPE = "f_droid"
REPO_INDEX_V2_URL = "https://f-droid.org/repo/index-v2.json"
PACKAGE_URL_TEMPLATE = "https://f-droid.org/packages/{package_id}"
def __init__(
self,
scheduler: SchedulerInterface,
url: str = REPO_INDEX_V2_URL,
instance: str = INSTANCE,
credentials: CredentialsType = None,
):
super().__init__(
scheduler=scheduler,
credentials=credentials,
url=url,
instance=instance,
)
self.session.headers.update({"Accept": "application/json"})
def get_pages(self) -> Iterator[FDroidListerPage]:
data = self.http_request(self.url).json()
if "packages" not in data or not isinstance(data["packages"], dict):
raise RuntimeError(
f"Invalid response from {self.url}: missing 'packages' key"
)
packages: Dict[str, Any] = data["packages"]
return iter([[packages]])
def get_origins_from_page(self, page: FDroidListerPage) -> Iterator[ListedOrigin]:
"""Convert a page of FDroidLister repositories into a list of ListedOrigins"""
assert self.lister_obj.id is not None
for item in page:
for id, value in item.items():
metadata = value.get("metaData", {})
versions = value.get("versions", {})
last_updated = metadata.get("lastUpdated", None)
if last_updated is not None:
last_updated = datetime.fromtimestamp(last_updated)
yield ListedOrigin(
lister_id=self.lister_obj.id,
visit_type=self.VISIT_TYPE,
url=self.PACKAGE_URL_TEMPLATE.format(
package_id=id,
),
last_update=last_updated,
extra_loader_arguments={
"metadata": metadata,
"versions": versions,
},
)

View file

@ -0,0 +1,19 @@
# Copyright (C) 2018-2023 the Software Heritage developers
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from celery import shared_task
from .lister import FDroidLister
@shared_task(name=f"{__name__}.FDroidListerTask")
def list_pypi(**lister_args):
"Full listing of the FDroid registry"
lister = FDroidLister.from_configfile(**lister_args)
return lister.run().dict()
@shared_task(name=__name__ + ".ping")
def _ping():
return "OK"

View file

@ -13,10 +13,10 @@ logger = logging.getLogger(__name__)
class GiteaLister(GogsLister):
"""List origins from Gitea.
Gitea API documentation: https://try.gitea.io/api/swagger
Gitea API documentation: https://gitea.com/api/swagger
The API does pagination and provides navigation URLs through the 'Link' header.
The default value for page size is the maximum value observed on the instances
accessible at https://try.gitea.io/api/v1/ and https://codeberg.org/api/v1/."""
accessible at https://gitea.com/api/v1/ and https://codeberg.org/api/v1/."""
LISTER_NAME = "gitea"

View file

@ -52,7 +52,7 @@ class GitHubLister(Lister[GitHubListerState, List[Dict[str, Any]]]):
When the credentials aren't set in the lister config, the lister can run in
anonymous mode too (e.g. for testing purposes).
.. _abuse rate limit policy: https://developer.github.com/v3/guides/best-practices-for-integrators/#dealing-with-abuse-rate-limits
.. _abuse rate limit policy: https://developer.github.com/v3/guides/best-practices-for-integrators/#handle-rate-limit-errors-appropriately
Args:

View file

@ -69,5 +69,6 @@
{"type":"file","name":"xboard-4.2.5.tar.gz","size":1055502,"time":"1008466945"},
{"type":"file","name":"xboard-4.2.6.tar.gz","size":1057625,"time":"1012641715"},
{"type":"file","name":"xboard-4.2.7.tar.gz","size":1318110,"time":"1070057764"}
]}
]},
{"type":"directory","name":"no-contents","size":4096,"time":"1254860068"}
]

View file

@ -61,7 +61,7 @@ class GNUTree:
for directory in raw_data["contents"]:
if directory["name"] not in self.top_level_directories:
continue
infos = directory["contents"]
infos = directory.get("contents", [])
for info in infos:
if info["type"] == "directory":
package_url = "%s/%s/%s/" % (
@ -69,7 +69,9 @@ class GNUTree:
directory["name"],
info["name"],
)
package_artifacts = find_artifacts(info["contents"], package_url)
package_artifacts = find_artifacts(
info.get("contents", []), package_url
)
if package_artifacts != []:
repo_details = {
"name": info["name"],
@ -146,7 +148,7 @@ def find_artifacts(
# It will recursively check for artifacts in all sub-folders
elif filetype == "directory":
tarballs_in_dir = find_artifacts(
info_file["contents"], url + filename + "/"
info_file.get("contents", []), url + filename + "/"
)
artifacts.extend(tarballs_in_dir)

View file

@ -1,4 +1,4 @@
# Copyright (C) 2019-2023 The Software Heritage developers
# Copyright (C) 2019-2025 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
@ -7,7 +7,7 @@ from dataclasses import dataclass
from datetime import datetime, timezone
import logging
from random import shuffle
from typing import Any, Dict, Iterator, List, Optional
from typing import Any, Dict, Iterable, Iterator, List, Optional
import iso8601
import requests
@ -21,7 +21,7 @@ from ..pattern import CredentialsType, Lister
logger = logging.getLogger(__name__)
PackagistPageType = List[str]
PackagistPageType = Iterable[str]
class NotModifiedSinceLastVisit(ValueError):

View file

@ -15,7 +15,7 @@ import tempfile
from typing import Any, Dict, Iterator, Optional, Tuple
from bs4 import BeautifulSoup
import psycopg2
import psycopg
from testing.postgresql import Postgresql
from swh.scheduler.interface import SchedulerInterface
@ -87,20 +87,18 @@ class RubyGemsLister(StatelessLister[RubyGemsListerPage]):
def create_rubygems_db(
self, postgresql: Postgresql
) -> Tuple[str, psycopg2._psycopg.connection]:
) -> Tuple[str, psycopg.Connection[Any]]:
logger.debug("Creating rubygems database")
db_dsn = postgresql.dsn()
db_url = postgresql.url().replace(db_dsn["database"], self.DB_NAME)
db = psycopg2.connect(**db_dsn)
db.set_isolation_level(psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT)
db = psycopg.connect(autocommit=True, conninfo=postgresql.url())
with db.cursor() as cursor:
cursor.execute(f"CREATE DATABASE {self.DB_NAME}")
db_dsn["database"] = self.DB_NAME
db = psycopg2.connect(**db_dsn)
db.set_isolation_level(psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT)
db = psycopg.connect(conninfo=db_url, autocommit=True)
with db.cursor() as cursor:
cursor.execute("CREATE EXTENSION IF NOT EXISTS hstore")

View file

@ -133,7 +133,6 @@ def network_requests_mock(datadir, requests_mock):
)
@pytest.mark.db
def test_rubygems_lister(swh_scheduler, expected_listed_origins):
lister = RubyGemsLister(scheduler=swh_scheduler)
res = lister.run()