Compare commits

..

No commits in common. "01f2e56a53c9967474621a1504aa30ba9a53108c" and "db00f23ec0ebd2c2a1a46787ddc7ae03c9603952" have entirely different histories.

21 changed files with 36 additions and 150 deletions

View file

@ -374,7 +374,7 @@ More about listers
See current implemented listers as examples (GitHub_, Bitbucket_, CGit_, GitLab_ ).
.. _GitHub: https://gitlab.softwareheritage.org/swh/devel/swh-lister/-/blob/master/swh/lister/github/lister.py
.. _Bitbucket: https://gitlab.softwareheritage.org/swh/devel/swh-lister/-/blob/master/swh/lister/bitbucket/lister.py
.. _CGit: https://gitlab.softwareheritage.org/swh/devel/swh-lister/-/blob/master/swh/lister/cgit/lister.py
.. _GitLab: https://gitlab.softwareheritage.org/swh/devel/swh-lister/-/blob/master/swh/lister/gitlab/lister.py
.. _GitHub: https://forge.softwareheritage.org/source/swh-lister/browse/master/swh/lister/github/lister.py
.. _Bitbucket: https://forge.softwareheritage.org/source/swh-lister/browse/master/swh/lister/bitbucket/lister.py
.. _CGit: https://forge.softwareheritage.org/source/swh-lister/browse/master/swh/lister/cgit/lister.py
.. _GitLab: https://forge.softwareheritage.org/source/swh-lister/browse/master/swh/lister/gitlab/lister.py

View file

@ -43,7 +43,6 @@ testing = {file = ["requirements-test.txt"]}
"lister.debian" = "swh.lister.debian:register"
"lister.dlang" = "swh.lister.dlang:register"
"lister.elm" = "swh.lister.elm:register"
"lister.f-droid" = "swh.lister.f_droid:register"
"lister.gitea" = "swh.lister.gitea:register"
"lister.github" = "swh.lister.github:register"
"lister.gitiles" = "swh.lister.gitiles:register"

View file

@ -1,2 +1,2 @@
swh.core[db] >= 4.0.0
swh.scheduler >= 3.0.0
swh.core[db] >= 3.4.0
swh.scheduler >= 2.7.0

View file

@ -4,7 +4,7 @@ pandas-stubs
pytest >= 8.1
pytest-mock
requests_mock
swh.scheduler[pytest] >= 3.1.0
swh-scheduler[testing] >= 2.7.0
types-beautifulsoup4
types-click
types-dateparser

View file

@ -7,7 +7,7 @@ launchpadlib
looseversion
lxml
mercurial
psycopg
psycopg2
pyreadr
python_debian
repomd

View file

@ -1,17 +1,18 @@
# Copyright (C) 2018-2025 The Software Heritage developers
# Copyright (C) 2018-2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from importlib.metadata import PackageNotFoundError, entry_points, version
import logging
import pkg_resources
logger = logging.getLogger(__name__)
try:
__version__ = version("swh-lister")
except PackageNotFoundError:
__version__ = pkg_resources.get_distribution("swh.lister").version
except pkg_resources.DistributionNotFound:
__version__ = "devel"
USER_AGENT_TEMPLATE = (
@ -21,7 +22,7 @@ USER_AGENT_TEMPLATE = (
LISTERS = {
entry_point.name.split(".", 1)[1]: entry_point
for entry_point in entry_points().select(group="swh.workers")
for entry_point in pkg_resources.iter_entry_points("swh.workers")
if entry_point.name.split(".", 1)[0] == "lister"
}

View file

@ -58,10 +58,10 @@ You can follow lister execution by displaying logs of swh-lister service::
.. _cpan.org: https://cpan.org/
.. _metacpan.org: https://metacpan.org/
.. _http api endpoint: https://explorer.metacpan.org/?url=/release/
.. _search: https://github.com/metacpan/metacpan-api/blob/master/docs/API-docs.md#search-without-constraints
.. _search: https://github.com/metacpan/metacpan-api/blob/master/docs/API-docs.md#search-without-constraints # noqa: B950
""" # noqa: B950
"""
def register():

View file

@ -1,4 +1,4 @@
# Copyright (C) 2022-2025 The Software Heritage developers
# Copyright (C) 2022-2024 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
@ -18,7 +18,6 @@ import iso8601
from looseversion import LooseVersion2
from swh.core.utils import grouper
from swh.model.hashutil import HASH_BLOCK_SIZE
from swh.scheduler.interface import SchedulerInterface
from swh.scheduler.model import ListedOrigin
@ -120,7 +119,7 @@ class CratesLister(Lister[CratesListerState, CratesListerPage]):
# Download the Db dump
with self.http_request(self.DB_DUMP_URL, stream=True) as res:
with open(archive_path, "wb") as out_file:
for chunk in res.iter_content(chunk_size=HASH_BLOCK_SIZE):
for chunk in res.iter_content(chunk_size=1024):
out_file.write(chunk)
# Extract the Db dump

View file

@ -200,8 +200,6 @@ class DebianLister(Lister[DebianListerState, DebianPageType]):
sum_name = "md5sum"
if field_ in src_pkg:
for entry in src_pkg[field_]:
if "name" not in entry:
continue
name = entry["name"]
files[name]["name"] = name
files[name]["size"] = int(entry["size"], 10)

View file

@ -62,7 +62,7 @@ You can follow lister execution by displaying logs of swh-lister service::
.. _Dlang: https://dlang.org/
.. _DUB: https://code.dlang.org/
.. _http api endpoint: https://code.dlang.org/api/packages/dump
.. _http api endpoint: https://code.dlang.org/api/packages/dump"
"""

View file

@ -1,12 +0,0 @@
# Copyright (C) 2019-2021 the Software Heritage developers
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
def register():
from .lister import FDroidLister
return {
"lister": FDroidLister,
"task_modules": ["%s.tasks" % __name__],
}

View file

@ -1,80 +0,0 @@
# Copyright (C) 2021-2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import logging
from typing import Any, Dict, Iterator, List
from swh.scheduler.interface import SchedulerInterface
from swh.scheduler.model import ListedOrigin
from ..pattern import CredentialsType, StatelessLister
from datetime import datetime
logger = logging.getLogger(__name__)
FDroidListerPage = List[Dict[str, Any]]
class FDroidLister(StatelessLister[FDroidListerPage]):
"""List origins from the FDroid."""
LISTER_NAME = "f_droid"
INSTANCE = "f_droid"
VISIT_TYPE = "f_droid"
REPO_INDEX_V2_URL = "https://f-droid.org/repo/index-v2.json"
PACKAGE_URL_TEMPLATE = "https://f-droid.org/packages/{package_id}"
def __init__(
self,
scheduler: SchedulerInterface,
url: str = REPO_INDEX_V2_URL,
instance: str = INSTANCE,
credentials: CredentialsType = None,
):
super().__init__(
scheduler=scheduler,
credentials=credentials,
url=url,
instance=instance,
)
self.session.headers.update({"Accept": "application/json"})
def get_pages(self) -> Iterator[FDroidListerPage]:
data = self.http_request(self.url).json()
if "packages" not in data or not isinstance(data["packages"], dict):
raise RuntimeError(
f"Invalid response from {self.url}: missing 'packages' key"
)
packages: Dict[str, Any] = data["packages"]
return iter([[packages]])
def get_origins_from_page(self, page: FDroidListerPage) -> Iterator[ListedOrigin]:
"""Convert a page of FDroidLister repositories into a list of ListedOrigins"""
assert self.lister_obj.id is not None
for item in page:
for id, value in item.items():
metadata = value.get("metaData", {})
versions = value.get("versions", {})
last_updated = metadata.get("lastUpdated", None)
if last_updated is not None:
last_updated = datetime.fromtimestamp(last_updated)
yield ListedOrigin(
lister_id=self.lister_obj.id,
visit_type=self.VISIT_TYPE,
url=self.PACKAGE_URL_TEMPLATE.format(
package_id=id,
),
last_update=last_updated,
extra_loader_arguments={
"metadata": metadata,
"versions": versions,
},
)

View file

@ -1,19 +0,0 @@
# Copyright (C) 2018-2023 the Software Heritage developers
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from celery import shared_task
from .lister import FDroidLister
@shared_task(name=f"{__name__}.FDroidListerTask")
def list_pypi(**lister_args):
"Full listing of the FDroid registry"
lister = FDroidLister.from_configfile(**lister_args)
return lister.run().dict()
@shared_task(name=__name__ + ".ping")
def _ping():
return "OK"

View file

@ -13,10 +13,10 @@ logger = logging.getLogger(__name__)
class GiteaLister(GogsLister):
"""List origins from Gitea.
Gitea API documentation: https://gitea.com/api/swagger
Gitea API documentation: https://try.gitea.io/api/swagger
The API does pagination and provides navigation URLs through the 'Link' header.
The default value for page size is the maximum value observed on the instances
accessible at https://gitea.com/api/v1/ and https://codeberg.org/api/v1/."""
accessible at https://try.gitea.io/api/v1/ and https://codeberg.org/api/v1/."""
LISTER_NAME = "gitea"

View file

@ -52,7 +52,7 @@ class GitHubLister(Lister[GitHubListerState, List[Dict[str, Any]]]):
When the credentials aren't set in the lister config, the lister can run in
anonymous mode too (e.g. for testing purposes).
.. _abuse rate limit policy: https://developer.github.com/v3/guides/best-practices-for-integrators/#handle-rate-limit-errors-appropriately
.. _abuse rate limit policy: https://developer.github.com/v3/guides/best-practices-for-integrators/#dealing-with-abuse-rate-limits
Args:

View file

@ -69,6 +69,5 @@
{"type":"file","name":"xboard-4.2.5.tar.gz","size":1055502,"time":"1008466945"},
{"type":"file","name":"xboard-4.2.6.tar.gz","size":1057625,"time":"1012641715"},
{"type":"file","name":"xboard-4.2.7.tar.gz","size":1318110,"time":"1070057764"}
]},
{"type":"directory","name":"no-contents","size":4096,"time":"1254860068"}
]}
]

View file

@ -61,7 +61,7 @@ class GNUTree:
for directory in raw_data["contents"]:
if directory["name"] not in self.top_level_directories:
continue
infos = directory.get("contents", [])
infos = directory["contents"]
for info in infos:
if info["type"] == "directory":
package_url = "%s/%s/%s/" % (
@ -69,9 +69,7 @@ class GNUTree:
directory["name"],
info["name"],
)
package_artifacts = find_artifacts(
info.get("contents", []), package_url
)
package_artifacts = find_artifacts(info["contents"], package_url)
if package_artifacts != []:
repo_details = {
"name": info["name"],
@ -148,7 +146,7 @@ def find_artifacts(
# It will recursively check for artifacts in all sub-folders
elif filetype == "directory":
tarballs_in_dir = find_artifacts(
info_file.get("contents", []), url + filename + "/"
info_file["contents"], url + filename + "/"
)
artifacts.extend(tarballs_in_dir)

View file

@ -1,4 +1,4 @@
# Copyright (C) 2019-2025 The Software Heritage developers
# Copyright (C) 2019-2023 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
@ -7,7 +7,7 @@ from dataclasses import dataclass
from datetime import datetime, timezone
import logging
from random import shuffle
from typing import Any, Dict, Iterable, Iterator, List, Optional
from typing import Any, Dict, Iterator, List, Optional
import iso8601
import requests
@ -21,7 +21,7 @@ from ..pattern import CredentialsType, Lister
logger = logging.getLogger(__name__)
PackagistPageType = Iterable[str]
PackagistPageType = List[str]
class NotModifiedSinceLastVisit(ValueError):

View file

@ -15,7 +15,7 @@ import tempfile
from typing import Any, Dict, Iterator, Optional, Tuple
from bs4 import BeautifulSoup
import psycopg
import psycopg2
from testing.postgresql import Postgresql
from swh.scheduler.interface import SchedulerInterface
@ -87,18 +87,20 @@ class RubyGemsLister(StatelessLister[RubyGemsListerPage]):
def create_rubygems_db(
self, postgresql: Postgresql
) -> Tuple[str, psycopg.Connection[Any]]:
) -> Tuple[str, psycopg2._psycopg.connection]:
logger.debug("Creating rubygems database")
db_dsn = postgresql.dsn()
db_url = postgresql.url().replace(db_dsn["database"], self.DB_NAME)
db = psycopg.connect(autocommit=True, conninfo=postgresql.url())
db = psycopg2.connect(**db_dsn)
db.set_isolation_level(psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT)
with db.cursor() as cursor:
cursor.execute(f"CREATE DATABASE {self.DB_NAME}")
db_dsn["database"] = self.DB_NAME
db = psycopg.connect(conninfo=db_url, autocommit=True)
db = psycopg2.connect(**db_dsn)
db.set_isolation_level(psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT)
with db.cursor() as cursor:
cursor.execute("CREATE EXTENSION IF NOT EXISTS hstore")

View file

@ -133,6 +133,7 @@ def network_requests_mock(datadir, requests_mock):
)
@pytest.mark.db
def test_rubygems_lister(swh_scheduler, expected_listed_origins):
lister = RubyGemsLister(scheduler=swh_scheduler)
res = lister.run()