Compare commits
10 commits
db00f23ec0
...
01f2e56a53
Author | SHA1 | Date | |
---|---|---|---|
01f2e56a53 | |||
![]() |
213a4a152f | ||
![]() |
ceb1b6450e | ||
![]() |
41c13438b4 | ||
![]() |
08fda328be | ||
![]() |
61cfd77da1 | ||
![]() |
f2f9c7d19e | ||
![]() |
6b4f84a384 | ||
![]() |
bde37867d8 | ||
![]() |
3771a411ae |
21 changed files with 150 additions and 36 deletions
|
@ -374,7 +374,7 @@ More about listers
|
|||
|
||||
See current implemented listers as examples (GitHub_, Bitbucket_, CGit_, GitLab_ ).
|
||||
|
||||
.. _GitHub: https://forge.softwareheritage.org/source/swh-lister/browse/master/swh/lister/github/lister.py
|
||||
.. _Bitbucket: https://forge.softwareheritage.org/source/swh-lister/browse/master/swh/lister/bitbucket/lister.py
|
||||
.. _CGit: https://forge.softwareheritage.org/source/swh-lister/browse/master/swh/lister/cgit/lister.py
|
||||
.. _GitLab: https://forge.softwareheritage.org/source/swh-lister/browse/master/swh/lister/gitlab/lister.py
|
||||
.. _GitHub: https://gitlab.softwareheritage.org/swh/devel/swh-lister/-/blob/master/swh/lister/github/lister.py
|
||||
.. _Bitbucket: https://gitlab.softwareheritage.org/swh/devel/swh-lister/-/blob/master/swh/lister/bitbucket/lister.py
|
||||
.. _CGit: https://gitlab.softwareheritage.org/swh/devel/swh-lister/-/blob/master/swh/lister/cgit/lister.py
|
||||
.. _GitLab: https://gitlab.softwareheritage.org/swh/devel/swh-lister/-/blob/master/swh/lister/gitlab/lister.py
|
||||
|
|
|
@ -43,6 +43,7 @@ testing = {file = ["requirements-test.txt"]}
|
|||
"lister.debian" = "swh.lister.debian:register"
|
||||
"lister.dlang" = "swh.lister.dlang:register"
|
||||
"lister.elm" = "swh.lister.elm:register"
|
||||
"lister.f-droid" = "swh.lister.f_droid:register"
|
||||
"lister.gitea" = "swh.lister.gitea:register"
|
||||
"lister.github" = "swh.lister.github:register"
|
||||
"lister.gitiles" = "swh.lister.gitiles:register"
|
||||
|
|
|
@ -1,2 +1,2 @@
|
|||
swh.core[db] >= 3.4.0
|
||||
swh.scheduler >= 2.7.0
|
||||
swh.core[db] >= 4.0.0
|
||||
swh.scheduler >= 3.0.0
|
||||
|
|
|
@ -4,7 +4,7 @@ pandas-stubs
|
|||
pytest >= 8.1
|
||||
pytest-mock
|
||||
requests_mock
|
||||
swh-scheduler[testing] >= 2.7.0
|
||||
swh.scheduler[pytest] >= 3.1.0
|
||||
types-beautifulsoup4
|
||||
types-click
|
||||
types-dateparser
|
||||
|
|
|
@ -7,7 +7,7 @@ launchpadlib
|
|||
looseversion
|
||||
lxml
|
||||
mercurial
|
||||
psycopg2
|
||||
psycopg
|
||||
pyreadr
|
||||
python_debian
|
||||
repomd
|
||||
|
|
|
@ -1,18 +1,17 @@
|
|||
# Copyright (C) 2018-2022 The Software Heritage developers
|
||||
# Copyright (C) 2018-2025 The Software Heritage developers
|
||||
# See the AUTHORS file at the top-level directory of this distribution
|
||||
# License: GNU General Public License version 3, or any later version
|
||||
# See top-level LICENSE file for more information
|
||||
|
||||
from importlib.metadata import PackageNotFoundError, entry_points, version
|
||||
import logging
|
||||
|
||||
import pkg_resources
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
try:
|
||||
__version__ = pkg_resources.get_distribution("swh.lister").version
|
||||
except pkg_resources.DistributionNotFound:
|
||||
__version__ = version("swh-lister")
|
||||
except PackageNotFoundError:
|
||||
__version__ = "devel"
|
||||
|
||||
USER_AGENT_TEMPLATE = (
|
||||
|
@ -22,7 +21,7 @@ USER_AGENT_TEMPLATE = (
|
|||
|
||||
LISTERS = {
|
||||
entry_point.name.split(".", 1)[1]: entry_point
|
||||
for entry_point in pkg_resources.iter_entry_points("swh.workers")
|
||||
for entry_point in entry_points().select(group="swh.workers")
|
||||
if entry_point.name.split(".", 1)[0] == "lister"
|
||||
}
|
||||
|
||||
|
|
|
@ -58,10 +58,10 @@ You can follow lister execution by displaying logs of swh-lister service::
|
|||
.. _cpan.org: https://cpan.org/
|
||||
.. _metacpan.org: https://metacpan.org/
|
||||
.. _http api endpoint: https://explorer.metacpan.org/?url=/release/
|
||||
.. _search: https://github.com/metacpan/metacpan-api/blob/master/docs/API-docs.md#search-without-constraints # noqa: B950
|
||||
.. _search: https://github.com/metacpan/metacpan-api/blob/master/docs/API-docs.md#search-without-constraints
|
||||
|
||||
|
||||
"""
|
||||
""" # noqa: B950
|
||||
|
||||
|
||||
def register():
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
# Copyright (C) 2022-2024 The Software Heritage developers
|
||||
# Copyright (C) 2022-2025 The Software Heritage developers
|
||||
# See the AUTHORS file at the top-level directory of this distribution
|
||||
# License: GNU General Public License version 3, or any later version
|
||||
# See top-level LICENSE file for more information
|
||||
|
@ -18,6 +18,7 @@ import iso8601
|
|||
from looseversion import LooseVersion2
|
||||
|
||||
from swh.core.utils import grouper
|
||||
from swh.model.hashutil import HASH_BLOCK_SIZE
|
||||
from swh.scheduler.interface import SchedulerInterface
|
||||
from swh.scheduler.model import ListedOrigin
|
||||
|
||||
|
@ -119,7 +120,7 @@ class CratesLister(Lister[CratesListerState, CratesListerPage]):
|
|||
# Download the Db dump
|
||||
with self.http_request(self.DB_DUMP_URL, stream=True) as res:
|
||||
with open(archive_path, "wb") as out_file:
|
||||
for chunk in res.iter_content(chunk_size=1024):
|
||||
for chunk in res.iter_content(chunk_size=HASH_BLOCK_SIZE):
|
||||
out_file.write(chunk)
|
||||
|
||||
# Extract the Db dump
|
||||
|
|
|
@ -200,6 +200,8 @@ class DebianLister(Lister[DebianListerState, DebianPageType]):
|
|||
sum_name = "md5sum"
|
||||
if field_ in src_pkg:
|
||||
for entry in src_pkg[field_]:
|
||||
if "name" not in entry:
|
||||
continue
|
||||
name = entry["name"]
|
||||
files[name]["name"] = name
|
||||
files[name]["size"] = int(entry["size"], 10)
|
||||
|
|
|
@ -62,7 +62,7 @@ You can follow lister execution by displaying logs of swh-lister service::
|
|||
|
||||
.. _Dlang: https://dlang.org/
|
||||
.. _DUB: https://code.dlang.org/
|
||||
.. _http api endpoint: https://code.dlang.org/api/packages/dump"
|
||||
.. _http api endpoint: https://code.dlang.org/api/packages/dump
|
||||
"""
|
||||
|
||||
|
||||
|
|
12
swh/lister/f_droid/__init__.py
Normal file
12
swh/lister/f_droid/__init__.py
Normal file
|
@ -0,0 +1,12 @@
|
|||
# Copyright (C) 2019-2021 the Software Heritage developers
|
||||
# License: GNU General Public License version 3, or any later version
|
||||
# See top-level LICENSE file for more information
|
||||
|
||||
|
||||
def register():
|
||||
from .lister import FDroidLister
|
||||
|
||||
return {
|
||||
"lister": FDroidLister,
|
||||
"task_modules": ["%s.tasks" % __name__],
|
||||
}
|
80
swh/lister/f_droid/lister.py
Normal file
80
swh/lister/f_droid/lister.py
Normal file
|
@ -0,0 +1,80 @@
|
|||
# Copyright (C) 2021-2022 The Software Heritage developers
|
||||
# See the AUTHORS file at the top-level directory of this distribution
|
||||
# License: GNU General Public License version 3, or any later version
|
||||
# See top-level LICENSE file for more information
|
||||
|
||||
import logging
|
||||
from typing import Any, Dict, Iterator, List
|
||||
|
||||
from swh.scheduler.interface import SchedulerInterface
|
||||
from swh.scheduler.model import ListedOrigin
|
||||
|
||||
|
||||
from ..pattern import CredentialsType, StatelessLister
|
||||
from datetime import datetime
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
FDroidListerPage = List[Dict[str, Any]]
|
||||
|
||||
|
||||
class FDroidLister(StatelessLister[FDroidListerPage]):
|
||||
"""List origins from the FDroid."""
|
||||
|
||||
LISTER_NAME = "f_droid"
|
||||
INSTANCE = "f_droid"
|
||||
VISIT_TYPE = "f_droid"
|
||||
REPO_INDEX_V2_URL = "https://f-droid.org/repo/index-v2.json"
|
||||
PACKAGE_URL_TEMPLATE = "https://f-droid.org/packages/{package_id}"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
scheduler: SchedulerInterface,
|
||||
url: str = REPO_INDEX_V2_URL,
|
||||
instance: str = INSTANCE,
|
||||
credentials: CredentialsType = None,
|
||||
):
|
||||
super().__init__(
|
||||
scheduler=scheduler,
|
||||
credentials=credentials,
|
||||
url=url,
|
||||
instance=instance,
|
||||
)
|
||||
|
||||
self.session.headers.update({"Accept": "application/json"})
|
||||
|
||||
def get_pages(self) -> Iterator[FDroidListerPage]:
|
||||
data = self.http_request(self.url).json()
|
||||
|
||||
if "packages" not in data or not isinstance(data["packages"], dict):
|
||||
raise RuntimeError(
|
||||
f"Invalid response from {self.url}: missing 'packages' key"
|
||||
)
|
||||
|
||||
packages: Dict[str, Any] = data["packages"]
|
||||
return iter([[packages]])
|
||||
|
||||
def get_origins_from_page(self, page: FDroidListerPage) -> Iterator[ListedOrigin]:
|
||||
"""Convert a page of FDroidLister repositories into a list of ListedOrigins"""
|
||||
assert self.lister_obj.id is not None
|
||||
|
||||
for item in page:
|
||||
for id, value in item.items():
|
||||
metadata = value.get("metaData", {})
|
||||
versions = value.get("versions", {})
|
||||
last_updated = metadata.get("lastUpdated", None)
|
||||
if last_updated is not None:
|
||||
last_updated = datetime.fromtimestamp(last_updated)
|
||||
|
||||
yield ListedOrigin(
|
||||
lister_id=self.lister_obj.id,
|
||||
visit_type=self.VISIT_TYPE,
|
||||
url=self.PACKAGE_URL_TEMPLATE.format(
|
||||
package_id=id,
|
||||
),
|
||||
last_update=last_updated,
|
||||
extra_loader_arguments={
|
||||
"metadata": metadata,
|
||||
"versions": versions,
|
||||
},
|
||||
)
|
19
swh/lister/f_droid/tasks.py
Normal file
19
swh/lister/f_droid/tasks.py
Normal file
|
@ -0,0 +1,19 @@
|
|||
# Copyright (C) 2018-2023 the Software Heritage developers
|
||||
# License: GNU General Public License version 3, or any later version
|
||||
# See top-level LICENSE file for more information
|
||||
|
||||
from celery import shared_task
|
||||
|
||||
from .lister import FDroidLister
|
||||
|
||||
|
||||
@shared_task(name=f"{__name__}.FDroidListerTask")
|
||||
def list_pypi(**lister_args):
|
||||
"Full listing of the FDroid registry"
|
||||
lister = FDroidLister.from_configfile(**lister_args)
|
||||
return lister.run().dict()
|
||||
|
||||
|
||||
@shared_task(name=__name__ + ".ping")
|
||||
def _ping():
|
||||
return "OK"
|
|
@ -13,10 +13,10 @@ logger = logging.getLogger(__name__)
|
|||
class GiteaLister(GogsLister):
|
||||
"""List origins from Gitea.
|
||||
|
||||
Gitea API documentation: https://try.gitea.io/api/swagger
|
||||
Gitea API documentation: https://gitea.com/api/swagger
|
||||
|
||||
The API does pagination and provides navigation URLs through the 'Link' header.
|
||||
The default value for page size is the maximum value observed on the instances
|
||||
accessible at https://try.gitea.io/api/v1/ and https://codeberg.org/api/v1/."""
|
||||
accessible at https://gitea.com/api/v1/ and https://codeberg.org/api/v1/."""
|
||||
|
||||
LISTER_NAME = "gitea"
|
||||
|
|
|
@ -52,7 +52,7 @@ class GitHubLister(Lister[GitHubListerState, List[Dict[str, Any]]]):
|
|||
When the credentials aren't set in the lister config, the lister can run in
|
||||
anonymous mode too (e.g. for testing purposes).
|
||||
|
||||
.. _abuse rate limit policy: https://developer.github.com/v3/guides/best-practices-for-integrators/#dealing-with-abuse-rate-limits
|
||||
.. _abuse rate limit policy: https://developer.github.com/v3/guides/best-practices-for-integrators/#handle-rate-limit-errors-appropriately
|
||||
|
||||
|
||||
Args:
|
||||
|
|
Binary file not shown.
|
@ -69,5 +69,6 @@
|
|||
{"type":"file","name":"xboard-4.2.5.tar.gz","size":1055502,"time":"1008466945"},
|
||||
{"type":"file","name":"xboard-4.2.6.tar.gz","size":1057625,"time":"1012641715"},
|
||||
{"type":"file","name":"xboard-4.2.7.tar.gz","size":1318110,"time":"1070057764"}
|
||||
]}
|
||||
]},
|
||||
{"type":"directory","name":"no-contents","size":4096,"time":"1254860068"}
|
||||
]
|
||||
|
|
|
@ -61,7 +61,7 @@ class GNUTree:
|
|||
for directory in raw_data["contents"]:
|
||||
if directory["name"] not in self.top_level_directories:
|
||||
continue
|
||||
infos = directory["contents"]
|
||||
infos = directory.get("contents", [])
|
||||
for info in infos:
|
||||
if info["type"] == "directory":
|
||||
package_url = "%s/%s/%s/" % (
|
||||
|
@ -69,7 +69,9 @@ class GNUTree:
|
|||
directory["name"],
|
||||
info["name"],
|
||||
)
|
||||
package_artifacts = find_artifacts(info["contents"], package_url)
|
||||
package_artifacts = find_artifacts(
|
||||
info.get("contents", []), package_url
|
||||
)
|
||||
if package_artifacts != []:
|
||||
repo_details = {
|
||||
"name": info["name"],
|
||||
|
@ -146,7 +148,7 @@ def find_artifacts(
|
|||
# It will recursively check for artifacts in all sub-folders
|
||||
elif filetype == "directory":
|
||||
tarballs_in_dir = find_artifacts(
|
||||
info_file["contents"], url + filename + "/"
|
||||
info_file.get("contents", []), url + filename + "/"
|
||||
)
|
||||
artifacts.extend(tarballs_in_dir)
|
||||
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
# Copyright (C) 2019-2023 The Software Heritage developers
|
||||
# Copyright (C) 2019-2025 The Software Heritage developers
|
||||
# See the AUTHORS file at the top-level directory of this distribution
|
||||
# License: GNU General Public License version 3, or any later version
|
||||
# See top-level LICENSE file for more information
|
||||
|
@ -7,7 +7,7 @@ from dataclasses import dataclass
|
|||
from datetime import datetime, timezone
|
||||
import logging
|
||||
from random import shuffle
|
||||
from typing import Any, Dict, Iterator, List, Optional
|
||||
from typing import Any, Dict, Iterable, Iterator, List, Optional
|
||||
|
||||
import iso8601
|
||||
import requests
|
||||
|
@ -21,7 +21,7 @@ from ..pattern import CredentialsType, Lister
|
|||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
PackagistPageType = List[str]
|
||||
PackagistPageType = Iterable[str]
|
||||
|
||||
|
||||
class NotModifiedSinceLastVisit(ValueError):
|
||||
|
|
|
@ -15,7 +15,7 @@ import tempfile
|
|||
from typing import Any, Dict, Iterator, Optional, Tuple
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
import psycopg2
|
||||
import psycopg
|
||||
from testing.postgresql import Postgresql
|
||||
|
||||
from swh.scheduler.interface import SchedulerInterface
|
||||
|
@ -87,20 +87,18 @@ class RubyGemsLister(StatelessLister[RubyGemsListerPage]):
|
|||
|
||||
def create_rubygems_db(
|
||||
self, postgresql: Postgresql
|
||||
) -> Tuple[str, psycopg2._psycopg.connection]:
|
||||
) -> Tuple[str, psycopg.Connection[Any]]:
|
||||
logger.debug("Creating rubygems database")
|
||||
|
||||
db_dsn = postgresql.dsn()
|
||||
db_url = postgresql.url().replace(db_dsn["database"], self.DB_NAME)
|
||||
db = psycopg2.connect(**db_dsn)
|
||||
db.set_isolation_level(psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT)
|
||||
db = psycopg.connect(autocommit=True, conninfo=postgresql.url())
|
||||
with db.cursor() as cursor:
|
||||
cursor.execute(f"CREATE DATABASE {self.DB_NAME}")
|
||||
|
||||
db_dsn["database"] = self.DB_NAME
|
||||
|
||||
db = psycopg2.connect(**db_dsn)
|
||||
db.set_isolation_level(psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT)
|
||||
db = psycopg.connect(conninfo=db_url, autocommit=True)
|
||||
with db.cursor() as cursor:
|
||||
cursor.execute("CREATE EXTENSION IF NOT EXISTS hstore")
|
||||
|
||||
|
|
|
@ -133,7 +133,6 @@ def network_requests_mock(datadir, requests_mock):
|
|||
)
|
||||
|
||||
|
||||
@pytest.mark.db
|
||||
def test_rubygems_lister(swh_scheduler, expected_listed_origins):
|
||||
lister = RubyGemsLister(scheduler=swh_scheduler)
|
||||
res = lister.run()
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue