Hackage: List origins from hackage.haskell.org, The Haskell Package Repository
Use http api point to get package names and build origin urls.
This commit is contained in:
parent
8ff418fbc2
commit
6696a8424a
13 changed files with 358 additions and 0 deletions
1
setup.py
1
setup.py
|
@ -69,6 +69,7 @@ setup(
|
|||
lister.gitlab=swh.lister.gitlab:register
|
||||
lister.gnu=swh.lister.gnu:register
|
||||
lister.golang=swh.lister.golang:register
|
||||
lister.hackage=swh.lister.hackage:register
|
||||
lister.launchpad=swh.lister.launchpad:register
|
||||
lister.npm=swh.lister.npm:register
|
||||
lister.opam=swh.lister.opam:register
|
||||
|
|
99
swh/lister/hackage/__init__.py
Normal file
99
swh/lister/hackage/__init__.py
Normal file
|
@ -0,0 +1,99 @@
|
|||
# Copyright (C) 2022 The Software Heritage developers
|
||||
# See the AUTHORS file at the top-level directory of this distribution
|
||||
# License: GNU General Public License version 3, or any later version
|
||||
# See top-level LICENSE file for more information
|
||||
|
||||
|
||||
"""
|
||||
Hackage lister
|
||||
==============
|
||||
|
||||
The Hackage lister list origins from `hackage.haskell.org`_, the `Haskell`_ Package
|
||||
Repository.
|
||||
|
||||
The registry provide an `http api`_ from where the lister retrieve package names
|
||||
and build origins urls.
|
||||
|
||||
As of August 2022 `hackage.haskell.org`_ list 15536 package names.
|
||||
|
||||
Origins retrieving strategy
|
||||
---------------------------
|
||||
|
||||
To get a list of all package names we make a POST call to
|
||||
`https://hackage.haskell.org/packages/search` endpoint with some params given as
|
||||
json data.
|
||||
|
||||
Default params::
|
||||
|
||||
{
|
||||
"page": 0,
|
||||
"sortColumn": "default",
|
||||
"sortDirection": "ascending",
|
||||
"searchQuery": "(deprecated:any)",
|
||||
}
|
||||
|
||||
The page size is 50. The lister will make has much http api call has needed to get
|
||||
all results.
|
||||
|
||||
Page listing
|
||||
------------
|
||||
|
||||
The result is paginated, each page is 50 records long.
|
||||
|
||||
Entry data set example::
|
||||
|
||||
{
|
||||
"description": "3D model parsers",
|
||||
"downloads": 6,
|
||||
"lastUpload": "2014-11-08T03:55:23.879047Z",
|
||||
"maintainers": [{"display": "capsjac", "uri": "/user/capsjac"}],
|
||||
"name": {"display": "3dmodels", "uri": "/package/3dmodels"},
|
||||
"tags": [
|
||||
{"display": "graphics", "uri": "/packages/tag/graphics"},
|
||||
{"display": "lgpl", "uri": "/packages/tag/lgpl"},
|
||||
{"display": "library", "uri": "/packages/tag/library"},
|
||||
],
|
||||
"votes": 1.5,
|
||||
}
|
||||
|
||||
Origins from page
|
||||
-----------------
|
||||
|
||||
The lister yields 50 origins url per page.
|
||||
Each ListedOrigin has a `last_update` date set.
|
||||
|
||||
Running tests
|
||||
-------------
|
||||
|
||||
Activate the virtualenv and run from within swh-lister directory::
|
||||
|
||||
pytest -s -vv --log-cli-level=DEBUG swh/lister/hackage/tests
|
||||
|
||||
Testing with Docker
|
||||
-------------------
|
||||
|
||||
Change directory to swh/docker then launch the docker environment::
|
||||
|
||||
docker compose up -d
|
||||
|
||||
Then schedule an Hackage listing task::
|
||||
|
||||
docker compose exec swh-scheduler swh scheduler task add -p oneshot list-hackage
|
||||
|
||||
You can follow lister execution by displaying logs of swh-lister service::
|
||||
|
||||
docker compose logs -f swh-lister
|
||||
|
||||
.. _hackage.haskell.org: https://hackage.haskell.org/
|
||||
.. _Haskell: https://haskell.org/
|
||||
.. _http api: https://hackage.haskell.org/api
|
||||
"""
|
||||
|
||||
|
||||
def register():
|
||||
from .lister import HackageLister
|
||||
|
||||
return {
|
||||
"lister": HackageLister,
|
||||
"task_modules": ["%s.tasks" % __name__],
|
||||
}
|
100
swh/lister/hackage/lister.py
Normal file
100
swh/lister/hackage/lister.py
Normal file
|
@ -0,0 +1,100 @@
|
|||
# Copyright (C) 2022 The Software Heritage developers
|
||||
# See the AUTHORS file at the top-level directory of this distribution
|
||||
# License: GNU General Public License version 3, or any later version
|
||||
# See top-level LICENSE file for more information
|
||||
|
||||
import logging
|
||||
from typing import Any, Dict, Iterator, List, Optional
|
||||
|
||||
import iso8601
|
||||
|
||||
from swh.scheduler.interface import SchedulerInterface
|
||||
from swh.scheduler.model import ListedOrigin
|
||||
|
||||
from ..pattern import CredentialsType, StatelessLister
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Aliasing the page results returned by `get_pages` method from the lister.
|
||||
HackageListerPage = List[Dict[str, Any]]
|
||||
|
||||
|
||||
class HackageLister(StatelessLister[HackageListerPage]):
|
||||
"""List Hackage (The Haskell Package Repository) origins."""
|
||||
|
||||
LISTER_NAME = "hackage"
|
||||
VISIT_TYPE = "hackage"
|
||||
INSTANCE = "hackage"
|
||||
|
||||
BASE_URL = "https://hackage.haskell.org/"
|
||||
PACKAGE_NAMES_URL_PATTERN = "{base_url}packages/search"
|
||||
PACKAGE_INFO_URL_PATTERN = "{base_url}package/{pkgname}"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
scheduler: SchedulerInterface,
|
||||
credentials: Optional[CredentialsType] = None,
|
||||
url: Optional[str] = None,
|
||||
):
|
||||
super().__init__(
|
||||
scheduler=scheduler,
|
||||
credentials=credentials,
|
||||
instance=self.INSTANCE,
|
||||
url=url if url else self.BASE_URL,
|
||||
)
|
||||
# Ensure to set this with same value as the http api search endpoint use
|
||||
# (50 as of august 2022)
|
||||
self.page_size: int = 50
|
||||
|
||||
def get_pages(self) -> Iterator[HackageListerPage]:
|
||||
"""Yield an iterator which returns 'page'
|
||||
|
||||
It uses the http api endpoint `https://hackage.haskell.org/packages/search`
|
||||
to get a list of package names from which we build an origin url.
|
||||
|
||||
Results are paginated.
|
||||
"""
|
||||
params = {
|
||||
"page": 0,
|
||||
"sortColumn": "default",
|
||||
"sortDirection": "ascending",
|
||||
"searchQuery": "(deprecated:any)",
|
||||
}
|
||||
|
||||
data = self.http_request(
|
||||
url=self.PACKAGE_NAMES_URL_PATTERN.format(base_url=self.url),
|
||||
method="POST",
|
||||
json=params,
|
||||
).json()
|
||||
|
||||
nb_entries: int = data["numberOfResults"]
|
||||
(nb_pages, remainder) = divmod(nb_entries, self.page_size)
|
||||
if remainder:
|
||||
nb_pages += 1
|
||||
yield data["pageContents"]
|
||||
|
||||
for page in range(1, nb_pages):
|
||||
params["page"] = page
|
||||
data = self.http_request(
|
||||
url=self.PACKAGE_NAMES_URL_PATTERN.format(base_url=self.url),
|
||||
method="POST",
|
||||
json=params,
|
||||
).json()
|
||||
yield data["pageContents"]
|
||||
|
||||
def get_origins_from_page(self, page: HackageListerPage) -> Iterator[ListedOrigin]:
|
||||
"""Iterate on all pages and yield ListedOrigin instances."""
|
||||
assert self.lister_obj.id is not None
|
||||
|
||||
for entry in page:
|
||||
pkgname = entry["name"]["display"]
|
||||
last_update = iso8601.parse_date(entry["lastUpload"])
|
||||
url = self.PACKAGE_INFO_URL_PATTERN.format(
|
||||
base_url=self.url, pkgname=pkgname
|
||||
)
|
||||
yield ListedOrigin(
|
||||
lister_id=self.lister_obj.id,
|
||||
visit_type=self.VISIT_TYPE,
|
||||
url=url,
|
||||
last_update=last_update,
|
||||
)
|
19
swh/lister/hackage/tasks.py
Normal file
19
swh/lister/hackage/tasks.py
Normal file
|
@ -0,0 +1,19 @@
|
|||
# Copyright (C) 2022 The Software Heritage developers
|
||||
# See the AUTHORS file at the top-level directory of this distribution
|
||||
# License: GNU General Public License version 3, or any later version
|
||||
# See top-level LICENSE file for more information
|
||||
|
||||
from celery import shared_task
|
||||
|
||||
from swh.lister.hackage.lister import HackageLister
|
||||
|
||||
|
||||
@shared_task(name=__name__ + ".HackageListerTask")
|
||||
def list_hackage(**lister_args):
|
||||
"""Lister task for Hackage, the Haskell Package Repository"""
|
||||
return HackageLister.from_configfile(**lister_args).run().dict()
|
||||
|
||||
|
||||
@shared_task(name=__name__ + ".ping")
|
||||
def _ping():
|
||||
return "OK"
|
0
swh/lister/hackage/tests/__init__.py
Normal file
0
swh/lister/hackage/tests/__init__.py
Normal file
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
|
@ -0,0 +1 @@
|
|||
{"numberOfResults":51,"pageContents":[{"description":"Command-line program for type-checking and compiling Agda programs","downloads":20,"lastUpload":"2012-03-12T11:01:45Z","maintainers":[{"display":"NilsAndersDanielsson","uri":"/user/NilsAndersDanielsson"},{"display":"UlfNorell","uri":"/user/UlfNorell"}],"name":{"display":"Agda-executable","uri":"/package/Agda-executable"},"tags":[{"display":"dependent-types","uri":"/packages/tag/dependent-types"},{"display":"deprecated","uri":"/packages/tag/deprecated"},{"display":"program","uri":"/packages/tag/program"}],"votes":0}]}
|
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
100
swh/lister/hackage/tests/test_lister.py
Normal file
100
swh/lister/hackage/tests/test_lister.py
Normal file
|
@ -0,0 +1,100 @@
|
|||
# Copyright (C) 2022 The Software Heritage developers
|
||||
# See the AUTHORS file at the top-level directory of this distribution
|
||||
# License: GNU General Public License version 3, or any later version
|
||||
# See top-level LICENSE file for more information
|
||||
|
||||
import functools
|
||||
import json
|
||||
from pathlib import Path
|
||||
from urllib.parse import unquote, urlparse
|
||||
|
||||
from swh.lister.hackage.lister import HackageLister
|
||||
|
||||
|
||||
def json_callback(request, context, datadir):
|
||||
"""Callback for requests_mock that load a json file regarding a page number"""
|
||||
page = request.json()["page"]
|
||||
|
||||
unquoted_url = unquote(request.url)
|
||||
url = urlparse(unquoted_url)
|
||||
dirname = "%s_%s" % (url.scheme, url.hostname)
|
||||
filename = url.path[1:]
|
||||
if filename.endswith("/"):
|
||||
filename = filename[:-1]
|
||||
filename = filename.replace("/", "_")
|
||||
|
||||
return json.loads(Path(datadir, dirname, f"{filename}_{page}").read_text())
|
||||
|
||||
|
||||
def test_hackage_lister(swh_scheduler, requests_mock, datadir):
|
||||
|
||||
requests_mock.post(
|
||||
url="https://hackage.haskell.org/packages/search",
|
||||
status_code=200,
|
||||
json=functools.partial(json_callback, datadir=datadir),
|
||||
)
|
||||
|
||||
expected_origins = []
|
||||
|
||||
for page in [0, 1, 2]:
|
||||
data = json.loads(
|
||||
Path(
|
||||
datadir, "https_hackage.haskell.org", f"packages_search_{page}"
|
||||
).read_text()
|
||||
)
|
||||
for entry in data["pageContents"]:
|
||||
pkgname = entry["name"]["display"]
|
||||
expected_origins.append(
|
||||
{"url": f"https://hackage.haskell.org/package/{pkgname}"}
|
||||
)
|
||||
|
||||
lister = HackageLister(scheduler=swh_scheduler)
|
||||
res = lister.run()
|
||||
|
||||
assert res.pages == 3
|
||||
assert res.origins == res.pages * 50
|
||||
|
||||
scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
|
||||
|
||||
assert len(scheduler_origins) == len(expected_origins)
|
||||
|
||||
assert {
|
||||
(
|
||||
scheduled.visit_type,
|
||||
scheduled.url,
|
||||
)
|
||||
for scheduled in scheduler_origins
|
||||
} == {
|
||||
(
|
||||
"hackage",
|
||||
expected["url"],
|
||||
)
|
||||
for expected in expected_origins
|
||||
}
|
||||
|
||||
|
||||
def test_hackage_lister_pagination_49(swh_scheduler, requests_mock, datadir):
|
||||
requests_mock.post(
|
||||
url="https://fake49.haskell.org/packages/search",
|
||||
status_code=200,
|
||||
json=functools.partial(json_callback, datadir=datadir),
|
||||
)
|
||||
lister = HackageLister(scheduler=swh_scheduler, url="https://fake49.haskell.org/")
|
||||
pages = list(lister.get_pages())
|
||||
# there should be 1 page with 49 entries
|
||||
assert len(pages) == 1
|
||||
assert len(pages[0]) == 49
|
||||
|
||||
|
||||
def test_hackage_lister_pagination_51(swh_scheduler, requests_mock, datadir):
|
||||
requests_mock.post(
|
||||
url="https://fake51.haskell.org/packages/search",
|
||||
status_code=200,
|
||||
json=functools.partial(json_callback, datadir=datadir),
|
||||
)
|
||||
lister = HackageLister(scheduler=swh_scheduler, url="https://fake51.haskell.org/")
|
||||
pages = list(lister.get_pages())
|
||||
# there should be 2 pages with 50 + 1 entries
|
||||
assert len(pages) == 2
|
||||
assert len(pages[0]) == 50
|
||||
assert len(pages[1]) == 1
|
33
swh/lister/hackage/tests/test_tasks.py
Normal file
33
swh/lister/hackage/tests/test_tasks.py
Normal file
|
@ -0,0 +1,33 @@
|
|||
# Copyright (C) 2022 The Software Heritage developers
|
||||
# See the AUTHORS file at the top-level directory of this distribution
|
||||
# License: GNU General Public License version 3, or any later version
|
||||
# See top-level LICENSE file for more information
|
||||
|
||||
from swh.lister.pattern import ListerStats
|
||||
|
||||
|
||||
def test_hackage_ping(swh_scheduler_celery_app, swh_scheduler_celery_worker):
|
||||
res = swh_scheduler_celery_app.send_task("swh.lister.hackage.tasks.ping")
|
||||
assert res
|
||||
res.wait()
|
||||
assert res.successful()
|
||||
assert res.result == "OK"
|
||||
|
||||
|
||||
def test_hackage_lister(swh_scheduler_celery_app, swh_scheduler_celery_worker, mocker):
|
||||
# setup the mocked HackageLister
|
||||
lister = mocker.patch("swh.lister.hackage.tasks.HackageLister")
|
||||
lister.from_configfile.return_value = lister
|
||||
stats = ListerStats(pages=42, origins=42)
|
||||
lister.run.return_value = stats
|
||||
|
||||
res = swh_scheduler_celery_app.send_task(
|
||||
"swh.lister.hackage.tasks.HackageListerTask"
|
||||
)
|
||||
assert res
|
||||
res.wait()
|
||||
assert res.successful()
|
||||
assert res.result == stats.dict()
|
||||
|
||||
lister.from_configfile.assert_called_once_with()
|
||||
lister.run.assert_called_once_with()
|
Loading…
Add table
Add a link
Reference in a new issue