Elm stateful lister

Use another Api endpoint that helps the lister to be stateful.
The Api endpoint used needs a ``since`` value that represents a
sequential index in the history.
The ``all_packages_count`` state helps in storing a count which will be
used as ``since`` argument on the next run.
This commit is contained in:
Franck Bret 2023-12-21 09:54:29 +01:00
parent 4b1f49ac22
commit 82ee095128
7 changed files with 133 additions and 47 deletions

View file

@ -12,20 +12,24 @@ Elm lister
Additional packages for the language can be searched from the `Packages`_ website
and installed with `elm install`_ command. The Elm packages website also provides a
`Http Api endpoint`_ listing all available packages.
`Http Api endpoint`_ listing all available packages versions since a count of
package versions.
Elm origins are Git repositories hosted on Github. Each repository must provide its
packaged releases using the Github release system.
Elm origins are Git repositories hosted on GitHub. Each repository must provide its
packaged releases using the GitHub release system.
As of July 2023 `Packages`_ list 1746 packages.
Origins retrieving strategy
---------------------------
To build a list of origins we make a GET request to the `Http Api endpoint`_ that returns
a Json array of objects.
To build a list of origins we make a GET request to the `Http Api endpoint`_ with a
``since`` argument as a sequential index in the history which returns a Json array
of strings.
Each string represents a new version for a package. The string is split to get the
``name`` of the package.
The origin url for each package is constructed with the information of corresponding
`name` entry which represents the suffix of Github repositories (org/project_name).
``name`` entry which represents the suffix of GitHub repositories (*org*/*project_name*).
Page listing
------------
@ -35,8 +39,8 @@ There is only one page listing all origins url.
Origins from page
-----------------
The lister is stateless and yields all origins url from one page. It is a list of package
repository url.
The lister is stateful and yields all new origins url from one page since the last run.
It is a list of package repository url.
Running tests
-------------
@ -63,7 +67,7 @@ You can follow lister execution by displaying logs of swh-lister service::
.. _Elm: https://elm-lang.org/
.. _Packages: https://package.elm-lang.org/
.. _elm install: https://guide.elm-lang.org/install/elm.html#elm-install
.. _Http Api endpoint: https://package.elm-lang.org/search.json
.. _Http Api endpoint: https://package.elm-lang.org/all-packages/since/5000
"""

View file

@ -3,36 +3,47 @@
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from dataclasses import asdict, dataclass
import logging
from typing import Any, Dict, Iterator, List, Optional
from typing import Any, Dict, Iterator, Optional, Set
from swh.scheduler.interface import SchedulerInterface
from swh.scheduler.model import ListedOrigin
from ..pattern import CredentialsType, StatelessLister
from ..pattern import CredentialsType, Lister
logger = logging.getLogger(__name__)
# Aliasing the page results returned by `get_pages` method from the lister.
ElmListerPage = List[Dict[str, Any]]
ElmListerPage = Set[str]
class ElmLister(StatelessLister[ElmListerPage]):
@dataclass
class ElmListerState:
"""Store lister state for incremental mode operations"""
all_packages_count: Optional[int] = None
"""Store the count of all existing packages, used as ``since`` argument of
API endpoint url.
"""
class ElmLister(Lister[ElmListerState, ElmListerPage]):
"""List Elm packages origins"""
LISTER_NAME = "elm"
VISIT_TYPE = "git" # Elm origins url are Git repositories
INSTANCE = "elm"
SEARCH_URL = "https://package.elm-lang.org/search.json"
BASE_URL = "https://package.elm-lang.org"
ALL_PACKAGES_URL_PATTERN = "{base_url}/all-packages/since/{since}"
REPO_URL_PATTERN = "https://github.com/{name}"
def __init__(
self,
scheduler: SchedulerInterface,
credentials: Optional[CredentialsType] = None,
url: str = SEARCH_URL,
url: str = BASE_URL,
instance: str = INSTANCE,
max_origins_per_page: Optional[int] = None,
max_pages: Optional[int] = None,
@ -47,25 +58,47 @@ class ElmLister(StatelessLister[ElmListerPage]):
max_pages=max_pages,
enable_origins=enable_origins,
)
self.all_packages_count: int = 0
self.session.headers.update({"Accept": "application/json"})
def state_from_dict(self, d: Dict[str, Any]) -> ElmListerState:
return ElmListerState(**d)
def state_to_dict(self, state: ElmListerState) -> Dict[str, Any]:
return asdict(state)
def get_pages(self) -> Iterator[ElmListerPage]:
"""Yield an iterator which returns 'page'
It uses the unique Http api endpoint `https://package.elm-lang.org/search.json`
to get a list of names corresponding to Github repository url suffixes.
It uses the Http api endpoint ``https://package.elm-lang.org/all-packages/since/:since``
to get a list of packages versions from where we get names corresponding to GitHub
repository url suffixes.
There is only one page that list all origins urls.
"""
response = self.http_request(self.url)
yield response.json()
if not self.state.all_packages_count:
since = 0
else:
since = self.state.all_packages_count
response = self.http_request(
self.ALL_PACKAGES_URL_PATTERN.format(base_url=self.url, since=since)
)
# Well save this to the state in finalize()
self.all_packages_count = len(response.json()) + since
res = set()
for entry in response.json():
res.add(entry.split("@")[0])
yield res
def get_origins_from_page(self, page: ElmListerPage) -> Iterator[ListedOrigin]:
"""Iterate on all pages and yield ListedOrigin instances"""
assert self.lister_obj.id is not None
for entry in page:
name: str = entry["name"]
for name in page:
repo_url: str = self.REPO_URL_PATTERN.format(name=name)
yield ListedOrigin(
@ -74,3 +107,11 @@ class ElmLister(StatelessLister[ElmListerPage]):
url=repo_url,
last_update=None,
)
def finalize(self) -> None:
if (
self.state.all_packages_count is None
or self.all_packages_count > self.state.all_packages_count
):
self.state.all_packages_count = self.all_packages_count
self.updated = True

View file

@ -0,0 +1 @@
["mercurymedia/elm-ag-grid@20.0.0","elm-toulouse/cbor@3.4.0","elm-toulouse/cbor@3.3.0"]

View file

@ -0,0 +1 @@
["miniBill/elm-avataaars@1.1.1"]

View file

@ -1,20 +0,0 @@
[
{
"name": "elm/bytes",
"summary": "Work with sequences of bytes (a.k.a. ArrayBuffer, typed arrays, DataView)",
"license": "BSD-3-Clause",
"version": "1.0.8"
},
{
"name": "STTR13/ziplist",
"summary": "List with a selected element that makes impossible state impossible.",
"license": "BSD-3-Clause",
"version": "1.4.2"
},
{
"name": "cuducos/elm-format-number",
"summary": "Format numbers as pretty strings",
"license": "BSD-3-Clause",
"version": "9.0.1"
}
]

View file

@ -5,10 +5,13 @@
from swh.lister.elm.lister import ElmLister
expected_origins = [
"https://github.com/STTR13/ziplist",
"https://github.com/elm/bytes",
"https://github.com/cuducos/elm-format-number",
expected_origins_since_0 = [
"https://github.com/elm-toulouse/cbor",
"https://github.com/mercurymedia/elm-ag-grid",
]
expected_origins_since_3 = [
"https://github.com/miniBill/elm-avataaars",
]
@ -17,10 +20,57 @@ def test_elm_lister(datadir, requests_mock_datadir, swh_scheduler):
res = lister.run()
assert res.pages == 1
assert res.origins == 1 + 1 + 1
# 2 of the 3 entries are related to the same package so the origins count is 2
assert res.origins == 2
scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
assert len(scheduler_origins) == len(expected_origins_since_0)
assert {
(
scheduled.visit_type,
scheduled.url,
scheduled.last_update,
)
for scheduled in scheduler_origins
} == {("git", expected, None) for expected in expected_origins_since_0}
# Check that all_packages_count is set
assert lister.state.all_packages_count == 3 # 3 entries
def test_elm_lister_incremental(datadir, requests_mock_datadir, swh_scheduler):
# First run, since=0
lister = ElmLister(scheduler=swh_scheduler)
res = lister.run()
assert res.pages == 1
# 2 of the 3 entries are related to the same package so the origins count is 2
assert res.origins == 2
scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
assert len(scheduler_origins) == len(expected_origins_since_0)
assert {
(
scheduled.visit_type,
scheduled.url,
scheduled.last_update,
)
for scheduled in scheduler_origins
} == {("git", expected, None) for expected in expected_origins_since_0}
# Check that all_packages_count is set
assert lister.state.all_packages_count == 3 # 3 entries
# Second run, since=3
lister = ElmLister(scheduler=swh_scheduler)
res = lister.run()
assert res.pages == 1
assert res.origins == 1
scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
expected_origins = expected_origins_since_0 + expected_origins_since_3
assert len(scheduler_origins) == len(expected_origins)
assert {
(
@ -30,3 +80,11 @@ def test_elm_lister(datadir, requests_mock_datadir, swh_scheduler):
)
for scheduled in scheduler_origins
} == {("git", expected, None) for expected in expected_origins}
assert lister.state.all_packages_count == 4 # 4 entries
# Third run, since=4, nothing new
lister = ElmLister(scheduler=swh_scheduler)
res = lister.run()
assert res.pages == 1
assert res.origins == 0
assert lister.state.all_packages_count == 4 # 4 entries