Elm stateful lister
Use another Api endpoint that helps the lister to be stateful. The Api endpoint used needs a ``since`` value that represents a sequential index in the history. The ``all_packages_count`` state helps in storing a count which will be used as ``since`` argument on the next run.
This commit is contained in:
parent
4b1f49ac22
commit
82ee095128
7 changed files with 133 additions and 47 deletions
|
@ -12,20 +12,24 @@ Elm lister
|
|||
|
||||
Additional packages for the language can be searched from the `Packages`_ website
|
||||
and installed with `elm install`_ command. The Elm packages website also provides a
|
||||
`Http Api endpoint`_ listing all available packages.
|
||||
`Http Api endpoint`_ listing all available packages versions since a count of
|
||||
package versions.
|
||||
|
||||
Elm origins are Git repositories hosted on Github. Each repository must provide its
|
||||
packaged releases using the Github release system.
|
||||
Elm origins are Git repositories hosted on GitHub. Each repository must provide its
|
||||
packaged releases using the GitHub release system.
|
||||
|
||||
As of July 2023 `Packages`_ list 1746 packages.
|
||||
|
||||
Origins retrieving strategy
|
||||
---------------------------
|
||||
|
||||
To build a list of origins we make a GET request to the `Http Api endpoint`_ that returns
|
||||
a Json array of objects.
|
||||
To build a list of origins we make a GET request to the `Http Api endpoint`_ with a
|
||||
``since`` argument as a sequential index in the history which returns a Json array
|
||||
of strings.
|
||||
Each string represents a new version for a package. The string is split to get the
|
||||
``name`` of the package.
|
||||
The origin url for each package is constructed with the information of corresponding
|
||||
`name` entry which represents the suffix of Github repositories (org/project_name).
|
||||
``name`` entry which represents the suffix of GitHub repositories (*org*/*project_name*).
|
||||
|
||||
Page listing
|
||||
------------
|
||||
|
@ -35,8 +39,8 @@ There is only one page listing all origins url.
|
|||
Origins from page
|
||||
-----------------
|
||||
|
||||
The lister is stateless and yields all origins url from one page. It is a list of package
|
||||
repository url.
|
||||
The lister is stateful and yields all new origins url from one page since the last run.
|
||||
It is a list of package repository url.
|
||||
|
||||
Running tests
|
||||
-------------
|
||||
|
@ -63,7 +67,7 @@ You can follow lister execution by displaying logs of swh-lister service::
|
|||
.. _Elm: https://elm-lang.org/
|
||||
.. _Packages: https://package.elm-lang.org/
|
||||
.. _elm install: https://guide.elm-lang.org/install/elm.html#elm-install
|
||||
.. _Http Api endpoint: https://package.elm-lang.org/search.json
|
||||
.. _Http Api endpoint: https://package.elm-lang.org/all-packages/since/5000
|
||||
"""
|
||||
|
||||
|
||||
|
|
|
@ -3,36 +3,47 @@
|
|||
# License: GNU General Public License version 3, or any later version
|
||||
# See top-level LICENSE file for more information
|
||||
|
||||
from dataclasses import asdict, dataclass
|
||||
import logging
|
||||
from typing import Any, Dict, Iterator, List, Optional
|
||||
from typing import Any, Dict, Iterator, Optional, Set
|
||||
|
||||
from swh.scheduler.interface import SchedulerInterface
|
||||
from swh.scheduler.model import ListedOrigin
|
||||
|
||||
from ..pattern import CredentialsType, StatelessLister
|
||||
from ..pattern import CredentialsType, Lister
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Aliasing the page results returned by `get_pages` method from the lister.
|
||||
ElmListerPage = List[Dict[str, Any]]
|
||||
ElmListerPage = Set[str]
|
||||
|
||||
|
||||
class ElmLister(StatelessLister[ElmListerPage]):
|
||||
@dataclass
|
||||
class ElmListerState:
|
||||
"""Store lister state for incremental mode operations"""
|
||||
|
||||
all_packages_count: Optional[int] = None
|
||||
"""Store the count of all existing packages, used as ``since`` argument of
|
||||
API endpoint url.
|
||||
"""
|
||||
|
||||
|
||||
class ElmLister(Lister[ElmListerState, ElmListerPage]):
|
||||
"""List Elm packages origins"""
|
||||
|
||||
LISTER_NAME = "elm"
|
||||
VISIT_TYPE = "git" # Elm origins url are Git repositories
|
||||
INSTANCE = "elm"
|
||||
|
||||
SEARCH_URL = "https://package.elm-lang.org/search.json"
|
||||
|
||||
BASE_URL = "https://package.elm-lang.org"
|
||||
ALL_PACKAGES_URL_PATTERN = "{base_url}/all-packages/since/{since}"
|
||||
REPO_URL_PATTERN = "https://github.com/{name}"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
scheduler: SchedulerInterface,
|
||||
credentials: Optional[CredentialsType] = None,
|
||||
url: str = SEARCH_URL,
|
||||
url: str = BASE_URL,
|
||||
instance: str = INSTANCE,
|
||||
max_origins_per_page: Optional[int] = None,
|
||||
max_pages: Optional[int] = None,
|
||||
|
@ -47,25 +58,47 @@ class ElmLister(StatelessLister[ElmListerPage]):
|
|||
max_pages=max_pages,
|
||||
enable_origins=enable_origins,
|
||||
)
|
||||
self.all_packages_count: int = 0
|
||||
self.session.headers.update({"Accept": "application/json"})
|
||||
|
||||
def state_from_dict(self, d: Dict[str, Any]) -> ElmListerState:
|
||||
return ElmListerState(**d)
|
||||
|
||||
def state_to_dict(self, state: ElmListerState) -> Dict[str, Any]:
|
||||
return asdict(state)
|
||||
|
||||
def get_pages(self) -> Iterator[ElmListerPage]:
|
||||
"""Yield an iterator which returns 'page'
|
||||
|
||||
It uses the unique Http api endpoint `https://package.elm-lang.org/search.json`
|
||||
to get a list of names corresponding to Github repository url suffixes.
|
||||
It uses the Http api endpoint ``https://package.elm-lang.org/all-packages/since/:since``
|
||||
to get a list of packages versions from where we get names corresponding to GitHub
|
||||
repository url suffixes.
|
||||
|
||||
There is only one page that list all origins urls.
|
||||
"""
|
||||
response = self.http_request(self.url)
|
||||
yield response.json()
|
||||
|
||||
if not self.state.all_packages_count:
|
||||
since = 0
|
||||
else:
|
||||
since = self.state.all_packages_count
|
||||
|
||||
response = self.http_request(
|
||||
self.ALL_PACKAGES_URL_PATTERN.format(base_url=self.url, since=since)
|
||||
)
|
||||
# We’ll save this to the state in finalize()
|
||||
self.all_packages_count = len(response.json()) + since
|
||||
|
||||
res = set()
|
||||
for entry in response.json():
|
||||
res.add(entry.split("@")[0])
|
||||
|
||||
yield res
|
||||
|
||||
def get_origins_from_page(self, page: ElmListerPage) -> Iterator[ListedOrigin]:
|
||||
"""Iterate on all pages and yield ListedOrigin instances"""
|
||||
assert self.lister_obj.id is not None
|
||||
|
||||
for entry in page:
|
||||
name: str = entry["name"]
|
||||
for name in page:
|
||||
repo_url: str = self.REPO_URL_PATTERN.format(name=name)
|
||||
|
||||
yield ListedOrigin(
|
||||
|
@ -74,3 +107,11 @@ class ElmLister(StatelessLister[ElmListerPage]):
|
|||
url=repo_url,
|
||||
last_update=None,
|
||||
)
|
||||
|
||||
def finalize(self) -> None:
|
||||
if (
|
||||
self.state.all_packages_count is None
|
||||
or self.all_packages_count > self.state.all_packages_count
|
||||
):
|
||||
self.state.all_packages_count = self.all_packages_count
|
||||
self.updated = True
|
||||
|
|
|
@ -0,0 +1 @@
|
|||
["mercurymedia/elm-ag-grid@20.0.0","elm-toulouse/cbor@3.4.0","elm-toulouse/cbor@3.3.0"]
|
|
@ -0,0 +1 @@
|
|||
["miniBill/elm-avataaars@1.1.1"]
|
|
@ -0,0 +1 @@
|
|||
[]
|
|
@ -1,20 +0,0 @@
|
|||
[
|
||||
{
|
||||
"name": "elm/bytes",
|
||||
"summary": "Work with sequences of bytes (a.k.a. ArrayBuffer, typed arrays, DataView)",
|
||||
"license": "BSD-3-Clause",
|
||||
"version": "1.0.8"
|
||||
},
|
||||
{
|
||||
"name": "STTR13/ziplist",
|
||||
"summary": "List with a selected element that makes impossible state impossible.",
|
||||
"license": "BSD-3-Clause",
|
||||
"version": "1.4.2"
|
||||
},
|
||||
{
|
||||
"name": "cuducos/elm-format-number",
|
||||
"summary": "Format numbers as pretty strings",
|
||||
"license": "BSD-3-Clause",
|
||||
"version": "9.0.1"
|
||||
}
|
||||
]
|
|
@ -5,10 +5,13 @@
|
|||
|
||||
from swh.lister.elm.lister import ElmLister
|
||||
|
||||
expected_origins = [
|
||||
"https://github.com/STTR13/ziplist",
|
||||
"https://github.com/elm/bytes",
|
||||
"https://github.com/cuducos/elm-format-number",
|
||||
expected_origins_since_0 = [
|
||||
"https://github.com/elm-toulouse/cbor",
|
||||
"https://github.com/mercurymedia/elm-ag-grid",
|
||||
]
|
||||
|
||||
expected_origins_since_3 = [
|
||||
"https://github.com/miniBill/elm-avataaars",
|
||||
]
|
||||
|
||||
|
||||
|
@ -17,10 +20,57 @@ def test_elm_lister(datadir, requests_mock_datadir, swh_scheduler):
|
|||
res = lister.run()
|
||||
|
||||
assert res.pages == 1
|
||||
assert res.origins == 1 + 1 + 1
|
||||
# 2 of the 3 entries are related to the same package so the origins count is 2
|
||||
assert res.origins == 2
|
||||
|
||||
scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
|
||||
|
||||
assert len(scheduler_origins) == len(expected_origins_since_0)
|
||||
assert {
|
||||
(
|
||||
scheduled.visit_type,
|
||||
scheduled.url,
|
||||
scheduled.last_update,
|
||||
)
|
||||
for scheduled in scheduler_origins
|
||||
} == {("git", expected, None) for expected in expected_origins_since_0}
|
||||
|
||||
# Check that all_packages_count is set
|
||||
assert lister.state.all_packages_count == 3 # 3 entries
|
||||
|
||||
|
||||
def test_elm_lister_incremental(datadir, requests_mock_datadir, swh_scheduler):
|
||||
# First run, since=0
|
||||
lister = ElmLister(scheduler=swh_scheduler)
|
||||
res = lister.run()
|
||||
|
||||
assert res.pages == 1
|
||||
# 2 of the 3 entries are related to the same package so the origins count is 2
|
||||
assert res.origins == 2
|
||||
|
||||
scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
|
||||
|
||||
assert len(scheduler_origins) == len(expected_origins_since_0)
|
||||
assert {
|
||||
(
|
||||
scheduled.visit_type,
|
||||
scheduled.url,
|
||||
scheduled.last_update,
|
||||
)
|
||||
for scheduled in scheduler_origins
|
||||
} == {("git", expected, None) for expected in expected_origins_since_0}
|
||||
|
||||
# Check that all_packages_count is set
|
||||
assert lister.state.all_packages_count == 3 # 3 entries
|
||||
|
||||
# Second run, since=3
|
||||
lister = ElmLister(scheduler=swh_scheduler)
|
||||
res = lister.run()
|
||||
assert res.pages == 1
|
||||
assert res.origins == 1
|
||||
|
||||
scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
|
||||
expected_origins = expected_origins_since_0 + expected_origins_since_3
|
||||
assert len(scheduler_origins) == len(expected_origins)
|
||||
assert {
|
||||
(
|
||||
|
@ -30,3 +80,11 @@ def test_elm_lister(datadir, requests_mock_datadir, swh_scheduler):
|
|||
)
|
||||
for scheduled in scheduler_origins
|
||||
} == {("git", expected, None) for expected in expected_origins}
|
||||
assert lister.state.all_packages_count == 4 # 4 entries
|
||||
|
||||
# Third run, since=4, nothing new
|
||||
lister = ElmLister(scheduler=swh_scheduler)
|
||||
res = lister.run()
|
||||
assert res.pages == 1
|
||||
assert res.origins == 0
|
||||
assert lister.state.all_packages_count == 4 # 4 entries
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue