crates: Speedup listing by processing crates in batch
Instead of having a single crate and its versions info per page, prefer to have up to 1000 crates per page to significantly speedup the listing process.
This commit is contained in:
parent
c6aa490fc1
commit
42e76ee62e
2 changed files with 54 additions and 48 deletions
|
@ -1,4 +1,4 @@
|
|||
# Copyright (C) 2022-2023 The Software Heritage developers
|
||||
# Copyright (C) 2022-2024 The Software Heritage developers
|
||||
# See the AUTHORS file at the top-level directory of this distribution
|
||||
# License: GNU General Public License version 3, or any later version
|
||||
# See top-level LICENSE file for more information
|
||||
|
@ -17,6 +17,7 @@ from urllib.parse import urlparse
|
|||
import iso8601
|
||||
from looseversion import LooseVersion2
|
||||
|
||||
from swh.core.utils import grouper
|
||||
from swh.scheduler.interface import SchedulerInterface
|
||||
from swh.scheduler.model import ListedOrigin
|
||||
|
||||
|
@ -25,7 +26,7 @@ from ..pattern import CredentialsType, Lister
|
|||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Aliasing the page results returned by `get_pages` method from the lister.
|
||||
CratesListerPage = List[Dict[str, Any]]
|
||||
CratesListerPage = List[List[Dict[str, Any]]]
|
||||
|
||||
|
||||
@dataclass
|
||||
|
@ -198,17 +199,21 @@ class CratesLister(Lister[CratesListerState, CratesListerPage]):
|
|||
|
||||
logger.debug("Found %s crates in crates_index", len(dataset))
|
||||
|
||||
# Each entry from dataset will correspond to a page
|
||||
for name, item in dataset.items():
|
||||
# a page contains up to 1000 crates with versions info
|
||||
for crates in grouper(dataset.items(), 1000):
|
||||
page = []
|
||||
# sort crate versions
|
||||
versions = sorted(item["versions"].keys(), key=LooseVersion2)
|
||||
for name, item in crates:
|
||||
crate_versions = []
|
||||
# sort crate versions
|
||||
versions = sorted(item["versions"].keys(), key=LooseVersion2)
|
||||
|
||||
for version in versions:
|
||||
v = item["versions"][version]
|
||||
v["name"] = name
|
||||
v["version"] = version
|
||||
page.append(self.page_entry_dict(v))
|
||||
for version in versions:
|
||||
v = item["versions"][version]
|
||||
v["name"] = name
|
||||
v["version"] = version
|
||||
crate_versions.append(self.page_entry_dict(v))
|
||||
|
||||
page.append(crate_versions)
|
||||
|
||||
yield page
|
||||
self.all_crates_processed = True
|
||||
|
@ -217,45 +222,46 @@ class CratesLister(Lister[CratesListerState, CratesListerPage]):
|
|||
"""Iterate on all crate pages and yield ListedOrigin instances."""
|
||||
assert self.lister_obj.id is not None
|
||||
|
||||
url = self.CRATE_URL_PATTERN.format(crate=page[0]["name"])
|
||||
last_update = page[0]["last_update"]
|
||||
for crate_versions in page:
|
||||
url = self.CRATE_URL_PATTERN.format(crate=crate_versions[0]["name"])
|
||||
last_update = crate_versions[0]["last_update"]
|
||||
|
||||
artifacts = []
|
||||
crates_metadata = []
|
||||
artifacts = []
|
||||
crates_metadata = []
|
||||
|
||||
for entry in page:
|
||||
# Build an artifact entry following original-artifacts-json specification
|
||||
# https://docs.softwareheritage.org/devel/swh-storage/extrinsic-metadata-specification.html#original-artifacts-json # noqa: B950
|
||||
artifacts.append(
|
||||
{
|
||||
"version": entry["version"],
|
||||
"filename": entry["filename"],
|
||||
"url": entry["crate_file"],
|
||||
"checksums": {
|
||||
"sha256": entry["checksum"],
|
||||
},
|
||||
}
|
||||
for entry in crate_versions:
|
||||
# Build an artifact entry following original-artifacts-json specification
|
||||
# https://docs.softwareheritage.org/devel/swh-storage/extrinsic-metadata-specification.html#original-artifacts-json # noqa: B950
|
||||
artifacts.append(
|
||||
{
|
||||
"version": entry["version"],
|
||||
"filename": entry["filename"],
|
||||
"url": entry["crate_file"],
|
||||
"checksums": {
|
||||
"sha256": entry["checksum"],
|
||||
},
|
||||
}
|
||||
)
|
||||
|
||||
crates_metadata.append(
|
||||
{
|
||||
"version": entry["version"],
|
||||
"yanked": entry["yanked"],
|
||||
"last_update": entry["last_update"],
|
||||
}
|
||||
)
|
||||
|
||||
yield ListedOrigin(
|
||||
lister_id=self.lister_obj.id,
|
||||
visit_type=self.VISIT_TYPE,
|
||||
url=url,
|
||||
last_update=iso8601.parse_date(last_update),
|
||||
extra_loader_arguments={
|
||||
"artifacts": artifacts,
|
||||
"crates_metadata": crates_metadata,
|
||||
},
|
||||
)
|
||||
|
||||
crates_metadata.append(
|
||||
{
|
||||
"version": entry["version"],
|
||||
"yanked": entry["yanked"],
|
||||
"last_update": entry["last_update"],
|
||||
}
|
||||
)
|
||||
|
||||
yield ListedOrigin(
|
||||
lister_id=self.lister_obj.id,
|
||||
visit_type=self.VISIT_TYPE,
|
||||
url=url,
|
||||
last_update=iso8601.parse_date(last_update),
|
||||
extra_loader_arguments={
|
||||
"artifacts": artifacts,
|
||||
"crates_metadata": crates_metadata,
|
||||
},
|
||||
)
|
||||
|
||||
def finalize(self) -> None:
|
||||
if not self.state.index_last_update and self.all_crates_processed:
|
||||
last = iso8601.parse_date(self.index_metadata["timestamp"])
|
||||
|
|
|
@ -175,7 +175,7 @@ def test_crates_lister(datadir, tmp_path, swh_scheduler, requests_mock_datadir):
|
|||
lister = CratesLister(scheduler=swh_scheduler)
|
||||
res = lister.run()
|
||||
|
||||
assert res.pages == 3
|
||||
assert res.pages == 1
|
||||
assert res.origins == 3
|
||||
|
||||
scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
|
||||
|
@ -205,7 +205,7 @@ def test_crates_lister_incremental(
|
|||
lister = CratesLister(scheduler=swh_scheduler)
|
||||
first = lister.run()
|
||||
|
||||
assert first.pages == 3
|
||||
assert first.pages == 1
|
||||
assert first.origins == 3
|
||||
|
||||
second = lister.run()
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue