aur: Store packages index in memory instead of disk

Simplify code for downloading packages index as gzip and deflate
transfer-encodings are automatically decoded by requests, also
do not stream response for a couple of megabytes and store
HTTP responses in memory.

Also add more debug logs to track lister execution.
This commit is contained in:
Antoine Lambert 2022-09-02 15:36:20 +02:00
parent 7638f2028b
commit 92baa2b45c
3 changed files with 23 additions and 45 deletions

View file

@ -2,13 +2,10 @@
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import datetime
import gzip
import json
import logging
from pathlib import Path
import shutil
from typing import Any, Dict, Iterator, Optional
from typing import Any, Dict, Iterator, List, Optional
import requests
@ -47,8 +44,6 @@ class AurLister(StatelessLister[AurListerPage]):
PACKAGE_VCS_URL_PATTERN = "{base_url}/{pkgname}.git"
PACKAGE_SNAPSHOT_URL_PATTERN = "{base_url}/cgit/aur.git/snapshot/{pkgname}.tar.gz"
DESTINATION_PATH = Path("/tmp/aur_archive")
def __init__(
self,
scheduler: SchedulerInterface,
@ -61,7 +56,7 @@ class AurLister(StatelessLister[AurListerPage]):
url=self.BASE_URL,
)
def download_index_archive(self) -> Path:
def download_packages_index(self) -> List[Dict[str, Any]]:
"""Build an url based on self.DEFAULT_PACKAGES_INDEX_URL format string,
and download the archive to self.DESTINATION_PATH
@ -69,16 +64,7 @@ class AurLister(StatelessLister[AurListerPage]):
a directory Path where the archive has been downloaded to.
"""
url = self.DEFAULT_PACKAGES_INDEX_URL.format(base_url=self.url)
filename = url.split("/")[-1]
destination = self.DESTINATION_PATH / filename
self.DESTINATION_PATH.mkdir(exist_ok=True)
response = requests.get(url, stream=True)
destination.write_bytes(response.raw.read())
assert destination.exists()
return destination
return requests.get(url).json()
def get_pages(self) -> Iterator[AurListerPage]:
"""Yield an iterator which returns 'page'
@ -88,27 +74,21 @@ class AurLister(StatelessLister[AurListerPage]):
a canonical 'snapshot_url' from which a tar.gz archive of the package can
be downloaded.
"""
index = self.download_index_archive()
packages = self.download_packages_index()
with gzip.open(index, "rb") as f:
assert f.readable()
file_content = f.read()
packages = json.loads(file_content)
logger.debug("Found %s AUR packages in aur_index", len(packages))
assert packages
counter: int = 0
for package in packages:
# Exclude lines where Name differs from PackageBase as they represents
# split package and they don't have resolvable snapshots url
if package["Name"] == package["PackageBase"]:
logger.debug("Processing AUR package %s", package["Name"])
pkgname = package["PackageBase"]
version = package["Version"]
project_url = package["URL"]
last_modified = datetime.datetime.fromtimestamp(
float(package["LastModified"]), tz=datetime.timezone.utc
).isoformat()
counter += 1
yield {
"pkgname": pkgname,
"version": version,
@ -121,7 +101,6 @@ class AurLister(StatelessLister[AurListerPage]):
"project_url": project_url,
"last_modified": last_modified,
}
logger.debug("Found %s AUR packages in aur_index", counter)
def get_origins_from_page(self, origin: AurListerPage) -> Iterator[ListedOrigin]:
"""Iterate on all pages and yield ListedOrigin instances.
@ -163,11 +142,3 @@ class AurLister(StatelessLister[AurListerPage]):
"aur_metadata": aur_metadata,
},
)
def finalize(self) -> None:
# Cleanup by removing the repository directory
if self.DESTINATION_PATH.exists():
shutil.rmtree(self.DESTINATION_PATH)
logger.debug(
"Successfully removed %s directory", str(self.DESTINATION_PATH)
)

View file

@ -2,6 +2,11 @@
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import gzip
import json
import os
from swh.lister.aur.lister import AurLister
expected_origins = [
@ -92,13 +97,22 @@ expected_origins = [
]
def test_aur_lister(datadir, requests_mock_datadir, swh_scheduler):
def test_aur_lister(datadir, swh_scheduler, requests_mock):
lister = AurLister(scheduler=swh_scheduler)
packages_index_filename = "packages-meta-v1.json.gz"
# simulate requests behavior: gzip and deflate transfer-encodings are automatically decoded
with gzip.open(os.path.join(datadir, packages_index_filename), "rb") as f:
requests_mock.get(
f"{lister.BASE_URL}/{packages_index_filename}", json=json.loads(f.read())
)
res = lister.run()
assert res.pages == 4
assert res.origins == 4
scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
assert [
@ -116,10 +130,3 @@ def test_aur_lister(datadir, requests_mock_datadir, swh_scheduler):
)
for expected in sorted(expected_origins, key=lambda expected: expected["url"])
]
def test_aur_lister_directory_cleanup(datadir, requests_mock_datadir, swh_scheduler):
lister = AurLister(scheduler=swh_scheduler)
lister.run()
# Repository directory should not exists after the lister runs
assert not lister.DESTINATION_PATH.exists()