aur: Store packages index in memory instead of disk

Simplify code for downloading packages index as gzip and deflate transfer-encodings are automatically decoded by requests, also do not stream response for a couple of megabytes and store HTTP responses in memory. Also add more debug logs to track lister execution.
2022-09-02 15:36:20 +02:00 · 2022-09-02 15:36:20 +02:00 · 92baa2b45c
commit 92baa2b45c
parent 7638f2028b
3 changed files with 23 additions and 45 deletions
--- a/swh/lister/aur/lister.py
+++ b/swh/lister/aur/lister.py
@ -2,13 +2,10 @@
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
+
 import datetime
-import gzip
-import json
 import logging
-from pathlib import Path
-import shutil
-from typing import Any, Dict, Iterator, Optional
+from typing import Any, Dict, Iterator, List, Optional

 import requests

@ -47,8 +44,6 @@ class AurLister(StatelessLister[AurListerPage]):
    PACKAGE_VCS_URL_PATTERN = "{base_url}/{pkgname}.git"
    PACKAGE_SNAPSHOT_URL_PATTERN = "{base_url}/cgit/aur.git/snapshot/{pkgname}.tar.gz"

-    DESTINATION_PATH = Path("/tmp/aur_archive")
-
    def __init__(
        self,
        scheduler: SchedulerInterface,
@ -61,7 +56,7 @@ class AurLister(StatelessLister[AurListerPage]):
            url=self.BASE_URL,
        )

-    def download_index_archive(self) -> Path:
+    def download_packages_index(self) -> List[Dict[str, Any]]:
        """Build an url based on self.DEFAULT_PACKAGES_INDEX_URL format string,
        and download the archive to self.DESTINATION_PATH

@ -69,16 +64,7 @@ class AurLister(StatelessLister[AurListerPage]):
            a directory Path where the archive has been downloaded to.
        """
        url = self.DEFAULT_PACKAGES_INDEX_URL.format(base_url=self.url)
-        filename = url.split("/")[-1]
-        destination = self.DESTINATION_PATH / filename
-
-        self.DESTINATION_PATH.mkdir(exist_ok=True)
-
-        response = requests.get(url, stream=True)
-        destination.write_bytes(response.raw.read())
-        assert destination.exists()
-
-        return destination
+        return requests.get(url).json()

    def get_pages(self) -> Iterator[AurListerPage]:
        """Yield an iterator which returns 'page'
@ -88,27 +74,21 @@ class AurLister(StatelessLister[AurListerPage]):
        a canonical 'snapshot_url' from which a tar.gz archive of the package can
        be downloaded.
        """
-        index = self.download_index_archive()
+        packages = self.download_packages_index()

-        with gzip.open(index, "rb") as f:
-            assert f.readable()
-            file_content = f.read()
-            packages = json.loads(file_content)
+        logger.debug("Found %s AUR packages in aur_index", len(packages))

-        assert packages
-
-        counter: int = 0
        for package in packages:
            # Exclude lines where Name differs from PackageBase as they represents
            # split package and they don't have resolvable snapshots url
            if package["Name"] == package["PackageBase"]:
+                logger.debug("Processing AUR package %s", package["Name"])
                pkgname = package["PackageBase"]
                version = package["Version"]
                project_url = package["URL"]
                last_modified = datetime.datetime.fromtimestamp(
                    float(package["LastModified"]), tz=datetime.timezone.utc
                ).isoformat()
-                counter += 1
                yield {
                    "pkgname": pkgname,
                    "version": version,
@ -121,7 +101,6 @@ class AurLister(StatelessLister[AurListerPage]):
                    "project_url": project_url,
                    "last_modified": last_modified,
                }
-        logger.debug("Found %s AUR packages in aur_index", counter)

    def get_origins_from_page(self, origin: AurListerPage) -> Iterator[ListedOrigin]:
        """Iterate on all pages and yield ListedOrigin instances.
@ -163,11 +142,3 @@ class AurLister(StatelessLister[AurListerPage]):
                "aur_metadata": aur_metadata,
            },
        )
-
-    def finalize(self) -> None:
-        # Cleanup by removing the repository directory
-        if self.DESTINATION_PATH.exists():
-            shutil.rmtree(self.DESTINATION_PATH)
-            logger.debug(
-                "Successfully removed %s directory", str(self.DESTINATION_PATH)
-            )
--- a/swh/lister/aur/tests/data/https_aur.archlinux.org/packages-meta-v1.json.gz
+++ b/swh/lister/aur/tests/data/https_aur.archlinux.org/packages-meta-v1.json.gz
--- a/swh/lister/aur/tests/test_lister.py
+++ b/swh/lister/aur/tests/test_lister.py
@ -2,6 +2,11 @@
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
+
+import gzip
+import json
+import os
+
 from swh.lister.aur.lister import AurLister

 expected_origins = [
@ -92,13 +97,22 @@ expected_origins = [
 ]


-def test_aur_lister(datadir, requests_mock_datadir, swh_scheduler):
+def test_aur_lister(datadir, swh_scheduler, requests_mock):
+
    lister = AurLister(scheduler=swh_scheduler)
+
+    packages_index_filename = "packages-meta-v1.json.gz"
+
+    # simulate requests behavior: gzip and deflate transfer-encodings are automatically decoded
+    with gzip.open(os.path.join(datadir, packages_index_filename), "rb") as f:
+        requests_mock.get(
+            f"{lister.BASE_URL}/{packages_index_filename}", json=json.loads(f.read())
+        )
+
    res = lister.run()

    assert res.pages == 4
    assert res.origins == 4
-
    scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results

    assert [
@ -116,10 +130,3 @@ def test_aur_lister(datadir, requests_mock_datadir, swh_scheduler):
        )
        for expected in sorted(expected_origins, key=lambda expected: expected["url"])
    ]
-
-
-def test_aur_lister_directory_cleanup(datadir, requests_mock_datadir, swh_scheduler):
-    lister = AurLister(scheduler=swh_scheduler)
-    lister.run()
-    # Repository directory should not exists after the lister runs
-    assert not lister.DESTINATION_PATH.exists()