From 2e6e282d4464da4668eaf0f99fad8139a5d9a653 Mon Sep 17 00:00:00 2001 From: "Antoine R. Dumont (@ardumont)" Date: Wed, 5 Oct 2022 15:58:50 +0200 Subject: [PATCH] nixguix: Deal with manifest entries without an integrity field In that case, this fallbacks to use the "outputHash" which is an equivalent field of the integrity one except it's for "recursive" outputHashMode. This adds the necessary assertions around this case so correct data is sent to loaders as well. Related to T3781 --- swh/lister/nixguix/lister.py | 30 +++++++++++++++---- .../nixguix/tests/data/guix-swh_sources.json | 10 +++++++ .../tests/data/nixpkgs-swh_sources.json | 10 +++++++ 3 files changed, 44 insertions(+), 6 deletions(-) diff --git a/swh/lister/nixguix/lister.py b/swh/lister/nixguix/lister.py index 5c1983a..59976c3 100644 --- a/swh/lister/nixguix/lister.py +++ b/swh/lister/nixguix/lister.py @@ -16,6 +16,7 @@ Artifacts can be of types: """ import base64 +import binascii from dataclasses import dataclass from enum import Enum import logging @@ -362,11 +363,20 @@ class NixGuixLister(StatelessLister[PageResult]): yield built_artifact continue + outputHash = artifact.get("outputHash") integrity = artifact.get("integrity") - if integrity is None: - logger.warning("Skipping url <%s>: missing integrity field", origin) + if integrity is None and outputHash is None: + logger.warning( + "Skipping url <%s>: missing integrity and outputHash field", + origin, + ) continue + # Falls back to outputHash field if integrity is missing + if integrity is None and outputHash: + # We'll deal with outputHash as integrity field + integrity = outputHash + try: is_tar, origin = is_tarball(urls, self.session) except ArtifactNatureMistyped: @@ -396,10 +406,18 @@ class NixGuixLister(StatelessLister[PageResult]): # convert into a dict of checksums. This only parses the # `hash-expression` (hash-) as defined in # https://w3c.github.io/webappsec-subresource-integrity/#the-integrity-attribute - chksum_algo, chksum_b64 = integrity.split("-") - checksums: Dict[str, str] = { - chksum_algo: base64.decodebytes(chksum_b64.encode()).hex() - } + try: + chksum_algo, chksum_b64 = integrity.split("-") + checksums: Dict[str, str] = { + chksum_algo: base64.decodebytes(chksum_b64.encode()).hex() + } + except binascii.Error: + logger.exception( + "Skipping url: <%s>: integrity computation failure for <%s>", + url, + artifact, + ) + continue # The 'outputHashMode' attribute determines how the hash is computed. It # must be one of the following two values: diff --git a/swh/lister/nixguix/tests/data/guix-swh_sources.json b/swh/lister/nixguix/tests/data/guix-swh_sources.json index fdb9c4a..096ea7e 100644 --- a/swh/lister/nixguix/tests/data/guix-swh_sources.json +++ b/swh/lister/nixguix/tests/data/guix-swh_sources.json @@ -27,6 +27,16 @@ ], "integrity": "sha256-lV3xiWUZmSnt4LW0ni/sUyC/bbtaxkTzvFLFtJKLuI4=" }, + { + "outputHash": "sha256-9uF0fYl4Zz/Ia2UKx7CBi8ZU8jfWoBfy2QSgTSwXo5A", + "outputHashAlgo": null, + "outputHashMode": "recursive", + "type": "url", + "urls": [ + "https://github.com/figiel/hosts/archive/v1.0.0.tar.gz" + ], + "inferredFetcher": "fetchzip" + }, { "type": "url", "urls": [ "unknown://example.org/wrong-scheme-so-skipped.txt" ], diff --git a/swh/lister/nixguix/tests/data/nixpkgs-swh_sources.json b/swh/lister/nixguix/tests/data/nixpkgs-swh_sources.json index 9deb967..bb1943c 100644 --- a/swh/lister/nixguix/tests/data/nixpkgs-swh_sources.json +++ b/swh/lister/nixguix/tests/data/nixpkgs-swh_sources.json @@ -79,6 +79,16 @@ "svn_url": "https://code.call-cc.org/svn/chicken-eggs/release/5/iset/tags/2.2", "svn_revision": 39057 }, + { + "outputHash": "sha256-LxVcYj2WKHbhNu5x/DFkxQPOYrVkNvwiE/qcODq52Lc=", + "outputHashAlgo": null, + "outputHashMode": "recursive", + "type": "url", + "urls": [ + "https://github.com/julian-klode/triehash/archive/debian/0.3-3.tar.gz" + ], + "inferredFetcher": "fetchzip" + }, { "type": "url", "urls": [