From 93f17d4d9cb103d39448d4eed2031a825e3c0125 Mon Sep 17 00:00:00 2001 From: Antoine Lambert Date: Fri, 3 Dec 2021 16:09:36 +0100 Subject: [PATCH] debian: Provide last_update to produced ListedOrigin models Use the value of the "Last-Modified" header from the HTTP response resulting of the debian sources index HTTP request. It will prevent to create loading tasks for debian packages with no changes since last listing. Related to T2400 --- swh/lister/debian/lister.py | 6 ++++++ swh/lister/debian/tests/test_lister.py | 8 +++++++- 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/swh/lister/debian/lister.py b/swh/lister/debian/lister.py index 32049c7..2537235 100644 --- a/swh/lister/debian/lister.py +++ b/swh/lister/debian/lister.py @@ -7,6 +7,7 @@ import bz2 from collections import defaultdict from dataclasses import dataclass, field +from email.utils import parsedate_to_datetime import gzip from itertools import product import logging @@ -134,6 +135,10 @@ class DebianLister(Lister[DebianListerState, DebianPageType]): response = requests.get(url, stream=True) logging.debug("Fetched URL: %s, status code: %s", url, response.status_code) if response.status_code == 200: + last_modified = response.headers.get("Last-Modified") + self.last_sources_update = ( + parsedate_to_datetime(last_modified) if last_modified else None + ) decompressor = decompressors.get(compression) if decompressor: data = decompressor(response.raw).readlines() @@ -224,6 +229,7 @@ class DebianLister(Lister[DebianListerState, DebianPageType]): url=origin_url, visit_type="deb", extra_loader_arguments={"packages": {}}, + last_update=self.last_sources_update, ) # origin will be yielded at the end of that method origins_to_send[origin_url] = self.listed_origins[origin_url] diff --git a/swh/lister/debian/tests/test_lister.py b/swh/lister/debian/tests/test_lister.py index d741a02..1ce5de2 100644 --- a/swh/lister/debian/tests/test_lister.py +++ b/swh/lister/debian/tests/test_lister.py @@ -4,6 +4,7 @@ # See top-level LICENSE file for more information from collections import defaultdict +from email.utils import formatdate import os from pathlib import Path from typing import Dict, List, Set, Tuple @@ -78,7 +79,11 @@ def _init_test( if compression: requests_mock.get(idx_url, status_code=404) else: - requests_mock.get(idx_url, text=sources) + requests_mock.get( + idx_url, + text=sources, + headers={"Last-Modified": formatdate(usegmt=True)}, + ) for idx_url, _ in lister.debian_index_urls(suite, _components[1]): requests_mock.get(idx_url, status_code=404) @@ -122,6 +127,7 @@ def _check_listed_origins( ] assert filtered_origins + assert filtered_origins[0].last_update is not None packages = filtered_origins[0].extra_loader_arguments["packages"] # check the version info are available assert package_version_key in packages