From ebdb959823bc004ff1766e97a409561dc48dcf3d Mon Sep 17 00:00:00 2001 From: Archit Agrawal Date: Mon, 27 May 2019 22:04:03 +0530 Subject: [PATCH] swh.lister.gnu : Change download method of tree.json file to request Previously gnu lister was using same code as that of tarball loader to download, unzip and read tree.json file. To make the code consise the downloading method is changed to requests library. --- swh/lister/gnu/lister.py | 118 +++++---------------------------------- 1 file changed, 14 insertions(+), 104 deletions(-) diff --git a/swh/lister/gnu/lister.py b/swh/lister/gnu/lister.py index bd821d4..f91dbf2 100644 --- a/swh/lister/gnu/lister.py +++ b/swh/lister/gnu/lister.py @@ -5,100 +5,21 @@ import random import gzip import json -import os import requests -from urllib.parse import urlparse from .models import GNUModel from swh.scheduler import utils from swh.lister.core.simple_lister import SimpleLister -from swh.model.hashutil import MultiHash, HASH_BLOCK_SIZE -class LocalResponse: - """Local Response class with iter_content api - - """ - def __init__(self, path): - self.path = path - - def iter_content(self, chunk_size=None): - with open(self.path, 'rb') as f: - while True: - chunk = f.read(chunk_size) - if not chunk: - break - yield chunk - - -class ArchiveFetcher: - """Http/Local client in charge of downloading archives from a - remote/local server. - - Args: - temp_directory (str): Path to the temporary disk location used - for downloading the release artifacts - - """ - def __init__(self, temp_directory=None): - self.temp_directory = os.getcwd() - self.session = requests.session() - self.params = { - 'headers': { - 'User-Agent': 'Software Heritage Lister ( __devl__)' - } - } - - def download(self, url): - """Download the remote tarball url locally. - - Args: - url (str): Url (file or http*) - - Raises: - ValueError in case of failing to query - - Returns: - Tuple of local (filepath, hashes of filepath) - - """ - url_parsed = urlparse(url) - if url_parsed.scheme == 'file': - path = url_parsed.path - response = LocalResponse(path) - length = os.path.getsize(path) - else: - response = self.session.get(url, **self.params, stream=True) - if response.status_code != 200: - raise ValueError("Fail to query '%s'. Reason: %s" % ( - url, response.status_code)) - length = int(response.headers['content-length']) - - filepath = os.path.join(self.temp_directory, os.path.basename(url)) - - h = MultiHash(length=length) - with open(filepath, 'wb') as f: - for chunk in response.iter_content(chunk_size=HASH_BLOCK_SIZE): - h.update(chunk) - f.write(chunk) - - actual_length = os.path.getsize(filepath) - if length != actual_length: - raise ValueError('Error when checking size: %s != %s' % ( - length, actual_length)) - - return filepath - - -class GNULister(SimpleLister, ArchiveFetcher): +class GNULister(SimpleLister): MODEL = GNUModel LISTER_NAME = 'gnu' TREE_URL = 'https://ftp.gnu.org/tree.json.gz' def __init__(self, override_config=None): SimpleLister.__init__(self, override_config=override_config) - ArchiveFetcher.__init__(self, override_config=override_config) def task_dict(self, origin_type, origin_url, **kwargs): """(Override) @@ -106,7 +27,6 @@ class GNULister(SimpleLister, ArchiveFetcher): This is overridden from the lister_base as more information is needed for the ingestion task creation. - """ _type = 'load-%s' % origin_type _policy = 'recurring' @@ -116,26 +36,18 @@ class GNULister(SimpleLister, ArchiveFetcher): _type, _policy, project_name, origin_url, project_metadata_url=project_metadata_url) - def download_file(self): + def get_file(self): ''' - Downloads tree.json file and returns its location + Downloads and unzip tree.json.gz file and returns its content + in JSON format Returns - File path of the downloaded file + File content in JSON format ''' - file_path, hash_dict = self.download(self.TREE_URL) - return file_path - - def read_downloaded_file(self, file_path): - ''' - Reads the downloaded file content and convert it into json format - - Returns - File content in json format - ''' - with gzip.GzipFile(file_path, 'r') as fin: - response = json.loads(fin.read().decode('utf-8')) - return response + response = requests.get('https://ftp.gnu.org/tree.json.gz', + allow_redirects=True) + uncompressed_content = gzip.decompress(response.content) + return json.loads(uncompressed_content.decode('utf-8')) def safely_issue_request(self, identifier): '''(Override)Make network request with to download the file which @@ -146,17 +58,15 @@ class GNULister(SimpleLister, ArchiveFetcher): Returns: server response ''' - file_path = self.download_file() - response = self.read_downloaded_file(file_path) + response = self.get_file() return response def list_packages(self, response): """(Override) List the actual gnu origins with their names and time last updated from the response. - """ response = clean_up_response(response) - _packages = [] + packages = [] for directory in response: content = directory['contents'] for repo in content: @@ -167,9 +77,9 @@ class GNULister(SimpleLister, ArchiveFetcher): repo['name']), 'time_modified': repo['time'] } - _packages.append(repo_details) - random.shuffle(_packages) - return _packages + packages.append(repo_details) + random.shuffle(packages) + return packages def _get_project_url(self, dir_name, package_name): """Returns project_url