swh.lister.gnu : Change download method of tree.json file to request

Previously gnu lister was using same code as that of tarball loader
to download, unzip and read tree.json file.
To make the code consise the downloading method is changed to
requests library.
This commit is contained in:
Archit Agrawal 2019-05-27 22:04:03 +05:30
parent 151f6cd223
commit ebdb959823

View file

@ -5,100 +5,21 @@
import random
import gzip
import json
import os
import requests
from urllib.parse import urlparse
from .models import GNUModel
from swh.scheduler import utils
from swh.lister.core.simple_lister import SimpleLister
from swh.model.hashutil import MultiHash, HASH_BLOCK_SIZE
class LocalResponse:
"""Local Response class with iter_content api
"""
def __init__(self, path):
self.path = path
def iter_content(self, chunk_size=None):
with open(self.path, 'rb') as f:
while True:
chunk = f.read(chunk_size)
if not chunk:
break
yield chunk
class ArchiveFetcher:
"""Http/Local client in charge of downloading archives from a
remote/local server.
Args:
temp_directory (str): Path to the temporary disk location used
for downloading the release artifacts
"""
def __init__(self, temp_directory=None):
self.temp_directory = os.getcwd()
self.session = requests.session()
self.params = {
'headers': {
'User-Agent': 'Software Heritage Lister ( __devl__)'
}
}
def download(self, url):
"""Download the remote tarball url locally.
Args:
url (str): Url (file or http*)
Raises:
ValueError in case of failing to query
Returns:
Tuple of local (filepath, hashes of filepath)
"""
url_parsed = urlparse(url)
if url_parsed.scheme == 'file':
path = url_parsed.path
response = LocalResponse(path)
length = os.path.getsize(path)
else:
response = self.session.get(url, **self.params, stream=True)
if response.status_code != 200:
raise ValueError("Fail to query '%s'. Reason: %s" % (
url, response.status_code))
length = int(response.headers['content-length'])
filepath = os.path.join(self.temp_directory, os.path.basename(url))
h = MultiHash(length=length)
with open(filepath, 'wb') as f:
for chunk in response.iter_content(chunk_size=HASH_BLOCK_SIZE):
h.update(chunk)
f.write(chunk)
actual_length = os.path.getsize(filepath)
if length != actual_length:
raise ValueError('Error when checking size: %s != %s' % (
length, actual_length))
return filepath
class GNULister(SimpleLister, ArchiveFetcher):
class GNULister(SimpleLister):
MODEL = GNUModel
LISTER_NAME = 'gnu'
TREE_URL = 'https://ftp.gnu.org/tree.json.gz'
def __init__(self, override_config=None):
SimpleLister.__init__(self, override_config=override_config)
ArchiveFetcher.__init__(self, override_config=override_config)
def task_dict(self, origin_type, origin_url, **kwargs):
"""(Override)
@ -106,7 +27,6 @@ class GNULister(SimpleLister, ArchiveFetcher):
This is overridden from the lister_base as more information is
needed for the ingestion task creation.
"""
_type = 'load-%s' % origin_type
_policy = 'recurring'
@ -116,26 +36,18 @@ class GNULister(SimpleLister, ArchiveFetcher):
_type, _policy, project_name, origin_url,
project_metadata_url=project_metadata_url)
def download_file(self):
def get_file(self):
'''
Downloads tree.json file and returns its location
Downloads and unzip tree.json.gz file and returns its content
in JSON format
Returns
File path of the downloaded file
File content in JSON format
'''
file_path, hash_dict = self.download(self.TREE_URL)
return file_path
def read_downloaded_file(self, file_path):
'''
Reads the downloaded file content and convert it into json format
Returns
File content in json format
'''
with gzip.GzipFile(file_path, 'r') as fin:
response = json.loads(fin.read().decode('utf-8'))
return response
response = requests.get('https://ftp.gnu.org/tree.json.gz',
allow_redirects=True)
uncompressed_content = gzip.decompress(response.content)
return json.loads(uncompressed_content.decode('utf-8'))
def safely_issue_request(self, identifier):
'''(Override)Make network request with to download the file which
@ -146,17 +58,15 @@ class GNULister(SimpleLister, ArchiveFetcher):
Returns:
server response
'''
file_path = self.download_file()
response = self.read_downloaded_file(file_path)
response = self.get_file()
return response
def list_packages(self, response):
"""(Override) List the actual gnu origins with their names and
time last updated from the response.
"""
response = clean_up_response(response)
_packages = []
packages = []
for directory in response:
content = directory['contents']
for repo in content:
@ -167,9 +77,9 @@ class GNULister(SimpleLister, ArchiveFetcher):
repo['name']),
'time_modified': repo['time']
}
_packages.append(repo_details)
random.shuffle(_packages)
return _packages
packages.append(repo_details)
random.shuffle(packages)
return packages
def _get_project_url(self, dir_name, package_name):
"""Returns project_url