swh.lister.gnu : Change download method of tree.json file to request
Previously gnu lister was using same code as that of tarball loader to download, unzip and read tree.json file. To make the code consise the downloading method is changed to requests library.
This commit is contained in:
parent
151f6cd223
commit
ebdb959823
1 changed files with 14 additions and 104 deletions
|
@ -5,100 +5,21 @@
|
|||
import random
|
||||
import gzip
|
||||
import json
|
||||
import os
|
||||
import requests
|
||||
from urllib.parse import urlparse
|
||||
|
||||
from .models import GNUModel
|
||||
|
||||
from swh.scheduler import utils
|
||||
from swh.lister.core.simple_lister import SimpleLister
|
||||
from swh.model.hashutil import MultiHash, HASH_BLOCK_SIZE
|
||||
|
||||
|
||||
class LocalResponse:
|
||||
"""Local Response class with iter_content api
|
||||
|
||||
"""
|
||||
def __init__(self, path):
|
||||
self.path = path
|
||||
|
||||
def iter_content(self, chunk_size=None):
|
||||
with open(self.path, 'rb') as f:
|
||||
while True:
|
||||
chunk = f.read(chunk_size)
|
||||
if not chunk:
|
||||
break
|
||||
yield chunk
|
||||
|
||||
|
||||
class ArchiveFetcher:
|
||||
"""Http/Local client in charge of downloading archives from a
|
||||
remote/local server.
|
||||
|
||||
Args:
|
||||
temp_directory (str): Path to the temporary disk location used
|
||||
for downloading the release artifacts
|
||||
|
||||
"""
|
||||
def __init__(self, temp_directory=None):
|
||||
self.temp_directory = os.getcwd()
|
||||
self.session = requests.session()
|
||||
self.params = {
|
||||
'headers': {
|
||||
'User-Agent': 'Software Heritage Lister ( __devl__)'
|
||||
}
|
||||
}
|
||||
|
||||
def download(self, url):
|
||||
"""Download the remote tarball url locally.
|
||||
|
||||
Args:
|
||||
url (str): Url (file or http*)
|
||||
|
||||
Raises:
|
||||
ValueError in case of failing to query
|
||||
|
||||
Returns:
|
||||
Tuple of local (filepath, hashes of filepath)
|
||||
|
||||
"""
|
||||
url_parsed = urlparse(url)
|
||||
if url_parsed.scheme == 'file':
|
||||
path = url_parsed.path
|
||||
response = LocalResponse(path)
|
||||
length = os.path.getsize(path)
|
||||
else:
|
||||
response = self.session.get(url, **self.params, stream=True)
|
||||
if response.status_code != 200:
|
||||
raise ValueError("Fail to query '%s'. Reason: %s" % (
|
||||
url, response.status_code))
|
||||
length = int(response.headers['content-length'])
|
||||
|
||||
filepath = os.path.join(self.temp_directory, os.path.basename(url))
|
||||
|
||||
h = MultiHash(length=length)
|
||||
with open(filepath, 'wb') as f:
|
||||
for chunk in response.iter_content(chunk_size=HASH_BLOCK_SIZE):
|
||||
h.update(chunk)
|
||||
f.write(chunk)
|
||||
|
||||
actual_length = os.path.getsize(filepath)
|
||||
if length != actual_length:
|
||||
raise ValueError('Error when checking size: %s != %s' % (
|
||||
length, actual_length))
|
||||
|
||||
return filepath
|
||||
|
||||
|
||||
class GNULister(SimpleLister, ArchiveFetcher):
|
||||
class GNULister(SimpleLister):
|
||||
MODEL = GNUModel
|
||||
LISTER_NAME = 'gnu'
|
||||
TREE_URL = 'https://ftp.gnu.org/tree.json.gz'
|
||||
|
||||
def __init__(self, override_config=None):
|
||||
SimpleLister.__init__(self, override_config=override_config)
|
||||
ArchiveFetcher.__init__(self, override_config=override_config)
|
||||
|
||||
def task_dict(self, origin_type, origin_url, **kwargs):
|
||||
"""(Override)
|
||||
|
@ -106,7 +27,6 @@ class GNULister(SimpleLister, ArchiveFetcher):
|
|||
|
||||
This is overridden from the lister_base as more information is
|
||||
needed for the ingestion task creation.
|
||||
|
||||
"""
|
||||
_type = 'load-%s' % origin_type
|
||||
_policy = 'recurring'
|
||||
|
@ -116,26 +36,18 @@ class GNULister(SimpleLister, ArchiveFetcher):
|
|||
_type, _policy, project_name, origin_url,
|
||||
project_metadata_url=project_metadata_url)
|
||||
|
||||
def download_file(self):
|
||||
def get_file(self):
|
||||
'''
|
||||
Downloads tree.json file and returns its location
|
||||
Downloads and unzip tree.json.gz file and returns its content
|
||||
in JSON format
|
||||
|
||||
Returns
|
||||
File path of the downloaded file
|
||||
File content in JSON format
|
||||
'''
|
||||
file_path, hash_dict = self.download(self.TREE_URL)
|
||||
return file_path
|
||||
|
||||
def read_downloaded_file(self, file_path):
|
||||
'''
|
||||
Reads the downloaded file content and convert it into json format
|
||||
|
||||
Returns
|
||||
File content in json format
|
||||
'''
|
||||
with gzip.GzipFile(file_path, 'r') as fin:
|
||||
response = json.loads(fin.read().decode('utf-8'))
|
||||
return response
|
||||
response = requests.get('https://ftp.gnu.org/tree.json.gz',
|
||||
allow_redirects=True)
|
||||
uncompressed_content = gzip.decompress(response.content)
|
||||
return json.loads(uncompressed_content.decode('utf-8'))
|
||||
|
||||
def safely_issue_request(self, identifier):
|
||||
'''(Override)Make network request with to download the file which
|
||||
|
@ -146,17 +58,15 @@ class GNULister(SimpleLister, ArchiveFetcher):
|
|||
Returns:
|
||||
server response
|
||||
'''
|
||||
file_path = self.download_file()
|
||||
response = self.read_downloaded_file(file_path)
|
||||
response = self.get_file()
|
||||
return response
|
||||
|
||||
def list_packages(self, response):
|
||||
"""(Override) List the actual gnu origins with their names and
|
||||
time last updated from the response.
|
||||
|
||||
"""
|
||||
response = clean_up_response(response)
|
||||
_packages = []
|
||||
packages = []
|
||||
for directory in response:
|
||||
content = directory['contents']
|
||||
for repo in content:
|
||||
|
@ -167,9 +77,9 @@ class GNULister(SimpleLister, ArchiveFetcher):
|
|||
repo['name']),
|
||||
'time_modified': repo['time']
|
||||
}
|
||||
_packages.append(repo_details)
|
||||
random.shuffle(_packages)
|
||||
return _packages
|
||||
packages.append(repo_details)
|
||||
random.shuffle(packages)
|
||||
return packages
|
||||
|
||||
def _get_project_url(self, dir_name, package_name):
|
||||
"""Returns project_url
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue