lister: Type correctly the 'indexable' column

instead of converting that column as a string

As a side effect, bitbucket wise, we provided improperly the after query
parameter as a date not url encoded. This resulted in improper api response from
bitbucket's (we received from time to time the same next index as the current
one).

Related T1826
This commit is contained in:
Antoine R. Dumont (@ardumont) 2019-06-25 14:06:33 +02:00
parent b99617f976
commit 3d473c307c
No known key found for this signature in database
GPG key ID: 52E2E9840D10C3B8
8 changed files with 85 additions and 44 deletions

View file

@ -6,7 +6,6 @@ import logging
import iso8601
from datetime import datetime
from urllib import parse
from swh.lister.bitbucket.models import BitBucketModel
@ -15,7 +14,6 @@ from swh.lister.core.indexing_lister import IndexingHttpLister
logger = logging.getLogger(__name__)
DEFAULT_BITBUCKET_PAGE = 10
@ -24,7 +22,7 @@ class BitBucketLister(IndexingHttpLister):
MODEL = BitBucketModel
LISTER_NAME = 'bitbucket'
instance = 'bitbucket'
default_min_bound = datetime.utcfromtimestamp(0).isoformat()
default_min_bound = datetime.utcfromtimestamp(0)
def __init__(self, api_baseurl, override_config=None, per_page=100):
super().__init__(
@ -39,7 +37,7 @@ class BitBucketLister(IndexingHttpLister):
def get_model_from_repo(self, repo):
return {
'uid': repo['uuid'],
'indexable': repo['created_on'],
'indexable': iso8601.parse_date(repo['created_on']),
'name': repo['name'],
'full_name': repo['full_name'],
'html_url': repo['links']['html']['href'],
@ -48,38 +46,38 @@ class BitBucketLister(IndexingHttpLister):
}
def get_next_target_from_response(self, response):
"""This will read the 'next' link from the api response if any
and return it as a datetime.
Args:
reponse (Response): requests' response from api call
Returns:
next date as a datetime
"""
body = response.json()
if 'next' in body:
return parse.unquote(body['next'].split('after=')[1])
next_ = body.get('next')
if next_ is not None:
next_ = parse.urlparse(next_)
return iso8601.parse_date(parse.parse_qs(next_.query)['after'][0])
def transport_response_simplified(self, response):
repos = response.json()['values']
return [self.get_model_from_repo(repo) for repo in repos]
def request_uri(self, identifier):
identifier = parse.quote(identifier.isoformat())
return super().request_uri(identifier or '1970-01-01')
def is_within_bounds(self, inner, lower=None, upper=None):
# values are expected to be str dates
try:
inner = iso8601.parse_date(inner)
if lower:
lower = iso8601.parse_date(lower)
if upper:
upper = iso8601.parse_date(upper)
if lower is None and upper is None:
return True
elif lower is None:
ret = inner <= upper
elif upper is None:
ret = inner >= lower
else:
ret = lower <= inner <= upper
except Exception as e:
logger.error(str(e) + ': %s, %s, %s',
('inner=%s%s' % (type(inner), inner)),
('lower=%s%s' % (type(lower), lower)),
('upper=%s%s' % (type(upper), upper)))
raise
# values are expected to be datetimes
if lower is None and upper is None:
ret = True
elif lower is None:
ret = inner <= upper
elif upper is None:
ret = inner >= lower
else:
ret = lower <= inner <= upper
return ret

View file

@ -2,7 +2,7 @@
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from sqlalchemy import Column, String
from sqlalchemy import Column, String, DateTime
from swh.lister.core.models import IndexingModelBase
@ -12,4 +12,4 @@ class BitBucketModel(IndexingModelBase):
__tablename__ = 'bitbucket_repo'
uid = Column(String, primary_key=True)
indexable = Column(String, index=True)
indexable = Column(DateTime(timezone=True), index=True)

View file

@ -1,29 +1,63 @@
# Copyright (C) 2017-2018 the Software Heritage developers
# Copyright (C) 2017-2019 the Software Heritage developers
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import re
import unittest
from datetime import timedelta
from urllib.parse import unquote
import iso8601
import requests_mock
from swh.lister.bitbucket.lister import BitBucketLister
from swh.lister.core.tests.test_lister import HttpListerTester
def convert_type(req_index):
"""Convert the req_index to its right type according to the model's
"indexable" column.
"""
return iso8601.parse_date(unquote(req_index))
class BitBucketListerTester(HttpListerTester, unittest.TestCase):
Lister = BitBucketLister
test_re = re.compile(r'/repositories\?after=([^?&]+)')
lister_subdir = 'bitbucket'
good_api_response_file = 'api_response.json'
bad_api_response_file = 'api_empty_response.json'
first_index = '2008-07-12T07:44:01.476818+00:00'
last_index = '2008-07-19T06:16:43.044743+00:00'
first_index = convert_type('2008-07-12T07:44:01.476818+00:00')
last_index = convert_type('2008-07-19T06:16:43.044743+00:00')
entries_per_page = 10
convert_type = staticmethod(convert_type)
@requests_mock.Mocker()
def test_fetch_none_nodb(self, http_mocker):
"""Overridden because index is not an integer nor a string
"""
http_mocker.get(self.test_re, text=self.mock_response)
fl = self.get_fl()
self.disable_scheduler(fl)
self.disable_db(fl)
# stores no results
fl.run(min_bound=self.first_index - timedelta(days=3),
max_bound=self.first_index)
def test_is_within_bounds(self):
fl = self.get_fl()
self.assertTrue(fl.is_within_bounds(
'2008-07-15', self.first_index, self.last_index))
iso8601.parse_date('2008-07-15'),
self.first_index, self.last_index))
self.assertFalse(fl.is_within_bounds(
'2008-07-20', self.first_index, self.last_index))
iso8601.parse_date('2008-07-20'),
self.first_index, self.last_index))
self.assertFalse(fl.is_within_bounds(
'2008-07-11', self.first_index, self.last_index))
iso8601.parse_date('2008-07-11'),
self.first_index, self.last_index))

View file

@ -38,6 +38,12 @@ class HttpListerTesterBase(abc.ABC):
first_index = AbstractAttribute('First index in good_api_response')
entries_per_page = AbstractAttribute('Number of results in good response')
LISTER_NAME = 'fake-lister'
convert_type = str
"""static method used to convert the "request_index" to its right type (for
indexing listers for example, this is in accordance with the model's
"indexable" column).
"""
# May need to override this if the headers are used for something
def response_headers(self, request):
@ -66,7 +72,7 @@ class HttpListerTesterBase(abc.ABC):
def request_index(self, request):
m = self.test_re.search(request.path_url)
if m and (len(m.groups()) > 0):
return m.group(1)
return self.convert_type(m.group(1))
def mock_response(self, request, context):
self.fl.reset_backoff()
@ -76,7 +82,7 @@ class HttpListerTesterBase(abc.ABC):
context.headers.update(custom_headers)
req_index = self.request_index(request)
if req_index == str(self.first_index):
if req_index == self.first_index:
response_file = self.good_api_response_file
else:
response_file = self.bad_api_response_file

View file

@ -1,4 +1,4 @@
# Copyright (C) 2017-2018 the Software Heritage developers
# Copyright (C) 2017-2019 the Software Heritage developers
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
@ -19,10 +19,11 @@ class GitHubListerTester(HttpListerTester, unittest.TestCase):
first_index = 26
last_index = 368
entries_per_page = 100
convert_type = int
def response_headers(self, request):
headers = {'X-RateLimit-Remaining': '1'}
if self.request_index(request) == str(self.first_index):
if self.request_index(request) == self.first_index:
headers.update({
'Link': '<https://api.github.com/repositories?since=367>;'
' rel="next",'

View file

@ -1,4 +1,4 @@
# Copyright (C) 2017-2018 the Software Heritage developers
# Copyright (C) 2017-2019 the Software Heritage developers
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
@ -18,10 +18,11 @@ class GitLabListerTester(HttpListerTesterBase, unittest.TestCase):
bad_api_response_file = 'api_empty_response.json'
first_index = 1
entries_per_page = 10
convert_type = int
def response_headers(self, request):
headers = {'RateLimit-Remaining': '1'}
if self.request_index(request) == str(self.first_index):
if self.request_index(request) == self.first_index:
headers.update({
'x-next-page': '3',
})

View file

@ -1,4 +1,4 @@
# Copyright (C) 2018 the Software Heritage developers
# Copyright (C) 2018-2019 the Software Heritage developers
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
@ -33,7 +33,7 @@ class NpmIncrementalListerTester(HttpListerTesterBase, unittest.TestCase):
lister_subdir = 'npm'
good_api_response_file = 'api_inc_response.json'
bad_api_response_file = 'api_inc_empty_response.json'
first_index = 6920642
first_index = '6920642'
entries_per_page = 100
@requests_mock.Mocker()

View file

@ -21,6 +21,7 @@ class PhabricatorListerTester(HttpListerTester, unittest.TestCase):
first_index = 1
last_index = 12
entries_per_page = 10
convert_type = int
def get_fl(self, override_config=None):
"""(Override) Retrieve an instance of fake lister (fl).