Enable black

- blackify all the python files,
- enable black in pre-commit,
- add a black tox environment.
This commit is contained in:
David Douard 2020-04-08 16:31:22 +02:00
parent 1ae75166c7
commit 93a4d8b784
97 changed files with 1734 additions and 1642 deletions

View file

@ -23,6 +23,11 @@ repos:
language: system
types: [python]
- repo: https://github.com/python/black
rev: 19.10b0
hooks:
- id: black
# unfortunately, we are far from being able to enable this...
# - repo: https://github.com/PyCQA/pydocstyle.git
# rev: 4.0.0
@ -34,14 +39,3 @@ repos:
# language: python
# types: [python]
# black requires py3.6+
#- repo: https://github.com/python/black
# rev: 19.3b0
# hooks:
# - id: black
# language_version: python3
#- repo: https://github.com/asottile/blacken-docs
# rev: v1.0.0-1
# hooks:
# - id: blacken-docs
# additional_dependencies: [black==19.3b0]

6
setup.cfg Normal file
View file

@ -0,0 +1,6 @@
[flake8]
# E203: whitespaces before ':' <https://github.com/psf/black/issues/315>
# E231: missing whitespace after ','
# W503: line break before binary operator <https://github.com/psf/black/issues/52>
ignore = E203,E231,W503
max-line-length = 88

View file

@ -12,15 +12,15 @@ from io import open
here = path.abspath(path.dirname(__file__))
# Get the long description from the README file
with open(path.join(here, 'README.md'), encoding='utf-8') as f:
with open(path.join(here, "README.md"), encoding="utf-8") as f:
long_description = f.read()
def parse_requirements(name=None):
if name:
reqf = 'requirements-%s.txt' % name
reqf = "requirements-%s.txt" % name
else:
reqf = 'requirements.txt'
reqf = "requirements.txt"
requirements = []
if not path.exists(reqf):
@ -29,28 +29,28 @@ def parse_requirements(name=None):
with open(reqf) as f:
for line in f.readlines():
line = line.strip()
if not line or line.startswith('#'):
if not line or line.startswith("#"):
continue
requirements.append(line)
return requirements
setup(
name='swh.lister',
description='Software Heritage lister',
name="swh.lister",
description="Software Heritage lister",
long_description=long_description,
long_description_content_type='text/markdown',
author='Software Heritage developers',
author_email='swh-devel@inria.fr',
url='https://forge.softwareheritage.org/diffusion/DLSGH/',
long_description_content_type="text/markdown",
author="Software Heritage developers",
author_email="swh-devel@inria.fr",
url="https://forge.softwareheritage.org/diffusion/DLSGH/",
packages=find_packages(),
install_requires=parse_requirements() + parse_requirements('swh'),
tests_require=parse_requirements('test'),
setup_requires=['vcversioner'],
extras_require={'testing': parse_requirements('test')},
install_requires=parse_requirements() + parse_requirements("swh"),
tests_require=parse_requirements("test"),
setup_requires=["vcversioner"],
extras_require={"testing": parse_requirements("test")},
vcversioner={},
include_package_data=True,
entry_points='''
entry_points="""
[swh.cli.subcommands]
lister=swh.lister.cli:lister
[swh.workers]
@ -65,7 +65,7 @@ setup(
lister.packagist=swh.lister.packagist:register
lister.phabricator=swh.lister.phabricator:register
lister.pypi=swh.lister.pypi:register
''',
""",
classifiers=[
"Programming Language :: Python :: 3",
"Intended Audience :: Developers",
@ -74,8 +74,8 @@ setup(
"Development Status :: 5 - Production/Stable",
],
project_urls={
'Bug Reports': 'https://forge.softwareheritage.org/maniphest',
'Funding': 'https://www.softwareheritage.org/donate',
'Source': 'https://forge.softwareheritage.org/source/swh-lister',
"Bug Reports": "https://forge.softwareheritage.org/maniphest",
"Funding": "https://www.softwareheritage.org/donate",
"Source": "https://forge.softwareheritage.org/source/swh-lister",
},
)

View file

@ -11,17 +11,19 @@ logger = logging.getLogger(__name__)
try:
__version__ = pkg_resources.get_distribution('swh.lister').version
__version__ = pkg_resources.get_distribution("swh.lister").version
except pkg_resources.DistributionNotFound:
__version__ = 'devel'
__version__ = "devel"
USER_AGENT_TEMPLATE = 'Software Heritage Lister (%s)'
USER_AGENT_TEMPLATE = "Software Heritage Lister (%s)"
USER_AGENT = USER_AGENT_TEMPLATE % __version__
LISTERS = {entry_point.name.split('.', 1)[1]: entry_point
for entry_point in pkg_resources.iter_entry_points('swh.workers')
if entry_point.name.split('.', 1)[0] == 'lister'}
LISTERS = {
entry_point.name.split(".", 1)[1]: entry_point
for entry_point in pkg_resources.iter_entry_points("swh.workers")
if entry_point.name.split(".", 1)[0] == "lister"
}
SUPPORTED_LISTERS = list(LISTERS)
@ -41,12 +43,13 @@ def get_lister(lister_name, db_url=None, **conf):
"""
if lister_name not in LISTERS:
raise ValueError(
'Invalid lister %s: only supported listers are %s' %
(lister_name, SUPPORTED_LISTERS))
"Invalid lister %s: only supported listers are %s"
% (lister_name, SUPPORTED_LISTERS)
)
if db_url:
conf['lister'] = {'cls': 'local', 'args': {'db': db_url}}
conf["lister"] = {"cls": "local", "args": {"db": db_url}}
registry_entry = LISTERS[lister_name].load()()
lister_cls = registry_entry['lister']
lister_cls = registry_entry["lister"]
lister = lister_cls(override_config=conf)
return lister

View file

@ -7,7 +7,8 @@ def register():
from .models import BitBucketModel
from .lister import BitBucketLister
return {'models': [BitBucketModel],
'lister': BitBucketLister,
'task_modules': ['%s.tasks' % __name__],
}
return {
"models": [BitBucketModel],
"lister": BitBucketLister,
"task_modules": ["%s.tasks" % __name__],
}

View file

@ -19,34 +19,33 @@ logger = logging.getLogger(__name__)
class BitBucketLister(IndexingHttpLister):
PATH_TEMPLATE = '/repositories?after=%s'
PATH_TEMPLATE = "/repositories?after=%s"
MODEL = BitBucketModel
LISTER_NAME = 'bitbucket'
DEFAULT_URL = 'https://api.bitbucket.org/2.0'
instance = 'bitbucket'
LISTER_NAME = "bitbucket"
DEFAULT_URL = "https://api.bitbucket.org/2.0"
instance = "bitbucket"
default_min_bound = datetime.fromtimestamp(0, timezone.utc) # type: Any
def __init__(self, url: str = None,
override_config=None, per_page: int = 100) -> None:
def __init__(
self, url: str = None, override_config=None, per_page: int = 100
) -> None:
super().__init__(url=url, override_config=override_config)
per_page = self.config.get('per_page', per_page)
per_page = self.config.get("per_page", per_page)
self.PATH_TEMPLATE = '%s&pagelen=%s' % (
self.PATH_TEMPLATE, per_page)
self.PATH_TEMPLATE = "%s&pagelen=%s" % (self.PATH_TEMPLATE, per_page)
def get_model_from_repo(self, repo: Dict) -> Dict[str, Any]:
return {
'uid': repo['uuid'],
'indexable': iso8601.parse_date(repo['created_on']),
'name': repo['name'],
'full_name': repo['full_name'],
'html_url': repo['links']['html']['href'],
'origin_url': repo['links']['clone'][0]['href'],
'origin_type': repo['scm'],
"uid": repo["uuid"],
"indexable": iso8601.parse_date(repo["created_on"]),
"name": repo["name"],
"full_name": repo["full_name"],
"html_url": repo["links"]["html"]["href"],
"origin_url": repo["links"]["clone"][0]["href"],
"origin_type": repo["scm"],
}
def get_next_target_from_response(self, response: Response
) -> Optional[datetime]:
def get_next_target_from_response(self, response: Response) -> Optional[datetime]:
"""This will read the 'next' link from the api response if any
and return it as a datetime.
@ -58,23 +57,23 @@ class BitBucketLister(IndexingHttpLister):
"""
body = response.json()
next_ = body.get('next')
next_ = body.get("next")
if next_ is not None:
next_ = parse.urlparse(next_)
return iso8601.parse_date(parse.parse_qs(next_.query)['after'][0])
return iso8601.parse_date(parse.parse_qs(next_.query)["after"][0])
return None
def transport_response_simplified(self, response: Response
) -> List[Dict[str, Any]]:
repos = response.json()['values']
def transport_response_simplified(self, response: Response) -> List[Dict[str, Any]]:
repos = response.json()["values"]
return [self.get_model_from_repo(repo) for repo in repos]
def request_uri(self, identifier: datetime) -> str: # type: ignore
identifier_str = parse.quote(identifier.isoformat())
return super().request_uri(identifier_str or '1970-01-01')
return super().request_uri(identifier_str or "1970-01-01")
def is_within_bounds(self, inner: int, lower: Optional[int] = None,
upper: Optional[int] = None) -> bool:
def is_within_bounds(
self, inner: int, lower: Optional[int] = None, upper: Optional[int] = None
) -> bool:
# values are expected to be datetimes
if lower is None and upper is None:
ret = True

View file

@ -9,7 +9,8 @@ from swh.lister.core.models import IndexingModelBase
class BitBucketModel(IndexingModelBase):
"""a BitBucket repository"""
__tablename__ = 'bitbucket_repo'
__tablename__ = "bitbucket_repo"
uid = Column(String, primary_key=True)
indexable = Column(DateTime(timezone=True), index=True)

View file

@ -10,20 +10,20 @@ from .lister import BitBucketLister
GROUP_SPLIT = 10000
@shared_task(name=__name__ + '.IncrementalBitBucketLister')
@shared_task(name=__name__ + ".IncrementalBitBucketLister")
def list_bitbucket_incremental(**lister_args):
'''Incremental update of the BitBucket forge'''
"""Incremental update of the BitBucket forge"""
lister = BitBucketLister(**lister_args)
return lister.run(min_bound=lister.db_last_index(), max_bound=None)
@shared_task(name=__name__ + '.RangeBitBucketLister')
@shared_task(name=__name__ + ".RangeBitBucketLister")
def _range_bitbucket_lister(start, end, **lister_args):
lister = BitBucketLister(**lister_args)
return lister.run(min_bound=start, max_bound=end)
@shared_task(name=__name__ + '.FullBitBucketRelister', bind=True)
@shared_task(name=__name__ + ".FullBitBucketRelister", bind=True)
def list_bitbucket_full(self, split=None, **lister_args):
"""Full update of the BitBucket forge
@ -33,21 +33,22 @@ def list_bitbucket_full(self, split=None, **lister_args):
lister = BitBucketLister(**lister_args)
ranges = lister.db_partition_indices(split or GROUP_SPLIT)
if not ranges:
self.log.info('Nothing to list')
self.log.info("Nothing to list")
return
random.shuffle(ranges)
promise = group(_range_bitbucket_lister.s(minv, maxv, **lister_args)
for minv, maxv in ranges)()
self.log.debug('%s OK (spawned %s subtasks)', (self.name, len(ranges)))
promise = group(
_range_bitbucket_lister.s(minv, maxv, **lister_args) for minv, maxv in ranges
)()
self.log.debug("%s OK (spawned %s subtasks)", (self.name, len(ranges)))
try:
promise.save() # so that we can restore the GroupResult in tests
except (NotImplementedError, AttributeError):
self.log.info('Unable to call save_group with current result backend.')
self.log.info("Unable to call save_group with current result backend.")
# FIXME: what to do in terms of return here?
return promise.id
@shared_task(name=__name__ + '.ping')
@shared_task(name=__name__ + ".ping")
def _ping():
return 'OK'
return "OK"

View file

@ -26,12 +26,12 @@ def _convert_type(req_index):
class BitBucketListerTester(HttpListerTester, unittest.TestCase):
Lister = BitBucketLister
test_re = re.compile(r'/repositories\?after=([^?&]+)')
lister_subdir = 'bitbucket'
good_api_response_file = 'data/https_api.bitbucket.org/response.json'
bad_api_response_file = 'data/https_api.bitbucket.org/empty_response.json'
first_index = _convert_type('2008-07-12T07:44:01.476818+00:00')
last_index = _convert_type('2008-07-19T06:16:43.044743+00:00')
test_re = re.compile(r"/repositories\?after=([^?&]+)")
lister_subdir = "bitbucket"
good_api_response_file = "data/https_api.bitbucket.org/response.json"
bad_api_response_file = "data/https_api.bitbucket.org/empty_response.json"
first_index = _convert_type("2008-07-12T07:44:01.476818+00:00")
last_index = _convert_type("2008-07-19T06:16:43.044743+00:00")
entries_per_page = 10
convert_type = _convert_type
@ -57,57 +57,64 @@ class BitBucketListerTester(HttpListerTester, unittest.TestCase):
self.disable_db(fl)
# stores no results
fl.run(min_bound=self.first_index - timedelta(days=3),
max_bound=self.first_index)
fl.run(
min_bound=self.first_index - timedelta(days=3), max_bound=self.first_index
)
def test_is_within_bounds(self):
fl = self.get_fl()
self.assertTrue(fl.is_within_bounds(
iso8601.parse_date('2008-07-15'),
self.first_index, self.last_index))
self.assertFalse(fl.is_within_bounds(
iso8601.parse_date('2008-07-20'),
self.first_index, self.last_index))
self.assertFalse(fl.is_within_bounds(
iso8601.parse_date('2008-07-11'),
self.first_index, self.last_index))
self.assertTrue(
fl.is_within_bounds(
iso8601.parse_date("2008-07-15"), self.first_index, self.last_index
)
)
self.assertFalse(
fl.is_within_bounds(
iso8601.parse_date("2008-07-20"), self.first_index, self.last_index
)
)
self.assertFalse(
fl.is_within_bounds(
iso8601.parse_date("2008-07-11"), self.first_index, self.last_index
)
)
def test_lister_bitbucket(swh_listers, requests_mock_datadir):
"""Simple bitbucket listing should create scheduled tasks (git, hg)
"""
lister = swh_listers['bitbucket']
lister = swh_listers["bitbucket"]
lister.run()
r = lister.scheduler.search_tasks(task_type='load-hg')
r = lister.scheduler.search_tasks(task_type="load-hg")
assert len(r) == 9
for row in r:
args = row['arguments']['args']
kwargs = row['arguments']['kwargs']
args = row["arguments"]["args"]
kwargs = row["arguments"]["kwargs"]
assert len(args) == 0
assert len(kwargs) == 1
url = kwargs['url']
url = kwargs["url"]
assert url.startswith('https://bitbucket.org')
assert url.startswith("https://bitbucket.org")
assert row['policy'] == 'recurring'
assert row['priority'] is None
assert row["policy"] == "recurring"
assert row["priority"] is None
r = lister.scheduler.search_tasks(task_type='load-git')
r = lister.scheduler.search_tasks(task_type="load-git")
assert len(r) == 1
for row in r:
args = row['arguments']['args']
kwargs = row['arguments']['kwargs']
args = row["arguments"]["args"]
kwargs = row["arguments"]["kwargs"]
assert len(args) == 0
assert len(kwargs) == 1
url = kwargs['url']
url = kwargs["url"]
assert url.startswith('https://bitbucket.org')
assert url.startswith("https://bitbucket.org")
assert row['policy'] == 'recurring'
assert row['priority'] is None
assert row["policy"] == "recurring"
assert row["priority"] is None

View file

@ -5,23 +5,21 @@ from unittest.mock import patch
def test_ping(swh_app, celery_session_worker):
res = swh_app.send_task(
'swh.lister.bitbucket.tasks.ping')
res = swh_app.send_task("swh.lister.bitbucket.tasks.ping")
assert res
res.wait()
assert res.successful()
assert res.result == 'OK'
assert res.result == "OK"
@patch('swh.lister.bitbucket.tasks.BitBucketLister')
@patch("swh.lister.bitbucket.tasks.BitBucketLister")
def test_incremental(lister, swh_app, celery_session_worker):
# setup the mocked BitbucketLister
lister.return_value = lister
lister.db_last_index.return_value = 42
lister.run.return_value = None
res = swh_app.send_task(
'swh.lister.bitbucket.tasks.IncrementalBitBucketLister')
res = swh_app.send_task("swh.lister.bitbucket.tasks.IncrementalBitBucketLister")
assert res
res.wait()
assert res.successful()
@ -31,15 +29,15 @@ def test_incremental(lister, swh_app, celery_session_worker):
lister.run.assert_called_once_with(min_bound=42, max_bound=None)
@patch('swh.lister.bitbucket.tasks.BitBucketLister')
@patch("swh.lister.bitbucket.tasks.BitBucketLister")
def test_range(lister, swh_app, celery_session_worker):
# setup the mocked BitbucketLister
lister.return_value = lister
lister.run.return_value = None
res = swh_app.send_task(
'swh.lister.bitbucket.tasks.RangeBitBucketLister',
kwargs=dict(start=12, end=42))
"swh.lister.bitbucket.tasks.RangeBitBucketLister", kwargs=dict(start=12, end=42)
)
assert res
res.wait()
assert res.successful()
@ -49,16 +47,14 @@ def test_range(lister, swh_app, celery_session_worker):
lister.run.assert_called_once_with(min_bound=12, max_bound=42)
@patch('swh.lister.bitbucket.tasks.BitBucketLister')
@patch("swh.lister.bitbucket.tasks.BitBucketLister")
def test_relister(lister, swh_app, celery_session_worker):
# setup the mocked BitbucketLister
lister.return_value = lister
lister.run.return_value = None
lister.db_partition_indices.return_value = [
(i, i+9) for i in range(0, 50, 10)]
lister.db_partition_indices.return_value = [(i, i + 9) for i in range(0, 50, 10)]
res = swh_app.send_task(
'swh.lister.bitbucket.tasks.FullBitBucketRelister')
res = swh_app.send_task("swh.lister.bitbucket.tasks.FullBitBucketRelister")
assert res
res.wait()
@ -85,5 +81,6 @@ def test_relister(lister, swh_app, celery_session_worker):
# lister.run should have been called once per partition interval
for i in range(5):
assert (dict(min_bound=10*i, max_bound=10*i + 9),) \
in lister.run.call_args_list
assert (
dict(min_bound=10 * i, max_bound=10 * i + 9),
) in lister.run.call_args_list

View file

@ -7,7 +7,8 @@ def register():
from .models import CGitModel
from .lister import CGitLister
return {'models': [CGitModel],
'lister': CGitLister,
'task_modules': ['%s.tasks' % __name__],
}
return {
"models": [CGitModel],
"lister": CGitLister,
"task_modules": ["%s.tasks" % __name__],
}

View file

@ -50,13 +50,13 @@ class CGitLister(ListerBase):
Args:
'https://git.savannah.gnu.org/git/elisp-es.git'
"""
MODEL = CGitModel
DEFAULT_URL = 'https://git.savannah.gnu.org/cgit/'
LISTER_NAME = 'cgit'
DEFAULT_URL = "https://git.savannah.gnu.org/cgit/"
LISTER_NAME = "cgit"
url_prefix_present = True
def __init__(self, url=None, instance=None,
override_config=None):
def __init__(self, url=None, instance=None, override_config=None):
"""Lister class for CGit repositories.
Args:
@ -69,7 +69,7 @@ class CGitLister(ListerBase):
super().__init__(override_config=override_config)
if url is None:
url = self.config.get('url', self.DEFAULT_URL)
url = self.config.get("url", self.DEFAULT_URL)
self.url = url
if not instance:
@ -78,23 +78,22 @@ class CGitLister(ListerBase):
self.session = Session()
self.session.mount(self.url, HTTPAdapter(max_retries=3))
self.session.headers = {
'User-Agent': USER_AGENT,
"User-Agent": USER_AGENT,
}
def run(self) -> Dict[str, str]:
status = 'uneventful'
status = "uneventful"
total = 0
for repos in grouper(self.get_repos(), 10):
models = list(filter(None, (self.build_model(repo)
for repo in repos)))
models = list(filter(None, (self.build_model(repo) for repo in repos)))
injected_repos = self.inject_repo_data_into_db(models)
self.schedule_missing_tasks(models, injected_repos)
self.db_session.commit()
total += len(injected_repos)
logger.debug('Scheduled %s tasks for %s', total, self.url)
status = 'eventful'
logger.debug("Scheduled %s tasks for %s", total, self.url)
status = "eventful"
return {'status': status}
return {"status": status}
def get_repos(self) -> Generator[str, None, None]:
"""Generate git 'project' URLs found on the current CGit server
@ -103,16 +102,16 @@ class CGitLister(ListerBase):
next_page = self.url
while next_page:
bs_idx = self.get_and_parse(next_page)
for tr in bs_idx.find(
'div', {"class": "content"}).find_all(
"tr", {"class": ""}):
yield urljoin(self.url, tr.find('a')['href'])
for tr in bs_idx.find("div", {"class": "content"}).find_all(
"tr", {"class": ""}
):
yield urljoin(self.url, tr.find("a")["href"])
try:
pager = bs_idx.find('ul', {'class': 'pager'})
current_page = pager.find('a', {'class': 'current'})
pager = bs_idx.find("ul", {"class": "pager"})
current_page = pager.find("a", {"class": "current"})
if current_page:
next_page = current_page.parent.next_sibling.a['href']
next_page = current_page.parent.next_sibling.a["href"]
next_page = urljoin(self.url, next_page)
except (AttributeError, KeyError):
# no pager, or no next page
@ -123,28 +122,28 @@ class CGitLister(ListerBase):
return the repo description (dict) suitable for insertion in the db.
"""
bs = self.get_and_parse(repo_url)
urls = [x['href'] for x in bs.find_all('a', {'rel': 'vcs-git'})]
urls = [x["href"] for x in bs.find_all("a", {"rel": "vcs-git"})]
if not urls:
return None
# look for the http/https url, if any, and use it as origin_url
for url in urls:
if urlparse(url).scheme in ('http', 'https'):
if urlparse(url).scheme in ("http", "https"):
origin_url = url
break
else:
# otherwise, choose the first one
origin_url = urls[0]
return {'uid': repo_url,
'name': bs.find('a', title=re.compile('.+'))['title'],
'origin_type': 'git',
'instance': self.instance,
'origin_url': origin_url,
}
return {
"uid": repo_url,
"name": bs.find("a", title=re.compile(".+"))["title"],
"origin_type": "git",
"instance": self.instance,
"origin_url": origin_url,
}
def get_and_parse(self, url: str) -> BeautifulSoup:
"Get the given url and parse the retrieved HTML using BeautifulSoup"
return BeautifulSoup(self.session.get(url).text,
features='html.parser')
return BeautifulSoup(self.session.get(url).text, features="html.parser")

View file

@ -11,7 +11,8 @@ class CGitModel(ModelBase):
"""a CGit repository representation
"""
__tablename__ = 'cgit_repo'
__tablename__ = "cgit_repo"
uid = Column(String, primary_key=True)
instance = Column(String, index=True)

View file

@ -7,12 +7,12 @@ from celery import shared_task
from .lister import CGitLister
@shared_task(name=__name__ + '.CGitListerTask')
@shared_task(name=__name__ + ".CGitListerTask")
def list_cgit(**lister_args):
'''Lister task for CGit instances'''
"""Lister task for CGit instances"""
return CGitLister(**lister_args).run()
@shared_task(name=__name__ + '.ping')
@shared_task(name=__name__ + ".ping")
def _ping():
return 'OK'
return "OK"

View file

@ -7,38 +7,38 @@ from swh.lister import __version__
def test_lister_no_page(requests_mock_datadir, swh_listers):
lister = swh_listers['cgit']
lister = swh_listers["cgit"]
assert lister.url == 'https://git.savannah.gnu.org/cgit/'
assert lister.url == "https://git.savannah.gnu.org/cgit/"
repos = list(lister.get_repos())
assert len(repos) == 977
assert repos[0] == 'https://git.savannah.gnu.org/cgit/elisp-es.git/'
assert repos[0] == "https://git.savannah.gnu.org/cgit/elisp-es.git/"
# note the url below is NOT a subpath of /cgit/
assert repos[-1] == 'https://git.savannah.gnu.org/path/to/yetris.git/' # noqa
assert repos[-1] == "https://git.savannah.gnu.org/path/to/yetris.git/" # noqa
# note the url below is NOT on the same server
assert repos[-2] == 'http://example.org/cgit/xstarcastle.git/'
assert repos[-2] == "http://example.org/cgit/xstarcastle.git/"
def test_lister_model(requests_mock_datadir, swh_listers):
lister = swh_listers['cgit']
lister = swh_listers["cgit"]
repo = next(lister.get_repos())
model = lister.build_model(repo)
assert model == {
'uid': 'https://git.savannah.gnu.org/cgit/elisp-es.git/',
'name': 'elisp-es.git',
'origin_type': 'git',
'instance': 'git.savannah.gnu.org',
'origin_url': 'https://git.savannah.gnu.org/git/elisp-es.git'
}
"uid": "https://git.savannah.gnu.org/cgit/elisp-es.git/",
"name": "elisp-es.git",
"origin_type": "git",
"instance": "git.savannah.gnu.org",
"origin_url": "https://git.savannah.gnu.org/git/elisp-es.git",
}
def test_lister_with_pages(requests_mock_datadir, swh_listers):
lister = swh_listers['cgit']
lister.url = 'https://git.tizen/cgit/'
lister = swh_listers["cgit"]
lister.url = "https://git.tizen/cgit/"
repos = list(lister.get_repos())
# we should have 16 repos (listed on 3 pages)
@ -46,37 +46,37 @@ def test_lister_with_pages(requests_mock_datadir, swh_listers):
def test_lister_run(requests_mock_datadir, swh_listers):
lister = swh_listers['cgit']
lister.url = 'https://git.tizen/cgit/'
lister = swh_listers["cgit"]
lister.url = "https://git.tizen/cgit/"
lister.run()
r = lister.scheduler.search_tasks(task_type='load-git')
r = lister.scheduler.search_tasks(task_type="load-git")
assert len(r) == 16
for row in r:
assert row['type'] == 'load-git'
assert row["type"] == "load-git"
# arguments check
args = row['arguments']['args']
args = row["arguments"]["args"]
assert len(args) == 0
# kwargs
kwargs = row['arguments']['kwargs']
kwargs = row["arguments"]["kwargs"]
assert len(kwargs) == 1
url = kwargs['url']
assert url.startswith('https://git.tizen')
url = kwargs["url"]
assert url.startswith("https://git.tizen")
assert row['policy'] == 'recurring'
assert row['priority'] is None
assert row["policy"] == "recurring"
assert row["priority"] is None
def test_lister_requests(requests_mock_datadir, swh_listers):
lister = swh_listers['cgit']
lister.url = 'https://git.tizen/cgit/'
lister = swh_listers["cgit"]
lister.url = "https://git.tizen/cgit/"
lister.run()
assert len(requests_mock_datadir.request_history) != 0
for request in requests_mock_datadir.request_history:
assert 'User-Agent' in request.headers
user_agent = request.headers['User-Agent']
assert 'Software Heritage Lister' in user_agent
assert "User-Agent" in request.headers
user_agent = request.headers["User-Agent"]
assert "Software Heritage Lister" in user_agent
assert __version__ in user_agent

View file

@ -2,29 +2,27 @@ from unittest.mock import patch
def test_ping(swh_app, celery_session_worker):
res = swh_app.send_task(
'swh.lister.cgit.tasks.ping')
res = swh_app.send_task("swh.lister.cgit.tasks.ping")
assert res
res.wait()
assert res.successful()
assert res.result == 'OK'
assert res.result == "OK"
@patch('swh.lister.cgit.tasks.CGitLister')
@patch("swh.lister.cgit.tasks.CGitLister")
def test_lister(lister, swh_app, celery_session_worker):
# setup the mocked CGitLister
lister.return_value = lister
lister.run.return_value = None
res = swh_app.send_task(
'swh.lister.cgit.tasks.CGitListerTask',
kwargs=dict(url='https://git.kernel.org/', instance='kernel'))
"swh.lister.cgit.tasks.CGitListerTask",
kwargs=dict(url="https://git.kernel.org/", instance="kernel"),
)
assert res
res.wait()
assert res.successful()
lister.assert_called_once_with(
url='https://git.kernel.org/',
instance='kernel')
lister.assert_called_once_with(url="https://git.kernel.org/", instance="kernel")
lister.db_last_index.assert_not_called()
lister.run.assert_called_once_with()

View file

@ -23,104 +23,123 @@ logger = logging.getLogger(__name__)
# value used when inserting a new task-type in the scheduler db will be the one
# under the 'full' key below (because it matches xxx_full).
DEFAULT_TASK_TYPE = {
'full': { # for tasks like 'list_xxx_full()'
'default_interval': '90 days',
'min_interval': '90 days',
'max_interval': '90 days',
'backoff_factor': 1
},
'*': { # value if not suffix matches
'default_interval': '1 day',
'min_interval': '1 day',
'max_interval': '1 day',
'backoff_factor': 1
},
}
"full": { # for tasks like 'list_xxx_full()'
"default_interval": "90 days",
"min_interval": "90 days",
"max_interval": "90 days",
"backoff_factor": 1,
},
"*": { # value if not suffix matches
"default_interval": "1 day",
"min_interval": "1 day",
"max_interval": "1 day",
"backoff_factor": 1,
},
}
@click.group(name='lister', context_settings=CONTEXT_SETTINGS)
@click.option('--config-file', '-C', default=None,
type=click.Path(exists=True, dir_okay=False,),
help="Configuration file.")
@click.option('--db-url', '-d', default=None,
help='SQLAlchemy DB URL; see '
'<http://docs.sqlalchemy.org/en/latest/core/engines.html#database-urls>') # noqa
@click.group(name="lister", context_settings=CONTEXT_SETTINGS)
@click.option(
"--config-file",
"-C",
default=None,
type=click.Path(exists=True, dir_okay=False,),
help="Configuration file.",
)
@click.option(
"--db-url",
"-d",
default=None,
help="SQLAlchemy DB URL; see "
"<http://docs.sqlalchemy.org/en/latest/core/engines.html#database-urls>",
) # noqa
@click.pass_context
def lister(ctx, config_file, db_url):
'''Software Heritage Lister tools.'''
"""Software Heritage Lister tools."""
from swh.core import config
ctx.ensure_object(dict)
if not config_file:
config_file = os.environ.get('SWH_CONFIG_FILENAME')
config_file = os.environ.get("SWH_CONFIG_FILENAME")
conf = config.read(config_file)
if db_url:
conf['lister'] = {
'cls': 'local',
'args': {'db': db_url}
}
ctx.obj['config'] = conf
conf["lister"] = {"cls": "local", "args": {"db": db_url}}
ctx.obj["config"] = conf
@lister.command(name='db-init', context_settings=CONTEXT_SETTINGS)
@click.option('--drop-tables', '-D', is_flag=True, default=False,
help='Drop tables before creating the database schema')
@lister.command(name="db-init", context_settings=CONTEXT_SETTINGS)
@click.option(
"--drop-tables",
"-D",
is_flag=True,
default=False,
help="Drop tables before creating the database schema",
)
@click.pass_context
def db_init(ctx, drop_tables):
"""Initialize the database model for given listers.
"""
cfg = ctx.obj['config']
lister_cfg = cfg['lister']
if lister_cfg['cls'] != 'local':
click.echo('A local lister configuration is required')
cfg = ctx.obj["config"]
lister_cfg = cfg["lister"]
if lister_cfg["cls"] != "local":
click.echo("A local lister configuration is required")
ctx.exit(1)
db_url = lister_cfg['args']['db']
db_url = lister_cfg["args"]["db"]
db_engine = create_engine(db_url)
registry = {}
for lister, entrypoint in LISTERS.items():
logger.info('Loading lister %s', lister)
logger.info("Loading lister %s", lister)
registry[lister] = entrypoint.load()()
logger.info('Initializing database')
logger.info("Initializing database")
initialize(db_engine, drop_tables)
for lister, entrypoint in LISTERS.items():
registry_entry = registry[lister]
init_hook = registry_entry.get('init')
init_hook = registry_entry.get("init")
if callable(init_hook):
logger.info('Calling init hook for %s', lister)
logger.info("Calling init hook for %s", lister)
init_hook(db_engine)
@lister.command(name='run', context_settings=CONTEXT_SETTINGS,
help='Trigger a full listing run for a particular forge '
'instance. The output of this listing results in '
'"oneshot" tasks in the scheduler db with a priority '
'defined by the user')
@click.option('--lister', '-l', help='Lister to run',
type=click.Choice(SUPPORTED_LISTERS))
@click.option('--priority', '-p', default='high',
type=click.Choice(['high', 'medium', 'low']),
help='Task priority for the listed repositories to ingest')
@click.argument('options', nargs=-1)
@lister.command(
name="run",
context_settings=CONTEXT_SETTINGS,
help="Trigger a full listing run for a particular forge "
"instance. The output of this listing results in "
'"oneshot" tasks in the scheduler db with a priority '
"defined by the user",
)
@click.option(
"--lister", "-l", help="Lister to run", type=click.Choice(SUPPORTED_LISTERS)
)
@click.option(
"--priority",
"-p",
default="high",
type=click.Choice(["high", "medium", "low"]),
help="Task priority for the listed repositories to ingest",
)
@click.argument("options", nargs=-1)
@click.pass_context
def run(ctx, lister, priority, options):
from swh.scheduler.cli.utils import parse_options
config = deepcopy(ctx.obj['config'])
config = deepcopy(ctx.obj["config"])
if options:
config.update(parse_options(options)[1])
config['priority'] = priority
config['policy'] = 'oneshot'
config["priority"] = priority
config["policy"] = "oneshot"
get_lister(lister, **config).run()
if __name__ == '__main__':
if __name__ == "__main__":
lister()

View file

@ -20,8 +20,9 @@ class AbstractAttribute:
AbstractAttribute('docstring for foo')
"""
__isabstractmethod__ = True
def __init__(self, docstring=None):
if docstring is not None:
self.__doc__ = 'AbstractAttribute: ' + docstring
self.__doc__ = "AbstractAttribute: " + docstring

View file

@ -49,18 +49,19 @@ class IndexingLister(ListerBase):
def get_next_target_from_response
"""
flush_packet_db = 20
"""Number of iterations in-between write flushes of lister repositories to
db (see fn:`run`).
"""
default_min_bound = ''
default_min_bound = ""
"""Default initialization value for the minimum boundary index to use when
undefined (see fn:`run`).
"""
@abc.abstractmethod
def get_next_target_from_response(
self, response: Response
self, response: Response
) -> Union[Optional[datetime], Optional[str], Optional[int]]:
"""Find the next server endpoint identifier given the entire response.
@ -78,14 +79,16 @@ class IndexingLister(ListerBase):
# You probably don't need to override anything below this line.
def filter_before_inject(
self, models_list: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
self, models_list: List[Dict[str, Any]]
) -> List[Dict[str, Any]]:
"""Overrides ListerBase.filter_before_inject
Bounds query results by this Lister's set max_index.
"""
models_list = [
m for m in models_list
if self.is_within_bounds(m['indexable'], None, self.max_index)
m
for m in models_list
if self.is_within_bounds(m["indexable"], None, self.max_index)
]
return models_list
@ -108,7 +111,7 @@ class IndexingLister(ListerBase):
return retlist
def db_partition_indices(
self, partition_size: int
self, partition_size: int
) -> List[Tuple[Optional[int], Optional[int]]]:
"""Describe an index-space compartmentalization of the db table
in equal sized chunks. This is used to describe min&max bounds for
@ -135,14 +138,19 @@ class IndexingLister(ListerBase):
return []
if isinstance(min_index, str):
def format_bound(bound):
return bound.isoformat()
min_index = dateutil.parser.parse(min_index)
max_index = dateutil.parser.parse(max_index)
elif isinstance(max_index - min_index, int):
def format_bound(bound):
return int(bound)
else:
def format_bound(bound):
return bound
@ -156,9 +164,7 @@ class IndexingLister(ListerBase):
# Trim duplicate bounds
bounds.append(None)
bounds = [cur
for cur, next in zip(bounds[:-1], bounds[1:])
if cur != next]
bounds = [cur for cur, next in zip(bounds[:-1], bounds[1:]) if cur != next]
# Remove bounds for lowest and highest partition
bounds[0] = bounds[-1] = None
@ -204,8 +210,9 @@ class IndexingLister(ListerBase):
deleted_repos = self.winnow_models(
self.db_query_range(start, end), self.MODEL.uid, keep_these
)
tasks_to_disable = [repo.task_id for repo in deleted_repos
if repo.task_id is not None]
tasks_to_disable = [
repo.task_id for repo in deleted_repos if repo.task_id is not None
]
if tasks_to_disable:
self.scheduler.disable_tasks(tasks_to_disable)
for repo in deleted_repos:
@ -224,7 +231,7 @@ class IndexingLister(ListerBase):
Returns:
nothing
"""
status = 'uneventful'
status = "uneventful"
self.min_index = min_bound
self.max_index = max_bound
@ -233,7 +240,7 @@ class IndexingLister(ListerBase):
for i in count(1):
response, injected_repos = self.ingest_data(index)
if not response and not injected_repos:
logger.info('No response from api server, stopping')
logger.info("No response from api server, stopping")
return
next_index = self.get_next_target_from_response(response)
@ -243,23 +250,22 @@ class IndexingLister(ListerBase):
# termination condition
if next_index is None or next_index == index:
logger.info('stopping after index %s, no next link found',
index)
logger.info("stopping after index %s, no next link found", index)
return
index = next_index
logger.debug('Index: %s', index)
logger.debug("Index: %s", index)
yield i
for i in ingest_indexes():
if (i % self.flush_packet_db) == 0:
logger.debug('Flushing updates at index %s', i)
logger.debug("Flushing updates at index %s", i)
self.db_session.commit()
self.db_session = self.mk_session()
status = 'eventful'
status = "eventful"
self.db_session.commit()
self.db_session = self.mk_session()
return {'status': status}
return {"status": status}
class IndexingHttpLister(ListerHttpTransport, IndexingLister):

View file

@ -68,11 +68,12 @@ class ListerBase(abc.ABC, config.SWHConfig):
"""
MODEL = AbstractAttribute(
'Subclass type (not instance) of swh.lister.core.models.ModelBase '
'customized for a specific service.'
"Subclass type (not instance) of swh.lister.core.models.ModelBase "
"customized for a specific service."
) # type: Union[AbstractAttribute, Type[Any]]
LISTER_NAME = AbstractAttribute(
"Lister's name") # type: Union[AbstractAttribute, str]
"Lister's name"
) # type: Union[AbstractAttribute, str]
def transport_request(self, identifier):
"""Given a target endpoint identifier to query, try once to request it.
@ -138,8 +139,7 @@ class ListerBase(abc.ABC, config.SWHConfig):
"""
pass
def filter_before_inject(
self, models_list: List[Dict]) -> List[Dict]:
def filter_before_inject(self, models_list: List[Dict]) -> List[Dict]:
"""Filter models_list entries prior to injection in the db.
This is ran directly after `transport_response_simplified`.
@ -154,8 +154,7 @@ class ListerBase(abc.ABC, config.SWHConfig):
"""
return models_list
def do_additional_checks(
self, models_list: List[Dict]) -> List[Dict]:
def do_additional_checks(self, models_list: List[Dict]) -> List[Dict]:
"""Execute some additional checks on the model list (after the
filtering).
@ -173,8 +172,8 @@ class ListerBase(abc.ABC, config.SWHConfig):
return models_list
def is_within_bounds(
self, inner: int,
lower: Optional[int] = None, upper: Optional[int] = None) -> bool:
self, inner: int, lower: Optional[int] = None, upper: Optional[int] = None
) -> bool:
"""See if a sortable value is inside the range [lower,upper].
MAY BE OVERRIDDEN, for example if the server indexable* key is
@ -201,11 +200,15 @@ class ListerBase(abc.ABC, config.SWHConfig):
self.string_pattern_check(inner, lower, upper)
except Exception as e:
logger.error(str(e) + ': %s, %s, %s' %
(('inner=%s%s' % (type(inner), inner)),
('lower=%s%s' % (type(lower), lower)),
('upper=%s%s' % (type(upper), upper)))
)
logger.error(
str(e)
+ ": %s, %s, %s"
% (
("inner=%s%s" % (type(inner), inner)),
("lower=%s%s" % (type(lower), lower)),
("upper=%s%s" % (type(upper), upper)),
)
)
raise
return ret
@ -213,30 +216,23 @@ class ListerBase(abc.ABC, config.SWHConfig):
# You probably don't need to override anything below this line.
DEFAULT_CONFIG = {
'scheduler': ('dict', {
'cls': 'remote',
'args': {
'url': 'http://localhost:5008/'
},
}),
'lister': ('dict', {
'cls': 'local',
'args': {
'db': 'postgresql:///lister',
},
}),
"scheduler": (
"dict",
{"cls": "remote", "args": {"url": "http://localhost:5008/"},},
),
"lister": ("dict", {"cls": "local", "args": {"db": "postgresql:///lister",},}),
}
@property
def CONFIG_BASE_FILENAME(self): # noqa: N802
return 'lister_%s' % self.LISTER_NAME
return "lister_%s" % self.LISTER_NAME
@property
def ADDITIONAL_CONFIG(self): # noqa: N802
return {
'credentials': ('dict', {}),
'cache_responses': ('bool', False),
'cache_dir': ('str', '~/.cache/swh/lister/%s' % self.LISTER_NAME),
"credentials": ("dict", {}),
"cache_responses": ("bool", False),
"cache_dir": ("str", "~/.cache/swh/lister/%s" % self.LISTER_NAME),
}
INITIAL_BACKOFF = 10
@ -245,21 +241,21 @@ class ListerBase(abc.ABC, config.SWHConfig):
def __init__(self, override_config=None):
self.backoff = self.INITIAL_BACKOFF
logger.debug('Loading config from %s' % self.CONFIG_BASE_FILENAME)
logger.debug("Loading config from %s" % self.CONFIG_BASE_FILENAME)
self.config = self.parse_config_file(
base_filename=self.CONFIG_BASE_FILENAME,
additional_configs=[self.ADDITIONAL_CONFIG]
additional_configs=[self.ADDITIONAL_CONFIG],
)
self.config['cache_dir'] = os.path.expanduser(self.config['cache_dir'])
if self.config['cache_responses']:
config.prepare_folders(self.config, 'cache_dir')
self.config["cache_dir"] = os.path.expanduser(self.config["cache_dir"])
if self.config["cache_responses"]:
config.prepare_folders(self.config, "cache_dir")
if override_config:
self.config.update(override_config)
logger.debug('%s CONFIG=%s' % (self, self.config))
self.scheduler = get_scheduler(**self.config['scheduler'])
self.db_engine = create_engine(self.config['lister']['args']['db'])
logger.debug("%s CONFIG=%s" % (self, self.config))
self.scheduler = get_scheduler(**self.config["scheduler"])
self.db_engine = create_engine(self.config["lister"]["args"]["db"])
self.mk_session = sessionmaker(bind=self.db_engine)
self.db_session = self.mk_session()
@ -285,7 +281,7 @@ class ListerBase(abc.ABC, config.SWHConfig):
server response
"""
retries_left = self.MAX_RETRIES
do_cache = self.config['cache_responses']
do_cache = self.config["cache_responses"]
r = None
while retries_left > 0:
try:
@ -293,8 +289,9 @@ class ListerBase(abc.ABC, config.SWHConfig):
except FetchError:
# network-level connection error, try again
logger.warning(
'connection error on %s: sleep for %d seconds' %
(identifier, self.CONN_SLEEP))
"connection error on %s: sleep for %d seconds"
% (identifier, self.CONN_SLEEP)
)
time.sleep(self.CONN_SLEEP)
retries_left -= 1
continue
@ -306,8 +303,8 @@ class ListerBase(abc.ABC, config.SWHConfig):
must_retry, delay = self.transport_quota_check(r)
if must_retry:
logger.warning(
'rate limited on %s: sleep for %f seconds' %
(identifier, delay))
"rate limited on %s: sleep for %f seconds" % (identifier, delay)
)
time.sleep(delay)
else: # request ok
break
@ -315,8 +312,7 @@ class ListerBase(abc.ABC, config.SWHConfig):
retries_left -= 1
if not retries_left:
logger.warning(
'giving up on %s: max retries exceeded' % identifier)
logger.warning("giving up on %s: max retries exceeded" % identifier)
return r
@ -332,8 +328,7 @@ class ListerBase(abc.ABC, config.SWHConfig):
"""
if isinstance(key, str):
key = self.MODEL.__dict__[key]
return self.db_session.query(self.MODEL) \
.filter(key == value).first()
return self.db_session.query(self.MODEL).filter(key == value).first()
def winnow_models(self, mlist, key, to_remove):
"""Given a list of models, remove any with <key> matching
@ -358,8 +353,7 @@ class ListerBase(abc.ABC, config.SWHConfig):
def db_num_entries(self):
"""Return the known number of entries in the lister db"""
return self.db_session.query(func.count('*')).select_from(self.MODEL) \
.scalar()
return self.db_session.query(func.count("*")).select_from(self.MODEL).scalar()
def db_inject_repo(self, model_dict):
"""Add/update a new repo to the db and mark it last_seen now.
@ -372,7 +366,7 @@ class ListerBase(abc.ABC, config.SWHConfig):
object associated with the injection
"""
sql_repo = self.db_query_equal('uid', model_dict['uid'])
sql_repo = self.db_query_equal("uid", model_dict["uid"])
if not sql_repo:
sql_repo = self.MODEL(**model_dict)
@ -384,8 +378,7 @@ class ListerBase(abc.ABC, config.SWHConfig):
return sql_repo
def task_dict(self, origin_type: str,
origin_url: str, **kwargs) -> Dict[str, Any]:
def task_dict(self, origin_type: str, origin_url: str, **kwargs) -> Dict[str, Any]:
"""Return special dict format for the tasks list
Args:
@ -394,11 +387,11 @@ class ListerBase(abc.ABC, config.SWHConfig):
Returns:
the same information in a different form
"""
logger.debug('origin-url: %s, type: %s', origin_url, origin_type)
_type = 'load-%s' % origin_type
_policy = kwargs.get('policy', 'recurring')
priority = kwargs.get('priority')
kw = {'priority': priority} if priority else {}
logger.debug("origin-url: %s, type: %s", origin_url, origin_type)
_type = "load-%s" % origin_type
_policy = kwargs.get("policy", "recurring")
priority = kwargs.get("priority")
kw = {"priority": priority} if priority else {}
return utils.create_task_dict(_type, _policy, url=origin_url, **kw)
def string_pattern_check(self, a, b, c=None):
@ -420,14 +413,15 @@ class ListerBase(abc.ABC, config.SWHConfig):
pattern.
"""
if isinstance(a, str):
a_pattern = re.sub('[a-zA-Z0-9]',
'[a-zA-Z0-9]',
re.escape(a))
if (isinstance(b, str) and (re.match(a_pattern, b) is None)
or isinstance(c, str) and
(re.match(a_pattern, c) is None)):
a_pattern = re.sub("[a-zA-Z0-9]", "[a-zA-Z0-9]", re.escape(a))
if (
isinstance(b, str)
and (re.match(a_pattern, b) is None)
or isinstance(c, str)
and (re.match(a_pattern, c) is None)
):
logger.debug(a_pattern)
raise TypeError('incomparable string patterns detected')
raise TypeError("incomparable string patterns detected")
def inject_repo_data_into_db(self, models_list: List[Dict]) -> Dict:
"""Inject data into the db.
@ -441,11 +435,12 @@ class ListerBase(abc.ABC, config.SWHConfig):
"""
injected_repos = {}
for m in models_list:
injected_repos[m['uid']] = self.db_inject_repo(m)
injected_repos[m["uid"]] = self.db_inject_repo(m)
return injected_repos
def schedule_missing_tasks(
self, models_list: List[Dict], injected_repos: Dict) -> None:
self, models_list: List[Dict], injected_repos: Dict
) -> None:
"""Schedule any newly created db entries that do not have been
scheduled yet.
@ -463,20 +458,17 @@ class ListerBase(abc.ABC, config.SWHConfig):
tasks = {}
def _task_key(m):
return '%s-%s' % (
m['type'],
json.dumps(m['arguments'], sort_keys=True)
)
return "%s-%s" % (m["type"], json.dumps(m["arguments"], sort_keys=True))
for m in models_list:
ir = injected_repos[m['uid']]
ir = injected_repos[m["uid"]]
if not ir.task_id:
# Patching the model instance to add the policy/priority task
# scheduling
if 'policy' in self.config:
m['policy'] = self.config['policy']
if 'priority' in self.config:
m['priority'] = self.config['priority']
if "policy" in self.config:
m["policy"] = self.config["policy"]
if "priority" in self.config:
m["priority"] = self.config["priority"]
task_dict = self.task_dict(**m)
tasks[_task_key(task_dict)] = (ir, m, task_dict)
@ -485,7 +477,7 @@ class ListerBase(abc.ABC, config.SWHConfig):
new_tasks = self.scheduler.create_tasks(list(grouped_tasks))
for task in new_tasks:
ir, m, _ = tasks[_task_key(task)]
ir.task_id = task['id']
ir.task_id = task["id"]
def ingest_data(self, identifier: int, checks: bool = False):
"""The core data fetch sequence. Request server endpoint. Simplify and
@ -523,13 +515,7 @@ class ListerBase(abc.ABC, config.SWHConfig):
"""
datepath = utcnow().isoformat()
fname = os.path.join(
self.config['cache_dir'],
datepath + '.gz',
)
fname = os.path.join(self.config["cache_dir"], datepath + ".gz",)
with gzip.open(fname, 'w') as f:
f.write(bytes(
self.transport_response_to_string(response),
'UTF-8'
))
with gzip.open(fname, "w") as f:
f.write(bytes(self.transport_response_to_string(response), "UTF-8"))

View file

@ -29,14 +29,14 @@ class ListerHttpTransport(abc.ABC):
To be used in conjunction with ListerBase or a subclass of it.
"""
DEFAULT_URL = None # type: Optional[str]
PATH_TEMPLATE = \
AbstractAttribute(
'string containing a python string format pattern that produces'
' the API endpoint path for listing stored repositories when given'
' an index, e.g., "/repositories?after=%s". To be implemented in'
' the API-specific class inheriting this.'
) # type: Union[AbstractAttribute, Optional[str]]
PATH_TEMPLATE = AbstractAttribute(
"string containing a python string format pattern that produces"
" the API endpoint path for listing stored repositories when given"
' an index, e.g., "/repositories?after=%s". To be implemented in'
" the API-specific class inheriting this."
) # type: Union[AbstractAttribute, Optional[str]]
EXPECTED_STATUS_CODES = (200, 429, 403, 404)
@ -45,9 +45,7 @@ class ListerHttpTransport(abc.ABC):
MAY BE OVERRIDDEN if request headers are needed.
"""
return {
'User-Agent': USER_AGENT_TEMPLATE % self.lister_version
}
return {"User-Agent": USER_AGENT_TEMPLATE % self.lister_version}
def request_instance_credentials(self) -> List[Dict[str, Any]]:
"""Returns dictionary of any credentials configuration needed by the
@ -82,7 +80,7 @@ class ListerHttpTransport(abc.ABC):
list of credential dicts for the current lister.
"""
all_creds = self.config.get('credentials') # type: ignore
all_creds = self.config.get("credentials") # type: ignore
if not all_creds:
return []
lister_creds = all_creds.get(self.LISTER_NAME, {}) # type: ignore
@ -110,14 +108,16 @@ class ListerHttpTransport(abc.ABC):
"""
params = {}
params['headers'] = self.request_headers() or {}
params["headers"] = self.request_headers() or {}
creds = self.request_instance_credentials()
if not creds:
return params
auth = random.choice(creds) if creds else None
if auth:
params['auth'] = (auth['username'], # type: ignore
auth['password'])
params["auth"] = (
auth["username"], # type: ignore
auth["password"],
)
return params
def transport_quota_check(self, response):
@ -130,7 +130,7 @@ class ListerHttpTransport(abc.ABC):
"""
if response.status_code == 429: # HTTP too many requests
retry_after = response.headers.get('Retry-After', self.back_off())
retry_after = response.headers.get("Retry-After", self.back_off())
try:
# might be seconds
return True, float(retry_after)
@ -145,17 +145,16 @@ class ListerHttpTransport(abc.ABC):
def __init__(self, url=None):
if not url:
url = self.config.get('url')
url = self.config.get("url")
if not url:
url = self.DEFAULT_URL
if not url:
raise NameError('HTTP Lister Transport requires an url.')
raise NameError("HTTP Lister Transport requires an url.")
self.url = url # eg. 'https://api.github.com'
self.session = requests.Session()
self.lister_version = __version__
def _transport_action(
self, identifier: str, method: str = 'get') -> Response:
def _transport_action(self, identifier: str, method: str = "get") -> Response:
"""Permit to ask information to the api prior to actually executing
query.
@ -163,16 +162,16 @@ class ListerHttpTransport(abc.ABC):
path = self.request_uri(identifier)
params = self.request_params(identifier)
logger.debug('path: %s', path)
logger.debug('params: %s', params)
logger.debug('method: %s', method)
logger.debug("path: %s", path)
logger.debug("params: %s", params)
logger.debug("method: %s", method)
try:
if method == 'head':
if method == "head":
response = self.session.head(path, **params)
else:
response = self.session.get(path, **params)
except requests.exceptions.ConnectionError as e:
logger.warning('Failed to fetch %s: %s', path, e)
logger.warning("Failed to fetch %s: %s", path, e)
raise FetchError(e)
else:
if response.status_code not in self.EXPECTED_STATUS_CODES:
@ -183,7 +182,7 @@ class ListerHttpTransport(abc.ABC):
"""Retrieve head information on api.
"""
return self._transport_action(identifier, method='head')
return self._transport_action(identifier, method="head")
def transport_request(self, identifier: str) -> Response:
"""Implements ListerBase.transport_request for HTTP using Requests.
@ -198,10 +197,10 @@ class ListerHttpTransport(abc.ABC):
Requests responses.
"""
s = pformat(response.request.path_url)
s += '\n#\n' + pformat(response.request.headers)
s += '\n#\n' + pformat(response.status_code)
s += '\n#\n' + pformat(response.headers)
s += '\n#\n'
s += "\n#\n" + pformat(response.request.headers)
s += "\n#\n" + pformat(response.status_code)
s += "\n#\n" + pformat(response.headers)
s += "\n#\n"
try: # json?
s += pformat(response.json())
except Exception: # not json
@ -219,9 +218,10 @@ class ListerOnePageApiTransport(ListerHttpTransport):
To be used in conjunction with ListerBase or a subclass of it.
"""
PAGE = AbstractAttribute(
"URL of the API's unique page to retrieve and parse "
"for information") # type: Union[AbstractAttribute, str]
"URL of the API's unique page to retrieve and parse " "for information"
) # type: Union[AbstractAttribute, str]
PATH_TEMPLATE = None # we do not use it
def __init__(self, url=None):

View file

@ -25,12 +25,12 @@ class ABCSQLMeta(abc.ABCMeta, DeclarativeMeta):
class ModelBase(SQLBase, metaclass=ABCSQLMeta):
"""a common repository"""
__abstract__ = True
__tablename__ = \
AbstractAttribute # type: Union[Type[AbstractAttribute], str]
__tablename__ = AbstractAttribute # type: Union[Type[AbstractAttribute], str]
uid = AbstractAttribute(
'Column(<uid_type>, primary_key=True)'
"Column(<uid_type>, primary_key=True)"
) # type: Union[AbstractAttribute, Column]
name = Column(String, index=True)
@ -44,19 +44,18 @@ class ModelBase(SQLBase, metaclass=ABCSQLMeta):
task_id = Column(Integer)
def __init__(self, **kw):
kw['last_seen'] = datetime.now()
kw["last_seen"] = datetime.now()
super().__init__(**kw)
class IndexingModelBase(ModelBase, metaclass=ABCSQLMeta):
__abstract__ = True
__tablename__ = \
AbstractAttribute # type: Union[Type[AbstractAttribute], str]
__tablename__ = AbstractAttribute # type: Union[Type[AbstractAttribute], str]
# The value used for sorting, segmenting, or api query paging,
# because uids aren't always sequential.
indexable = AbstractAttribute(
'Column(<indexable_type>, index=True)'
"Column(<indexable_type>, index=True)"
) # type: Union[AbstractAttribute, Column]
@ -72,8 +71,8 @@ def initialize(db_engine, drop_tables=False, **kwargs):
(re)creating them.
"""
if drop_tables:
logger.info('Dropping tables')
logger.info("Dropping tables")
SQLBase.metadata.drop_all(db_engine, checkfirst=True)
logger.info('Creating tables')
logger.info("Creating tables")
SQLBase.metadata.create_all(db_engine, checkfirst=True)

View file

@ -37,6 +37,7 @@ class PageByPageLister(ListerBase):
def get_next_target_from_response
"""
@abc.abstractmethod
def get_next_target_from_response(self, response):
"""Find the next server endpoint page given the entire response.
@ -87,7 +88,7 @@ class PageByPageLister(ListerBase):
"""
for m in models_list:
sql_repo = self.db_query_equal('uid', m['uid'])
sql_repo = self.db_query_equal("uid", m["uid"])
if sql_repo:
return False
return models_list
@ -110,7 +111,7 @@ class PageByPageLister(ListerBase):
nothing
"""
status = 'uneventful'
status = "uneventful"
page = min_bound or 0
loop_count = 0
@ -118,32 +119,30 @@ class PageByPageLister(ListerBase):
self.max_page = max_bound
while self.is_within_bounds(page, self.min_page, self.max_page):
logging.info('listing repos starting at %s' % page)
logging.info("listing repos starting at %s" % page)
response, injected_repos = self.ingest_data(page,
checks=check_existence)
response, injected_repos = self.ingest_data(page, checks=check_existence)
if not response and not injected_repos:
logging.info('No response from api server, stopping')
logging.info("No response from api server, stopping")
break
elif not injected_repos:
logging.info('Repositories already seen, stopping')
logging.info("Repositories already seen, stopping")
break
status = 'eventful'
status = "eventful"
next_page = self.get_next_target_from_response(response)
# termination condition
if (next_page is None) or (next_page == page):
logging.info('stopping after page %s, no next link found' %
page)
logging.info("stopping after page %s, no next link found" % page)
break
else:
page = next_page
loop_count += 1
if loop_count == 20:
logging.info('flushing updates')
logging.info("flushing updates")
loop_count = 0
self.db_session.commit()
self.db_session = self.mk_session()
@ -151,7 +150,7 @@ class PageByPageLister(ListerBase):
self.db_session.commit()
self.db_session = self.mk_session()
return {'status': status}
return {"status": status}
class PageByPageHttpLister(ListerHttpTransport, PageByPageLister):
@ -159,6 +158,7 @@ class PageByPageHttpLister(ListerHttpTransport, PageByPageLister):
combining PageByPageLister and ListerHttpTransport.
"""
def __init__(self, url=None, override_config=None):
PageByPageLister.__init__(self, override_config=override_config)
ListerHttpTransport.__init__(self, url=url)

View file

@ -24,6 +24,7 @@ class SimpleLister(ListerBase):
information and stores those in db
"""
flush_packet_db = 2
"""Number of iterations in-between write flushes of lister repositories to
db (see fn:`ingest_data`).
@ -57,14 +58,14 @@ class SimpleLister(ListerBase):
all_injected = []
for i, models in enumerate(utils.grouper(models_list, n=100), start=1):
models = list(models)
logging.debug('models: %s' % len(models))
logging.debug("models: %s" % len(models))
# inject into local db
injected = self.inject_repo_data_into_db(models)
# queue workers
self.schedule_missing_tasks(models, injected)
all_injected.append(injected)
if (i % self.flush_packet_db) == 0:
logger.debug('Flushing updates at index %s', i)
logger.debug("Flushing updates at index %s", i)
self.db_session.commit()
self.db_session = self.mk_session()
@ -88,9 +89,9 @@ class SimpleLister(ListerBase):
dump_not_used_identifier = 0
response, injected_repos = self.ingest_data(dump_not_used_identifier)
if not response and not injected_repos:
logging.info('No response from api server, stopping')
status = 'uneventful'
logging.info("No response from api server, stopping")
status = "uneventful"
else:
status = 'eventful'
status = "eventful"
return {'status': status}
return {"status": status}

View file

@ -19,13 +19,14 @@ logger = logging.getLogger(__name__)
@pytest.fixture
def swh_listers(request, postgresql_proc, postgresql, swh_scheduler):
db_url = 'postgresql://{user}@{host}:{port}/{dbname}'.format(
host=postgresql_proc.host,
port=postgresql_proc.port,
user='postgres',
dbname='tests')
db_url = "postgresql://{user}@{host}:{port}/{dbname}".format(
host=postgresql_proc.host,
port=postgresql_proc.port,
user="postgres",
dbname="tests",
)
logger.debug('lister db_url: %s', db_url)
logger.debug("lister db_url: %s", db_url)
listers = {}
@ -37,11 +38,13 @@ def swh_listers(request, postgresql_proc, postgresql, swh_scheduler):
initialize(create_engine(db_url), drop_tables=True)
# Add the load-archive-files expected by some listers (gnu, cran, ...)
swh_scheduler.create_task_type({
'type': 'load-archive-files',
'description': 'Load archive files.',
'backend_name': 'swh.loader.package.tasks.LoadArchive',
'default_interval': '1 day',
})
swh_scheduler.create_task_type(
{
"type": "load-archive-files",
"description": "Load archive files.",
"backend_name": "swh.loader.package.tasks.LoadArchive",
"default_interval": "1 day",
}
)
return listers

View file

@ -13,8 +13,8 @@ from swh.lister.core.abstractattribute import AbstractAttribute
class BaseClass(abc.ABC):
v1 = AbstractAttribute # type: Any
v2 = AbstractAttribute() # type: Any
v3 = AbstractAttribute('changed docstring') # type: Any
v4 = 'qux'
v3 = AbstractAttribute("changed docstring") # type: Any
v4 = "qux"
class BadSubclass1(BaseClass):
@ -22,19 +22,19 @@ class BadSubclass1(BaseClass):
class BadSubclass2(BaseClass):
v1 = 'foo'
v2 = 'bar'
v1 = "foo"
v2 = "bar"
class BadSubclass3(BaseClass):
v2 = 'bar'
v3 = 'baz'
v2 = "bar"
v3 = "baz"
class GoodSubclass(BaseClass):
v1 = 'foo'
v2 = 'bar'
v3 = 'baz'
v1 = "foo"
v2 = "bar"
v3 = "baz"
class TestAbstractAttributes(unittest.TestCase):
@ -54,13 +54,12 @@ class TestAbstractAttributes(unittest.TestCase):
self.assertIsInstance(GoodSubclass(), GoodSubclass)
gsc = GoodSubclass()
self.assertEqual(gsc.v1, 'foo')
self.assertEqual(gsc.v2, 'bar')
self.assertEqual(gsc.v3, 'baz')
self.assertEqual(gsc.v4, 'qux')
self.assertEqual(gsc.v1, "foo")
self.assertEqual(gsc.v2, "bar")
self.assertEqual(gsc.v3, "baz")
self.assertEqual(gsc.v4, "qux")
def test_aa_docstrings(self):
self.assertEqual(BaseClass.v1.__doc__, AbstractAttribute.__doc__)
self.assertEqual(BaseClass.v2.__doc__, AbstractAttribute.__doc__)
self.assertEqual(BaseClass.v3.__doc__,
'AbstractAttribute: changed docstring')
self.assertEqual(BaseClass.v3.__doc__, "AbstractAttribute: changed docstring")

View file

@ -9,7 +9,7 @@ from swh.lister.core.indexing_lister import IndexingLister
class MockedIndexingListerDbPartitionIndices(IndexingLister):
# Abstract Attribute boilerplate
LISTER_NAME = 'DbPartitionIndices'
LISTER_NAME = "DbPartitionIndices"
MODEL = type(None)
# ABC boilerplate
@ -33,9 +33,7 @@ class MockedIndexingListerDbPartitionIndices(IndexingLister):
def test_db_partition_indices():
m = MockedIndexingListerDbPartitionIndices(
num_entries=1000,
first_index=1,
last_index=10001,
num_entries=1000, first_index=1, last_index=10001,
)
assert m
@ -49,9 +47,7 @@ def test_db_partition_indices():
def test_db_partition_indices_zero_first():
m = MockedIndexingListerDbPartitionIndices(
num_entries=1000,
first_index=0,
last_index=10000,
num_entries=1000, first_index=0, last_index=10000,
)
assert m
@ -65,9 +61,7 @@ def test_db_partition_indices_zero_first():
def test_db_partition_indices_small_index_range():
m = MockedIndexingListerDbPartitionIndices(
num_entries=5000,
first_index=0,
last_index=5,
num_entries=5000, first_index=0, last_index=5,
)
assert m
@ -78,8 +72,8 @@ def test_db_partition_indices_small_index_range():
def test_db_partition_indices_date_indices():
# 24 hour delta
first = datetime.datetime.fromisoformat('2019-11-01T00:00:00+00:00')
last = datetime.datetime.fromisoformat('2019-11-02T00:00:00+00:00')
first = datetime.datetime.fromisoformat("2019-11-01T00:00:00+00:00")
last = datetime.datetime.fromisoformat("2019-11-02T00:00:00+00:00")
m = MockedIndexingListerDbPartitionIndices(
# one entry per second
@ -102,9 +96,7 @@ def test_db_partition_indices_date_indices():
def test_db_partition_indices_float_index_range():
m = MockedIndexingListerDbPartitionIndices(
num_entries=10000,
first_index=0.0,
last_index=1.0,
num_entries=10000, first_index=0.0, last_index=1.0,
)
assert m
@ -120,9 +112,7 @@ def test_db_partition_indices_float_index_range():
def test_db_partition_indices_uneven_int_index_range():
m = MockedIndexingListerDbPartitionIndices(
num_entries=5641,
first_index=0,
last_index=10000,
num_entries=5641, first_index=0, last_index=10000,
)
assert m

View file

@ -22,8 +22,9 @@ def noop(*args, **kwargs):
def test_version_generation():
assert swh.lister.__version__ != 'devel', \
"Make sure swh.lister is installed (e.g. pip install -e .)"
assert (
swh.lister.__version__ != "devel"
), "Make sure swh.lister is installed (e.g. pip install -e .)"
class HttpListerTesterBase(abc.ABC):
@ -35,13 +36,17 @@ class HttpListerTesterBase(abc.ABC):
to customize for a specific listing service.
"""
Lister = AbstractAttribute(
'Lister class to test') # type: Union[AbstractAttribute, Type[Any]]
"Lister class to test"
) # type: Union[AbstractAttribute, Type[Any]]
lister_subdir = AbstractAttribute(
'bitbucket, github, etc.') # type: Union[AbstractAttribute, str]
"bitbucket, github, etc."
) # type: Union[AbstractAttribute, str]
good_api_response_file = AbstractAttribute(
'Example good response body') # type: Union[AbstractAttribute, str]
LISTER_NAME = 'fake-lister'
"Example good response body"
) # type: Union[AbstractAttribute, str]
LISTER_NAME = "fake-lister"
# May need to override this if the headers are used for something
def response_headers(self, request):
@ -53,7 +58,7 @@ class HttpListerTesterBase(abc.ABC):
def mock_rate_quota(self, n, request, context):
self.rate_limit += 1
context.status_code = 429
context.headers['Retry-After'] = '1'
context.headers["Retry-After"] = "1"
return '{"error":"dummy"}'
def __init__(self, *args, **kwargs):
@ -89,8 +94,9 @@ class HttpListerTesterBase(abc.ABC):
"""
if override_config or self.fl is None:
self.fl = self.Lister(url='https://fakeurl',
override_config=override_config)
self.fl = self.Lister(
url="https://fakeurl", override_config=override_config
)
self.fl.INITIAL_BACKOFF = 1
self.fl.reset_backoff()
@ -105,23 +111,25 @@ class HttpListerTesterBase(abc.ABC):
task_id = 0
current_nb_tasks = len(self.scheduler_tasks)
if current_nb_tasks > 0:
task_id = self.scheduler_tasks[-1]['id'] + 1
task_id = self.scheduler_tasks[-1]["id"] + 1
for task in tasks:
scheduler_task = dict(task)
scheduler_task.update({
'status': 'next_run_not_scheduled',
'retries_left': 0,
'priority': None,
'id': task_id,
'current_interval': datetime.timedelta(days=64)
})
scheduler_task.update(
{
"status": "next_run_not_scheduled",
"retries_left": 0,
"priority": None,
"id": task_id,
"current_interval": datetime.timedelta(days=64),
}
)
self.scheduler_tasks.append(scheduler_task)
task_id = task_id + 1
return self.scheduler_tasks[current_nb_tasks:]
def _disable_tasks(task_ids):
for task_id in task_ids:
self.scheduler_tasks[task_id]['status'] = 'disabled'
self.scheduler_tasks[task_id]["status"] = "disabled"
fl.scheduler.create_tasks = Mock(wraps=_create_tasks)
fl.scheduler.disable_tasks = Mock(wraps=_disable_tasks)
@ -167,26 +175,29 @@ class HttpListerTester(HttpListerTesterBase, abc.ABC):
to customize for a specific listing service.
"""
last_index = AbstractAttribute(
'Last index '
'in good_api_response') # type: Union[AbstractAttribute, int]
"Last index " "in good_api_response"
) # type: Union[AbstractAttribute, int]
first_index = AbstractAttribute(
'First index in '
' good_api_response') # type: Union[AbstractAttribute, Optional[int]]
"First index in " " good_api_response"
) # type: Union[AbstractAttribute, Optional[int]]
bad_api_response_file = AbstractAttribute(
'Example bad response body') # type: Union[AbstractAttribute, str]
"Example bad response body"
) # type: Union[AbstractAttribute, str]
entries_per_page = AbstractAttribute(
'Number of results in '
'good response') # type: Union[AbstractAttribute, int]
"Number of results in " "good response"
) # type: Union[AbstractAttribute, int]
test_re = AbstractAttribute(
'Compiled regex matching the server url. Must capture the '
'index value.') # type: Union[AbstractAttribute, Pattern]
"Compiled regex matching the server url. Must capture the " "index value."
) # type: Union[AbstractAttribute, Pattern]
convert_type = str # type: Callable[..., Any]
"""static method used to convert the "request_index" to its right type (for
indexing listers for example, this is in accordance with the model's
"indexable" column).
"""
def mock_response(self, request, context):
self.fl.reset_backoff()
self.rate_limit = 1
@ -200,9 +211,11 @@ class HttpListerTester(HttpListerTesterBase, abc.ABC):
else:
response_file = self.bad_api_response_file
with open('swh/lister/%s/tests/%s' % (self.lister_subdir,
response_file),
'r', encoding='utf-8') as r:
with open(
"swh/lister/%s/tests/%s" % (self.lister_subdir, response_file),
"r",
encoding="utf-8",
) as r:
return r.read()
def request_index(self, request):
@ -214,12 +227,9 @@ class HttpListerTester(HttpListerTesterBase, abc.ABC):
http_mocker.get(self.test_re, text=self.mock_response)
db = init_db()
fl = self.get_fl(override_config={
'lister': {
'cls': 'local',
'args': {'db': db.url()}
}
})
fl = self.get_fl(
override_config={"lister": {"cls": "local", "args": {"db": db.url()}}}
)
fl.db = db
self.init_db(db, fl.MODEL)
@ -233,8 +243,7 @@ class HttpListerTester(HttpListerTesterBase, abc.ABC):
fl.run()
self.assertEqual(fl.db_last_index(), self.last_index)
ingested_repos = list(fl.db_query_range(self.first_index,
self.last_index))
ingested_repos = list(fl.db_query_range(self.first_index, self.last_index))
self.assertEqual(len(ingested_repos), self.entries_per_page)
@requests_mock.Mocker()
@ -307,13 +316,12 @@ class HttpListerTester(HttpListerTesterBase, abc.ABC):
"""
http_mocker.get(self.test_re, text=self.mock_response)
fl = self.get_fl()
li = fl.transport_response_simplified(
self.get_api_response(self.first_index))
li = fl.transport_response_simplified(self.get_api_response(self.first_index))
di = li[0]
self.assertIsInstance(di, dict)
pubs = [k for k in vars(fl.MODEL).keys() if not k.startswith('_')]
pubs = [k for k in vars(fl.MODEL).keys() if not k.startswith("_")]
for k in pubs:
if k not in ['last_seen', 'task_id', 'id']:
if k not in ["last_seen", "task_id", "id"]:
self.assertIn(k, di)
@requests_mock.Mocker()
@ -322,7 +330,7 @@ class HttpListerTester(HttpListerTesterBase, abc.ABC):
"""
http_mocker.get(self.test_re, text=self.mock_limit_twice_response)
with patch.object(time, 'sleep', wraps=time.sleep) as sleepmock:
with patch.object(time, "sleep", wraps=time.sleep) as sleepmock:
self.get_api_response(self.first_index)
self.assertEqual(sleepmock.call_count, 2)
@ -332,13 +340,14 @@ class HttpListerTester(HttpListerTesterBase, abc.ABC):
fl.run()
self.assertNotEqual(len(http_mocker.request_history), 0)
for request in http_mocker.request_history:
assert 'User-Agent' in request.headers
user_agent = request.headers['User-Agent']
assert 'Software Heritage Lister' in user_agent
assert "User-Agent" in request.headers
user_agent = request.headers["User-Agent"]
assert "Software Heritage Lister" in user_agent
assert swh.lister.__version__ in user_agent
def scheduled_tasks_test(self, next_api_response_file, next_last_index,
http_mocker):
def scheduled_tasks_test(
self, next_api_response_file, next_last_index, http_mocker
):
"""Check that no loading tasks get disabled when processing a new
page of repositories returned by a forge API
"""
@ -361,7 +370,7 @@ class HttpListerTester(HttpListerTesterBase, abc.ABC):
# check tasks are not disabled
for task in self.scheduler_tasks:
self.assertTrue(task['status'] != 'disabled')
self.assertTrue(task["status"] != "disabled")
class HttpSimpleListerTester(HttpListerTesterBase, abc.ABC):
@ -372,20 +381,20 @@ class HttpSimpleListerTester(HttpListerTesterBase, abc.ABC):
to customize for a specific listing service.
"""
entries = AbstractAttribute(
'Number of results '
'in good response') # type: Union[AbstractAttribute, int]
"Number of results " "in good response"
) # type: Union[AbstractAttribute, int]
PAGE = AbstractAttribute(
"URL of the server api's unique page to retrieve and "
"parse for information") # type: Union[AbstractAttribute, str]
"URL of the server api's unique page to retrieve and " "parse for information"
) # type: Union[AbstractAttribute, str]
def get_fl(self, override_config=None):
"""Retrieve an instance of fake lister (fl).
"""
if override_config or self.fl is None:
self.fl = self.Lister(
override_config=override_config)
self.fl = self.Lister(override_config=override_config)
self.fl.INITIAL_BACKOFF = 1
self.fl.reset_backoff()
@ -399,9 +408,11 @@ class HttpSimpleListerTester(HttpListerTesterBase, abc.ABC):
context.headers.update(custom_headers)
response_file = self.good_api_response_file
with open('swh/lister/%s/tests/%s' % (self.lister_subdir,
response_file),
'r', encoding='utf-8') as r:
with open(
"swh/lister/%s/tests/%s" % (self.lister_subdir, response_file),
"r",
encoding="utf-8",
) as r:
return r.read()
@requests_mock.Mocker()
@ -410,7 +421,7 @@ class HttpSimpleListerTester(HttpListerTesterBase, abc.ABC):
"""
http_mocker.get(self.PAGE, text=self.mock_limit_twice_response)
with patch.object(time, 'sleep', wraps=time.sleep) as sleepmock:
with patch.object(time, "sleep", wraps=time.sleep) as sleepmock:
self.get_api_response(0)
self.assertEqual(sleepmock.call_count, 2)
@ -426,9 +437,9 @@ class HttpSimpleListerTester(HttpListerTesterBase, abc.ABC):
li = fl.transport_response_simplified(li)
di = li[0]
self.assertIsInstance(di, dict)
pubs = [k for k in vars(fl.MODEL).keys() if not k.startswith('_')]
pubs = [k for k in vars(fl.MODEL).keys() if not k.startswith("_")]
for k in pubs:
if k not in ['last_seen', 'task_id', 'id']:
if k not in ["last_seen", "task_id", "id"]:
self.assertIn(k, di)
@requests_mock.Mocker()
@ -437,8 +448,6 @@ class HttpSimpleListerTester(HttpListerTesterBase, abc.ABC):
"""
http_mocker.get(self.PAGE, text=self.mock_response)
li = self.get_fl().list_packages(
self.get_api_response(0)
)
li = self.get_fl().list_packages(self.get_api_response(0))
self.assertIsInstance(li, list)
self.assertEqual(len(li), self.entries)

View file

@ -16,7 +16,7 @@ class BadSubclass1(ModelBase):
class BadSubclass2(ModelBase):
__abstract__ = True
__tablename__ = 'foo'
__tablename__ = "foo"
class BadSubclass3(BadSubclass2):
@ -36,7 +36,7 @@ class IndexingBadSubclass(IndexingModelBase):
class IndexingBadSubclass2(IndexingModelBase):
__abstract__ = True
__tablename__ = 'foo'
__tablename__ = "foo"
class IndexingBadSubclass3(IndexingBadSubclass2):
@ -47,7 +47,7 @@ class IndexingBadSubclass3(IndexingBadSubclass2):
class IndexingGoodSubclass(IndexingModelBase):
uid = Column(Integer, primary_key=True)
indexable = Column(Integer, index=True)
__tablename__ = 'bar'
__tablename__ = "bar"
class TestModel(unittest.TestCase):
@ -65,10 +65,10 @@ class TestModel(unittest.TestCase):
BadSubclass3()
self.assertIsInstance(GoodSubclass(), GoodSubclass)
gsc = GoodSubclass(uid='uid')
gsc = GoodSubclass(uid="uid")
self.assertEqual(gsc.__tablename__, 'foo')
self.assertEqual(gsc.uid, 'uid')
self.assertEqual(gsc.__tablename__, "foo")
self.assertEqual(gsc.uid, "uid")
def test_indexing_model_instancing(self):
with self.assertRaises(TypeError):
@ -84,8 +84,8 @@ class TestModel(unittest.TestCase):
IndexingBadSubclass3()
self.assertIsInstance(IndexingGoodSubclass(), IndexingGoodSubclass)
gsc = IndexingGoodSubclass(uid='uid', indexable='indexable')
gsc = IndexingGoodSubclass(uid="uid", indexable="indexable")
self.assertEqual(gsc.__tablename__, 'bar')
self.assertEqual(gsc.uid, 'uid')
self.assertEqual(gsc.indexable, 'indexable')
self.assertEqual(gsc.__tablename__, "bar")
self.assertEqual(gsc.uid, "uid")
self.assertEqual(gsc.indexable, "indexable")

View file

@ -7,7 +7,8 @@ def register():
from .models import CRANModel
from .lister import CRANLister
return {'models': [CRANModel],
'lister': CRANLister,
'task_modules': ['%s.tasks' % __name__],
}
return {
"models": [CRANModel],
"lister": CRANLister,
"task_modules": ["%s.tasks" % __name__],
}

View file

@ -19,16 +19,23 @@ from swh.scheduler.utils import create_task_dict
logger = logging.getLogger(__name__)
CRAN_MIRROR = 'https://cran.r-project.org'
CRAN_MIRROR = "https://cran.r-project.org"
class CRANLister(SimpleLister):
MODEL = CRANModel
LISTER_NAME = 'cran'
instance = 'cran'
LISTER_NAME = "cran"
instance = "cran"
def task_dict(self, origin_type, origin_url, version=None, html_url=None,
policy=None, **kwargs):
def task_dict(
self,
origin_type,
origin_url,
version=None,
html_url=None,
policy=None,
**kwargs,
):
"""Return task format dict. This creates tasks with args and kwargs
set, for example::
@ -43,15 +50,15 @@ class CRANLister(SimpleLister):
"""
if not policy:
policy = 'oneshot'
policy = "oneshot"
artifact_url = html_url
assert origin_type == 'tar'
assert origin_type == "tar"
return create_task_dict(
'load-cran', policy,
url=origin_url, artifacts=[{
'url': artifact_url,
'version': version
}], retries_left=3
"load-cran",
policy,
url=origin_url,
artifacts=[{"url": artifact_url, "version": version}],
retries_left=3,
)
def safely_issue_request(self, identifier):
@ -91,23 +98,22 @@ class CRANLister(SimpleLister):
"""
return read_cran_data()
def get_model_from_repo(
self, repo: Mapping[str, str]) -> Mapping[str, str]:
def get_model_from_repo(self, repo: Mapping[str, str]) -> Mapping[str, str]:
"""Transform from repository representation to model
"""
logger.debug('repo: %s', repo)
logger.debug("repo: %s", repo)
origin_url, artifact_url = compute_origin_urls(repo)
package = repo['Package']
version = repo['Version']
package = repo["Package"]
version = repo["Version"]
return {
'uid': f'{package}-{version}',
'name': package,
'full_name': repo['Title'],
'version': version,
'html_url': artifact_url,
'origin_url': origin_url,
'origin_type': 'tar',
"uid": f"{package}-{version}",
"name": package,
"full_name": repo["Title"],
"version": version,
"html_url": artifact_url,
"origin_url": origin_url,
"origin_type": "tar",
}
@ -115,11 +121,10 @@ def read_cran_data() -> List[Mapping[str, str]]:
"""Execute r script to read cran listing.
"""
filepath = pkg_resources.resource_filename('swh.lister.cran',
'list_all_packages.R')
logger.debug('script list-all-packages.R path: %s', filepath)
filepath = pkg_resources.resource_filename("swh.lister.cran", "list_all_packages.R")
logger.debug("script list-all-packages.R path: %s", filepath)
response = subprocess.run(filepath, stdout=subprocess.PIPE, shell=False)
return json.loads(response.stdout.decode('utf-8'))
return json.loads(response.stdout.decode("utf-8"))
def compute_origin_urls(repo: Mapping[str, str]) -> Tuple[str, str]:
@ -132,8 +137,8 @@ def compute_origin_urls(repo: Mapping[str, str]) -> Tuple[str, str]:
the tuple project url, artifact url
"""
package = repo['Package']
version = repo['Version']
origin_url = f'{CRAN_MIRROR}/package={package}'
artifact_url = f'{CRAN_MIRROR}/src/contrib/{package}_{version}.tar.gz'
package = repo["Package"]
version = repo["Version"]
origin_url = f"{CRAN_MIRROR}/package={package}"
artifact_url = f"{CRAN_MIRROR}/src/contrib/{package}_{version}.tar.gz"
return origin_url, artifact_url

View file

@ -11,7 +11,8 @@ class CRANModel(ModelBase):
"""a CRAN repository representation
"""
__tablename__ = 'cran_repo'
__tablename__ = "cran_repo"
uid = Column(String, primary_key=True)
version = Column(String)

View file

@ -7,12 +7,12 @@ from celery import shared_task
from swh.lister.cran.lister import CRANLister
@shared_task(name=__name__ + '.CRANListerTask')
@shared_task(name=__name__ + ".CRANListerTask")
def list_cran(**lister_args):
'''Lister task for the CRAN registry'''
"""Lister task for the CRAN registry"""
return CRANLister(**lister_args).run()
@shared_task(name=__name__ + '.ping')
@shared_task(name=__name__ + ".ping")
def _ping():
return 'OK'
return "OK"

View file

@ -10,14 +10,16 @@ from swh.lister.core.tests.conftest import * # noqa
@pytest.fixture
def lister_cran(swh_listers):
lister = swh_listers['cran']
lister = swh_listers["cran"]
# Add the load-deb-package in the scheduler backend
lister.scheduler.create_task_type({
'type': 'load-cran',
'description': 'Load a CRAN package',
'backend_name': 'swh.loader.package.cran.tasks.LoaderCRAN',
'default_interval': '1 day',
})
lister.scheduler.create_task_type(
{
"type": "load-cran",
"description": "Load a CRAN package",
"backend_name": "swh.loader.package.cran.tasks.LoaderCRAN",
"default_interval": "1 day",
}
)
return lister

View file

@ -13,28 +13,25 @@ from swh.lister.cran.lister import compute_origin_urls, CRAN_MIRROR
def test_cran_compute_origin_urls():
pack = 'something'
vers = '0.0.1'
origin_url, artifact_url = compute_origin_urls({
'Package': pack,
'Version': vers,
})
pack = "something"
vers = "0.0.1"
origin_url, artifact_url = compute_origin_urls({"Package": pack, "Version": vers,})
assert origin_url == f'{CRAN_MIRROR}/package={pack}'
assert artifact_url == f'{CRAN_MIRROR}/src/contrib/{pack}_{vers}.tar.gz'
assert origin_url == f"{CRAN_MIRROR}/package={pack}"
assert artifact_url == f"{CRAN_MIRROR}/src/contrib/{pack}_{vers}.tar.gz"
def test_cran_compute_origin_urls_failure():
for incomplete_repo in [{'Version': '0.0.1'}, {'Package': 'package'}, {}]:
for incomplete_repo in [{"Version": "0.0.1"}, {"Package": "package"}, {}]:
with pytest.raises(KeyError):
compute_origin_urls(incomplete_repo)
@patch('swh.lister.cran.lister.read_cran_data')
@patch("swh.lister.cran.lister.read_cran_data")
def test_cran_lister_cran(mock_cran, datadir, lister_cran):
lister = lister_cran
with open(path.join(datadir, 'list-r-packages.json')) as f:
with open(path.join(datadir, "list-r-packages.json")) as f:
data = json.loads(f.read())
mock_cran.return_value = data
@ -42,31 +39,33 @@ def test_cran_lister_cran(mock_cran, datadir, lister_cran):
lister.run()
r = lister.scheduler.search_tasks(task_type='load-cran')
r = lister.scheduler.search_tasks(task_type="load-cran")
assert len(r) == 6
for row in r:
assert row['type'] == 'load-cran'
assert row["type"] == "load-cran"
# arguments check
args = row['arguments']['args']
args = row["arguments"]["args"]
assert len(args) == 0
# kwargs
kwargs = row['arguments']['kwargs']
kwargs = row["arguments"]["kwargs"]
assert len(kwargs) == 2
assert set(kwargs.keys()) == {'url', 'artifacts'}
assert set(kwargs.keys()) == {"url", "artifacts"}
artifacts = kwargs['artifacts']
artifacts = kwargs["artifacts"]
assert len(artifacts) == 1
assert set(artifacts[0].keys()) == {'url', 'version'}
assert set(artifacts[0].keys()) == {"url", "version"}
assert row['policy'] == 'oneshot'
assert row['retries_left'] == 3
assert row["policy"] == "oneshot"
assert row["retries_left"] == 3
origin_url = kwargs['url']
record = lister.db_session \
.query(lister.MODEL) \
.filter(origin_url == origin_url).first()
origin_url = kwargs["url"]
record = (
lister.db_session.query(lister.MODEL)
.filter(origin_url == origin_url)
.first()
)
assert record
assert record.uid == f'{record.name}-{record.version}'
assert record.uid == f"{record.name}-{record.version}"

View file

@ -2,22 +2,20 @@ from unittest.mock import patch
def test_ping(swh_app, celery_session_worker):
res = swh_app.send_task(
'swh.lister.cran.tasks.ping')
res = swh_app.send_task("swh.lister.cran.tasks.ping")
assert res
res.wait()
assert res.successful()
assert res.result == 'OK'
assert res.result == "OK"
@patch('swh.lister.cran.tasks.CRANLister')
@patch("swh.lister.cran.tasks.CRANLister")
def test_lister(lister, swh_app, celery_session_worker):
# setup the mocked CRANLister
lister.return_value = lister
lister.run.return_value = None
res = swh_app.send_task(
'swh.lister.cran.tasks.CRANListerTask')
res = swh_app.send_task("swh.lister.cran.tasks.CRANListerTask")
assert res
res.wait()
assert res.successful()

View file

@ -11,11 +11,13 @@ from typing import Any, List, Mapping
logger = logging.getLogger(__name__)
def debian_init(db_engine,
override_conf: Mapping[str, Any] = {},
distribution_name: str = 'Debian',
suites: List[str] = ['stretch', 'buster', 'bullseye'],
components: List[str] = ['main', 'contrib', 'non-free']):
def debian_init(
db_engine,
override_conf: Mapping[str, Any] = {},
distribution_name: str = "Debian",
suites: List[str] = ["stretch", "buster", "bullseye"],
components: List[str] = ["main", "contrib", "non-free"],
):
"""Initialize the debian data model.
Args:
@ -28,30 +30,32 @@ def debian_init(db_engine,
"""
from swh.lister.debian.models import Distribution, Area
from sqlalchemy.orm import sessionmaker
db_session = sessionmaker(bind=db_engine)()
distrib = db_session.query(Distribution) \
.filter(Distribution.name == distribution_name) \
distrib = (
db_session.query(Distribution)
.filter(Distribution.name == distribution_name)
.one_or_none()
)
if distrib is None:
distrib = Distribution(
name=distribution_name, type='deb',
mirror_uri='http://deb.debian.org/debian/'
name=distribution_name,
type="deb",
mirror_uri="http://deb.debian.org/debian/",
)
db_session.add(distrib)
# Check the existing
existing_area = db_session.query(Area) \
.filter(Area.distribution == distrib) \
.all()
existing_area = db_session.query(Area).filter(Area.distribution == distrib).all()
existing_area = set([a.name for a in existing_area])
logger.debug('Area already known: %s', ', '.join(existing_area))
logger.debug("Area already known: %s", ", ".join(existing_area))
# Create only the new ones
for suite in suites:
for component in components:
area_name = f'{suite}/{component}'
area_name = f"{suite}/{component}"
if area_name in existing_area:
logger.debug("Area '%s' already set, skipping", area_name)
continue
@ -64,7 +68,10 @@ def debian_init(db_engine,
def register() -> Mapping[str, Any]:
from .lister import DebianLister
return {'models': [DebianLister.MODEL],
'lister': DebianLister,
'task_modules': ['%s.tasks' % __name__],
'init': debian_init}
return {
"models": [DebianLister.MODEL],
"lister": DebianLister,
"task_modules": ["%s.tasks" % __name__],
"init": debian_init,
}

View file

@ -17,7 +17,10 @@ from typing import Mapping, Optional, Dict, Any
from requests import Response
from swh.lister.debian.models import (
AreaSnapshot, Distribution, DistributionSnapshot, Package,
AreaSnapshot,
Distribution,
DistributionSnapshot,
Package,
TempPackage,
)
@ -25,9 +28,9 @@ from swh.lister.core.lister_base import ListerBase, FetchError
from swh.lister.core.lister_transports import ListerHttpTransport
decompressors = {
'gz': lambda f: gzip.GzipFile(fileobj=f),
'bz2': bz2.BZ2File,
'xz': lzma.LZMAFile,
"gz": lambda f: gzip.GzipFile(fileobj=f),
"bz2": bz2.BZ2File,
"xz": lzma.LZMAFile,
}
@ -37,12 +40,15 @@ logger = logging.getLogger(__name__)
class DebianLister(ListerHttpTransport, ListerBase):
MODEL = Package
PATH_TEMPLATE = None
LISTER_NAME = 'debian'
instance = 'debian'
LISTER_NAME = "debian"
instance = "debian"
def __init__(self, distribution: str = 'Debian',
date: Optional[datetime.datetime] = None,
override_config: Mapping = {}):
def __init__(
self,
distribution: str = "Debian",
date: Optional[datetime.datetime] = None,
override_config: Mapping = {},
):
"""Initialize the debian lister for a given distribution at a given
date.
@ -55,9 +61,10 @@ class DebianLister(ListerHttpTransport, ListerBase):
"""
ListerHttpTransport.__init__(self, url="notused")
ListerBase.__init__(self, override_config=override_config)
self.distribution = override_config.get('distribution', distribution)
self.date = override_config.get('date', date) or datetime.datetime.now(
tz=datetime.timezone.utc)
self.distribution = override_config.get("distribution", distribution)
self.date = override_config.get("date", date) or datetime.datetime.now(
tz=datetime.timezone.utc
)
def transport_request(self, identifier) -> Response:
"""Subvert ListerHttpTransport.transport_request, to try several
@ -83,9 +90,7 @@ class DebianLister(ListerHttpTransport, ListerBase):
if response.status_code == 200:
break
else:
raise FetchError(
"Could not retrieve index for %s" % self.area
)
raise FetchError("Could not retrieve index for %s" % self.area)
self.decompressor = decompressors.get(compression)
return response
@ -99,7 +104,7 @@ class DebianLister(ListerHttpTransport, ListerBase):
# Enable streaming to allow wrapping the response in the decompressor
# in transport_response_simplified.
params = super().request_params(identifier)
params['stream'] = True
params["stream"] = True
return params
def transport_response_simplified(self, response):
@ -118,22 +123,22 @@ class DebianLister(ListerHttpTransport, ListerBase):
files = defaultdict(dict)
for field in src_pkg._multivalued_fields:
if field.startswith('checksums-'):
sum_name = field[len('checksums-'):]
if field.startswith("checksums-"):
sum_name = field[len("checksums-") :]
else:
sum_name = 'md5sum'
sum_name = "md5sum"
if field in src_pkg:
for entry in src_pkg[field]:
name = entry['name']
files[name]['name'] = entry['name']
files[name]['size'] = int(entry['size'], 10)
name = entry["name"]
files[name]["name"] = entry["name"]
files[name]["size"] = int(entry["size"], 10)
files[name][sum_name] = entry[sum_name]
yield {
'name': src_pkg['Package'],
'version': src_pkg['Version'],
'directory': src_pkg['Directory'],
'files': files,
"name": src_pkg["Package"],
"version": src_pkg["Version"],
"directory": src_pkg["Directory"],
"files": files,
}
def inject_repo_data_into_db(self, models_list):
@ -149,13 +154,11 @@ class DebianLister(ListerHttpTransport, ListerBase):
area_id = self.area.id
for model in models_list:
name = model['name']
version = model['version']
temp_packages.append({
'area_id': area_id,
'name': name,
'version': version,
})
name = model["name"]
version = model["version"]
temp_packages.append(
{"area_id": area_id, "name": name, "version": version,}
)
by_name_version[name, version] = model
# Add all the listed packages to a temporary table
@ -172,15 +175,18 @@ class DebianLister(ListerHttpTransport, ListerBase):
)
# Filter out the packages that already exist in the main Package table
new_packages = self.db_session\
.query(TempPackage)\
.options(load_only('name', 'version'))\
.filter(~exists_tmp_pkg(self.db_session, Package))\
.all()
new_packages = (
self.db_session.query(TempPackage)
.options(load_only("name", "version"))
.filter(~exists_tmp_pkg(self.db_session, Package))
.all()
)
self.old_area_packages = self.db_session.query(Package).filter(
exists_tmp_pkg(self.db_session, TempPackage)
).all()
self.old_area_packages = (
self.db_session.query(Package)
.filter(exists_tmp_pkg(self.db_session, TempPackage))
.all()
)
self.db_session.execute(DropTable(TempPackage.__table__))
@ -188,8 +194,7 @@ class DebianLister(ListerHttpTransport, ListerBase):
for package in new_packages:
model = by_name_version[package.name, package.version]
added_packages.append(Package(area=self.area,
**model))
added_packages.append(Package(area=self.area, **model))
self.db_session.add_all(added_packages)
return added_packages
@ -210,26 +215,26 @@ class DebianLister(ListerHttpTransport, ListerBase):
"""Run the lister for a given (distribution, area) tuple.
"""
distribution = self.db_session\
.query(Distribution)\
.options(joinedload(Distribution.areas))\
.filter(Distribution.name == self.distribution)\
.one_or_none()
distribution = (
self.db_session.query(Distribution)
.options(joinedload(Distribution.areas))
.filter(Distribution.name == self.distribution)
.one_or_none()
)
if not distribution:
logger.error("Distribution %s is not registered" %
self.distribution)
return {'status': 'failed'}
logger.error("Distribution %s is not registered" % self.distribution)
return {"status": "failed"}
if not distribution.type == 'deb':
logger.error("Distribution %s is not a Debian derivative" %
distribution)
return {'status': 'failed'}
if not distribution.type == "deb":
logger.error("Distribution %s is not a Debian derivative" % distribution)
return {"status": "failed"}
date = self.date
logger.debug('Creating snapshot for distribution %s on date %s' %
(distribution, date))
logger.debug(
"Creating snapshot for distribution %s on date %s" % (distribution, date)
)
snapshot = DistributionSnapshot(date=date, distribution=distribution)
@ -241,7 +246,7 @@ class DebianLister(ListerHttpTransport, ListerBase):
self.area = area
logger.debug('Processing area %s' % area)
logger.debug("Processing area %s" % area)
_, new_area_packages = self.ingest_data(None)
area_snapshot = AreaSnapshot(snapshot=snapshot, area=area)
@ -253,4 +258,4 @@ class DebianLister(ListerHttpTransport, ListerBase):
self.db_session.commit()
return {'status': 'eventful'}
return {"status": "eventful"}

View file

@ -34,78 +34,66 @@ from swh.lister.core.models import SQLBase
class Distribution(SQLBase):
"""A distribution (e.g. Debian, Ubuntu, Fedora, ...)"""
__tablename__ = 'distribution'
__tablename__ = "distribution"
id = Column(Integer, primary_key=True)
name = Column(String, unique=True, nullable=False)
type = Column(Enum('deb', 'rpm', name='distribution_types'),
nullable=False)
type = Column(Enum("deb", "rpm", name="distribution_types"), nullable=False)
mirror_uri = Column(String, nullable=False)
areas = relationship('Area', back_populates='distribution')
areas = relationship("Area", back_populates="distribution")
def origin_for_package(self, package_name: str) -> str:
"""Return the origin url for the given package
"""
return '%s://%s/packages/%s' % (self.type, self.name, package_name)
return "%s://%s/packages/%s" % (self.type, self.name, package_name)
def __repr__(self):
return 'Distribution(%s (%s) on %s)' % (
self.name,
self.type,
self.mirror_uri,
)
return "Distribution(%s (%s) on %s)" % (self.name, self.type, self.mirror_uri,)
class Area(SQLBase):
__tablename__ = 'area'
__table_args__ = (
UniqueConstraint('distribution_id', 'name'),
)
__tablename__ = "area"
__table_args__ = (UniqueConstraint("distribution_id", "name"),)
id = Column(Integer, primary_key=True)
distribution_id = Column(Integer, ForeignKey('distribution.id'),
nullable=False)
distribution_id = Column(Integer, ForeignKey("distribution.id"), nullable=False)
name = Column(String, nullable=False)
active = Column(Boolean, nullable=False, default=True)
distribution = relationship('Distribution', back_populates='areas')
distribution = relationship("Distribution", back_populates="areas")
def index_uris(self):
"""Get possible URIs for this component's package index"""
if self.distribution.type == 'deb':
compression_exts = ('xz', 'bz2', 'gz', None)
base_uri = '%s/dists/%s/source/Sources' % (
if self.distribution.type == "deb":
compression_exts = ("xz", "bz2", "gz", None)
base_uri = "%s/dists/%s/source/Sources" % (
self.distribution.mirror_uri,
self.name,
)
for ext in compression_exts:
if ext:
yield (base_uri + '.' + ext, ext)
yield (base_uri + "." + ext, ext)
else:
yield (base_uri, None)
else:
raise NotImplementedError(
'Do not know how to build index URI for Distribution type %s' %
self.distribution.type
"Do not know how to build index URI for Distribution type %s"
% self.distribution.type
)
def __repr__(self):
return 'Area(%s of %s)' % (
self.name,
self.distribution.name,
)
return "Area(%s of %s)" % (self.name, self.distribution.name,)
class Package(SQLBase):
__tablename__ = 'package'
__table_args__ = (
UniqueConstraint('area_id', 'name', 'version'),
)
__tablename__ = "package"
__table_args__ = (UniqueConstraint("area_id", "name", "version"),)
id = Column(Integer, primary_key=True)
area_id = Column(Integer, ForeignKey('area.id'), nullable=False)
area_id = Column(Integer, ForeignKey("area.id"), nullable=False)
name = Column(String, nullable=False)
version = Column(String, nullable=False)
directory = Column(String, nullable=False)
@ -116,7 +104,7 @@ class Package(SQLBase):
revision_id = Column(LargeBinary(20))
area = relationship('Area')
area = relationship("Area")
@property
def distribution(self):
@ -125,42 +113,38 @@ class Package(SQLBase):
def fetch_uri(self, filename):
"""Get the URI to fetch the `filename` file associated with the
package"""
if self.distribution.type == 'deb':
return '%s/%s/%s' % (
if self.distribution.type == "deb":
return "%s/%s/%s" % (
self.distribution.mirror_uri,
self.directory,
filename,
)
else:
raise NotImplementedError(
'Do not know how to build fetch URI for Distribution type %s' %
self.distribution.type
"Do not know how to build fetch URI for Distribution type %s"
% self.distribution.type
)
def loader_dict(self):
ret = {
'id': self.id,
'name': self.name,
'version': self.version,
"id": self.id,
"name": self.name,
"version": self.version,
}
if self.revision_id:
ret['revision_id'] = binascii.hexlify(self.revision_id).decode()
ret["revision_id"] = binascii.hexlify(self.revision_id).decode()
else:
files = {
name: checksums.copy()
for name, checksums in self.files.items()
}
files = {name: checksums.copy() for name, checksums in self.files.items()}
for name in files:
files[name]['uri'] = self.fetch_uri(name)
files[name]["uri"] = self.fetch_uri(name)
ret.update({
'revision_id': None,
'files': files,
})
ret.update(
{"revision_id": None, "files": files,}
)
return ret
def __repr__(self):
return 'Package(%s_%s of %s %s)' % (
return "Package(%s_%s of %s %s)" % (
self.name,
self.version,
self.distribution.name,
@ -169,37 +153,36 @@ class Package(SQLBase):
class DistributionSnapshot(SQLBase):
__tablename__ = 'distribution_snapshot'
__tablename__ = "distribution_snapshot"
id = Column(Integer, primary_key=True)
date = Column(DateTime, nullable=False, index=True)
distribution_id = Column(Integer,
ForeignKey('distribution.id'),
nullable=False)
distribution_id = Column(Integer, ForeignKey("distribution.id"), nullable=False)
distribution = relationship('Distribution')
areas = relationship('AreaSnapshot', back_populates='snapshot')
distribution = relationship("Distribution")
areas = relationship("AreaSnapshot", back_populates="snapshot")
def task_for_package(self, package_name: str,
package_versions: Mapping) -> Mapping[str, Any]:
def task_for_package(
self, package_name: str, package_versions: Mapping
) -> Mapping[str, Any]:
"""Return the task dictionary for the given list of package versions
"""
origin_url = self.distribution.origin_for_package(package_name)
return {
'policy': 'oneshot',
'type': 'load-%s-package' % self.distribution.type,
'next_run': datetime.datetime.now(tz=datetime.timezone.utc),
'arguments': {
'args': [],
'kwargs': {
'url': origin_url,
'date': self.date.isoformat(),
'packages': package_versions,
"policy": "oneshot",
"type": "load-%s-package" % self.distribution.type,
"next_run": datetime.datetime.now(tz=datetime.timezone.utc),
"arguments": {
"args": [],
"kwargs": {
"url": origin_url,
"date": self.date.isoformat(),
"packages": package_versions,
},
},
'retries_left': 3,
"retries_left": 3,
}
def get_packages(self):
@ -207,41 +190,38 @@ class DistributionSnapshot(SQLBase):
for area_snapshot in self.areas:
area_name = area_snapshot.area.name
for package in area_snapshot.packages:
ref_name = '%s/%s' % (area_name, package.version)
ref_name = "%s/%s" % (area_name, package.version)
packages[package.name][ref_name] = package.loader_dict()
return packages
area_snapshot_package_assoc = Table(
'area_snapshot_package', SQLBase.metadata,
Column('area_snapshot_id', Integer, ForeignKey('area_snapshot.id'),
nullable=False),
Column('package_id', Integer, ForeignKey('package.id'),
nullable=False),
"area_snapshot_package",
SQLBase.metadata,
Column("area_snapshot_id", Integer, ForeignKey("area_snapshot.id"), nullable=False),
Column("package_id", Integer, ForeignKey("package.id"), nullable=False),
)
class AreaSnapshot(SQLBase):
__tablename__ = 'area_snapshot'
__tablename__ = "area_snapshot"
id = Column(Integer, primary_key=True)
snapshot_id = Column(Integer,
ForeignKey('distribution_snapshot.id'),
nullable=False)
area_id = Column(Integer,
ForeignKey('area.id'),
nullable=False)
snapshot_id = Column(
Integer, ForeignKey("distribution_snapshot.id"), nullable=False
)
area_id = Column(Integer, ForeignKey("area.id"), nullable=False)
snapshot = relationship('DistributionSnapshot', back_populates='areas')
area = relationship('Area')
packages = relationship('Package', secondary=area_snapshot_package_assoc)
snapshot = relationship("DistributionSnapshot", back_populates="areas")
area = relationship("Area")
packages = relationship("Package", secondary=area_snapshot_package_assoc)
class TempPackage(SQLBase):
__tablename__ = 'temp_package'
__tablename__ = "temp_package"
__table_args__ = {
'prefixes': ['TEMPORARY'],
"prefixes": ["TEMPORARY"],
}
id = Column(Integer, primary_key=True)

View file

@ -7,12 +7,12 @@ from celery import shared_task
from .lister import DebianLister
@shared_task(name=__name__ + '.DebianListerTask')
@shared_task(name=__name__ + ".DebianListerTask")
def list_debian_distribution(distribution, **lister_args):
'''List a Debian distribution'''
"""List a Debian distribution"""
return DebianLister(distribution=distribution, **lister_args).run()
@shared_task(name=__name__ + '.ping')
@shared_task(name=__name__ + ".ping")
def _ping():
return 'OK'
return "OK"

View file

@ -16,20 +16,20 @@ from swh.lister.debian import debian_init
@pytest.fixture
def lister_debian(swh_listers):
lister = swh_listers['debian']
lister = swh_listers["debian"]
# Initialize the debian data model
debian_init(
lister.db_engine, suites=['stretch'], components=['main', 'contrib']
)
debian_init(lister.db_engine, suites=["stretch"], components=["main", "contrib"])
# Add the load-deb-package in the scheduler backend
lister.scheduler.create_task_type({
'type': 'load-deb-package',
'description': 'Load a Debian package',
'backend_name': 'swh.loader.debian.tasks.LoaderDebianPackage',
'default_interval': '1 day',
})
lister.scheduler.create_task_type(
{
"type": "load-deb-package",
"description": "Load a Debian package",
"backend_name": "swh.loader.debian.tasks.LoaderDebianPackage",
"default_interval": "1 day",
}
)
return lister
@ -40,12 +40,10 @@ def sqlalchemy_engine(postgresql_proc):
pg_port = postgresql_proc.port
pg_user = postgresql_proc.user
pg_db = 'sqlalchemy-tests'
pg_db = "sqlalchemy-tests"
url = f'postgresql://{pg_user}@{pg_host}:{pg_port}/{pg_db}'
with DatabaseJanitor(
pg_user, pg_host, pg_port, pg_db, postgresql_proc.version
):
url = f"postgresql://{pg_user}@{pg_host}:{pg_port}/{pg_db}"
with DatabaseJanitor(pg_user, pg_host, pg_port, pg_db, postgresql_proc.version):
engine = create_engine(url)
yield engine
engine.dispose()

View file

@ -17,29 +17,37 @@ def engine(session):
def test_debian_init_step(engine, session):
distribution_name = 'KaliLinux'
distribution_name = "KaliLinux"
distrib = session.query(Distribution) \
.filter(Distribution.name == distribution_name) \
distrib = (
session.query(Distribution)
.filter(Distribution.name == distribution_name)
.one_or_none()
)
assert distrib is None
all_area = session.query(Area).all()
assert all_area == []
suites = ['wheezy', 'jessie']
components = ['main', 'contrib']
suites = ["wheezy", "jessie"]
components = ["main", "contrib"]
debian_init(engine, distribution_name=distribution_name,
suites=suites, components=components)
distrib = session.query(Distribution) \
.filter(Distribution.name == distribution_name) \
debian_init(
engine,
distribution_name=distribution_name,
suites=suites,
components=components,
)
distrib = (
session.query(Distribution)
.filter(Distribution.name == distribution_name)
.one_or_none()
)
assert distrib is not None
assert distrib.name == distribution_name
assert distrib.type == 'deb'
assert distrib.mirror_uri == 'http://deb.debian.org/debian/'
assert distrib.type == "deb"
assert distrib.mirror_uri == "http://deb.debian.org/debian/"
all_area = session.query(Area).all()
assert len(all_area) == 2 * 2, "2 suites * 2 components per suite"
@ -47,7 +55,7 @@ def test_debian_init_step(engine, session):
expected_area_names = []
for suite in suites:
for component in components:
expected_area_names.append(f'{suite}/{component}')
expected_area_names.append(f"{suite}/{component}")
for area in all_area:
area.id = None
@ -56,12 +64,16 @@ def test_debian_init_step(engine, session):
# check idempotency (on exact same call)
debian_init(engine, distribution_name=distribution_name,
suites=suites, components=components)
debian_init(
engine,
distribution_name=distribution_name,
suites=suites,
components=components,
)
distribs = session.query(Distribution) \
.filter(Distribution.name == distribution_name) \
.all()
distribs = (
session.query(Distribution).filter(Distribution.name == distribution_name).all()
)
assert len(distribs) == 1
distrib = distribs[0]
@ -70,8 +82,12 @@ def test_debian_init_step(engine, session):
assert len(all_area) == 2 * 2, "2 suites * 2 components per suite"
# Add a new suite
debian_init(engine, distribution_name=distribution_name,
suites=['lenny'], components=components)
debian_init(
engine,
distribution_name=distribution_name,
suites=["lenny"],
components=components,
)
all_area = [a.name for a in session.query(Area).all()]
assert len(all_area) == (2 + 1) * 2, "3 suites * 2 components per suite"

View file

@ -16,21 +16,21 @@ def test_lister_debian(lister_debian, datadir, requests_mock_datadir):
# Run the lister
lister_debian.run()
r = lister_debian.scheduler.search_tasks(task_type='load-deb-package')
r = lister_debian.scheduler.search_tasks(task_type="load-deb-package")
assert len(r) == 151
for row in r:
assert row['type'] == 'load-deb-package'
assert row["type"] == "load-deb-package"
# arguments check
args = row['arguments']['args']
args = row["arguments"]["args"]
assert len(args) == 0
# kwargs
kwargs = row['arguments']['kwargs']
assert set(kwargs.keys()) == {'url', 'date', 'packages'}
kwargs = row["arguments"]["kwargs"]
assert set(kwargs.keys()) == {"url", "date", "packages"}
logger.debug('kwargs: %s', kwargs)
assert isinstance(kwargs['url'], str)
logger.debug("kwargs: %s", kwargs)
assert isinstance(kwargs["url"], str)
assert row['policy'] == 'oneshot'
assert row['priority'] is None
assert row["policy"] == "oneshot"
assert row["priority"] is None

View file

@ -10,13 +10,9 @@ from swh.lister.debian.models import Distribution, Area
def test_area_index_uris_deb(session):
d = Distribution(
name='Debian', type='deb', mirror_uri='http://deb.debian.org/debian'
)
a = Area(
distribution=d,
name='unstable/main',
active=True,
name="Debian", type="deb", mirror_uri="http://deb.debian.org/debian"
)
a = Area(distribution=d, name="unstable/main", active=True,)
session.add_all([d, a])
session.commit()
@ -26,14 +22,9 @@ def test_area_index_uris_deb(session):
def test_area_index_uris_rpm(session):
d = Distribution(
name='CentOS', type='rpm',
mirror_uri='http://centos.mirrors.proxad.net/'
)
a = Area(
distribution=d,
name='8',
active=True,
name="CentOS", type="rpm", mirror_uri="http://centos.mirrors.proxad.net/"
)
a = Area(distribution=d, name="8", active=True,)
session.add_all([d, a])
session.commit()

View file

@ -7,25 +7,23 @@ from unittest.mock import patch
def test_ping(swh_app, celery_session_worker):
res = swh_app.send_task(
'swh.lister.debian.tasks.ping')
res = swh_app.send_task("swh.lister.debian.tasks.ping")
assert res
res.wait()
assert res.successful()
assert res.result == 'OK'
assert res.result == "OK"
@patch('swh.lister.debian.tasks.DebianLister')
@patch("swh.lister.debian.tasks.DebianLister")
def test_lister(lister, swh_app, celery_session_worker):
# setup the mocked DebianLister
lister.return_value = lister
lister.run.return_value = None
res = swh_app.send_task(
'swh.lister.debian.tasks.DebianListerTask', ('stretch',))
res = swh_app.send_task("swh.lister.debian.tasks.DebianListerTask", ("stretch",))
assert res
res.wait()
assert res.successful()
lister.assert_called_once_with(distribution='stretch')
lister.assert_called_once_with(distribution="stretch")
lister.run.assert_called_once_with()

View file

@ -11,19 +11,18 @@ from swh.lister.debian.lister import DebianLister
@click.group()
@click.option('--verbose/--no-verbose', default=False)
@click.option("--verbose/--no-verbose", default=False)
@click.pass_context
def cli(ctx, verbose):
ctx.obj['lister'] = DebianLister()
ctx.obj["lister"] = DebianLister()
if verbose:
loglevel = logging.DEBUG
logging.getLogger('sqlalchemy.engine').setLevel(logging.INFO)
logging.getLogger("sqlalchemy.engine").setLevel(logging.INFO)
else:
loglevel = logging.INFO
logging.basicConfig(
format='%(asctime)s %(process)d %(levelname)s %(message)s',
level=loglevel,
format="%(asctime)s %(process)d %(levelname)s %(message)s", level=loglevel,
)
@ -31,23 +30,24 @@ def cli(ctx, verbose):
@click.pass_context
def create_schema(ctx):
"""Create the schema from the models"""
SQLBase.metadata.create_all(ctx.obj['lister'].db_engine)
SQLBase.metadata.create_all(ctx.obj["lister"].db_engine)
@cli.command()
@click.option('--name', help='The name of the distribution')
@click.option('--type', help='The type of distribution')
@click.option('--mirror-uri', help='The URL to the mirror of the distribution')
@click.option('--area', help='The areas for the distribution',
multiple=True)
@click.option("--name", help="The name of the distribution")
@click.option("--type", help="The type of distribution")
@click.option("--mirror-uri", help="The URL to the mirror of the distribution")
@click.option("--area", help="The areas for the distribution", multiple=True)
@click.pass_context
def create_distribution(ctx, name, type, mirror_uri, area):
to_add = []
db_session = ctx.obj['lister'].db_session
d = db_session.query(Distribution)\
.filter(Distribution.name == name)\
.filter(Distribution.type == type)\
.one_or_none()
db_session = ctx.obj["lister"].db_session
d = (
db_session.query(Distribution)
.filter(Distribution.name == name)
.filter(Distribution.type == type)
.one_or_none()
)
if not d:
d = Distribution(name=name, type=type, mirror_uri=mirror_uri)
@ -56,10 +56,12 @@ def create_distribution(ctx, name, type, mirror_uri, area):
for area_name in area:
a = None
if d.id:
a = db_session.query(Area)\
.filter(Area.distribution == d)\
.filter(Area.name == area_name)\
.one_or_none()
a = (
db_session.query(Area)
.filter(Area.distribution == d)
.filter(Area.name == area_name)
.one_or_none()
)
if not a:
a = Area(name=area_name, distribution=d)
@ -70,12 +72,12 @@ def create_distribution(ctx, name, type, mirror_uri, area):
@cli.command()
@click.option('--name', help='The name of the distribution')
@click.option("--name", help="The name of the distribution")
@click.pass_context
def list_distribution(ctx, name):
"""List the distribution"""
ctx.obj['lister'].run(name)
ctx.obj["lister"].run(name)
if __name__ == '__main__':
if __name__ == "__main__":
cli(obj={})

View file

@ -7,7 +7,8 @@ def register():
from .models import GitHubModel
from .lister import GitHubLister
return {'models': [GitHubModel],
'lister': GitHubLister,
'task_modules': ['%s.tasks' % __name__],
}
return {
"models": [GitHubModel],
"lister": GitHubLister,
"task_modules": ["%s.tasks" % __name__],
}

View file

@ -14,60 +14,57 @@ from requests import Response
class GitHubLister(IndexingHttpLister):
PATH_TEMPLATE = '/repositories?since=%d'
PATH_TEMPLATE = "/repositories?since=%d"
MODEL = GitHubModel
DEFAULT_URL = 'https://api.github.com'
API_URL_INDEX_RE = re.compile(r'^.*/repositories\?since=(\d+)')
LISTER_NAME = 'github'
instance = 'github' # There is only 1 instance of such lister
DEFAULT_URL = "https://api.github.com"
API_URL_INDEX_RE = re.compile(r"^.*/repositories\?since=(\d+)")
LISTER_NAME = "github"
instance = "github" # There is only 1 instance of such lister
default_min_bound = 0 # type: Any
def get_model_from_repo(self, repo: Dict[str, Any]) -> Dict[str, Any]:
return {
'uid': repo['id'],
'indexable': repo['id'],
'name': repo['name'],
'full_name': repo['full_name'],
'html_url': repo['html_url'],
'origin_url': repo['html_url'],
'origin_type': 'git',
'fork': repo['fork'],
"uid": repo["id"],
"indexable": repo["id"],
"name": repo["name"],
"full_name": repo["full_name"],
"html_url": repo["html_url"],
"origin_url": repo["html_url"],
"origin_type": "git",
"fork": repo["fork"],
}
def transport_quota_check(self, response: Response) -> Tuple[bool, int]:
x_rate_limit_remaining = response.headers.get('X-RateLimit-Remaining')
x_rate_limit_remaining = response.headers.get("X-RateLimit-Remaining")
if not x_rate_limit_remaining:
return False, 0
reqs_remaining = int(x_rate_limit_remaining)
if response.status_code == 403 and reqs_remaining == 0:
delay = int(response.headers['Retry-After'])
delay = int(response.headers["Retry-After"])
return True, delay
return False, 0
def get_next_target_from_response(self,
response: Response) -> Optional[int]:
if 'next' in response.links:
next_url = response.links['next']['url']
return int(
self.API_URL_INDEX_RE.match(next_url).group(1)) # type: ignore
def get_next_target_from_response(self, response: Response) -> Optional[int]:
if "next" in response.links:
next_url = response.links["next"]["url"]
return int(self.API_URL_INDEX_RE.match(next_url).group(1)) # type: ignore
return None
def transport_response_simplified(self, response: Response
) -> List[Dict[str, Any]]:
def transport_response_simplified(self, response: Response) -> List[Dict[str, Any]]:
repos = response.json()
return [self.get_model_from_repo(repo)
for repo in repos if repo and 'id' in repo]
return [
self.get_model_from_repo(repo) for repo in repos if repo and "id" in repo
]
def request_headers(self) -> Dict[str, Any]:
"""(Override) Set requests headers to send when querying the GitHub API
"""
headers = super().request_headers()
headers['Accept'] = 'application/vnd.github.v3+json'
headers["Accept"] = "application/vnd.github.v3+json"
return headers
def disable_deleted_repo_tasks(self, index: int,
next_index: int, keep_these: int):
def disable_deleted_repo_tasks(self, index: int, next_index: int, keep_these: int):
""" (Overrides) Fix provided index value to avoid erroneously disabling
some scheduler tasks
"""
@ -75,5 +72,4 @@ class GitHubLister(IndexingHttpLister):
# parameter, so increment the index to avoid disabling the latest
# created task when processing a new repositories page returned by
# the Github API
return super().disable_deleted_repo_tasks(index + 1, next_index,
keep_these)
return super().disable_deleted_repo_tasks(index + 1, next_index, keep_these)

View file

@ -9,7 +9,8 @@ from swh.lister.core.models import IndexingModelBase
class GitHubModel(IndexingModelBase):
"""a GitHub repository"""
__tablename__ = 'github_repo'
__tablename__ = "github_repo"
uid = Column(Integer, primary_key=True)
indexable = Column(Integer, index=True)

View file

@ -11,20 +11,20 @@ from swh.lister.github.lister import GitHubLister
GROUP_SPLIT = 10000
@shared_task(name=__name__ + '.IncrementalGitHubLister')
@shared_task(name=__name__ + ".IncrementalGitHubLister")
def list_github_incremental(**lister_args):
'Incremental update of GitHub'
"Incremental update of GitHub"
lister = GitHubLister(**lister_args)
return lister.run(min_bound=lister.db_last_index(), max_bound=None)
@shared_task(name=__name__ + '.RangeGitHubLister')
@shared_task(name=__name__ + ".RangeGitHubLister")
def _range_github_lister(start, end, **lister_args):
lister = GitHubLister(**lister_args)
return lister.run(min_bound=start, max_bound=end)
@shared_task(name=__name__ + '.FullGitHubRelister', bind=True)
@shared_task(name=__name__ + ".FullGitHubRelister", bind=True)
def list_github_full(self, split=None, **lister_args):
"""Full update of GitHub
@ -34,20 +34,21 @@ def list_github_full(self, split=None, **lister_args):
lister = GitHubLister(**lister_args)
ranges = lister.db_partition_indices(split or GROUP_SPLIT)
if not ranges:
self.log.info('Nothing to list')
self.log.info("Nothing to list")
return
random.shuffle(ranges)
promise = group(_range_github_lister.s(minv, maxv, **lister_args)
for minv, maxv in ranges)()
self.log.debug('%s OK (spawned %s subtasks)' % (self.name, len(ranges)))
promise = group(
_range_github_lister.s(minv, maxv, **lister_args) for minv, maxv in ranges
)()
self.log.debug("%s OK (spawned %s subtasks)" % (self.name, len(ranges)))
try:
promise.save() # so that we can restore the GroupResult in tests
except (NotImplementedError, AttributeError):
self.log.info('Unable to call save_group with current result backend.')
self.log.info("Unable to call save_group with current result backend.")
# FIXME: what to do in terms of return here?
return promise.id
@shared_task(name=__name__ + '.ping')
@shared_task(name=__name__ + ".ping")
def _ping():
return 'OK'
return "OK"

View file

@ -14,65 +14,70 @@ from swh.lister.github.lister import GitHubLister
class GitHubListerTester(HttpListerTester, unittest.TestCase):
Lister = GitHubLister
test_re = re.compile(r'/repositories\?since=([^?&]+)')
lister_subdir = 'github'
good_api_response_file = 'data/https_api.github.com/first_response.json'
bad_api_response_file = 'data/https_api.github.com/empty_response.json'
test_re = re.compile(r"/repositories\?since=([^?&]+)")
lister_subdir = "github"
good_api_response_file = "data/https_api.github.com/first_response.json"
bad_api_response_file = "data/https_api.github.com/empty_response.json"
first_index = 0
last_index = 369
entries_per_page = 100
convert_type = int
def response_headers(self, request):
headers = {'X-RateLimit-Remaining': '1'}
headers = {"X-RateLimit-Remaining": "1"}
if self.request_index(request) == self.first_index:
headers.update({
'Link': '<https://api.github.com/repositories?since=%s>;'
' rel="next",'
'<https://api.github.com/repositories{?since}>;'
' rel="first"' % self.last_index
})
headers.update(
{
"Link": "<https://api.github.com/repositories?since=%s>;"
' rel="next",'
"<https://api.github.com/repositories{?since}>;"
' rel="first"' % self.last_index
}
)
else:
headers.update({
'Link': '<https://api.github.com/repositories{?since}>;'
' rel="first"'
})
headers.update(
{
"Link": "<https://api.github.com/repositories{?since}>;"
' rel="first"'
}
)
return headers
def mock_rate_quota(self, n, request, context):
self.rate_limit += 1
context.status_code = 403
context.headers['X-RateLimit-Remaining'] = '0'
context.headers['Retry-After'] = '1' # 1 second
context.headers["X-RateLimit-Remaining"] = "0"
context.headers["Retry-After"] = "1" # 1 second
return '{"error":"dummy"}'
@requests_mock.Mocker()
def test_scheduled_tasks(self, http_mocker):
self.scheduled_tasks_test(
'data/https_api.github.com/next_response.json', 876, http_mocker)
"data/https_api.github.com/next_response.json", 876, http_mocker
)
def test_lister_github(swh_listers, requests_mock_datadir):
"""Simple github listing should create scheduled tasks
"""
lister = swh_listers['github']
lister = swh_listers["github"]
lister.run()
r = lister.scheduler.search_tasks(task_type='load-git')
r = lister.scheduler.search_tasks(task_type="load-git")
assert len(r) == 100
for row in r:
assert row['type'] == 'load-git'
assert row["type"] == "load-git"
# arguments check
args = row['arguments']['args']
args = row["arguments"]["args"]
assert len(args) == 0
# kwargs
kwargs = row['arguments']['kwargs']
url = kwargs['url']
assert url.startswith('https://github.com')
kwargs = row["arguments"]["kwargs"]
url = kwargs["url"]
assert url.startswith("https://github.com")
assert row['policy'] == 'recurring'
assert row['priority'] is None
assert row["policy"] == "recurring"
assert row["priority"] is None

View file

@ -5,23 +5,21 @@ from unittest.mock import patch
def test_ping(swh_app, celery_session_worker):
res = swh_app.send_task(
'swh.lister.github.tasks.ping')
res = swh_app.send_task("swh.lister.github.tasks.ping")
assert res
res.wait()
assert res.successful()
assert res.result == 'OK'
assert res.result == "OK"
@patch('swh.lister.github.tasks.GitHubLister')
@patch("swh.lister.github.tasks.GitHubLister")
def test_incremental(lister, swh_app, celery_session_worker):
# setup the mocked GitHubLister
lister.return_value = lister
lister.db_last_index.return_value = 42
lister.run.return_value = None
res = swh_app.send_task(
'swh.lister.github.tasks.IncrementalGitHubLister')
res = swh_app.send_task("swh.lister.github.tasks.IncrementalGitHubLister")
assert res
res.wait()
assert res.successful()
@ -31,15 +29,15 @@ def test_incremental(lister, swh_app, celery_session_worker):
lister.run.assert_called_once_with(min_bound=42, max_bound=None)
@patch('swh.lister.github.tasks.GitHubLister')
@patch("swh.lister.github.tasks.GitHubLister")
def test_range(lister, swh_app, celery_session_worker):
# setup the mocked GitHubLister
lister.return_value = lister
lister.run.return_value = None
res = swh_app.send_task(
'swh.lister.github.tasks.RangeGitHubLister',
kwargs=dict(start=12, end=42))
"swh.lister.github.tasks.RangeGitHubLister", kwargs=dict(start=12, end=42)
)
assert res
res.wait()
assert res.successful()
@ -49,16 +47,14 @@ def test_range(lister, swh_app, celery_session_worker):
lister.run.assert_called_once_with(min_bound=12, max_bound=42)
@patch('swh.lister.github.tasks.GitHubLister')
@patch("swh.lister.github.tasks.GitHubLister")
def test_relister(lister, swh_app, celery_session_worker):
# setup the mocked GitHubLister
lister.return_value = lister
lister.run.return_value = None
lister.db_partition_indices.return_value = [
(i, i+9) for i in range(0, 50, 10)]
lister.db_partition_indices.return_value = [(i, i + 9) for i in range(0, 50, 10)]
res = swh_app.send_task(
'swh.lister.github.tasks.FullGitHubRelister')
res = swh_app.send_task("swh.lister.github.tasks.FullGitHubRelister")
assert res
res.wait()
@ -86,5 +82,6 @@ def test_relister(lister, swh_app, celery_session_worker):
# lister.run should have been called once per partition interval
for i in range(5):
# XXX inconsistent behavior: max_bound is INCLUDED here
assert (dict(min_bound=10*i, max_bound=10*i + 9),) \
in lister.run.call_args_list
assert (
dict(min_bound=10 * i, max_bound=10 * i + 9),
) in lister.run.call_args_list

View file

@ -7,7 +7,8 @@ def register():
from .models import GitLabModel
from .lister import GitLabLister
return {'models': [GitLabModel],
'lister': GitLabLister,
'task_modules': ['%s.tasks' % __name__],
}
return {
"models": [GitLabModel],
"lister": GitLabLister,
"task_modules": ["%s.tasks" % __name__],
}

View file

@ -15,77 +15,83 @@ from requests import Response
class GitLabLister(PageByPageHttpLister):
# Template path expecting an integer that represents the page id
PATH_TEMPLATE = '/projects?page=%d&order_by=id'
DEFAULT_URL = 'https://gitlab.com/api/v4/'
PATH_TEMPLATE = "/projects?page=%d&order_by=id"
DEFAULT_URL = "https://gitlab.com/api/v4/"
MODEL = GitLabModel
LISTER_NAME = 'gitlab'
LISTER_NAME = "gitlab"
def __init__(self, url=None, instance=None,
override_config=None, sort='asc', per_page=20):
def __init__(
self, url=None, instance=None, override_config=None, sort="asc", per_page=20
):
super().__init__(url=url, override_config=override_config)
if instance is None:
instance = parse_url(self.url).host
self.instance = instance
self.PATH_TEMPLATE = '%s&sort=%s&per_page=%s' % (
self.PATH_TEMPLATE, sort, per_page)
self.PATH_TEMPLATE = "%s&sort=%s&per_page=%s" % (
self.PATH_TEMPLATE,
sort,
per_page,
)
def uid(self, repo: Dict[str, Any]) -> str:
return '%s/%s' % (self.instance, repo['path_with_namespace'])
return "%s/%s" % (self.instance, repo["path_with_namespace"])
def get_model_from_repo(self, repo: Dict[str, Any]) -> Dict[str, Any]:
return {
'instance': self.instance,
'uid': self.uid(repo),
'name': repo['name'],
'full_name': repo['path_with_namespace'],
'html_url': repo['web_url'],
'origin_url': repo['http_url_to_repo'],
'origin_type': 'git',
"instance": self.instance,
"uid": self.uid(repo),
"name": repo["name"],
"full_name": repo["path_with_namespace"],
"html_url": repo["web_url"],
"origin_url": repo["http_url_to_repo"],
"origin_type": "git",
}
def transport_quota_check(self, response: Response
) -> Tuple[bool, Union[int, float]]:
def transport_quota_check(
self, response: Response
) -> Tuple[bool, Union[int, float]]:
"""Deal with rate limit if any.
"""
# not all gitlab instance have rate limit
if 'RateLimit-Remaining' in response.headers:
reqs_remaining = int(response.headers['RateLimit-Remaining'])
if "RateLimit-Remaining" in response.headers:
reqs_remaining = int(response.headers["RateLimit-Remaining"])
if response.status_code == 403 and reqs_remaining == 0:
reset_at = int(response.headers['RateLimit-Reset'])
reset_at = int(response.headers["RateLimit-Reset"])
delay = min(reset_at - time.time(), 3600)
return True, delay
return False, 0
def _get_int(self, headers: MutableMapping[str, Any],
key: str) -> Optional[int]:
def _get_int(self, headers: MutableMapping[str, Any], key: str) -> Optional[int]:
_val = headers.get(key)
if _val:
return int(_val)
return None
def get_next_target_from_response(
self, response: Response) -> Optional[int]:
def get_next_target_from_response(self, response: Response) -> Optional[int]:
"""Determine the next page identifier.
"""
return self._get_int(response.headers, 'x-next-page')
return self._get_int(response.headers, "x-next-page")
def get_pages_information(self) -> Tuple[Optional[int],
Optional[int], Optional[int]]:
def get_pages_information(
self,
) -> Tuple[Optional[int], Optional[int], Optional[int]]:
"""Determine pages information.
"""
response = self.transport_head(identifier=1) # type: ignore
if not response.ok:
raise ValueError(
'Problem during information fetch: %s' % response.status_code)
"Problem during information fetch: %s" % response.status_code
)
h = response.headers
return (self._get_int(h, 'x-total'),
self._get_int(h, 'x-total-pages'),
self._get_int(h, 'x-per-page'))
return (
self._get_int(h, "x-total"),
self._get_int(h, "x-total-pages"),
self._get_int(h, "x-per-page"),
)
def transport_response_simplified(self, response: Response
) -> List[Dict[str, Any]]:
def transport_response_simplified(self, response: Response) -> List[Dict[str, Any]]:
repos = response.json()
return [self.get_model_from_repo(repo) for repo in repos]

View file

@ -11,7 +11,8 @@ class GitLabModel(ModelBase):
"""a Gitlab repository from a gitlab instance
"""
__tablename__ = 'gitlab_repo'
__tablename__ = "gitlab_repo"
uid = Column(String, primary_key=True)
instance = Column(String, index=True)

View file

@ -13,40 +13,41 @@ from .lister import GitLabLister
NBPAGES = 10
@shared_task(name=__name__ + '.IncrementalGitLabLister')
@shared_task(name=__name__ + ".IncrementalGitLabLister")
def list_gitlab_incremental(**lister_args):
"""Incremental update of a GitLab instance"""
lister_args['sort'] = 'desc'
lister_args["sort"] = "desc"
lister = GitLabLister(**lister_args)
total_pages = lister.get_pages_information()[1]
# stopping as soon as existing origins for that instance are detected
return lister.run(min_bound=1, max_bound=total_pages, check_existence=True)
@shared_task(name=__name__ + '.RangeGitLabLister')
@shared_task(name=__name__ + ".RangeGitLabLister")
def _range_gitlab_lister(start, end, **lister_args):
lister = GitLabLister(**lister_args)
return lister.run(min_bound=start, max_bound=end)
@shared_task(name=__name__ + '.FullGitLabRelister', bind=True)
@shared_task(name=__name__ + ".FullGitLabRelister", bind=True)
def list_gitlab_full(self, **lister_args):
"""Full update of a GitLab instance"""
lister = GitLabLister(**lister_args)
_, total_pages, _ = lister.get_pages_information()
ranges = list(utils.split_range(total_pages, NBPAGES))
random.shuffle(ranges)
promise = group(_range_gitlab_lister.s(minv, maxv, **lister_args)
for minv, maxv in ranges)()
self.log.debug('%s OK (spawned %s subtasks)' % (self.name, len(ranges)))
promise = group(
_range_gitlab_lister.s(minv, maxv, **lister_args) for minv, maxv in ranges
)()
self.log.debug("%s OK (spawned %s subtasks)" % (self.name, len(ranges)))
try:
promise.save()
except (NotImplementedError, AttributeError):
self.log.info('Unable to call save_group with current result backend.')
self.log.info("Unable to call save_group with current result backend.")
# FIXME: what to do in terms of return here?
return promise.id
@shared_task(name=__name__ + '.ping')
@shared_task(name=__name__ + ".ping")
def _ping():
return 'OK'
return "OK"

View file

@ -17,50 +17,50 @@ logger = logging.getLogger(__name__)
class GitLabListerTester(HttpListerTesterBase, unittest.TestCase):
Lister = GitLabLister
test_re = re.compile(r'^.*/projects.*page=(\d+).*')
lister_subdir = 'gitlab'
good_api_response_file = 'data/gitlab.com/api_response.json'
bad_api_response_file = 'data/gitlab.com/api_empty_response.json'
test_re = re.compile(r"^.*/projects.*page=(\d+).*")
lister_subdir = "gitlab"
good_api_response_file = "data/gitlab.com/api_response.json"
bad_api_response_file = "data/gitlab.com/api_empty_response.json"
first_index = 1
entries_per_page = 10
convert_type = int
def response_headers(self, request):
headers = {'RateLimit-Remaining': '1'}
headers = {"RateLimit-Remaining": "1"}
if self.request_index(request) == self.first_index:
headers.update({
'x-next-page': '3',
})
headers.update(
{"x-next-page": "3",}
)
return headers
def mock_rate_quota(self, n, request, context):
self.rate_limit += 1
context.status_code = 403
context.headers['RateLimit-Remaining'] = '0'
context.headers["RateLimit-Remaining"] = "0"
one_second = int((datetime.now() + timedelta(seconds=1.5)).timestamp())
context.headers['RateLimit-Reset'] = str(one_second)
context.headers["RateLimit-Reset"] = str(one_second)
return '{"error":"dummy"}'
def test_lister_gitlab(swh_listers, requests_mock_datadir):
lister = swh_listers['gitlab']
lister = swh_listers["gitlab"]
lister.run()
r = lister.scheduler.search_tasks(task_type='load-git')
r = lister.scheduler.search_tasks(task_type="load-git")
assert len(r) == 10
for row in r:
assert row['type'] == 'load-git'
assert row["type"] == "load-git"
# arguments check
args = row['arguments']['args']
args = row["arguments"]["args"]
assert len(args) == 0
# kwargs
kwargs = row['arguments']['kwargs']
url = kwargs['url']
assert url.startswith('https://gitlab.com')
kwargs = row["arguments"]["kwargs"]
url = kwargs["url"]
assert url.startswith("https://gitlab.com")
assert row['policy'] == 'recurring'
assert row['priority'] is None
assert row["policy"] == "recurring"
assert row["priority"] is None

View file

@ -5,43 +5,40 @@ from unittest.mock import patch
def test_ping(swh_app, celery_session_worker):
res = swh_app.send_task(
'swh.lister.gitlab.tasks.ping')
res = swh_app.send_task("swh.lister.gitlab.tasks.ping")
assert res
res.wait()
assert res.successful()
assert res.result == 'OK'
assert res.result == "OK"
@patch('swh.lister.gitlab.tasks.GitLabLister')
@patch("swh.lister.gitlab.tasks.GitLabLister")
def test_incremental(lister, swh_app, celery_session_worker):
# setup the mocked GitlabLister
lister.return_value = lister
lister.run.return_value = None
lister.get_pages_information.return_value = (None, 10, None)
res = swh_app.send_task(
'swh.lister.gitlab.tasks.IncrementalGitLabLister')
res = swh_app.send_task("swh.lister.gitlab.tasks.IncrementalGitLabLister")
assert res
res.wait()
assert res.successful()
lister.assert_called_once_with(sort='desc')
lister.assert_called_once_with(sort="desc")
lister.db_last_index.assert_not_called()
lister.get_pages_information.assert_called_once_with()
lister.run.assert_called_once_with(
min_bound=1, max_bound=10, check_existence=True)
lister.run.assert_called_once_with(min_bound=1, max_bound=10, check_existence=True)
@patch('swh.lister.gitlab.tasks.GitLabLister')
@patch("swh.lister.gitlab.tasks.GitLabLister")
def test_range(lister, swh_app, celery_session_worker):
# setup the mocked GitlabLister
lister.return_value = lister
lister.run.return_value = None
res = swh_app.send_task(
'swh.lister.gitlab.tasks.RangeGitLabLister',
kwargs=dict(start=12, end=42))
"swh.lister.gitlab.tasks.RangeGitLabLister", kwargs=dict(start=12, end=42)
)
assert res
res.wait()
assert res.successful()
@ -51,17 +48,17 @@ def test_range(lister, swh_app, celery_session_worker):
lister.run.assert_called_once_with(min_bound=12, max_bound=42)
@patch('swh.lister.gitlab.tasks.GitLabLister')
@patch("swh.lister.gitlab.tasks.GitLabLister")
def test_relister(lister, swh_app, celery_session_worker):
# setup the mocked GitlabLister
lister.return_value = lister
lister.run.return_value = None
lister.get_pages_information.return_value = (None, 85, None)
lister.db_partition_indices.return_value = [
(i, i+9) for i in range(0, 80, 10)] + [(80, 85)]
(i, i + 9) for i in range(0, 80, 10)
] + [(80, 85)]
res = swh_app.send_task(
'swh.lister.gitlab.tasks.FullGitLabRelister')
res = swh_app.send_task("swh.lister.gitlab.tasks.FullGitLabRelister")
assert res
res.wait()
@ -90,24 +87,26 @@ def test_relister(lister, swh_app, celery_session_worker):
# lister.run should have been called once per partition interval
for i in range(8):
# XXX inconsistent behavior: max_bound is EXCLUDED here
assert (dict(min_bound=10*i, max_bound=10*i + 10),) \
in lister.run.call_args_list
assert (dict(min_bound=80, max_bound=85),) \
in lister.run.call_args_list
assert (
dict(min_bound=10 * i, max_bound=10 * i + 10),
) in lister.run.call_args_list
assert (dict(min_bound=80, max_bound=85),) in lister.run.call_args_list
@patch('swh.lister.gitlab.tasks.GitLabLister')
@patch("swh.lister.gitlab.tasks.GitLabLister")
def test_relister_instance(lister, swh_app, celery_session_worker):
# setup the mocked GitlabLister
lister.return_value = lister
lister.run.return_value = None
lister.get_pages_information.return_value = (None, 85, None)
lister.db_partition_indices.return_value = [
(i, i+9) for i in range(0, 80, 10)] + [(80, 85)]
(i, i + 9) for i in range(0, 80, 10)
] + [(80, 85)]
res = swh_app.send_task(
'swh.lister.gitlab.tasks.FullGitLabRelister',
kwargs=dict(url='https://0xacab.org/api/v4'))
"swh.lister.gitlab.tasks.FullGitLabRelister",
kwargs=dict(url="https://0xacab.org/api/v4"),
)
assert res
res.wait()
@ -123,7 +122,7 @@ def test_relister_instance(lister, swh_app, celery_session_worker):
break
sleep(1)
lister.assert_called_with(url='https://0xacab.org/api/v4')
lister.assert_called_with(url="https://0xacab.org/api/v4")
# one by the FullGitlabRelister task
# + 9 for the RangeGitlabLister subtasks
@ -136,7 +135,7 @@ def test_relister_instance(lister, swh_app, celery_session_worker):
# lister.run should have been called once per partition interval
for i in range(8):
# XXX inconsistent behavior: max_bound is EXCLUDED here
assert (dict(min_bound=10*i, max_bound=10*i + 10),) \
in lister.run.call_args_list
assert (dict(min_bound=80, max_bound=85),) \
in lister.run.call_args_list
assert (
dict(min_bound=10 * i, max_bound=10 * i + 10),
) in lister.run.call_args_list
assert (dict(min_bound=80, max_bound=85),) in lister.run.call_args_list

View file

@ -7,7 +7,8 @@ def register():
from .models import GNUModel
from .lister import GNULister
return {'models': [GNUModel],
'lister': GNULister,
'task_modules': ['%s.tasks' % __name__],
}
return {
"models": [GNUModel],
"lister": GNULister,
"task_modules": ["%s.tasks" % __name__],
}

View file

@ -18,12 +18,12 @@ logger = logging.getLogger(__name__)
class GNULister(SimpleLister):
MODEL = GNUModel
LISTER_NAME = 'gnu'
instance = 'gnu'
LISTER_NAME = "gnu"
instance = "gnu"
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.gnu_tree = GNUTree('https://ftp.gnu.org/tree.json.gz')
self.gnu_tree = GNUTree("https://ftp.gnu.org/tree.json.gz")
def task_dict(self, origin_type, origin_url, **kwargs):
"""Return task format dict
@ -51,10 +51,10 @@ class GNULister(SimpleLister):
"""
artifacts = self.gnu_tree.artifacts[origin_url]
assert origin_type == 'tar'
assert origin_type == "tar"
return utils.create_task_dict(
'load-archive-files',
kwargs.get('policy', 'oneshot'),
"load-archive-files",
kwargs.get("policy", "oneshot"),
url=origin_url,
artifacts=artifacts,
retries_left=3,
@ -103,11 +103,11 @@ class GNULister(SimpleLister):
"""
return {
'uid': repo['url'],
'name': repo['name'],
'full_name': repo['name'],
'html_url': repo['url'],
'origin_url': repo['url'],
'time_last_updated': repo['time_modified'],
'origin_type': 'tar',
"uid": repo["url"],
"name": repo["name"],
"full_name": repo["name"],
"html_url": repo["url"],
"origin_url": repo["url"],
"time_last_updated": repo["time_modified"],
"origin_type": "tar",
}

View file

@ -11,7 +11,8 @@ class GNUModel(ModelBase):
"""a GNU repository representation
"""
__tablename__ = 'gnu_repo'
__tablename__ = "gnu_repo"
uid = Column(String, primary_key=True)
time_last_updated = Column(DateTime)

View file

@ -7,12 +7,12 @@ from celery import shared_task
from .lister import GNULister
@shared_task(name=__name__ + '.GNUListerTask')
@shared_task(name=__name__ + ".GNUListerTask")
def list_gnu_full(**lister_args):
"""List lister for the GNU source code archive"""
return GNULister(**lister_args).run()
@shared_task(name=__name__ + '.ping')
@shared_task(name=__name__ + ".ping")
def _ping():
return 'OK'
return "OK"

View file

@ -10,43 +10,41 @@ logger = logging.getLogger(__name__)
def test_gnu_lister(swh_listers, requests_mock_datadir):
lister = swh_listers['gnu']
lister = swh_listers["gnu"]
lister.run()
r = lister.scheduler.search_tasks(task_type='load-archive-files')
r = lister.scheduler.search_tasks(task_type="load-archive-files")
assert len(r) == 383
for row in r:
assert row['type'] == 'load-archive-files'
assert row["type"] == "load-archive-files"
# arguments check
args = row['arguments']['args']
args = row["arguments"]["args"]
assert len(args) == 0
# kwargs
kwargs = row['arguments']['kwargs']
assert set(kwargs.keys()) == {'url', 'artifacts'}
kwargs = row["arguments"]["kwargs"]
assert set(kwargs.keys()) == {"url", "artifacts"}
url = kwargs['url']
assert url.startswith('https://ftp.gnu.org')
url = kwargs["url"]
assert url.startswith("https://ftp.gnu.org")
url_suffix = url.split('https://ftp.gnu.org')[1]
assert 'gnu' in url_suffix or 'old-gnu' in url_suffix
url_suffix = url.split("https://ftp.gnu.org")[1]
assert "gnu" in url_suffix or "old-gnu" in url_suffix
artifacts = kwargs['artifacts']
artifacts = kwargs["artifacts"]
# check the artifact's structure
artifact = artifacts[0]
assert set(artifact.keys()) == {
'url', 'length', 'time', 'filename', 'version'
}
assert set(artifact.keys()) == {"url", "length", "time", "filename", "version"}
for artifact in artifacts:
logger.debug(artifact)
# 'time' is an isoformat string now
for key in ['url', 'time', 'filename', 'version']:
for key in ["url", "time", "filename", "version"]:
assert isinstance(artifact[key], str)
assert isinstance(artifact['length'], int)
assert isinstance(artifact["length"], int)
assert row['policy'] == 'oneshot'
assert row['priority'] is None
assert row['retries_left'] == 3
assert row["policy"] == "oneshot"
assert row["priority"] is None
assert row["retries_left"] == 3

View file

@ -2,22 +2,20 @@ from unittest.mock import patch
def test_ping(swh_app, celery_session_worker):
res = swh_app.send_task(
'swh.lister.gnu.tasks.ping')
res = swh_app.send_task("swh.lister.gnu.tasks.ping")
assert res
res.wait()
assert res.successful()
assert res.result == 'OK'
assert res.result == "OK"
@patch('swh.lister.gnu.tasks.GNULister')
@patch("swh.lister.gnu.tasks.GNULister")
def test_lister(lister, swh_app, celery_session_worker):
# setup the mocked GNULister
lister.return_value = lister
lister.run.return_value = None
res = swh_app.send_task(
'swh.lister.gnu.tasks.GNUListerTask')
res = swh_app.send_task("swh.lister.gnu.tasks.GNUListerTask")
assert res
res.wait()
assert res.successful()

View file

@ -9,26 +9,30 @@ import pytest
from os import path
from swh.lister.gnu.tree import (
GNUTree, find_artifacts, check_filename_is_archive, load_raw_data,
get_version, format_date
GNUTree,
find_artifacts,
check_filename_is_archive,
load_raw_data,
get_version,
format_date,
)
def test_load_raw_data_from_query(requests_mock_datadir):
actual_json = load_raw_data('https://ftp.gnu.org/tree.json.gz')
actual_json = load_raw_data("https://ftp.gnu.org/tree.json.gz")
assert actual_json is not None
assert isinstance(actual_json, list)
assert len(actual_json) == 2
def test_load_raw_data_from_query_failure(requests_mock_datadir):
inexistant_url = 'https://ftp2.gnu.org/tree.unknown.gz'
with pytest.raises(ValueError, match='Error during query'):
inexistant_url = "https://ftp2.gnu.org/tree.unknown.gz"
with pytest.raises(ValueError, match="Error during query"):
load_raw_data(inexistant_url)
def test_load_raw_data_from_file(datadir):
filepath = path.join(datadir, 'https_ftp.gnu.org', 'tree.json.gz')
filepath = path.join(datadir, "https_ftp.gnu.org", "tree.json.gz")
actual_json = load_raw_data(filepath)
assert actual_json is not None
assert isinstance(actual_json, list)
@ -36,115 +40,115 @@ def test_load_raw_data_from_file(datadir):
def test_load_raw_data_from_file_failure(datadir):
unknown_path = path.join(datadir, 'ftp.gnu.org2', 'tree.json.gz')
unknown_path = path.join(datadir, "ftp.gnu.org2", "tree.json.gz")
with pytest.raises(FileNotFoundError):
load_raw_data(unknown_path)
def test_tree_json(requests_mock_datadir):
tree_json = GNUTree('https://ftp.gnu.org/tree.json.gz')
tree_json = GNUTree("https://ftp.gnu.org/tree.json.gz")
assert tree_json.projects['https://ftp.gnu.org/gnu/8sync/'] == {
'name': '8sync',
'time_modified': '2017-03-18T06:10:08+00:00',
'url': 'https://ftp.gnu.org/gnu/8sync/'
assert tree_json.projects["https://ftp.gnu.org/gnu/8sync/"] == {
"name": "8sync",
"time_modified": "2017-03-18T06:10:08+00:00",
"url": "https://ftp.gnu.org/gnu/8sync/",
}
assert tree_json.projects['https://ftp.gnu.org/gnu/3dldf/'] == {
'name': '3dldf',
'time_modified': '2013-12-13T19:00:36+00:00',
'url': 'https://ftp.gnu.org/gnu/3dldf/'
assert tree_json.projects["https://ftp.gnu.org/gnu/3dldf/"] == {
"name": "3dldf",
"time_modified": "2013-12-13T19:00:36+00:00",
"url": "https://ftp.gnu.org/gnu/3dldf/",
}
assert tree_json.projects['https://ftp.gnu.org/gnu/a2ps/'] == {
'name': 'a2ps',
'time_modified': '2007-12-29T03:55:05+00:00',
'url': 'https://ftp.gnu.org/gnu/a2ps/'
assert tree_json.projects["https://ftp.gnu.org/gnu/a2ps/"] == {
"name": "a2ps",
"time_modified": "2007-12-29T03:55:05+00:00",
"url": "https://ftp.gnu.org/gnu/a2ps/",
}
assert tree_json.projects['https://ftp.gnu.org/old-gnu/xshogi/'] == {
'name': 'xshogi',
'time_modified': '2003-08-02T11:15:22+00:00',
'url': 'https://ftp.gnu.org/old-gnu/xshogi/'
assert tree_json.projects["https://ftp.gnu.org/old-gnu/xshogi/"] == {
"name": "xshogi",
"time_modified": "2003-08-02T11:15:22+00:00",
"url": "https://ftp.gnu.org/old-gnu/xshogi/",
}
assert tree_json.artifacts['https://ftp.gnu.org/old-gnu/zlibc/'] == [
assert tree_json.artifacts["https://ftp.gnu.org/old-gnu/zlibc/"] == [
{
'url': 'https://ftp.gnu.org/old-gnu/zlibc/zlibc-0.9b.tar.gz', # noqa
'length': 90106,
'time': '1997-03-10T08:00:00+00:00',
'filename': 'zlibc-0.9b.tar.gz',
'version': '0.9b',
"url": "https://ftp.gnu.org/old-gnu/zlibc/zlibc-0.9b.tar.gz", # noqa
"length": 90106,
"time": "1997-03-10T08:00:00+00:00",
"filename": "zlibc-0.9b.tar.gz",
"version": "0.9b",
},
{
'url': 'https://ftp.gnu.org/old-gnu/zlibc/zlibc-0.9e.tar.gz', # noqa
'length': 89625,
'time': '1997-04-07T07:00:00+00:00',
'filename': 'zlibc-0.9e.tar.gz',
'version': '0.9e',
}
"url": "https://ftp.gnu.org/old-gnu/zlibc/zlibc-0.9e.tar.gz", # noqa
"length": 89625,
"time": "1997-04-07T07:00:00+00:00",
"filename": "zlibc-0.9e.tar.gz",
"version": "0.9e",
},
]
def test_tree_json_failures(requests_mock_datadir):
url = 'https://unknown/tree.json.gz'
url = "https://unknown/tree.json.gz"
tree_json = GNUTree(url)
with pytest.raises(ValueError, match='Error during query to %s' % url):
tree_json.artifacts['https://ftp.gnu.org/gnu/3dldf/']
with pytest.raises(ValueError, match="Error during query to %s" % url):
tree_json.artifacts["https://ftp.gnu.org/gnu/3dldf/"]
with pytest.raises(ValueError, match='Error during query to %s' % url):
tree_json.projects['https://ftp.gnu.org/old-gnu/xshogi/']
with pytest.raises(ValueError, match="Error during query to %s" % url):
tree_json.projects["https://ftp.gnu.org/old-gnu/xshogi/"]
def test_find_artifacts_small_sample(datadir):
expected_artifacts = [
{
'url': '/root/artanis/artanis-0.2.1.tar.bz2',
'time': '2017-05-19T14:59:39+00:00',
'length': 424081,
'version': '0.2.1',
'filename': 'artanis-0.2.1.tar.bz2',
"url": "/root/artanis/artanis-0.2.1.tar.bz2",
"time": "2017-05-19T14:59:39+00:00",
"length": 424081,
"version": "0.2.1",
"filename": "artanis-0.2.1.tar.bz2",
},
{
'url': '/root/xboard/winboard/winboard-4_0_0-src.zip', # noqa
'time': '1998-06-21T09:55:00+00:00',
'length': 1514448,
'version': '4_0_0-src',
'filename': 'winboard-4_0_0-src.zip',
"url": "/root/xboard/winboard/winboard-4_0_0-src.zip", # noqa
"time": "1998-06-21T09:55:00+00:00",
"length": 1514448,
"version": "4_0_0-src",
"filename": "winboard-4_0_0-src.zip",
},
{
'url': '/root/xboard/xboard-3.6.2.tar.gz', # noqa
'time': '1997-07-25T07:00:00+00:00',
'length': 450164,
'version': '3.6.2',
'filename': 'xboard-3.6.2.tar.gz',
"url": "/root/xboard/xboard-3.6.2.tar.gz", # noqa
"time": "1997-07-25T07:00:00+00:00",
"length": 450164,
"version": "3.6.2",
"filename": "xboard-3.6.2.tar.gz",
},
{
'url': '/root/xboard/xboard-4.0.0.tar.gz', # noqa
'time': '1998-06-21T09:55:00+00:00',
'length': 514951,
'version': '4.0.0',
'filename': 'xboard-4.0.0.tar.gz',
"url": "/root/xboard/xboard-4.0.0.tar.gz", # noqa
"time": "1998-06-21T09:55:00+00:00",
"length": 514951,
"version": "4.0.0",
"filename": "xboard-4.0.0.tar.gz",
},
]
file_structure = json.load(open(path.join(datadir, 'tree.min.json')))
actual_artifacts = find_artifacts(file_structure, '/root/')
file_structure = json.load(open(path.join(datadir, "tree.min.json")))
actual_artifacts = find_artifacts(file_structure, "/root/")
assert actual_artifacts == expected_artifacts
def test_find_artifacts(datadir):
file_structure = json.load(open(path.join(datadir, 'tree.json')))
actual_artifacts = find_artifacts(file_structure, '/root/')
file_structure = json.load(open(path.join(datadir, "tree.json")))
actual_artifacts = find_artifacts(file_structure, "/root/")
assert len(actual_artifacts) == 42 + 3 # tar + zip
def test_check_filename_is_archive():
for ext in ['abc.xy.zip', 'cvb.zip', 'abc.tar.bz2', 'something.tar']:
for ext in ["abc.xy.zip", "cvb.zip", "abc.tar.bz2", "something.tar"]:
assert check_filename_is_archive(ext) is True
for ext in ['abc.tar.gz.sig', 'abc', 'something.zip2', 'foo.tar.']:
for ext in ["abc.tar.gz.sig", "abc", "something.zip2", "foo.tar."]:
assert check_filename_is_archive(ext) is False
@ -155,54 +159,62 @@ def test_get_version():
"""
for url, expected_branchname in [
('https://gnu.org/sthg/info-2.1.0.tar.gz', '2.1.0'),
('https://gnu.org/sthg/info-2.1.2.zip', '2.1.2'),
('https://sthg.org/gnu/sthg.tar.gz', 'sthg'),
('https://sthg.org/gnu/DLDF-1.1.4.tar.gz', '1.1.4'),
('https://sthg.org/gnu/anubis-latest.tar.bz2', 'latest'),
('https://ftp.org/gnu/aris-w32.zip', 'w32'),
('https://ftp.org/gnu/aris-w32-2.2.zip', 'w32-2.2'),
('https://ftp.org/gnu/autogen.info.tar.gz', 'autogen.info'),
('https://ftp.org/gnu/crypto-build-demo.tar.gz',
'crypto-build-demo'),
('https://ftp.org/gnu/clue+clio+xit.clisp.tar.gz',
'clue+clio+xit.clisp'),
('https://ftp.org/gnu/clue+clio.for-pcl.tar.gz',
'clue+clio.for-pcl'),
('https://ftp.org/gnu/clisp-hppa2.0-hp-hpux10.20.tar.gz',
'hppa2.0-hp-hpux10.20'),
('clisp-i386-solaris2.6.tar.gz', 'i386-solaris2.6'),
('clisp-mips-sgi-irix6.5.tar.gz', 'mips-sgi-irix6.5'),
('clisp-powerpc-apple-macos.tar.gz', 'powerpc-apple-macos'),
('clisp-powerpc-unknown-linuxlibc6.tar.gz',
'powerpc-unknown-linuxlibc6'),
('clisp-rs6000-ibm-aix3.2.5.tar.gz', 'rs6000-ibm-aix3.2.5'),
('clisp-sparc-redhat51-linux.tar.gz', 'sparc-redhat51-linux'),
('clisp-sparc-sun-solaris2.4.tar.gz', 'sparc-sun-solaris2.4'),
('clisp-sparc-sun-sunos4.1.3_U1.tar.gz',
'sparc-sun-sunos4.1.3_U1'),
('clisp-2.25.1-powerpc-apple-MacOSX.tar.gz',
'2.25.1-powerpc-apple-MacOSX'),
('clisp-2.27-PowerMacintosh-powerpc-Darwin-1.3.7.tar.gz',
'2.27-PowerMacintosh-powerpc-Darwin-1.3.7'),
('clisp-2.27-i686-unknown-Linux-2.2.19.tar.gz',
'2.27-i686-unknown-Linux-2.2.19'),
('clisp-2.28-i386-i386-freebsd-4.3-RELEASE.tar.gz',
'2.28-i386-i386-freebsd-4.3-RELEASE'),
('clisp-2.28-i686-unknown-cygwin_me-4.90-1.3.10.tar.gz',
'2.28-i686-unknown-cygwin_me-4.90-1.3.10'),
('clisp-2.29-i386-i386-freebsd-4.6-STABLE.tar.gz',
'2.29-i386-i386-freebsd-4.6-STABLE'),
('clisp-2.29-i686-unknown-cygwin_nt-5.0-1.3.12.tar.gz',
'2.29-i686-unknown-cygwin_nt-5.0-1.3.12'),
('gcl-2.5.3-ansi-japi-xdr.20030701_mingw32.zip',
'2.5.3-ansi-japi-xdr.20030701_mingw32'),
('gettext-runtime-0.13.1.bin.woe32.zip', '0.13.1.bin.woe32'),
('sather-logo_images.tar.gz', 'sather-logo_images'),
('sather-specification-000328.html.tar.gz', '000328.html'),
('something-10.1.0.7z', '10.1.0'),
("https://gnu.org/sthg/info-2.1.0.tar.gz", "2.1.0"),
("https://gnu.org/sthg/info-2.1.2.zip", "2.1.2"),
("https://sthg.org/gnu/sthg.tar.gz", "sthg"),
("https://sthg.org/gnu/DLDF-1.1.4.tar.gz", "1.1.4"),
("https://sthg.org/gnu/anubis-latest.tar.bz2", "latest"),
("https://ftp.org/gnu/aris-w32.zip", "w32"),
("https://ftp.org/gnu/aris-w32-2.2.zip", "w32-2.2"),
("https://ftp.org/gnu/autogen.info.tar.gz", "autogen.info"),
("https://ftp.org/gnu/crypto-build-demo.tar.gz", "crypto-build-demo"),
("https://ftp.org/gnu/clue+clio+xit.clisp.tar.gz", "clue+clio+xit.clisp"),
("https://ftp.org/gnu/clue+clio.for-pcl.tar.gz", "clue+clio.for-pcl"),
(
"https://ftp.org/gnu/clisp-hppa2.0-hp-hpux10.20.tar.gz",
"hppa2.0-hp-hpux10.20",
),
("clisp-i386-solaris2.6.tar.gz", "i386-solaris2.6"),
("clisp-mips-sgi-irix6.5.tar.gz", "mips-sgi-irix6.5"),
("clisp-powerpc-apple-macos.tar.gz", "powerpc-apple-macos"),
("clisp-powerpc-unknown-linuxlibc6.tar.gz", "powerpc-unknown-linuxlibc6"),
("clisp-rs6000-ibm-aix3.2.5.tar.gz", "rs6000-ibm-aix3.2.5"),
("clisp-sparc-redhat51-linux.tar.gz", "sparc-redhat51-linux"),
("clisp-sparc-sun-solaris2.4.tar.gz", "sparc-sun-solaris2.4"),
("clisp-sparc-sun-sunos4.1.3_U1.tar.gz", "sparc-sun-sunos4.1.3_U1"),
("clisp-2.25.1-powerpc-apple-MacOSX.tar.gz", "2.25.1-powerpc-apple-MacOSX"),
(
"clisp-2.27-PowerMacintosh-powerpc-Darwin-1.3.7.tar.gz",
"2.27-PowerMacintosh-powerpc-Darwin-1.3.7",
),
(
"clisp-2.27-i686-unknown-Linux-2.2.19.tar.gz",
"2.27-i686-unknown-Linux-2.2.19",
),
(
"clisp-2.28-i386-i386-freebsd-4.3-RELEASE.tar.gz",
"2.28-i386-i386-freebsd-4.3-RELEASE",
),
(
"clisp-2.28-i686-unknown-cygwin_me-4.90-1.3.10.tar.gz",
"2.28-i686-unknown-cygwin_me-4.90-1.3.10",
),
(
"clisp-2.29-i386-i386-freebsd-4.6-STABLE.tar.gz",
"2.29-i386-i386-freebsd-4.6-STABLE",
),
(
"clisp-2.29-i686-unknown-cygwin_nt-5.0-1.3.12.tar.gz",
"2.29-i686-unknown-cygwin_nt-5.0-1.3.12",
),
(
"gcl-2.5.3-ansi-japi-xdr.20030701_mingw32.zip",
"2.5.3-ansi-japi-xdr.20030701_mingw32",
),
("gettext-runtime-0.13.1.bin.woe32.zip", "0.13.1.bin.woe32"),
("sather-logo_images.tar.gz", "sather-logo_images"),
("sather-specification-000328.html.tar.gz", "000328.html"),
("something-10.1.0.7z", "10.1.0"),
]:
actual_branchname = get_version(url)
@ -211,16 +223,16 @@ def test_get_version():
def test_format_date():
for timestamp, expected_isoformat_date in [
(1489817408, '2017-03-18T06:10:08+00:00'),
(1386961236, '2013-12-13T19:00:36+00:00'),
('1198900505', '2007-12-29T03:55:05+00:00'),
(1059822922, '2003-08-02T11:15:22+00:00'),
('1489817408', '2017-03-18T06:10:08+00:00'),
(1489817408, "2017-03-18T06:10:08+00:00"),
(1386961236, "2013-12-13T19:00:36+00:00"),
("1198900505", "2007-12-29T03:55:05+00:00"),
(1059822922, "2003-08-02T11:15:22+00:00"),
("1489817408", "2017-03-18T06:10:08+00:00"),
]:
actual_date = format_date(timestamp)
assert actual_date == expected_isoformat_date
with pytest.raises(ValueError):
format_date('')
format_date("")
with pytest.raises(TypeError):
format_date(None)

View file

@ -24,12 +24,13 @@ class GNUTree:
"""Gnu Tree's representation
"""
def __init__(self, url: str):
self.url = url # filepath or uri
u = urlparse(url)
self.base_url = '%s://%s' % (u.scheme, u.netloc)
self.base_url = "%s://%s" % (u.scheme, u.netloc)
# Interesting top level directories
self.top_level_directories = ['gnu', 'old-gnu']
self.top_level_directories = ["gnu", "old-gnu"]
# internal state
self._artifacts = {} # type: Mapping[str, Any]
self._projects = {} # type: Mapping[str, Any]
@ -59,21 +60,23 @@ class GNUTree:
artifacts = {}
raw_data = load_raw_data(self.url)[0]
for directory in raw_data['contents']:
if directory['name'] not in self.top_level_directories:
for directory in raw_data["contents"]:
if directory["name"] not in self.top_level_directories:
continue
infos = directory['contents']
infos = directory["contents"]
for info in infos:
if info['type'] == 'directory':
package_url = '%s/%s/%s/' % (
self.base_url, directory['name'], info['name'])
package_artifacts = find_artifacts(
info['contents'], package_url)
if info["type"] == "directory":
package_url = "%s/%s/%s/" % (
self.base_url,
directory["name"],
info["name"],
)
package_artifacts = find_artifacts(info["contents"], package_url)
if package_artifacts != []:
repo_details = {
'name': info['name'],
'url': package_url,
'time_modified': format_date(info['time'])
"name": info["name"],
"url": package_url,
"time_modified": format_date(info["time"]),
}
artifacts[package_url] = package_artifacts
projects[package_url] = repo_details
@ -81,8 +84,9 @@ class GNUTree:
return projects, artifacts
def find_artifacts(filesystem: List[Mapping[str, Any]],
url: str) -> List[Mapping[str, Any]]:
def find_artifacts(
filesystem: List[Mapping[str, Any]], url: str
) -> List[Mapping[str, Any]]:
"""Recursively list artifacts present in the folder and subfolders for a
particular package url.
@ -127,23 +131,25 @@ def find_artifacts(filesystem: List[Mapping[str, Any]],
"""
artifacts = [] # type: List[Mapping[str, Any]]
for info_file in filesystem:
filetype = info_file['type']
filename = info_file['name']
if filetype == 'file':
filetype = info_file["type"]
filename = info_file["name"]
if filetype == "file":
if check_filename_is_archive(filename):
uri = url + filename
artifacts.append({
'url': uri,
'filename': filename,
'time': format_date(info_file['time']),
'length': int(info_file['size']),
'version': get_version(filename),
})
artifacts.append(
{
"url": uri,
"filename": filename,
"time": format_date(info_file["time"]),
"length": int(info_file["size"]),
"version": get_version(filename),
}
)
# It will recursively check for artifacts in all sub-folders
elif filetype == 'directory':
elif filetype == "directory":
tarballs_in_dir = find_artifacts(
info_file['contents'],
url + filename + '/')
info_file["contents"], url + filename + "/"
)
artifacts.extend(tarballs_in_dir)
return artifacts
@ -176,40 +182,67 @@ def check_filename_is_archive(filename: str) -> bool:
"""
file_suffixes = Path(filename).suffixes
if len(file_suffixes) == 1 and file_suffixes[-1] in ('.zip', '.tar'):
if len(file_suffixes) == 1 and file_suffixes[-1] in (".zip", ".tar"):
return True
elif len(file_suffixes) > 1:
if file_suffixes[-1] == '.zip' or file_suffixes[-2] == '.tar':
if file_suffixes[-1] == ".zip" or file_suffixes[-2] == ".tar":
return True
return False
# to recognize existing naming pattern
EXTENSIONS = [
'zip',
'tar',
'gz', 'tgz',
'bz2', 'bzip2',
'lzma', 'lz',
'xz',
'Z', '7z',
"zip",
"tar",
"gz",
"tgz",
"bz2",
"bzip2",
"lzma",
"lz",
"xz",
"Z",
"7z",
]
VERSION_KEYWORDS = [
'cygwin_me',
'w32', 'win32', 'nt', 'cygwin', 'mingw',
'latest', 'alpha', 'beta',
'release', 'stable',
'hppa',
'solaris', 'sunos', 'sun4u', 'sparc', 'sun',
'aix', 'ibm', 'rs6000',
'i386', 'i686',
'linux', 'redhat', 'linuxlibc',
'mips',
'powerpc', 'macos', 'apple', 'darwin', 'macosx', 'powermacintosh',
'unknown',
'netbsd', 'freebsd',
'sgi', 'irix',
"cygwin_me",
"w32",
"win32",
"nt",
"cygwin",
"mingw",
"latest",
"alpha",
"beta",
"release",
"stable",
"hppa",
"solaris",
"sunos",
"sun4u",
"sparc",
"sun",
"aix",
"ibm",
"rs6000",
"i386",
"i686",
"linux",
"redhat",
"linuxlibc",
"mips",
"powerpc",
"macos",
"apple",
"darwin",
"macosx",
"powermacintosh",
"unknown",
"netbsd",
"freebsd",
"sgi",
"irix",
]
# Match a filename into components.
@ -225,7 +258,7 @@ VERSION_KEYWORDS = [
# greedily with +, software_name and release_number are matched lazily
# with +? and *?).
PATTERN = r'''
PATTERN = r"""
^
(?:
# We have a software name and a release number, separated with a
@ -239,9 +272,9 @@ PATTERN = r'''
)
(?P<extension>(?:\.(?:{extensions}))+)
$
'''.format(
extensions='|'.join(EXTENSIONS),
vkeywords='|'.join('%s[-]?' % k for k in VERSION_KEYWORDS),
""".format(
extensions="|".join(EXTENSIONS),
vkeywords="|".join("%s[-]?" % k for k in VERSION_KEYWORDS),
)
@ -267,16 +300,15 @@ def get_version(uri: str) -> str:
"""
filename = path.split(uri)[-1]
m = re.match(PATTERN, filename,
flags=re.VERBOSE | re.IGNORECASE)
m = re.match(PATTERN, filename, flags=re.VERBOSE | re.IGNORECASE)
if m:
d = m.groupdict()
if d['software_name1'] and d['release_number']:
return d['release_number']
if d['software_name2']:
return d['software_name2']
if d["software_name1"] and d["release_number"]:
return d["release_number"]
if d["software_name2"]:
return d["software_name2"]
return ''
return ""
def load_raw_data(url: str) -> Sequence[Mapping]:
@ -289,15 +321,15 @@ def load_raw_data(url: str) -> Sequence[Mapping]:
The raw json list
"""
if url.startswith('http://') or url.startswith('https://'):
if url.startswith("http://") or url.startswith("https://"):
response = requests.get(url, allow_redirects=True)
if not response.ok:
raise ValueError('Error during query to %s' % url)
raise ValueError("Error during query to %s" % url)
raw = gzip.decompress(response.content)
else:
with gzip.open(url, 'r') as f:
with gzip.open(url, "r") as f:
raw = f.read()
raw_data = json.loads(raw.decode('utf-8'))
raw_data = json.loads(raw.decode("utf-8"))
return raw_data

View file

@ -7,14 +7,15 @@ def register():
from .models import NpmVisitModel, NpmModel
from .lister import NpmLister
return {'models': [NpmVisitModel, NpmModel],
'lister': NpmLister,
'task_modules': ['%s.tasks' % __name__],
'task_types': {
'list-npm-full': {
'default_interval': '7 days',
'min_interval': '7 days',
'max_interval': '7 days',
},
},
}
return {
"models": [NpmVisitModel, NpmModel],
"lister": NpmLister,
"task_modules": ["%s.tasks" % __name__],
"task_types": {
"list-npm-full": {
"default_interval": "7 days",
"min_interval": "7 days",
"max_interval": "7 days",
},
},
}

View file

@ -14,15 +14,17 @@ class NpmListerBase(IndexingHttpLister):
"""List packages available in the npm registry in a paginated way
"""
MODEL = NpmModel
LISTER_NAME = 'npm'
instance = 'npm'
def __init__(self, url='https://replicate.npmjs.com',
per_page=1000, override_config=None):
MODEL = NpmModel
LISTER_NAME = "npm"
instance = "npm"
def __init__(
self, url="https://replicate.npmjs.com", per_page=1000, override_config=None
):
super().__init__(url=url, override_config=override_config)
self.per_page = per_page + 1
self.PATH_TEMPLATE += '&limit=%s' % self.per_page
self.PATH_TEMPLATE += "&limit=%s" % self.per_page
@property
def ADDITIONAL_CONFIG(self) -> Dict[str, Any]:
@ -30,22 +32,22 @@ class NpmListerBase(IndexingHttpLister):
"""
default_config = super().ADDITIONAL_CONFIG
default_config['loading_task_policy'] = ('str', 'recurring')
default_config["loading_task_policy"] = ("str", "recurring")
return default_config
def get_model_from_repo(self, repo_name: str) -> Dict[str, str]:
"""(Override) Transform from npm package name to model
"""
package_url = 'https://www.npmjs.com/package/%s' % repo_name
package_url = "https://www.npmjs.com/package/%s" % repo_name
return {
'uid': repo_name,
'indexable': repo_name,
'name': repo_name,
'full_name': repo_name,
'html_url': package_url,
'origin_url': package_url,
'origin_type': 'npm',
"uid": repo_name,
"indexable": repo_name,
"name": repo_name,
"full_name": repo_name,
"html_url": package_url,
"origin_url": package_url,
"origin_type": "npm",
}
def task_dict(self, origin_type: str, origin_url: str, **kwargs):
@ -56,10 +58,9 @@ class NpmListerBase(IndexingHttpLister):
needed for the ingestion task creation.
"""
task_type = 'load-%s' % origin_type
task_policy = self.config['loading_task_policy']
return create_task_dict(task_type, task_policy,
url=origin_url)
task_type = "load-%s" % origin_type
task_policy = self.config["loading_task_policy"]
return create_task_dict(task_type, task_policy, url=origin_url)
def request_headers(self) -> Dict[str, Any]:
"""(Override) Set requests headers to send when querying the npm
@ -67,7 +68,7 @@ class NpmListerBase(IndexingHttpLister):
"""
headers = super().request_headers()
headers['Accept'] = 'application/json'
headers["Accept"] = "application/json"
return headers
def string_pattern_check(self, inner: int, lower: int, upper: int = None):
@ -83,25 +84,24 @@ class NpmLister(NpmListerBase):
"""List all packages available in the npm registry in a paginated way
"""
PATH_TEMPLATE = '/_all_docs?startkey="%s"'
def get_next_target_from_response(
self, response: Response) -> Optional[str]:
def get_next_target_from_response(self, response: Response) -> Optional[str]:
"""(Override) Get next npm package name to continue the listing
"""
repos = response.json()['rows']
return repos[-1]['id'] if len(repos) == self.per_page else None
repos = response.json()["rows"]
return repos[-1]["id"] if len(repos) == self.per_page else None
def transport_response_simplified(
self, response: Response) -> List[Dict[str, str]]:
def transport_response_simplified(self, response: Response) -> List[Dict[str, str]]:
"""(Override) Transform npm registry response to list for model manipulation
"""
repos = response.json()['rows']
repos = response.json()["rows"]
if len(repos) == self.per_page:
repos = repos[:-1]
return [self.get_model_from_repo(repo['id']) for repo in repos]
return [self.get_model_from_repo(repo["id"]) for repo in repos]
class NpmIncrementalLister(NpmListerBase):
@ -109,30 +109,29 @@ class NpmIncrementalLister(NpmListerBase):
update_seq value of the underlying CouchDB database, in a paginated way.
"""
PATH_TEMPLATE = '/_changes?since=%s'
PATH_TEMPLATE = "/_changes?since=%s"
@property
def CONFIG_BASE_FILENAME(self): # noqa: N802
return 'lister_npm_incremental'
return "lister_npm_incremental"
def get_next_target_from_response(
self, response: Response) -> Optional[str]:
def get_next_target_from_response(self, response: Response) -> Optional[str]:
"""(Override) Get next npm package name to continue the listing.
"""
repos = response.json()['results']
return repos[-1]['seq'] if len(repos) == self.per_page else None
repos = response.json()["results"]
return repos[-1]["seq"] if len(repos) == self.per_page else None
def transport_response_simplified(
self, response: Response) -> List[Dict[str, str]]:
def transport_response_simplified(self, response: Response) -> List[Dict[str, str]]:
"""(Override) Transform npm registry response to list for model
manipulation.
"""
repos = response.json()['results']
repos = response.json()["results"]
if len(repos) == self.per_page:
repos = repos[:-1]
return [self.get_model_from_repo(repo['id']) for repo in repos]
return [self.get_model_from_repo(repo["id"]) for repo in repos]
def filter_before_inject(self, models_list: List[Dict[str, Any]]):
"""(Override) Filter out documents in the CouchDB database
@ -141,9 +140,9 @@ class NpmIncrementalLister(NpmListerBase):
"""
models_filtered = []
for model in models_list:
package_name = model['name']
package_name = model["name"]
# document related to CouchDB internals
if package_name.startswith('_design/'):
if package_name.startswith("_design/"):
continue
models_filtered.append(model)
return models_filtered

View file

@ -11,9 +11,10 @@ class NpmVisitModel(SQLBase, metaclass=ABCSQLMeta):
"""Table to store the npm registry state at the time of a
content listing by Software Heritage
"""
__tablename__ = 'npm_visit'
uid = Column(Integer, Sequence('npm_visit_id_seq'), primary_key=True)
__tablename__ = "npm_visit"
uid = Column(Integer, Sequence("npm_visit_id_seq"), primary_key=True)
visit_date = Column(DateTime, nullable=False)
doc_count = Column(BigInteger)
doc_del_count = Column(BigInteger)
@ -29,7 +30,8 @@ class NpmModel(IndexingModelBase):
"""A npm package representation
"""
__tablename__ = 'npm_repo'
__tablename__ = "npm_repo"
uid = Column(String, primary_key=True)
indexable = Column(String, index=True)

View file

@ -13,15 +13,22 @@ from swh.lister.npm.models import NpmVisitModel
@contextmanager
def save_registry_state(lister):
params = {'headers': lister.request_headers()}
params = {"headers": lister.request_headers()}
registry_state = lister.session.get(lister.url, **params)
registry_state = registry_state.json()
keys = ('doc_count', 'doc_del_count', 'update_seq', 'purge_seq',
'disk_size', 'data_size', 'committed_update_seq',
'compacted_seq')
keys = (
"doc_count",
"doc_del_count",
"update_seq",
"purge_seq",
"disk_size",
"data_size",
"committed_update_seq",
"compacted_seq",
)
state = {key: registry_state[key] for key in keys}
state['visit_date'] = datetime.now()
state["visit_date"] = datetime.now()
yield
npm_visit = NpmVisitModel(**state)
lister.db_session.add(npm_visit)
@ -34,29 +41,31 @@ def get_last_update_seq(lister):
query = lister.db_session.query(NpmVisitModel.update_seq)
row = query.order_by(NpmVisitModel.uid.desc()).first()
if not row:
raise ValueError('No npm registry listing previously performed ! '
'This is required prior to the execution of an '
'incremental listing.')
raise ValueError(
"No npm registry listing previously performed ! "
"This is required prior to the execution of an "
"incremental listing."
)
return row[0]
@shared_task(name=__name__ + '.NpmListerTask')
@shared_task(name=__name__ + ".NpmListerTask")
def list_npm_full(**lister_args):
'Full lister for the npm (javascript) registry'
"Full lister for the npm (javascript) registry"
lister = NpmLister(**lister_args)
with save_registry_state(lister):
return lister.run()
@shared_task(name=__name__ + '.NpmIncrementalListerTask')
@shared_task(name=__name__ + ".NpmIncrementalListerTask")
def list_npm_incremental(**lister_args):
'Incremental lister for the npm (javascript) registry'
"Incremental lister for the npm (javascript) registry"
lister = NpmIncrementalLister(**lister_args)
update_seq_start = get_last_update_seq(lister)
with save_registry_state(lister):
return lister.run(min_bound=update_seq_start)
@shared_task(name=__name__ + '.ping')
@shared_task(name=__name__ + ".ping")
def _ping():
return 'OK'
return "OK"

View file

@ -10,14 +10,16 @@ from swh.lister.core.tests.conftest import * # noqa
@pytest.fixture
def lister_npm(swh_listers):
lister = swh_listers['npm']
lister = swh_listers["npm"]
# Add the load-deb-package in the scheduler backend
lister.scheduler.create_task_type({
'type': 'load-npm',
'description': 'Load npm package',
'backend_name': 'swh.loader.package.tasks.LoadNpm',
'default_interval': '1 day',
})
lister.scheduler.create_task_type(
{
"type": "load-npm",
"description": "Load npm package",
"backend_name": "swh.loader.package.tasks.LoadNpm",
"default_interval": "1 day",
}
)
return lister

View file

@ -21,10 +21,10 @@ logger = logging.getLogger(__name__)
class NpmListerTester(HttpListerTesterBase, unittest.TestCase):
Lister = NpmLister
test_re = re.compile(r'^.*/_all_docs\?startkey="(.+)".*')
lister_subdir = 'npm'
good_api_response_file = 'data/replicate.npmjs.com/api_response.json'
bad_api_response_file = 'data/api_empty_response.json'
first_index = 'jquery'
lister_subdir = "npm"
good_api_response_file = "data/replicate.npmjs.com/api_response.json"
bad_api_response_file = "data/api_empty_response.json"
first_index = "jquery"
entries_per_page = 100
@requests_mock.Mocker()
@ -37,11 +37,11 @@ class NpmListerTester(HttpListerTesterBase, unittest.TestCase):
class NpmIncrementalListerTester(HttpListerTesterBase, unittest.TestCase):
Lister = NpmIncrementalLister
test_re = re.compile(r'^.*/_changes\?since=([0-9]+).*')
lister_subdir = 'npm'
good_api_response_file = 'data/api_inc_response.json'
bad_api_response_file = 'data/api_inc_empty_response.json'
first_index = '6920642'
test_re = re.compile(r"^.*/_changes\?since=([0-9]+).*")
lister_subdir = "npm"
good_api_response_file = "data/api_inc_response.json"
bad_api_response_file = "data/api_inc_empty_response.json"
first_index = "6920642"
entries_per_page = 100
@requests_mock.Mocker()
@ -58,27 +58,27 @@ def check_tasks(tasks: List[Any]):
"""
for row in tasks:
logger.debug('row: %s', row)
assert row['type'] == 'load-npm'
logger.debug("row: %s", row)
assert row["type"] == "load-npm"
# arguments check
args = row['arguments']['args']
args = row["arguments"]["args"]
assert len(args) == 0
# kwargs
kwargs = row['arguments']['kwargs']
kwargs = row["arguments"]["kwargs"]
assert len(kwargs) == 1
package_url = kwargs['url']
package_name = package_url.split('/')[-1]
assert package_url == f'https://www.npmjs.com/package/{package_name}'
package_url = kwargs["url"]
package_name = package_url.split("/")[-1]
assert package_url == f"https://www.npmjs.com/package/{package_name}"
assert row['policy'] == 'recurring'
assert row['priority'] is None
assert row["policy"] == "recurring"
assert row["priority"] is None
def test_lister_npm_basic_listing(lister_npm, requests_mock_datadir):
lister_npm.run()
tasks = lister_npm.scheduler.search_tasks(task_type='load-npm')
tasks = lister_npm.scheduler.search_tasks(task_type="load-npm")
assert len(tasks) == 100
check_tasks(tasks)
@ -89,10 +89,11 @@ def test_lister_npm_listing_pagination(lister_npm, requests_mock_datadir):
# Patch per page pagination
lister.per_page = 10 + 1
lister.PATH_TEMPLATE = lister.PATH_TEMPLATE.replace(
'&limit=1001', '&limit=%s' % lister.per_page)
"&limit=1001", "&limit=%s" % lister.per_page
)
lister.run()
tasks = lister.scheduler.search_tasks(task_type='load-npm')
tasks = lister.scheduler.search_tasks(task_type="load-npm")
assert len(tasks) == 2 * 10 # only 2 files with 10 results each
check_tasks(tasks)

View file

@ -8,23 +8,22 @@ def mock_save(lister):
def test_ping(swh_app, celery_session_worker):
res = swh_app.send_task(
'swh.lister.npm.tasks.ping')
res = swh_app.send_task("swh.lister.npm.tasks.ping")
assert res
res.wait()
assert res.successful()
assert res.result == 'OK'
assert res.result == "OK"
@patch('swh.lister.npm.tasks.save_registry_state')
@patch('swh.lister.npm.tasks.NpmLister')
@patch("swh.lister.npm.tasks.save_registry_state")
@patch("swh.lister.npm.tasks.NpmLister")
def test_lister(lister, save, swh_app, celery_session_worker):
# setup the mocked NpmLister
lister.return_value = lister
lister.run.return_value = None
save.side_effect = mock_save
res = swh_app.send_task('swh.lister.npm.tasks.NpmListerTask')
res = swh_app.send_task("swh.lister.npm.tasks.NpmListerTask")
assert res
res.wait()
assert res.successful()
@ -33,9 +32,9 @@ def test_lister(lister, save, swh_app, celery_session_worker):
lister.run.assert_called_once_with()
@patch('swh.lister.npm.tasks.save_registry_state')
@patch('swh.lister.npm.tasks.get_last_update_seq')
@patch('swh.lister.npm.tasks.NpmIncrementalLister')
@patch("swh.lister.npm.tasks.save_registry_state")
@patch("swh.lister.npm.tasks.get_last_update_seq")
@patch("swh.lister.npm.tasks.NpmIncrementalLister")
def test_incremental(lister, seq, save, swh_app, celery_session_worker):
# setup the mocked NpmLister
lister.return_value = lister
@ -43,8 +42,7 @@ def test_incremental(lister, seq, save, swh_app, celery_session_worker):
seq.return_value = 42
save.side_effect = mock_save
res = swh_app.send_task(
'swh.lister.npm.tasks.NpmIncrementalListerTask')
res = swh_app.send_task("swh.lister.npm.tasks.NpmIncrementalListerTask")
assert res
res.wait()
assert res.successful()

View file

@ -7,7 +7,8 @@ def register():
from .models import PackagistModel
from .lister import PackagistLister
return {'models': [PackagistModel],
'lister': PackagistLister,
'task_modules': ['%s.tasks' % __name__],
}
return {
"models": [PackagistModel],
"lister": PackagistLister,
"task_modules": ["%s.tasks" % __name__],
}

View file

@ -23,7 +23,7 @@ def compute_package_url(repo_name: str) -> str:
"""Compute packgist package url from repo name.
"""
return 'https://repo.packagist.org/p/%s.json' % repo_name
return "https://repo.packagist.org/p/%s.json" % repo_name
class PackagistLister(ListerOnePageApiTransport, SimpleLister):
@ -52,17 +52,19 @@ class PackagistLister(ListerOnePageApiTransport, SimpleLister):
'https://repo.packagist.org/p/hypejunction/hypegamemechanics.json'
"""
MODEL = PackagistModel
LISTER_NAME = 'packagist'
PAGE = 'https://packagist.org/packages/list.json'
instance = 'packagist'
LISTER_NAME = "packagist"
PAGE = "https://packagist.org/packages/list.json"
instance = "packagist"
def __init__(self, override_config=None):
ListerOnePageApiTransport .__init__(self)
ListerOnePageApiTransport.__init__(self)
SimpleLister.__init__(self, override_config=override_config)
def task_dict(self, origin_type: str, origin_url: str,
**kwargs: Mapping[str, str]) -> Dict[str, Any]:
def task_dict(
self, origin_type: str, origin_url: str, **kwargs: Mapping[str, str]
) -> Dict[str, Any]:
"""Return task format dict
This is overridden from the lister_base as more information is
@ -70,18 +72,20 @@ class PackagistLister(ListerOnePageApiTransport, SimpleLister):
"""
return utils.create_task_dict(
'load-%s' % origin_type,
kwargs.get('policy', 'recurring'),
kwargs.get('name'), origin_url,
retries_left=3)
"load-%s" % origin_type,
kwargs.get("policy", "recurring"),
kwargs.get("name"),
origin_url,
retries_left=3,
)
def list_packages(self, response: Any) -> List[str]:
"""List the actual packagist origins from the response.
"""
response = json.loads(response.text)
packages = [name for name in response['packageNames']]
logger.debug('Number of packages: %s', len(packages))
packages = [name for name in response["packageNames"]]
logger.debug("Number of packages: %s", len(packages))
random.shuffle(packages)
return packages
@ -91,10 +95,10 @@ class PackagistLister(ListerOnePageApiTransport, SimpleLister):
"""
url = compute_package_url(repo_name)
return {
'uid': repo_name,
'name': repo_name,
'full_name': repo_name,
'html_url': url,
'origin_url': url,
'origin_type': 'packagist',
"uid": repo_name,
"name": repo_name,
"full_name": repo_name,
"html_url": url,
"origin_url": url,
"origin_type": "packagist",
}

View file

@ -11,6 +11,7 @@ class PackagistModel(ModelBase):
"""a Packagist repository representation
"""
__tablename__ = 'packagist_repo'
__tablename__ = "packagist_repo"
uid = Column(String, primary_key=True)

View file

@ -7,12 +7,12 @@ from celery import shared_task
from .lister import PackagistLister
@shared_task(name=__name__ + '.PackagistListerTask')
@shared_task(name=__name__ + ".PackagistListerTask")
def list_packagist(**lister_args):
'List the packagist (php) registry'
"List the packagist (php) registry"
PackagistLister(**lister_args).run()
@shared_task(name=__name__ + '.ping')
@shared_task(name=__name__ + ".ping")
def _ping():
return 'OK'
return "OK"

View file

@ -10,14 +10,16 @@ from swh.lister.core.tests.conftest import * # noqa
@pytest.fixture
def lister_packagist(swh_listers):
lister = swh_listers['packagist']
lister = swh_listers["packagist"]
# Amend the scheduler with an unknown yet load-packagist task type
lister.scheduler.create_task_type({
'type': 'load-packagist',
'description': 'Load packagist origin',
'backend_name': 'swh.loader.package.tasks.LoaderPackagist',
'default_interval': '1 day',
})
lister.scheduler.create_task_type(
{
"type": "load-packagist",
"description": "Load packagist origin",
"backend_name": "swh.loader.package.tasks.LoaderPackagist",
"default_interval": "1 day",
}
)
return lister

View file

@ -12,27 +12,29 @@ from swh.lister.packagist.lister import PackagistLister, compute_package_url
from swh.lister.core.tests.test_lister import HttpSimpleListerTester
expected_packages = ['0.0.0/composer-include-files', '0.0.0/laravel-env-shim',
'0.0.1/try-make-package', '0099ff/dialogflowphp',
'00f100/array_dot']
expected_packages = [
"0.0.0/composer-include-files",
"0.0.0/laravel-env-shim",
"0.0.1/try-make-package",
"0099ff/dialogflowphp",
"00f100/array_dot",
]
expected_model = {
'uid': '0099ff/dialogflowphp',
'name': '0099ff/dialogflowphp',
'full_name': '0099ff/dialogflowphp',
'html_url':
'https://repo.packagist.org/p/0099ff/dialogflowphp.json',
'origin_url':
'https://repo.packagist.org/p/0099ff/dialogflowphp.json',
'origin_type': 'packagist',
}
"uid": "0099ff/dialogflowphp",
"name": "0099ff/dialogflowphp",
"full_name": "0099ff/dialogflowphp",
"html_url": "https://repo.packagist.org/p/0099ff/dialogflowphp.json",
"origin_url": "https://repo.packagist.org/p/0099ff/dialogflowphp.json",
"origin_type": "packagist",
}
class PackagistListerTester(HttpSimpleListerTester, unittest.TestCase):
Lister = PackagistLister
PAGE = 'https://packagist.org/packages/list.json'
lister_subdir = 'packagist'
good_api_response_file = 'data/https_packagist.org/packages_list.json'
PAGE = "https://packagist.org/packages/list.json"
lister_subdir = "packagist"
good_api_response_file = "data/https_packagist.org/packages_list.json"
entries = 5
@requests_mock.Mocker()
@ -52,40 +54,41 @@ class PackagistListerTester(HttpSimpleListerTester, unittest.TestCase):
"""
fl = self.get_fl()
model = fl.transport_response_simplified(['0099ff/dialogflowphp'])
model = fl.transport_response_simplified(["0099ff/dialogflowphp"])
assert len(model) == 1
for key, values in model[0].items():
assert values == expected_model[key]
@patch('swh.lister.packagist.lister.utils.create_task_dict')
@patch("swh.lister.packagist.lister.utils.create_task_dict")
def test_task_dict(self, mock_create_tasks):
"""Test the task creation of lister
"""
fl = self.get_fl()
fl.task_dict(origin_type='packagist', origin_url='https://abc',
name='test_pack')
fl.task_dict(
origin_type="packagist", origin_url="https://abc", name="test_pack"
)
mock_create_tasks.assert_called_once_with(
'load-packagist', 'recurring', 'test_pack', 'https://abc',
retries_left=3)
"load-packagist", "recurring", "test_pack", "https://abc", retries_left=3
)
def test_compute_package_url():
expected_url = 'https://repo.packagist.org/p/hello.json'
actual_url = compute_package_url('hello')
expected_url = "https://repo.packagist.org/p/hello.json"
actual_url = compute_package_url("hello")
assert actual_url == expected_url
def test_packagist_lister(lister_packagist, requests_mock_datadir):
lister_packagist.run()
r = lister_packagist.scheduler.search_tasks(task_type='load-packagist')
r = lister_packagist.scheduler.search_tasks(task_type="load-packagist")
assert len(r) == 5
for row in r:
assert row['type'] == 'load-packagist'
assert row["type"] == "load-packagist"
# arguments check
args = row['arguments']['args']
args = row["arguments"]["args"]
assert len(args) == 2
package = args[0]
@ -95,8 +98,8 @@ def test_packagist_lister(lister_packagist, requests_mock_datadir):
assert url == expected_url
# kwargs
kwargs = row['arguments']['kwargs']
kwargs = row["arguments"]["kwargs"]
assert kwargs == {}
assert row['policy'] == 'recurring'
assert row['priority'] is None
assert row["policy"] == "recurring"
assert row["priority"] is None

View file

@ -6,22 +6,20 @@ from unittest.mock import patch
def test_ping(swh_app, celery_session_worker):
res = swh_app.send_task(
'swh.lister.packagist.tasks.ping')
res = swh_app.send_task("swh.lister.packagist.tasks.ping")
assert res
res.wait()
assert res.successful()
assert res.result == 'OK'
assert res.result == "OK"
@patch('swh.lister.packagist.tasks.PackagistLister')
@patch("swh.lister.packagist.tasks.PackagistLister")
def test_lister(lister, swh_app, celery_session_worker):
# setup the mocked PackagistLister
lister.return_value = lister
lister.run.return_value = None
res = swh_app.send_task(
'swh.lister.packagist.tasks.PackagistListerTask')
res = swh_app.send_task("swh.lister.packagist.tasks.PackagistListerTask")
assert res
res.wait()
assert res.successful()

View file

@ -7,7 +7,8 @@ def register():
from .models import PhabricatorModel
from .lister import PhabricatorLister
return {'models': [PhabricatorModel],
'lister': PhabricatorLister,
'task_modules': ['%s.tasks' % __name__],
}
return {
"models": [PhabricatorModel],
"lister": PhabricatorLister,
"task_modules": ["%s.tasks" % __name__],
}

View file

@ -21,11 +21,10 @@ logger = logging.getLogger(__name__)
class PhabricatorLister(IndexingHttpLister):
PATH_TEMPLATE = '?order=oldest&attachments[uris]=1&after=%s'
DEFAULT_URL = \
'https://forge.softwareheritage.org/api/diffusion.repository.search'
PATH_TEMPLATE = "?order=oldest&attachments[uris]=1&after=%s"
DEFAULT_URL = "https://forge.softwareheritage.org/api/diffusion.repository.search"
MODEL = PhabricatorModel
LISTER_NAME = 'phabricator'
LISTER_NAME = "phabricator"
def __init__(self, url=None, instance=None, override_config=None):
super().__init__(url=url, override_config=override_config)
@ -48,11 +47,14 @@ class PhabricatorLister(IndexingHttpLister):
creds = self.request_instance_credentials()
if not creds:
raise ValueError(
'Phabricator forge needs authentication credential to list.')
api_token = random.choice(creds)['password']
"Phabricator forge needs authentication credential to list."
)
api_token = random.choice(creds)["password"]
return {'headers': self.request_headers() or {},
'params': {'api.token': api_token}}
return {
"headers": self.request_headers() or {},
"params": {"api.token": api_token},
}
def request_headers(self):
"""
@ -60,39 +62,39 @@ class PhabricatorLister(IndexingHttpLister):
Phabricator API
"""
headers = super().request_headers()
headers['Accept'] = 'application/json'
headers["Accept"] = "application/json"
return headers
def get_model_from_repo(
self, repo: Dict[str, Any]) -> Optional[Dict[str, Any]]:
url = get_repo_url(repo['attachments']['uris']['uris'])
def get_model_from_repo(self, repo: Dict[str, Any]) -> Optional[Dict[str, Any]]:
url = get_repo_url(repo["attachments"]["uris"]["uris"])
if url is None:
return None
return {
'uid': url,
'indexable': repo['id'],
'name': repo['fields']['shortName'],
'full_name': repo['fields']['name'],
'html_url': url,
'origin_url': url,
'origin_type': repo['fields']['vcs'],
'instance': self.instance,
"uid": url,
"indexable": repo["id"],
"name": repo["fields"]["shortName"],
"full_name": repo["fields"]["name"],
"html_url": url,
"origin_url": url,
"origin_type": repo["fields"]["vcs"],
"instance": self.instance,
}
def get_next_target_from_response(
self, response: Response) -> Optional[int]:
body = response.json()['result']['cursor']
if body['after'] and body['after'] != 'null':
return int(body['after'])
def get_next_target_from_response(self, response: Response) -> Optional[int]:
body = response.json()["result"]["cursor"]
if body["after"] and body["after"] != "null":
return int(body["after"])
return None
def transport_response_simplified(
self, response: Response) -> List[Optional[Dict[str, Any]]]:
self, response: Response
) -> List[Optional[Dict[str, Any]]]:
repos = response.json()
if repos['result'] is None:
if repos["result"] is None:
raise ValueError(
'Problem during information fetch: %s' % repos['error_code'])
repos = repos['result']['data']
"Problem during information fetch: %s" % repos["error_code"]
)
repos = repos["result"]["data"]
return [self.get_model_from_repo(repo) for repo in repos]
def filter_before_inject(self, models_list):
@ -103,8 +105,7 @@ class PhabricatorLister(IndexingHttpLister):
models_list = [m for m in models_list if m is not None]
return super().filter_before_inject(models_list)
def disable_deleted_repo_tasks(
self, index: int, next_index: int, keep_these: str):
def disable_deleted_repo_tasks(self, index: int, next_index: int, keep_these: str):
"""
(Overrides) Fix provided index value to avoid:
@ -113,7 +114,7 @@ class PhabricatorLister(IndexingHttpLister):
"""
# First call to the Phabricator API uses an empty 'after' parameter,
# so set the index to 0 to avoid database query error
if index == '':
if index == "":
index = 0
# Next listed repository ids are strictly greater than the 'after'
# parameter, so increment the index to avoid disabling the latest
@ -121,8 +122,7 @@ class PhabricatorLister(IndexingHttpLister):
# the Phabricator API
else:
index = index + 1
return super().disable_deleted_repo_tasks(index, next_index,
keep_these)
return super().disable_deleted_repo_tasks(index, next_index, keep_these)
def db_first_index(self) -> Optional[int]:
"""
@ -172,19 +172,18 @@ def get_repo_url(attachments: List[Dict[str, Any]]) -> Optional[int]:
"""
processed_urls = defaultdict(dict) # type: Dict[str, Any]
for uri in attachments:
protocol = uri['fields']['builtin']['protocol']
url = uri['fields']['uri']['effective']
identifier = uri['fields']['builtin']['identifier']
if protocol in ('http', 'https'):
protocol = uri["fields"]["builtin"]["protocol"]
url = uri["fields"]["uri"]["effective"]
identifier = uri["fields"]["builtin"]["identifier"]
if protocol in ("http", "https"):
processed_urls[protocol][identifier] = url
elif protocol is None:
for protocol in ('https', 'http'):
for protocol in ("https", "http"):
if url.startswith(protocol):
processed_urls[protocol]['undefined'] = url
processed_urls[protocol]["undefined"] = url
break
for protocol in ['https', 'http']:
for identifier in ['shortname', 'callsign', 'id', 'undefined']:
if (protocol in processed_urls and
identifier in processed_urls[protocol]):
for protocol in ["https", "http"]:
for identifier in ["shortname", "callsign", "id", "undefined"]:
if protocol in processed_urls and identifier in processed_urls[protocol]:
return processed_urls[protocol][identifier]
return None

View file

@ -9,7 +9,8 @@ from swh.lister.core.models import IndexingModelBase
class PhabricatorModel(IndexingModelBase):
"""a Phabricator repository"""
__tablename__ = 'phabricator_repo'
__tablename__ = "phabricator_repo"
uid = Column(String, primary_key=True)
indexable = Column(Integer, index=True)

View file

@ -7,12 +7,12 @@ from celery import shared_task
from swh.lister.phabricator.lister import PhabricatorLister
@shared_task(name=__name__ + '.FullPhabricatorLister')
@shared_task(name=__name__ + ".FullPhabricatorLister")
def list_phabricator_full(**lister_args):
"""Full update of a Phabricator instance"""
return PhabricatorLister(**lister_args).run()
@shared_task(name=__name__ + '.ping')
@shared_task(name=__name__ + ".ping")
def _ping():
return 'OK'
return "OK"

View file

@ -10,17 +10,12 @@ from swh.lister.core.tests.conftest import * # noqa
@pytest.fixture
def lister_phabricator(swh_listers):
lister = swh_listers['phabricator']
lister = swh_listers["phabricator"]
# Amend the credentials
lister.config = {
'cache_responses': False,
'credentials': {
'phabricator': {
lister.instance: [{
'password': 'foo'
}]
}}
"cache_responses": False,
"credentials": {"phabricator": {lister.instance: [{"password": "foo"}]}},
}
return lister

View file

@ -21,12 +21,11 @@ logger = logging.getLogger(__name__)
class PhabricatorListerTester(HttpListerTester, unittest.TestCase):
Lister = PhabricatorLister
# first request will have the after parameter empty
test_re = re.compile(r'\&after=([^?&]*)')
lister_subdir = 'phabricator'
good_api_response_file = 'data/api_first_response.json'
good_api_response_undefined_protocol = \
'data/api_response_undefined_protocol.json'
bad_api_response_file = 'data/api_empty_response.json'
test_re = re.compile(r"\&after=([^?&]*)")
lister_subdir = "phabricator"
good_api_response_file = "data/api_first_response.json"
good_api_response_undefined_protocol = "data/api_response_undefined_protocol.json"
bad_api_response_file = "data/api_empty_response.json"
# first_index must be retrieved through a bootstrap process for Phabricator
first_index = None
last_index = 12
@ -40,7 +39,7 @@ class PhabricatorListerTester(HttpListerTester, unittest.TestCase):
"""
m = self.test_re.search(request.path_url)
idx = m.group(1)
if idx not in ('', 'None'):
if idx not in ("", "None"):
return int(idx)
def get_fl(self, override_config=None):
@ -48,41 +47,42 @@ class PhabricatorListerTester(HttpListerTester, unittest.TestCase):
"""
if override_config or self.fl is None:
credentials = {'phabricator': {'fake': [
{'password': 'toto'}
]}}
override_config = dict(credentials=credentials,
**(override_config or {}))
self.fl = self.Lister(url='https://fakeurl', instance='fake',
override_config=override_config)
credentials = {"phabricator": {"fake": [{"password": "toto"}]}}
override_config = dict(credentials=credentials, **(override_config or {}))
self.fl = self.Lister(
url="https://fakeurl", instance="fake", override_config=override_config
)
self.fl.INITIAL_BACKOFF = 1
self.fl.reset_backoff()
return self.fl
def test_get_repo_url(self):
f = open('swh/lister/%s/tests/%s' % (self.lister_subdir,
self.good_api_response_file))
f = open(
"swh/lister/%s/tests/%s" % (self.lister_subdir, self.good_api_response_file)
)
api_response = json.load(f)
repos = api_response['result']['data']
repos = api_response["result"]["data"]
for repo in repos:
self.assertEqual(
'https://forge.softwareheritage.org/source/%s.git' %
(repo['fields']['shortName']),
get_repo_url(repo['attachments']['uris']['uris']))
"https://forge.softwareheritage.org/source/%s.git"
% (repo["fields"]["shortName"]),
get_repo_url(repo["attachments"]["uris"]["uris"]),
)
f = open('swh/lister/%s/tests/%s' %
(self.lister_subdir,
self.good_api_response_undefined_protocol))
f = open(
"swh/lister/%s/tests/%s"
% (self.lister_subdir, self.good_api_response_undefined_protocol)
)
repo = json.load(f)
self.assertEqual(
'https://svn.blender.org/svnroot/bf-blender/',
get_repo_url(repo['attachments']['uris']['uris']))
"https://svn.blender.org/svnroot/bf-blender/",
get_repo_url(repo["attachments"]["uris"]["uris"]),
)
@requests_mock.Mocker()
def test_scheduled_tasks(self, http_mocker):
self.scheduled_tasks_test('data/api_next_response.json', 23,
http_mocker)
self.scheduled_tasks_test("data/api_next_response.json", 23, http_mocker)
@requests_mock.Mocker()
def test_scheduled_tasks_multiple_instances(self, http_mocker):
@ -92,19 +92,14 @@ class PhabricatorListerTester(HttpListerTester, unittest.TestCase):
# list first Phabricator instance
fl.run()
fl.instance = 'other_fake'
fl.config['credentials'] = {
'phabricator': {
'other_fake': [{
'password': 'foo'
}]
}
fl.instance = "other_fake"
fl.config["credentials"] = {
"phabricator": {"other_fake": [{"password": "foo"}]}
}
# list second Phabricator instance hosting repositories having
# same ids as those listed from the first instance
self.good_api_response_file = \
'data/api_first_response_other_instance.json'
self.good_api_response_file = "data/api_first_response_other_instance.json"
self.last_index = 13
fl.run()
@ -113,28 +108,28 @@ class PhabricatorListerTester(HttpListerTester, unittest.TestCase):
# check tasks are not disabled
for task in self.scheduler_tasks:
self.assertTrue(task['status'] != 'disabled')
self.assertTrue(task["status"] != "disabled")
def test_phabricator_lister(lister_phabricator, requests_mock_datadir):
lister = lister_phabricator
assert lister.url == lister.DEFAULT_URL
assert lister.instance == 'forge.softwareheritage.org'
assert lister.instance == "forge.softwareheritage.org"
lister.run()
r = lister.scheduler.search_tasks(task_type='load-git')
r = lister.scheduler.search_tasks(task_type="load-git")
assert len(r) == 10
for row in r:
assert row['type'] == 'load-git'
assert row["type"] == "load-git"
# arguments check
args = row['arguments']['args']
args = row["arguments"]["args"]
assert len(args) == 0
# kwargs
kwargs = row['arguments']['kwargs']
url = kwargs['url']
kwargs = row["arguments"]["kwargs"]
url = kwargs["url"]
assert lister.instance in url
assert row['policy'] == 'recurring'
assert row['priority'] is None
assert row["policy"] == "recurring"
assert row["priority"] is None

View file

@ -4,9 +4,8 @@
def test_ping(swh_app, celery_session_worker):
res = swh_app.send_task(
'swh.lister.phabricator.tasks.ping')
res = swh_app.send_task("swh.lister.phabricator.tasks.ping")
assert res
res.wait()
assert res.successful()
assert res.result == 'OK'
assert res.result == "OK"

View file

@ -7,7 +7,8 @@ def register():
from .models import PyPIModel
from .lister import PyPILister
return {'models': [PyPIModel],
'lister': PyPILister,
'task_modules': ['%s.tasks' % __name__],
}
return {
"models": [PyPIModel],
"lister": PyPILister,
"task_modules": ["%s.tasks" % __name__],
}

View file

@ -18,12 +18,12 @@ from requests import Response
class PyPILister(ListerOnePageApiTransport, SimpleLister):
MODEL = PyPIModel
LISTER_NAME = 'pypi'
PAGE = 'https://pypi.org/simple/'
instance = 'pypi' # As of today only the main pypi.org is used
LISTER_NAME = "pypi"
PAGE = "https://pypi.org/simple/"
instance = "pypi" # As of today only the main pypi.org is used
def __init__(self, override_config=None):
ListerOnePageApiTransport .__init__(self)
ListerOnePageApiTransport.__init__(self)
SimpleLister.__init__(self, override_config=override_config)
def task_dict(self, origin_type: str, origin_url: str, **kwargs):
@ -33,17 +33,16 @@ class PyPILister(ListerOnePageApiTransport, SimpleLister):
needed for the ingestion task creation.
"""
_type = 'load-%s' % origin_type
_policy = kwargs.get('policy', 'recurring')
return utils.create_task_dict(
_type, _policy, url=origin_url)
_type = "load-%s" % origin_type
_policy = kwargs.get("policy", "recurring")
return utils.create_task_dict(_type, _policy, url=origin_url)
def list_packages(self, response: Response) -> list:
"""(Override) List the actual pypi origins from the response.
"""
result = xmltodict.parse(response.content)
_packages = [p['#text'] for p in result['html']['body']['a']]
_packages = [p["#text"] for p in result["html"]["body"]["a"]]
random.shuffle(_packages)
return _packages
@ -51,7 +50,7 @@ class PyPILister(ListerOnePageApiTransport, SimpleLister):
"""Returns origin_url
"""
return 'https://pypi.org/project/%s/' % repo_name
return "https://pypi.org/project/%s/" % repo_name
def get_model_from_repo(self, repo_name: str) -> Dict[str, Any]:
"""(Override) Transform from repository representation to model
@ -59,10 +58,10 @@ class PyPILister(ListerOnePageApiTransport, SimpleLister):
"""
origin_url = self.origin_url(repo_name)
return {
'uid': origin_url,
'name': repo_name,
'full_name': repo_name,
'html_url': origin_url,
'origin_url': origin_url,
'origin_type': 'pypi',
"uid": origin_url,
"name": repo_name,
"full_name": repo_name,
"html_url": origin_url,
"origin_url": origin_url,
"origin_type": "pypi",
}

View file

@ -11,6 +11,7 @@ class PyPIModel(ModelBase):
"""a PyPI repository representation
"""
__tablename__ = 'pypi_repo'
__tablename__ = "pypi_repo"
uid = Column(String, primary_key=True)

View file

@ -7,12 +7,12 @@ from celery import shared_task
from .lister import PyPILister
@shared_task(name=__name__ + '.PyPIListerTask')
@shared_task(name=__name__ + ".PyPIListerTask")
def list_pypi(**lister_args):
'Full update of the PyPI (python) registry'
"Full update of the PyPI (python) registry"
return PyPILister(**lister_args).run()
@shared_task(name=__name__ + '.ping')
@shared_task(name=__name__ + ".ping")
def _ping():
return 'OK'
return "OK"

View file

@ -10,14 +10,16 @@ from swh.lister.core.tests.conftest import * # noqa
@pytest.fixture
def lister_pypi(swh_listers):
lister = swh_listers['pypi']
lister = swh_listers["pypi"]
# Add the load-deb-package in the scheduler backend
lister.scheduler.create_task_type({
'type': 'load-pypi',
'description': 'Load PyPI package',
'backend_name': 'swh.loader.package.tasks.LoadPyPI',
'default_interval': '1 day',
})
lister.scheduler.create_task_type(
{
"type": "load-pypi",
"description": "Load PyPI package",
"backend_name": "swh.loader.package.tasks.LoadPyPI",
"default_interval": "1 day",
}
)
return lister

View file

@ -7,21 +7,21 @@
def test_pypi_lister(lister_pypi, requests_mock_datadir):
lister_pypi.run()
r = lister_pypi.scheduler.search_tasks(task_type='load-pypi')
r = lister_pypi.scheduler.search_tasks(task_type="load-pypi")
assert len(r) == 4
for row in r:
assert row['type'] == 'load-pypi'
assert row["type"] == "load-pypi"
# arguments check
args = row['arguments']['args']
args = row["arguments"]["args"]
assert len(args) == 0
# kwargs
kwargs = row['arguments']['kwargs']
kwargs = row["arguments"]["kwargs"]
assert len(kwargs) == 1
origin_url = kwargs['url']
assert 'https://pypi.org/project' in origin_url
origin_url = kwargs["url"]
assert "https://pypi.org/project" in origin_url
assert row['policy'] == 'recurring'
assert row['priority'] is None
assert row["policy"] == "recurring"
assert row["priority"] is None

View file

@ -2,22 +2,20 @@ from unittest.mock import patch
def test_ping(swh_app, celery_session_worker):
res = swh_app.send_task(
'swh.lister.pypi.tasks.ping')
res = swh_app.send_task("swh.lister.pypi.tasks.ping")
assert res
res.wait()
assert res.successful()
assert res.result == 'OK'
assert res.result == "OK"
@patch('swh.lister.pypi.tasks.PyPILister')
@patch("swh.lister.pypi.tasks.PyPILister")
def test_lister(lister, swh_app, celery_session_worker):
# setup the mocked PypiLister
lister.return_value = lister
lister.run.return_value = None
res = swh_app.send_task(
'swh.lister.pypi.tasks.PyPIListerTask')
res = swh_app.send_task("swh.lister.pypi.tasks.PyPIListerTask")
assert res
res.wait()
assert res.successful()

View file

@ -15,7 +15,7 @@ from .test_utils import init_db
def test_get_lister_wrong_input():
"""Unsupported lister should raise"""
with pytest.raises(ValueError) as e:
get_lister('unknown', 'db-url')
get_lister("unknown", "db-url")
assert "Invalid lister" in str(e.value)
@ -37,23 +37,22 @@ def test_get_lister_override():
db_url = init_db().url()
listers = {
'gitlab': 'https://other.gitlab.uni/api/v4/',
'phabricator': 'https://somewhere.org/api/diffusion.repository.search',
'cgit': 'https://some.where/cgit',
"gitlab": "https://other.gitlab.uni/api/v4/",
"phabricator": "https://somewhere.org/api/diffusion.repository.search",
"cgit": "https://some.where/cgit",
}
# check the override ends up defined in the lister
for lister_name, url in listers.items():
lst = get_lister(
lister_name, db_url, **{
'url': url,
'priority': 'high',
'policy': 'oneshot',
})
lister_name,
db_url,
**{"url": url, "priority": "high", "policy": "oneshot",}
)
assert lst.url == url
assert lst.config['priority'] == 'high'
assert lst.config['policy'] == 'oneshot'
assert lst.config["priority"] == "high"
assert lst.config["policy"] == "oneshot"
# check the default urls are used and not the override (since it's not
# passed)
@ -61,7 +60,7 @@ def test_get_lister_override():
lst = get_lister(lister_name, db_url)
# no override so this does not end up in lister's configuration
assert 'url' not in lst.config
assert 'priority' not in lst.config
assert 'oneshot' not in lst.config
assert "url" not in lst.config
assert "priority" not in lst.config
assert "oneshot" not in lst.config
assert lst.url == lst.DEFAULT_URL

View file

@ -10,7 +10,6 @@ from swh.lister import utils
class UtilsTest(unittest.TestCase):
def test_split_range(self):
actual_ranges = list(utils.split_range(14, 5))
self.assertEqual(actual_ranges, [(0, 5), (5, 10), (10, 14)])
@ -33,6 +32,6 @@ def init_db():
db object to ease db manipulation
"""
initdb_args = Postgresql.DEFAULT_SETTINGS['initdb_args']
initdb_args = ' '.join([initdb_args, '-E UTF-8'])
initdb_args = Postgresql.DEFAULT_SETTINGS["initdb_args"]
initdb_args = " ".join([initdb_args, "-E UTF-8"])
return Postgresql(initdb_args=initdb_args)

View file

@ -1,5 +1,5 @@
[tox]
envlist=flake8,mypy,py3
envlist=black,flake8,mypy,py3
[testenv]
extras =
@ -13,6 +13,13 @@ commands =
!dev: --cov={envsitepackagesdir}/swh/lister/ --cov-branch \
{envsitepackagesdir}/swh/lister/ {posargs}
[testenv:black]
skip_install = true
deps =
black
commands =
{envpython} -m black --check swh
[testenv:flake8]
skip_install = true
deps =