Refactor lister code
Streamline production of new listers by aggressively moving core functionality into progressively inherited (A->B->C) base classes with the transport layer abstracted. This should make common individual forge listers straightforward to produce with minimal customization. Github and Bitbucket listers can be used as examples of the indexing type.
This commit is contained in:
parent
a6e43f2777
commit
68d77fd43f
24 changed files with 8835 additions and 477 deletions
|
@ -1,3 +1,8 @@
|
|||
nose
|
||||
requests
|
||||
requests_mock
|
||||
setuptools
|
||||
SQLAlchemy
|
||||
|
||||
testing.postgresql
|
||||
xmltodict
|
||||
celery
|
||||
|
|
36
swh/lister/bitbucket/lister.py
Normal file
36
swh/lister/bitbucket/lister.py
Normal file
|
@ -0,0 +1,36 @@
|
|||
# Copyright (C) 2017 the Software Heritage developers
|
||||
# License: GNU General Public License version 3, or any later version
|
||||
# See top-level LICENSE file for more information
|
||||
|
||||
from urllib import parse
|
||||
|
||||
from swh.lister.bitbucket.models import BitBucketModel
|
||||
from swh.lister.core.indexing_lister import SWHIndexingHttpLister
|
||||
|
||||
|
||||
class BitBucketLister(SWHIndexingHttpLister):
|
||||
PATH_TEMPLATE = '/repositories?after=%s'
|
||||
MODEL = BitBucketModel
|
||||
|
||||
def get_model_from_repo(self, repo):
|
||||
return {
|
||||
'uid': repo['uuid'],
|
||||
'indexable': repo['created_on'],
|
||||
'name': repo['name'],
|
||||
'full_name': repo['full_name'],
|
||||
'html_url': repo['links']['html']['href'],
|
||||
'origin_url': repo['links']['clone'][0]['href'],
|
||||
'origin_type': repo['scm'],
|
||||
'description': repo['description']
|
||||
}
|
||||
|
||||
def get_next_target_from_response(self, response):
|
||||
body = response.json()
|
||||
if 'next' in body:
|
||||
return parse.unquote(body['next'].split('after=')[1])
|
||||
else:
|
||||
return None
|
||||
|
||||
def transport_response_simplified(self, response):
|
||||
repos = response.json()['values']
|
||||
return [self.get_model_from_repo(repo) for repo in repos]
|
15
swh/lister/bitbucket/models.py
Normal file
15
swh/lister/bitbucket/models.py
Normal file
|
@ -0,0 +1,15 @@
|
|||
# Copyright (C) 2017 the Software Heritage developers
|
||||
# License: GNU General Public License version 3, or any later version
|
||||
# See top-level LICENSE file for more information
|
||||
|
||||
from sqlalchemy import Column, String
|
||||
|
||||
from swh.lister.core.models import ModelBase
|
||||
|
||||
|
||||
class BitBucketModel(ModelBase):
|
||||
"""a BitBucket repository"""
|
||||
__tablename__ = 'bitbucket_repos'
|
||||
|
||||
uid = Column(String, primary_key=True)
|
||||
indexable = Column(String, index=True)
|
28
swh/lister/bitbucket/tasks.py
Normal file
28
swh/lister/bitbucket/tasks.py
Normal file
|
@ -0,0 +1,28 @@
|
|||
# Copyright (C) 2017 the Software Heritage developers
|
||||
# License: GNU General Public License version 3, or any later version
|
||||
# See top-level LICENSE file for more information
|
||||
|
||||
from swh.lister.core.tasks import (IndexingDiscoveryListerTask,
|
||||
IndexingRangeListerTask,
|
||||
IndexingRefreshListerTask, ListerTaskBase)
|
||||
|
||||
from .lister import BitBucketLister
|
||||
|
||||
|
||||
class BitBucketListerTask(ListerTaskBase):
|
||||
def new_lister(self):
|
||||
return BitBucketLister(lister_name='bitbucket.com',
|
||||
api_baseurl='https://api.bitbucket.org/2.0')
|
||||
|
||||
|
||||
class IncrementalBitBucketLister(BitBucketListerTask,
|
||||
IndexingDiscoveryListerTask):
|
||||
task_queue = 'swh_lister_bitbucket_discover'
|
||||
|
||||
|
||||
class RangeBitBucketLister(BitBucketListerTask, IndexingRangeListerTask):
|
||||
task_queue = 'swh_lister_bitbucket_refresh'
|
||||
|
||||
|
||||
class FullBitBucketRelister(BitBucketListerTask, IndexingRefreshListerTask):
|
||||
task_queue = 'swh_lister_bitbucket_refresh'
|
4
swh/lister/bitbucket/tests/api_empty_response.json
Normal file
4
swh/lister/bitbucket/tests/api_empty_response.json
Normal file
|
@ -0,0 +1,4 @@
|
|||
{
|
||||
"pagelen": 10,
|
||||
"values": []
|
||||
}
|
806
swh/lister/bitbucket/tests/api_response.json
Normal file
806
swh/lister/bitbucket/tests/api_response.json
Normal file
|
@ -0,0 +1,806 @@
|
|||
{
|
||||
"pagelen": 10,
|
||||
"values": [
|
||||
{
|
||||
"scm": "hg",
|
||||
"website": "",
|
||||
"has_wiki": true,
|
||||
"name": "app-template",
|
||||
"links": {
|
||||
"watchers": {
|
||||
"href": "https://api.bitbucket.org/2.0/repositories/bebac/app-template/watchers"
|
||||
},
|
||||
"branches": {
|
||||
"href": "https://api.bitbucket.org/2.0/repositories/bebac/app-template/refs/branches"
|
||||
},
|
||||
"tags": {
|
||||
"href": "https://api.bitbucket.org/2.0/repositories/bebac/app-template/refs/tags"
|
||||
},
|
||||
"commits": {
|
||||
"href": "https://api.bitbucket.org/2.0/repositories/bebac/app-template/commits"
|
||||
},
|
||||
"clone": [
|
||||
{
|
||||
"href": "https://bitbucket.org/bebac/app-template",
|
||||
"name": "https"
|
||||
},
|
||||
{
|
||||
"href": "ssh://hg@bitbucket.org/bebac/app-template",
|
||||
"name": "ssh"
|
||||
}
|
||||
],
|
||||
"self": {
|
||||
"href": "https://api.bitbucket.org/2.0/repositories/bebac/app-template"
|
||||
},
|
||||
"html": {
|
||||
"href": "https://bitbucket.org/bebac/app-template"
|
||||
},
|
||||
"avatar": {
|
||||
"href": "https://bitbucket.org/bebac/app-template/avatar/32/"
|
||||
},
|
||||
"hooks": {
|
||||
"href": "https://api.bitbucket.org/2.0/repositories/bebac/app-template/hooks"
|
||||
},
|
||||
"forks": {
|
||||
"href": "https://api.bitbucket.org/2.0/repositories/bebac/app-template/forks"
|
||||
},
|
||||
"downloads": {
|
||||
"href": "https://api.bitbucket.org/2.0/repositories/bebac/app-template/downloads"
|
||||
},
|
||||
"pullrequests": {
|
||||
"href": "https://api.bitbucket.org/2.0/repositories/bebac/app-template/pullrequests"
|
||||
}
|
||||
},
|
||||
"fork_policy": "allow_forks",
|
||||
"uuid": "{0cf80a6e-e91f-4a4c-a61b-8c8ff51ca3ec}",
|
||||
"language": "c++",
|
||||
"created_on": "2008-07-12T07:44:01.476818+00:00",
|
||||
"full_name": "bebac/app-template",
|
||||
"has_issues": true,
|
||||
"owner": {
|
||||
"username": "bebac",
|
||||
"display_name": "Benny Bach",
|
||||
"type": "user",
|
||||
"uuid": "{d1a83a2a-be1b-4034-8c1d-386a6690cddb}",
|
||||
"links": {
|
||||
"self": {
|
||||
"href": "https://api.bitbucket.org/2.0/users/bebac"
|
||||
},
|
||||
"html": {
|
||||
"href": "https://bitbucket.org/bebac/"
|
||||
},
|
||||
"avatar": {
|
||||
"href": "https://bitbucket.org/account/bebac/avatar/32/"
|
||||
}
|
||||
}
|
||||
},
|
||||
"updated_on": "2011-10-05T15:36:19.409008+00:00",
|
||||
"size": 71548,
|
||||
"type": "repository",
|
||||
"slug": "app-template",
|
||||
"is_private": false,
|
||||
"description": "Basic files and directory structure for a C++ project. Intended as a starting point for a new project. Includes a basic cross platform core library."
|
||||
},
|
||||
{
|
||||
"scm": "hg",
|
||||
"website": "",
|
||||
"has_wiki": true,
|
||||
"name": "mercurialeclipse",
|
||||
"links": {
|
||||
"watchers": {
|
||||
"href": "https://api.bitbucket.org/2.0/repositories/bastiand/mercurialeclipse/watchers"
|
||||
},
|
||||
"branches": {
|
||||
"href": "https://api.bitbucket.org/2.0/repositories/bastiand/mercurialeclipse/refs/branches"
|
||||
},
|
||||
"tags": {
|
||||
"href": "https://api.bitbucket.org/2.0/repositories/bastiand/mercurialeclipse/refs/tags"
|
||||
},
|
||||
"commits": {
|
||||
"href": "https://api.bitbucket.org/2.0/repositories/bastiand/mercurialeclipse/commits"
|
||||
},
|
||||
"clone": [
|
||||
{
|
||||
"href": "https://bitbucket.org/bastiand/mercurialeclipse",
|
||||
"name": "https"
|
||||
},
|
||||
{
|
||||
"href": "ssh://hg@bitbucket.org/bastiand/mercurialeclipse",
|
||||
"name": "ssh"
|
||||
}
|
||||
],
|
||||
"self": {
|
||||
"href": "https://api.bitbucket.org/2.0/repositories/bastiand/mercurialeclipse"
|
||||
},
|
||||
"html": {
|
||||
"href": "https://bitbucket.org/bastiand/mercurialeclipse"
|
||||
},
|
||||
"avatar": {
|
||||
"href": "https://bitbucket.org/bastiand/mercurialeclipse/avatar/32/"
|
||||
},
|
||||
"hooks": {
|
||||
"href": "https://api.bitbucket.org/2.0/repositories/bastiand/mercurialeclipse/hooks"
|
||||
},
|
||||
"forks": {
|
||||
"href": "https://api.bitbucket.org/2.0/repositories/bastiand/mercurialeclipse/forks"
|
||||
},
|
||||
"downloads": {
|
||||
"href": "https://api.bitbucket.org/2.0/repositories/bastiand/mercurialeclipse/downloads"
|
||||
},
|
||||
"pullrequests": {
|
||||
"href": "https://api.bitbucket.org/2.0/repositories/bastiand/mercurialeclipse/pullrequests"
|
||||
}
|
||||
},
|
||||
"fork_policy": "allow_forks",
|
||||
"uuid": "{f7a08670-bdd1-4465-aa97-7a5ce8d1a25b}",
|
||||
"language": "",
|
||||
"created_on": "2008-07-12T09:37:06.254721+00:00",
|
||||
"full_name": "bastiand/mercurialeclipse",
|
||||
"has_issues": false,
|
||||
"owner": {
|
||||
"username": "bastiand",
|
||||
"display_name": "Bastian Doetsch",
|
||||
"type": "user",
|
||||
"uuid": "{3742cd48-adad-4205-ab0d-04fc992a1728}",
|
||||
"links": {
|
||||
"self": {
|
||||
"href": "https://api.bitbucket.org/2.0/users/bastiand"
|
||||
},
|
||||
"html": {
|
||||
"href": "https://bitbucket.org/bastiand/"
|
||||
},
|
||||
"avatar": {
|
||||
"href": "https://bitbucket.org/account/bastiand/avatar/32/"
|
||||
}
|
||||
}
|
||||
},
|
||||
"updated_on": "2011-09-17T02:36:59.062596+00:00",
|
||||
"size": 6445145,
|
||||
"type": "repository",
|
||||
"slug": "mercurialeclipse",
|
||||
"is_private": false,
|
||||
"description": "my own repo for MercurialEclipse."
|
||||
},
|
||||
{
|
||||
"scm": "hg",
|
||||
"website": "",
|
||||
"has_wiki": true,
|
||||
"name": "sandboxpublic",
|
||||
"links": {
|
||||
"watchers": {
|
||||
"href": "https://api.bitbucket.org/2.0/repositories/aleax/sandboxpublic/watchers"
|
||||
},
|
||||
"branches": {
|
||||
"href": "https://api.bitbucket.org/2.0/repositories/aleax/sandboxpublic/refs/branches"
|
||||
},
|
||||
"tags": {
|
||||
"href": "https://api.bitbucket.org/2.0/repositories/aleax/sandboxpublic/refs/tags"
|
||||
},
|
||||
"commits": {
|
||||
"href": "https://api.bitbucket.org/2.0/repositories/aleax/sandboxpublic/commits"
|
||||
},
|
||||
"clone": [
|
||||
{
|
||||
"href": "https://bitbucket.org/aleax/sandboxpublic",
|
||||
"name": "https"
|
||||
},
|
||||
{
|
||||
"href": "ssh://hg@bitbucket.org/aleax/sandboxpublic",
|
||||
"name": "ssh"
|
||||
}
|
||||
],
|
||||
"self": {
|
||||
"href": "https://api.bitbucket.org/2.0/repositories/aleax/sandboxpublic"
|
||||
},
|
||||
"html": {
|
||||
"href": "https://bitbucket.org/aleax/sandboxpublic"
|
||||
},
|
||||
"avatar": {
|
||||
"href": "https://bitbucket.org/aleax/sandboxpublic/avatar/32/"
|
||||
},
|
||||
"hooks": {
|
||||
"href": "https://api.bitbucket.org/2.0/repositories/aleax/sandboxpublic/hooks"
|
||||
},
|
||||
"forks": {
|
||||
"href": "https://api.bitbucket.org/2.0/repositories/aleax/sandboxpublic/forks"
|
||||
},
|
||||
"downloads": {
|
||||
"href": "https://api.bitbucket.org/2.0/repositories/aleax/sandboxpublic/downloads"
|
||||
},
|
||||
"pullrequests": {
|
||||
"href": "https://api.bitbucket.org/2.0/repositories/aleax/sandboxpublic/pullrequests"
|
||||
}
|
||||
},
|
||||
"fork_policy": "allow_forks",
|
||||
"uuid": "{452c716c-a1ce-42bc-a95b-d38da49cbb37}",
|
||||
"language": "",
|
||||
"created_on": "2008-07-14T01:59:23.568048+00:00",
|
||||
"full_name": "aleax/sandboxpublic",
|
||||
"has_issues": true,
|
||||
"owner": {
|
||||
"username": "aleax",
|
||||
"display_name": "Alex Martelli",
|
||||
"type": "user",
|
||||
"uuid": "{1155d94d-fb48-43fe-a431-ec07c900b636}",
|
||||
"links": {
|
||||
"self": {
|
||||
"href": "https://api.bitbucket.org/2.0/users/aleax"
|
||||
},
|
||||
"html": {
|
||||
"href": "https://bitbucket.org/aleax/"
|
||||
},
|
||||
"avatar": {
|
||||
"href": "https://bitbucket.org/account/aleax/avatar/32/"
|
||||
}
|
||||
}
|
||||
},
|
||||
"updated_on": "2012-06-22T21:55:28.753727+00:00",
|
||||
"size": 3120,
|
||||
"type": "repository",
|
||||
"slug": "sandboxpublic",
|
||||
"is_private": false,
|
||||
"description": "to help debug ACLs for private vs public bitbucket repos"
|
||||
},
|
||||
{
|
||||
"scm": "hg",
|
||||
"website": "",
|
||||
"has_wiki": true,
|
||||
"name": "otrsfix-ng",
|
||||
"links": {
|
||||
"watchers": {
|
||||
"href": "https://api.bitbucket.org/2.0/repositories/adiakin/otrsfix-ng/watchers"
|
||||
},
|
||||
"branches": {
|
||||
"href": "https://api.bitbucket.org/2.0/repositories/adiakin/otrsfix-ng/refs/branches"
|
||||
},
|
||||
"tags": {
|
||||
"href": "https://api.bitbucket.org/2.0/repositories/adiakin/otrsfix-ng/refs/tags"
|
||||
},
|
||||
"commits": {
|
||||
"href": "https://api.bitbucket.org/2.0/repositories/adiakin/otrsfix-ng/commits"
|
||||
},
|
||||
"clone": [
|
||||
{
|
||||
"href": "https://bitbucket.org/adiakin/otrsfix-ng",
|
||||
"name": "https"
|
||||
},
|
||||
{
|
||||
"href": "ssh://hg@bitbucket.org/adiakin/otrsfix-ng",
|
||||
"name": "ssh"
|
||||
}
|
||||
],
|
||||
"self": {
|
||||
"href": "https://api.bitbucket.org/2.0/repositories/adiakin/otrsfix-ng"
|
||||
},
|
||||
"html": {
|
||||
"href": "https://bitbucket.org/adiakin/otrsfix-ng"
|
||||
},
|
||||
"avatar": {
|
||||
"href": "https://bitbucket.org/adiakin/otrsfix-ng/avatar/32/"
|
||||
},
|
||||
"hooks": {
|
||||
"href": "https://api.bitbucket.org/2.0/repositories/adiakin/otrsfix-ng/hooks"
|
||||
},
|
||||
"forks": {
|
||||
"href": "https://api.bitbucket.org/2.0/repositories/adiakin/otrsfix-ng/forks"
|
||||
},
|
||||
"downloads": {
|
||||
"href": "https://api.bitbucket.org/2.0/repositories/adiakin/otrsfix-ng/downloads"
|
||||
},
|
||||
"pullrequests": {
|
||||
"href": "https://api.bitbucket.org/2.0/repositories/adiakin/otrsfix-ng/pullrequests"
|
||||
}
|
||||
},
|
||||
"fork_policy": "allow_forks",
|
||||
"uuid": "{05b1b9dc-a7b6-46d6-ae1b-e66a17aa4f55}",
|
||||
"language": "",
|
||||
"created_on": "2008-07-15T06:14:39.306314+00:00",
|
||||
"full_name": "adiakin/otrsfix-ng",
|
||||
"has_issues": true,
|
||||
"owner": {
|
||||
"username": "adiakin",
|
||||
"display_name": "adiakin",
|
||||
"type": "user",
|
||||
"uuid": "{414012b5-1ac9-4096-9f46-8893cfa3cda5}",
|
||||
"links": {
|
||||
"self": {
|
||||
"href": "https://api.bitbucket.org/2.0/users/adiakin"
|
||||
},
|
||||
"html": {
|
||||
"href": "https://bitbucket.org/adiakin/"
|
||||
},
|
||||
"avatar": {
|
||||
"href": "https://bitbucket.org/account/adiakin/avatar/32/"
|
||||
}
|
||||
}
|
||||
},
|
||||
"updated_on": "2016-06-02T18:56:34.868302+00:00",
|
||||
"size": 211631,
|
||||
"type": "repository",
|
||||
"slug": "otrsfix-ng",
|
||||
"is_private": false,
|
||||
"description": "OTRS greasemonkey extension"
|
||||
},
|
||||
{
|
||||
"scm": "hg",
|
||||
"website": "",
|
||||
"has_wiki": true,
|
||||
"name": "pida-pypaned",
|
||||
"links": {
|
||||
"watchers": {
|
||||
"href": "https://api.bitbucket.org/2.0/repositories/aafshar/pida-pypaned/watchers"
|
||||
},
|
||||
"branches": {
|
||||
"href": "https://api.bitbucket.org/2.0/repositories/aafshar/pida-pypaned/refs/branches"
|
||||
},
|
||||
"tags": {
|
||||
"href": "https://api.bitbucket.org/2.0/repositories/aafshar/pida-pypaned/refs/tags"
|
||||
},
|
||||
"commits": {
|
||||
"href": "https://api.bitbucket.org/2.0/repositories/aafshar/pida-pypaned/commits"
|
||||
},
|
||||
"clone": [
|
||||
{
|
||||
"href": "https://bitbucket.org/aafshar/pida-pypaned",
|
||||
"name": "https"
|
||||
},
|
||||
{
|
||||
"href": "ssh://hg@bitbucket.org/aafshar/pida-pypaned",
|
||||
"name": "ssh"
|
||||
}
|
||||
],
|
||||
"self": {
|
||||
"href": "https://api.bitbucket.org/2.0/repositories/aafshar/pida-pypaned"
|
||||
},
|
||||
"html": {
|
||||
"href": "https://bitbucket.org/aafshar/pida-pypaned"
|
||||
},
|
||||
"avatar": {
|
||||
"href": "https://bitbucket.org/aafshar/pida-pypaned/avatar/32/"
|
||||
},
|
||||
"hooks": {
|
||||
"href": "https://api.bitbucket.org/2.0/repositories/aafshar/pida-pypaned/hooks"
|
||||
},
|
||||
"forks": {
|
||||
"href": "https://api.bitbucket.org/2.0/repositories/aafshar/pida-pypaned/forks"
|
||||
},
|
||||
"downloads": {
|
||||
"href": "https://api.bitbucket.org/2.0/repositories/aafshar/pida-pypaned/downloads"
|
||||
},
|
||||
"pullrequests": {
|
||||
"href": "https://api.bitbucket.org/2.0/repositories/aafshar/pida-pypaned/pullrequests"
|
||||
}
|
||||
},
|
||||
"fork_policy": "allow_forks",
|
||||
"uuid": "{94cb830a-1784-4e51-9791-8f5cc93990a9}",
|
||||
"language": "",
|
||||
"created_on": "2008-07-16T22:47:38.682491+00:00",
|
||||
"full_name": "aafshar/pida-pypaned",
|
||||
"has_issues": true,
|
||||
"owner": {
|
||||
"username": "aafshar",
|
||||
"display_name": "Ali Afshar",
|
||||
"type": "user",
|
||||
"uuid": "{bcb87110-6a92-41fc-b95c-680feeea5512}",
|
||||
"links": {
|
||||
"self": {
|
||||
"href": "https://api.bitbucket.org/2.0/users/aafshar"
|
||||
},
|
||||
"html": {
|
||||
"href": "https://bitbucket.org/aafshar/"
|
||||
},
|
||||
"avatar": {
|
||||
"href": "https://bitbucket.org/account/aafshar/avatar/32/"
|
||||
}
|
||||
}
|
||||
},
|
||||
"updated_on": "2012-06-22T21:55:42.451431+00:00",
|
||||
"size": 4680652,
|
||||
"type": "repository",
|
||||
"slug": "pida-pypaned",
|
||||
"is_private": false,
|
||||
"description": ""
|
||||
},
|
||||
{
|
||||
"scm": "hg",
|
||||
"website": "",
|
||||
"has_wiki": true,
|
||||
"name": "TLOMM-testing",
|
||||
"links": {
|
||||
"watchers": {
|
||||
"href": "https://api.bitbucket.org/2.0/repositories/tgrimley/tlomm-testing/watchers"
|
||||
},
|
||||
"branches": {
|
||||
"href": "https://api.bitbucket.org/2.0/repositories/tgrimley/tlomm-testing/refs/branches"
|
||||
},
|
||||
"tags": {
|
||||
"href": "https://api.bitbucket.org/2.0/repositories/tgrimley/tlomm-testing/refs/tags"
|
||||
},
|
||||
"commits": {
|
||||
"href": "https://api.bitbucket.org/2.0/repositories/tgrimley/tlomm-testing/commits"
|
||||
},
|
||||
"clone": [
|
||||
{
|
||||
"href": "https://bitbucket.org/tgrimley/tlomm-testing",
|
||||
"name": "https"
|
||||
},
|
||||
{
|
||||
"href": "ssh://hg@bitbucket.org/tgrimley/tlomm-testing",
|
||||
"name": "ssh"
|
||||
}
|
||||
],
|
||||
"self": {
|
||||
"href": "https://api.bitbucket.org/2.0/repositories/tgrimley/tlomm-testing"
|
||||
},
|
||||
"html": {
|
||||
"href": "https://bitbucket.org/tgrimley/tlomm-testing"
|
||||
},
|
||||
"avatar": {
|
||||
"href": "https://bitbucket.org/tgrimley/tlomm-testing/avatar/32/"
|
||||
},
|
||||
"hooks": {
|
||||
"href": "https://api.bitbucket.org/2.0/repositories/tgrimley/tlomm-testing/hooks"
|
||||
},
|
||||
"forks": {
|
||||
"href": "https://api.bitbucket.org/2.0/repositories/tgrimley/tlomm-testing/forks"
|
||||
},
|
||||
"downloads": {
|
||||
"href": "https://api.bitbucket.org/2.0/repositories/tgrimley/tlomm-testing/downloads"
|
||||
},
|
||||
"pullrequests": {
|
||||
"href": "https://api.bitbucket.org/2.0/repositories/tgrimley/tlomm-testing/pullrequests"
|
||||
}
|
||||
},
|
||||
"fork_policy": "allow_forks",
|
||||
"uuid": "{95283ca1-f77e-40d6-b3ed-5bfae6ed2d15}",
|
||||
"language": "",
|
||||
"created_on": "2008-07-18T21:05:17.750587+00:00",
|
||||
"full_name": "tgrimley/tlomm-testing",
|
||||
"has_issues": true,
|
||||
"owner": {
|
||||
"username": "tgrimley",
|
||||
"display_name": "Thomas Grimley",
|
||||
"type": "user",
|
||||
"uuid": "{c958a08f-4669-4c77-81ec-4e2faa8ebf35}",
|
||||
"links": {
|
||||
"self": {
|
||||
"href": "https://api.bitbucket.org/2.0/users/tgrimley"
|
||||
},
|
||||
"html": {
|
||||
"href": "https://bitbucket.org/tgrimley/"
|
||||
},
|
||||
"avatar": {
|
||||
"href": "https://bitbucket.org/account/tgrimley/avatar/32/"
|
||||
}
|
||||
}
|
||||
},
|
||||
"updated_on": "2012-06-22T21:55:46.627825+00:00",
|
||||
"size": 3128,
|
||||
"type": "repository",
|
||||
"slug": "tlomm-testing",
|
||||
"is_private": false,
|
||||
"description": "File related to testing functionality of TLOMM->TLOTTS transition"
|
||||
},
|
||||
{
|
||||
"scm": "hg",
|
||||
"website": "",
|
||||
"has_wiki": true,
|
||||
"name": "test",
|
||||
"links": {
|
||||
"watchers": {
|
||||
"href": "https://api.bitbucket.org/2.0/repositories/tingle/test/watchers"
|
||||
},
|
||||
"branches": {
|
||||
"href": "https://api.bitbucket.org/2.0/repositories/tingle/test/refs/branches"
|
||||
},
|
||||
"tags": {
|
||||
"href": "https://api.bitbucket.org/2.0/repositories/tingle/test/refs/tags"
|
||||
},
|
||||
"commits": {
|
||||
"href": "https://api.bitbucket.org/2.0/repositories/tingle/test/commits"
|
||||
},
|
||||
"clone": [
|
||||
{
|
||||
"href": "https://bitbucket.org/tingle/test",
|
||||
"name": "https"
|
||||
},
|
||||
{
|
||||
"href": "ssh://hg@bitbucket.org/tingle/test",
|
||||
"name": "ssh"
|
||||
}
|
||||
],
|
||||
"self": {
|
||||
"href": "https://api.bitbucket.org/2.0/repositories/tingle/test"
|
||||
},
|
||||
"html": {
|
||||
"href": "https://bitbucket.org/tingle/test"
|
||||
},
|
||||
"avatar": {
|
||||
"href": "https://bitbucket.org/tingle/test/avatar/32/"
|
||||
},
|
||||
"hooks": {
|
||||
"href": "https://api.bitbucket.org/2.0/repositories/tingle/test/hooks"
|
||||
},
|
||||
"forks": {
|
||||
"href": "https://api.bitbucket.org/2.0/repositories/tingle/test/forks"
|
||||
},
|
||||
"downloads": {
|
||||
"href": "https://api.bitbucket.org/2.0/repositories/tingle/test/downloads"
|
||||
},
|
||||
"pullrequests": {
|
||||
"href": "https://api.bitbucket.org/2.0/repositories/tingle/test/pullrequests"
|
||||
}
|
||||
},
|
||||
"fork_policy": "allow_forks",
|
||||
"uuid": "{457953ec-fe87-41b9-b659-94756fb40ece}",
|
||||
"language": "",
|
||||
"created_on": "2008-07-18T22:24:31.984981+00:00",
|
||||
"full_name": "tingle/test",
|
||||
"has_issues": true,
|
||||
"owner": {
|
||||
"username": "tingle",
|
||||
"display_name": "tingle",
|
||||
"type": "user",
|
||||
"uuid": "{dddce42b-bd19-417b-90ff-72cdbfb6eb7d}",
|
||||
"links": {
|
||||
"self": {
|
||||
"href": "https://api.bitbucket.org/2.0/users/tingle"
|
||||
},
|
||||
"html": {
|
||||
"href": "https://bitbucket.org/tingle/"
|
||||
},
|
||||
"avatar": {
|
||||
"href": "https://bitbucket.org/account/tingle/avatar/32/"
|
||||
}
|
||||
}
|
||||
},
|
||||
"updated_on": "2012-06-22T21:55:49.860564+00:00",
|
||||
"size": 3090,
|
||||
"type": "repository",
|
||||
"slug": "test",
|
||||
"is_private": false,
|
||||
"description": ""
|
||||
},
|
||||
{
|
||||
"scm": "hg",
|
||||
"website": "http://shaze.myopenid.com/",
|
||||
"has_wiki": true,
|
||||
"name": "Repository",
|
||||
"links": {
|
||||
"watchers": {
|
||||
"href": "https://api.bitbucket.org/2.0/repositories/Shaze/repository/watchers"
|
||||
},
|
||||
"branches": {
|
||||
"href": "https://api.bitbucket.org/2.0/repositories/Shaze/repository/refs/branches"
|
||||
},
|
||||
"tags": {
|
||||
"href": "https://api.bitbucket.org/2.0/repositories/Shaze/repository/refs/tags"
|
||||
},
|
||||
"commits": {
|
||||
"href": "https://api.bitbucket.org/2.0/repositories/Shaze/repository/commits"
|
||||
},
|
||||
"clone": [
|
||||
{
|
||||
"href": "https://bitbucket.org/Shaze/repository",
|
||||
"name": "https"
|
||||
},
|
||||
{
|
||||
"href": "ssh://hg@bitbucket.org/Shaze/repository",
|
||||
"name": "ssh"
|
||||
}
|
||||
],
|
||||
"self": {
|
||||
"href": "https://api.bitbucket.org/2.0/repositories/Shaze/repository"
|
||||
},
|
||||
"html": {
|
||||
"href": "https://bitbucket.org/Shaze/repository"
|
||||
},
|
||||
"avatar": {
|
||||
"href": "https://bitbucket.org/Shaze/repository/avatar/32/"
|
||||
},
|
||||
"hooks": {
|
||||
"href": "https://api.bitbucket.org/2.0/repositories/Shaze/repository/hooks"
|
||||
},
|
||||
"forks": {
|
||||
"href": "https://api.bitbucket.org/2.0/repositories/Shaze/repository/forks"
|
||||
},
|
||||
"downloads": {
|
||||
"href": "https://api.bitbucket.org/2.0/repositories/Shaze/repository/downloads"
|
||||
},
|
||||
"pullrequests": {
|
||||
"href": "https://api.bitbucket.org/2.0/repositories/Shaze/repository/pullrequests"
|
||||
}
|
||||
},
|
||||
"fork_policy": "allow_forks",
|
||||
"uuid": "{3c0b8076-caef-465a-8d08-a184459f659b}",
|
||||
"language": "",
|
||||
"created_on": "2008-07-18T22:39:51.380134+00:00",
|
||||
"full_name": "Shaze/repository",
|
||||
"has_issues": true,
|
||||
"owner": {
|
||||
"username": "Shaze",
|
||||
"display_name": "Shaze",
|
||||
"type": "user",
|
||||
"uuid": "{f57817e9-bfe4-4c65-84dd-662152430323}",
|
||||
"links": {
|
||||
"self": {
|
||||
"href": "https://api.bitbucket.org/2.0/users/Shaze"
|
||||
},
|
||||
"html": {
|
||||
"href": "https://bitbucket.org/Shaze/"
|
||||
},
|
||||
"avatar": {
|
||||
"href": "https://bitbucket.org/account/Shaze/avatar/32/"
|
||||
}
|
||||
}
|
||||
},
|
||||
"updated_on": "2012-06-22T21:55:51.570502+00:00",
|
||||
"size": 3052,
|
||||
"type": "repository",
|
||||
"slug": "repository",
|
||||
"is_private": false,
|
||||
"description": "Mine, all mine!"
|
||||
},
|
||||
{
|
||||
"scm": "hg",
|
||||
"website": "http://bitbucket.org/copiesofcopies/identifox/",
|
||||
"has_wiki": true,
|
||||
"name": "identifox",
|
||||
"links": {
|
||||
"watchers": {
|
||||
"href": "https://api.bitbucket.org/2.0/repositories/uncryptic/identifox/watchers"
|
||||
},
|
||||
"branches": {
|
||||
"href": "https://api.bitbucket.org/2.0/repositories/uncryptic/identifox/refs/branches"
|
||||
},
|
||||
"tags": {
|
||||
"href": "https://api.bitbucket.org/2.0/repositories/uncryptic/identifox/refs/tags"
|
||||
},
|
||||
"commits": {
|
||||
"href": "https://api.bitbucket.org/2.0/repositories/uncryptic/identifox/commits"
|
||||
},
|
||||
"clone": [
|
||||
{
|
||||
"href": "https://bitbucket.org/uncryptic/identifox",
|
||||
"name": "https"
|
||||
},
|
||||
{
|
||||
"href": "ssh://hg@bitbucket.org/uncryptic/identifox",
|
||||
"name": "ssh"
|
||||
}
|
||||
],
|
||||
"self": {
|
||||
"href": "https://api.bitbucket.org/2.0/repositories/uncryptic/identifox"
|
||||
},
|
||||
"html": {
|
||||
"href": "https://bitbucket.org/uncryptic/identifox"
|
||||
},
|
||||
"avatar": {
|
||||
"href": "https://bitbucket.org/uncryptic/identifox/avatar/32/"
|
||||
},
|
||||
"hooks": {
|
||||
"href": "https://api.bitbucket.org/2.0/repositories/uncryptic/identifox/hooks"
|
||||
},
|
||||
"forks": {
|
||||
"href": "https://api.bitbucket.org/2.0/repositories/uncryptic/identifox/forks"
|
||||
},
|
||||
"downloads": {
|
||||
"href": "https://api.bitbucket.org/2.0/repositories/uncryptic/identifox/downloads"
|
||||
},
|
||||
"pullrequests": {
|
||||
"href": "https://api.bitbucket.org/2.0/repositories/uncryptic/identifox/pullrequests"
|
||||
}
|
||||
},
|
||||
"fork_policy": "allow_forks",
|
||||
"uuid": "{78a1a080-a77e-4d0d-823a-b107484477a8}",
|
||||
"language": "",
|
||||
"created_on": "2008-07-19T00:33:14.065946+00:00",
|
||||
"full_name": "uncryptic/identifox",
|
||||
"has_issues": true,
|
||||
"owner": {
|
||||
"username": "uncryptic",
|
||||
"display_name": "Uncryptic Communications",
|
||||
"type": "user",
|
||||
"uuid": "{db87bb9a-9980-4840-bd4a-61f7748a56b4}",
|
||||
"links": {
|
||||
"self": {
|
||||
"href": "https://api.bitbucket.org/2.0/users/uncryptic"
|
||||
},
|
||||
"html": {
|
||||
"href": "https://bitbucket.org/uncryptic/"
|
||||
},
|
||||
"avatar": {
|
||||
"href": "https://bitbucket.org/account/uncryptic/avatar/32/"
|
||||
}
|
||||
}
|
||||
},
|
||||
"updated_on": "2008-07-19T00:33:14+00:00",
|
||||
"size": 1918,
|
||||
"type": "repository",
|
||||
"slug": "identifox",
|
||||
"is_private": false,
|
||||
"description": "TwitterFox, modified to work with Identi.ca, including cosmetic and subtle code changes. For the most part, the code is nearly identical to the TwitterFox base: http://www.naan.net/trac/wiki/TwitterFox"
|
||||
},
|
||||
{
|
||||
"scm": "hg",
|
||||
"website": "http://rforce.rubyforge.org",
|
||||
"has_wiki": false,
|
||||
"name": "rforce",
|
||||
"links": {
|
||||
"watchers": {
|
||||
"href": "https://api.bitbucket.org/2.0/repositories/undees/rforce/watchers"
|
||||
},
|
||||
"branches": {
|
||||
"href": "https://api.bitbucket.org/2.0/repositories/undees/rforce/refs/branches"
|
||||
},
|
||||
"tags": {
|
||||
"href": "https://api.bitbucket.org/2.0/repositories/undees/rforce/refs/tags"
|
||||
},
|
||||
"commits": {
|
||||
"href": "https://api.bitbucket.org/2.0/repositories/undees/rforce/commits"
|
||||
},
|
||||
"clone": [
|
||||
{
|
||||
"href": "https://bitbucket.org/undees/rforce",
|
||||
"name": "https"
|
||||
},
|
||||
{
|
||||
"href": "ssh://hg@bitbucket.org/undees/rforce",
|
||||
"name": "ssh"
|
||||
}
|
||||
],
|
||||
"self": {
|
||||
"href": "https://api.bitbucket.org/2.0/repositories/undees/rforce"
|
||||
},
|
||||
"html": {
|
||||
"href": "https://bitbucket.org/undees/rforce"
|
||||
},
|
||||
"avatar": {
|
||||
"href": "https://bitbucket.org/undees/rforce/avatar/32/"
|
||||
},
|
||||
"hooks": {
|
||||
"href": "https://api.bitbucket.org/2.0/repositories/undees/rforce/hooks"
|
||||
},
|
||||
"forks": {
|
||||
"href": "https://api.bitbucket.org/2.0/repositories/undees/rforce/forks"
|
||||
},
|
||||
"downloads": {
|
||||
"href": "https://api.bitbucket.org/2.0/repositories/undees/rforce/downloads"
|
||||
},
|
||||
"pullrequests": {
|
||||
"href": "https://api.bitbucket.org/2.0/repositories/undees/rforce/pullrequests"
|
||||
}
|
||||
},
|
||||
"fork_policy": "allow_forks",
|
||||
"uuid": "{ec2ffee7-bfcd-4e95-83c8-22ac31e69fa3}",
|
||||
"language": "",
|
||||
"created_on": "2008-07-19T06:16:43.044743+00:00",
|
||||
"full_name": "undees/rforce",
|
||||
"has_issues": false,
|
||||
"owner": {
|
||||
"username": "undees",
|
||||
"display_name": "Ian Dees",
|
||||
"type": "user",
|
||||
"uuid": "{6ff66a34-6412-4f28-bf57-707a2a5c6d7b}",
|
||||
"links": {
|
||||
"self": {
|
||||
"href": "https://api.bitbucket.org/2.0/users/undees"
|
||||
},
|
||||
"html": {
|
||||
"href": "https://bitbucket.org/undees/"
|
||||
},
|
||||
"avatar": {
|
||||
"href": "https://bitbucket.org/account/undees/avatar/32/"
|
||||
}
|
||||
}
|
||||
},
|
||||
"updated_on": "2015-02-09T00:48:15.408680+00:00",
|
||||
"size": 338402,
|
||||
"type": "repository",
|
||||
"slug": "rforce",
|
||||
"is_private": false,
|
||||
"description": "A simple, usable binding to the SalesForce API."
|
||||
}
|
||||
],
|
||||
"next": "https://api.bitbucket.org/2.0/repositories?after=2008-07-19T19%3A53%3A07.031845%2B00%3A00"
|
||||
}
|
20
swh/lister/bitbucket/tests/test_bb_lister.py
Normal file
20
swh/lister/bitbucket/tests/test_bb_lister.py
Normal file
|
@ -0,0 +1,20 @@
|
|||
# Copyright (C) 2017 the Software Heritage developers
|
||||
# License: GNU General Public License version 3, or any later version
|
||||
# See top-level LICENSE file for more information
|
||||
|
||||
import re
|
||||
import unittest
|
||||
|
||||
from swh.lister.bitbucket.lister import BitBucketLister
|
||||
from swh.lister.core.tests.test_lister import IndexingHttpListerTesterBase
|
||||
|
||||
|
||||
class BitBucketListerTester(IndexingHttpListerTesterBase, unittest.TestCase):
|
||||
Lister = BitBucketLister
|
||||
test_re = re.compile(r'/repositories\?after=([^?&]+)')
|
||||
lister_subdir = 'bitbucket'
|
||||
good_api_response_file = 'api_response.json'
|
||||
bad_api_response_file = 'api_empty_response.json'
|
||||
first_index = '2008-07-12T07:44:01.476818+00:00'
|
||||
last_index = '2008-07-19T06:16:43.044743+00:00'
|
||||
entries_per_page = 10
|
24
swh/lister/core/abstractattribute.py
Normal file
24
swh/lister/core/abstractattribute.py
Normal file
|
@ -0,0 +1,24 @@
|
|||
# Copyright (C) 2017 the Software Heritage developers
|
||||
# License: GNU General Public License version 3, or any later version
|
||||
# See top-level LICENSE file for more information
|
||||
|
||||
|
||||
class AbstractAttribute:
|
||||
"""AbstractAttributes in a base class must be overridden by the subclass.
|
||||
|
||||
It's like the @abc.abstractmethod decorator, but for things that are
|
||||
explicitly attributes/properties, not methods, without the need for
|
||||
empty method def boilerplate. Like abc.abstractmethod, the class
|
||||
containing AbstractAttributes must inherit abc.ABC or use the
|
||||
abc.ABCMeta metaclass.
|
||||
|
||||
Usage Example:
|
||||
import abc
|
||||
class ClassContainingAnAbstractAttribute(abc.ABC):
|
||||
foo = AbstractAttribute('descriptive docstring for foo')
|
||||
"""
|
||||
__isabstractmethod__ = True
|
||||
|
||||
def __init__(self, docstring=None):
|
||||
if docstring is not None:
|
||||
self.__doc__ = 'AbstractAttribute: ' + docstring
|
210
swh/lister/core/indexing_lister.py
Normal file
210
swh/lister/core/indexing_lister.py
Normal file
|
@ -0,0 +1,210 @@
|
|||
# Copyright (C) 2015-2017 the Software Heritage developers
|
||||
# License: GNU General Public License version 3, or any later version
|
||||
# See top-level LICENSE file for more information
|
||||
|
||||
import abc
|
||||
import logging
|
||||
|
||||
from sqlalchemy import func
|
||||
|
||||
from .lister_transports import SWHListerHttpTransport
|
||||
from .lister_base import SWHListerBase
|
||||
|
||||
|
||||
class SWHIndexingLister(SWHListerBase):
|
||||
"""Lister* intermediate class for any service that follows the pattern:
|
||||
-- The service must report at least one stable unique identifier,
|
||||
known herein as the UID value, for every listed repository.
|
||||
-- If the service splits the list of repositories into sublists,
|
||||
it must report at least one stable and sorted index identifier
|
||||
for every listed repository, known herein as the indexable value,
|
||||
which can be used as part of the service endpoint query to request
|
||||
a sublist beginning from that index. This might be the UID if the
|
||||
UID is monotonic.
|
||||
-- Client sends a request to list repositories starting from a given
|
||||
index.
|
||||
-- Client receives structured (json/xml/etc) response with information
|
||||
about a sequential series of repositories starting from that index
|
||||
and, if necessary/available, some indication of the URL or index
|
||||
for fetching the next series of repository data.
|
||||
|
||||
* - See swh.lister.core.lister_base.SWHListerBase for more details.
|
||||
|
||||
This class cannot be instantiated. To create a new Lister for a source
|
||||
code listing service that follows the model described above, you must
|
||||
subclass this class and provide the required overrides in addition to
|
||||
any unmet implementation/override requirements of this class's base.
|
||||
(see parent class and member docstrings for details)
|
||||
|
||||
Required Overrides:
|
||||
def get_next_target_from_response
|
||||
"""
|
||||
|
||||
@abc.abstractmethod
|
||||
def get_next_target_from_response(self, response):
|
||||
"""Find the next server endpoint identifier given the entire response.
|
||||
|
||||
Implementation of this method depends on the server API spec
|
||||
and the shape of the network response object returned by the
|
||||
transport_request method.
|
||||
|
||||
Args:
|
||||
response (transport response): response page from the server
|
||||
Returns:
|
||||
index of next page, possibly extracted from a next href url
|
||||
"""
|
||||
pass
|
||||
|
||||
# You probably don't need to override anything below this line.
|
||||
|
||||
def filter_before_inject(self, models_list):
|
||||
"""Overrides SWHListerBase.filter_before_inject
|
||||
|
||||
Bounds query results by this Lister's set max_index.
|
||||
"""
|
||||
models_list = [
|
||||
m for m in models_list
|
||||
if self.is_within_bounds(m['indexable'], None, self.max_index)
|
||||
]
|
||||
return models_list
|
||||
|
||||
def db_query_range(self, start, end):
|
||||
"""Look in the db for a range of repositories with indexable
|
||||
values in the range [start, end]
|
||||
|
||||
Args:
|
||||
start (model indexable type): start of desired indexable range
|
||||
end (model indexable type): end of desired indexable range
|
||||
Returns:
|
||||
a list of sqlalchemy.ext.declarative.declarative_base objects
|
||||
with indexable values within the given range
|
||||
"""
|
||||
retlist = self.db_session.query(self.MODEL)
|
||||
if start is not None:
|
||||
retlist.filter(self.MODEL.indexable >= start)
|
||||
if end is not None:
|
||||
retlist.filter(self.MODEL.indexable <= end)
|
||||
return retlist
|
||||
|
||||
def db_partition_indices(self, partition_size):
|
||||
"""Describe an index-space compartmentalization of the db table
|
||||
in equal sized chunks. This is used to describe min&max bounds for
|
||||
parallelizing fetch tasks.
|
||||
|
||||
Args:
|
||||
partition_size (int): desired size to make each partition
|
||||
Returns:
|
||||
a list of tuples (begin, end) of indexable value that
|
||||
declare approximately equal-sized ranges of existing
|
||||
repos
|
||||
"""
|
||||
n = self.db_num_entries()
|
||||
|
||||
partitions = []
|
||||
partition_size = min(partition_size, n)
|
||||
prev_index = None
|
||||
for i in range(0, n-1, partition_size):
|
||||
# indexable column from the ith row
|
||||
index = self.db_session.query(self.MODEL.indexable) \
|
||||
.order_by(self.MODEL.indexable).offset(i).first()
|
||||
if index is not None and prev_index is not None:
|
||||
partitions.append((prev_index, index))
|
||||
prev_index = index
|
||||
|
||||
partitions.append((prev_index, self.db_last_index()))
|
||||
return partitions
|
||||
|
||||
def db_last_index(self):
|
||||
"""Look in the db for the largest indexable value
|
||||
|
||||
Returns:
|
||||
the largest indexable value of all repos in the db
|
||||
"""
|
||||
t = self.db_session.query(func.max(self.MODEL.indexable)).first()
|
||||
if t:
|
||||
return t[0]
|
||||
else:
|
||||
return None
|
||||
|
||||
def disable_deleted_repo_tasks(self, start, end, keep_these):
|
||||
"""Disable tasks for repos that no longer exist between start and end.
|
||||
|
||||
Args:
|
||||
start: beginning of range to disable
|
||||
end: end of range to disable
|
||||
keep_these (uid list): do not disable repos with uids in this list
|
||||
"""
|
||||
if end is None:
|
||||
end = self.db_last_index()
|
||||
|
||||
if not self.is_within_bounds(end, None, self.max_index):
|
||||
end = self.max_index
|
||||
|
||||
deleted_repos = self.winnow_models(
|
||||
self.db_query_range(start, end), self.MODEL.uid, keep_these
|
||||
)
|
||||
tasks_to_disable = [repo for repo in deleted_repos
|
||||
if repo.task_id is not None]
|
||||
if tasks_to_disable:
|
||||
self.scheduler.disable_tasks(tasks_to_disable)
|
||||
for repo in deleted_repos:
|
||||
repo.task_id = None
|
||||
|
||||
def run(self, min_index=None, max_index=None):
|
||||
"""Main entry function. Sequentially fetches repository data
|
||||
from the service according to the basic outline in the class
|
||||
docstring, continually fetching sublists until either there
|
||||
is no next index reference given or the given next index is greater
|
||||
than the desired max_index.
|
||||
|
||||
Args:
|
||||
min_index (indexable type): optional index to start from
|
||||
max_index (indexable type): optional index to stop at
|
||||
Returns:
|
||||
nothing
|
||||
"""
|
||||
index = min_index or ''
|
||||
loop_count = 0
|
||||
self.min_index = min_index
|
||||
self.max_index = max_index
|
||||
|
||||
while self.is_within_bounds(index, self.min_index, self.max_index):
|
||||
logging.info('listing repos starting at %s' % index)
|
||||
|
||||
response, injected_repos = self.ingest_data(index)
|
||||
|
||||
next_index = self.get_next_target_from_response(response)
|
||||
|
||||
# Determine if any repos were deleted, and disable their tasks.
|
||||
|
||||
keep_these = [k for k in injected_repos.keys()]
|
||||
self.disable_deleted_repo_tasks(index, next_index, keep_these)
|
||||
|
||||
# termination condition
|
||||
|
||||
if (next_index is None) or (next_index == index):
|
||||
logging.info('stopping after index %s, no next link found' %
|
||||
index)
|
||||
break
|
||||
else:
|
||||
index = next_index
|
||||
|
||||
loop_count += 1
|
||||
if loop_count == 20:
|
||||
logging.info('flushing updates')
|
||||
loop_count = 0
|
||||
self.db_session.commit()
|
||||
self.db_session = self.mk_session()
|
||||
|
||||
self.db_session.commit()
|
||||
self.db_session = self.mk_session()
|
||||
|
||||
|
||||
class SWHIndexingHttpLister(SWHListerHttpTransport, SWHIndexingLister):
|
||||
"""Convenience class for ensuring right lookup and init order
|
||||
when combining SWHIndexingLister and SWHListerHttpTransport."""
|
||||
def __init__(self, lister_name=None, api_baseurl=None,
|
||||
override_config=None):
|
||||
SWHListerHttpTransport.__init__(self, api_baseurl=api_baseurl)
|
||||
SWHIndexingLister.__init__(self, lister_name=lister_name,
|
||||
override_config=override_config)
|
492
swh/lister/core/lister_base.py
Normal file
492
swh/lister/core/lister_base.py
Normal file
|
@ -0,0 +1,492 @@
|
|||
# Copyright (C) 2015-2017 the Software Heritage developers
|
||||
# License: GNU General Public License version 3, or any later version
|
||||
# See top-level LICENSE file for more information
|
||||
|
||||
import abc
|
||||
import gzip
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import time
|
||||
from datetime import datetime
|
||||
|
||||
from sqlalchemy import create_engine, func
|
||||
from sqlalchemy.orm import sessionmaker
|
||||
|
||||
from swh.core import config
|
||||
from swh.scheduler.backend import SchedulerBackend
|
||||
from swh.storage import get_storage
|
||||
|
||||
from .abstractattribute import AbstractAttribute
|
||||
|
||||
|
||||
class FetchError(RuntimeError):
|
||||
def __init__(self, response):
|
||||
self.response = response
|
||||
|
||||
def __str__(self):
|
||||
return repr(self.response)
|
||||
|
||||
|
||||
class SWHListerBase(abc.ABC, config.SWHConfig):
|
||||
"""Lister core base class.
|
||||
Generally a source code hosting service provides an API endpoint
|
||||
for listing the set of stored repositories. A Lister is the discovery
|
||||
service responsible for finding this list, all at once or sequentially
|
||||
by parts, and queueing local tasks to fetch and ingest the referenced
|
||||
repositories.
|
||||
|
||||
The core method in this class is ingest_data. Any subclasses should be
|
||||
calling this method one or more times to fetch and ingest data from API
|
||||
endpoints. See swh.lister.core.lister_base.SWHIndexingLister for
|
||||
example usage.
|
||||
|
||||
This class cannot be instantiated. Any instantiable Lister descending
|
||||
from SWHListerBase must provide at least the required overrides.
|
||||
(see member docstrings for details):
|
||||
|
||||
Required Overrides:
|
||||
MODEL
|
||||
def transport_request
|
||||
def transport_response_to_string
|
||||
def transport_response_simplified
|
||||
def transport_quota_check
|
||||
|
||||
Optional Overrides:
|
||||
def filter_before_inject
|
||||
def is_within_bounds
|
||||
"""
|
||||
|
||||
MODEL = AbstractAttribute('Subclass type (not instance)'
|
||||
' of swh.lister.core.models.ModelBase'
|
||||
' customized for a specific service.')
|
||||
|
||||
@abc.abstractmethod
|
||||
def transport_request(self, identifier):
|
||||
"""Given a target endpoint identifier to query, try once to request it.
|
||||
|
||||
Implementation of this method determines the network request protocol.
|
||||
|
||||
Args:
|
||||
identifier (string): unique identifier for an endpoint query.
|
||||
e.g. If the service indexes lists of repositories by date and
|
||||
time of creation, this might be that as a formatted string. Or
|
||||
it might be an integer UID. Or it might be nothing.
|
||||
It depends on what the service needs.
|
||||
Returns:
|
||||
the entire request response
|
||||
Raises:
|
||||
Will catch internal transport-dependent connection exceptions and
|
||||
raise swh.lister.core.lister_base.FetchError instead. Other
|
||||
non-connection exceptions should propogate unchanged.
|
||||
"""
|
||||
pass
|
||||
|
||||
@abc.abstractmethod
|
||||
def transport_response_to_string(self, response):
|
||||
"""Convert the server response into a formatted string for logging.
|
||||
|
||||
Implementation of this method depends on the shape of the network
|
||||
response object returned by the transport_request method.
|
||||
|
||||
Args:
|
||||
response: the server response
|
||||
Returns:
|
||||
a pretty string of the response
|
||||
"""
|
||||
pass
|
||||
|
||||
@abc.abstractmethod
|
||||
def transport_response_simplified(self, response):
|
||||
"""Convert the server response into list of a dict for each repo in the
|
||||
response, mapping columns in the lister's MODEL class to repo data.
|
||||
|
||||
Implementation of this method depends on the server API spec and the
|
||||
shape of the network response object returned by the transport_request
|
||||
method.
|
||||
|
||||
Args:
|
||||
response: response object from the server.
|
||||
Returns:
|
||||
list of repo MODEL dicts
|
||||
( eg. [{'uid': r['id'], etc.} for r in response.json()] )
|
||||
"""
|
||||
pass
|
||||
|
||||
@abc.abstractmethod
|
||||
def transport_quota_check(self, response):
|
||||
"""Check server response to see if we're hitting request rate limits.
|
||||
|
||||
Implementation of this method depends on the server communication
|
||||
protocol and API spec and the shape of the network response object
|
||||
returned by the transport_request method.
|
||||
|
||||
Args:
|
||||
response (session response): complete API query response
|
||||
Returns:
|
||||
1) must retry request? True/False
|
||||
2) seconds to delay if True
|
||||
"""
|
||||
pass
|
||||
|
||||
def filter_before_inject(self, models_list):
|
||||
"""Function run after transport_response_simplified but before injection
|
||||
into the local db and creation of workers. Can be used to eliminate
|
||||
some of the results if necessary.
|
||||
|
||||
MAY BE OVERRIDDEN if an intermediate Lister class needs to filter
|
||||
results before injection without requiring every child class to do so.
|
||||
|
||||
Args:
|
||||
models_list: list of dicts returned by
|
||||
transport_response_simplified.
|
||||
Returns:
|
||||
models_list with entries changed according to custom logic.
|
||||
"""
|
||||
pass
|
||||
|
||||
def is_within_bounds(self, inner, lower=None, upper=None):
|
||||
"""See if a sortable value is inside the range [lower,upper].
|
||||
|
||||
MAY BE OVERRIDDEN, for example if the server indexable* key is
|
||||
technically sortable but not automatically so.
|
||||
|
||||
* - ( see: swh.lister.core.indexing_lister.SWHIndexingLister )
|
||||
|
||||
Args:
|
||||
inner (sortable type): the value being checked
|
||||
lower (sortable type): optional lower bound
|
||||
upper (sortable type): optional upper bound
|
||||
Returns:
|
||||
whether inner is confined by the optional lower and upper bounds
|
||||
"""
|
||||
try:
|
||||
if lower is None and upper is None:
|
||||
return True
|
||||
elif lower is None:
|
||||
ret = inner <= upper
|
||||
elif upper is None:
|
||||
ret = inner >= lower
|
||||
else:
|
||||
ret = lower <= inner <= upper
|
||||
|
||||
self.string_pattern_check(inner, lower, upper)
|
||||
except Exception as e:
|
||||
logging.error(str(e) + ': %s, %s, %s' %
|
||||
(('inner=%s%s' % (type(inner), inner)),
|
||||
('lower=%s%s' % (type(lower), lower)),
|
||||
('upper=%s%s' % (type(upper), upper)))
|
||||
)
|
||||
raise
|
||||
|
||||
return ret
|
||||
|
||||
# You probably don't need to override anything below this line.
|
||||
|
||||
DEFAULT_CONFIG = {
|
||||
'storage': ('dict', {
|
||||
'cls': 'remote',
|
||||
'args': {
|
||||
'url': 'http://localhost:5002/'
|
||||
},
|
||||
}),
|
||||
'scheduling_db': ('str', 'dbname=softwareheritage-scheduler'),
|
||||
}
|
||||
|
||||
@property
|
||||
def CONFIG_BASE_FILENAME(self): # noqa: N802
|
||||
return 'lister-%s' % self.lister_name
|
||||
|
||||
@property
|
||||
def ADDITIONAL_CONFIG(self): # noqa: N802
|
||||
return {
|
||||
'lister_db_url':
|
||||
('str', 'postgresql:///lister-%s' % self.lister_name),
|
||||
'credentials':
|
||||
('list[dict]', []),
|
||||
'cache_responses':
|
||||
('bool', False),
|
||||
'cache_dir':
|
||||
('str', '~/.cache/swh/lister/%s' % self.lister_name),
|
||||
}
|
||||
|
||||
INITIAL_BACKOFF = 10
|
||||
MAX_RETRIES = 7
|
||||
CONN_SLEEP = 10
|
||||
|
||||
def __init__(self, lister_name=None, override_config=None):
|
||||
self.backoff = self.INITIAL_BACKOFF
|
||||
if lister_name is None:
|
||||
raise NameError("Every lister must be assigned a lister_name.")
|
||||
self.lister_name = lister_name # 'github?', 'bitbucket?', 'foo.com?'
|
||||
self.config = self.parse_config_file(
|
||||
base_filename=self.CONFIG_BASE_FILENAME,
|
||||
additional_configs=[self.ADDITIONAL_CONFIG]
|
||||
)
|
||||
self.config['cache_dir'] = os.path.expanduser(self.config['cache_dir'])
|
||||
if self.config['cache_responses']:
|
||||
config.prepare_folders(self.config, ['cache_dir'])
|
||||
|
||||
if override_config:
|
||||
self.config.update(override_config)
|
||||
|
||||
self.storage = get_storage(**self.config['storage'])
|
||||
self.scheduler = SchedulerBackend(
|
||||
scheduling_db=self.config['scheduling_db'],
|
||||
)
|
||||
self.db_engine = create_engine(self.config['lister_db_url'])
|
||||
self.mk_session = sessionmaker(bind=self.db_engine)
|
||||
self.db_session = self.mk_session()
|
||||
|
||||
def reset_backoff(self):
|
||||
"""Reset exponential backoff timeout to initial level."""
|
||||
self.backoff = self.INITIAL_BACKOFF
|
||||
|
||||
def back_off(self):
|
||||
"""Get next exponential backoff timeout."""
|
||||
ret = self.backoff
|
||||
self.backoff *= 10
|
||||
return ret
|
||||
|
||||
def safely_issue_request(self, identifier):
|
||||
"""Make network request with retries, rate quotas, and response logs.
|
||||
|
||||
Protocol is handled by the implementation of the transport_request
|
||||
method.
|
||||
|
||||
Args:
|
||||
identifier: resource identifier
|
||||
Returns:
|
||||
server response
|
||||
"""
|
||||
retries_left = self.MAX_RETRIES
|
||||
while retries_left > 0:
|
||||
try:
|
||||
r = self.transport_request(identifier)
|
||||
except FetchError:
|
||||
# network-level connection error, try again
|
||||
logging.warn('connection error on %s: sleep for %d seconds' %
|
||||
(identifier, self.CONN_SLEEP))
|
||||
time.sleep(self.CONN_SLEEP)
|
||||
retries_left -= 1
|
||||
continue
|
||||
|
||||
# detect throttling
|
||||
must_retry, delay = self.transport_quota_check(r)
|
||||
if must_retry:
|
||||
logging.warn('rate limited on %s: sleep for %f seconds' %
|
||||
(identifier, delay))
|
||||
time.sleep(delay)
|
||||
else: # request ok
|
||||
break
|
||||
|
||||
retries_left -= 1
|
||||
|
||||
if not retries_left:
|
||||
logging.warn('giving up on %s: max retries exceeded' % identifier)
|
||||
|
||||
do_cache = self.config['cache_responses']
|
||||
if do_cache:
|
||||
self.save_response(r)
|
||||
|
||||
return r
|
||||
|
||||
def db_query_equal(self, key, value):
|
||||
"""Look in the db for a row with key == value
|
||||
|
||||
Args:
|
||||
key: column key to look at
|
||||
value: value to look for in that column
|
||||
Returns:
|
||||
sqlalchemy.ext.declarative.declarative_base object
|
||||
with the given key == value
|
||||
"""
|
||||
if isinstance(key, str):
|
||||
key = self.MODEL.__dict__[key]
|
||||
return self.db_session.query(self.MODEL) \
|
||||
.filter(key == value).first()
|
||||
|
||||
def winnow_models(self, mlist, key, to_remove):
|
||||
"""Given a list of models, remove any with <key> matching
|
||||
some member of a list of values.
|
||||
|
||||
Args:
|
||||
mlist (list of model rows): the initial list of models
|
||||
key (column): the column to filter on
|
||||
to_remove (list): if anything in mlist has column <key> equal to
|
||||
one of the values in to_remove, it will be removed from the
|
||||
result
|
||||
Returns:
|
||||
A list of model rows starting from mlist minus any matching rows
|
||||
"""
|
||||
if isinstance(key, str):
|
||||
key = self.MODEL.__dict__[key]
|
||||
|
||||
if to_remove:
|
||||
return mlist.filter(~key.in_(to_remove)).all()
|
||||
else:
|
||||
return mlist.all()
|
||||
|
||||
def db_num_entries(self):
|
||||
"""Return the known number of entries in the lister db"""
|
||||
return self.db_session.query(func.count('*')).select_from(self.MODEL) \
|
||||
.scalar()
|
||||
|
||||
def db_inject_repo(self, model_dict):
|
||||
"""Add/update a new repo to the db and mark it last_seen now.
|
||||
|
||||
Args:
|
||||
model_dict: dictionary mapping model keys to values
|
||||
Returns:
|
||||
new or updated sqlalchemy.ext.declarative.declarative_base
|
||||
object associated with the injection
|
||||
"""
|
||||
sql_repo = self.db_query_equal('uid', model_dict['uid'])
|
||||
|
||||
if not sql_repo:
|
||||
sql_repo = self.MODEL(**model_dict)
|
||||
self.db_session.add(sql_repo)
|
||||
else:
|
||||
for k in model_dict:
|
||||
setattr(sql_repo, k, model_dict[k])
|
||||
sql_repo.last_seen = datetime.now()
|
||||
|
||||
return sql_repo
|
||||
|
||||
def origin_dict(self, origin_type, origin_url):
|
||||
"""Return special dict format for the origins list
|
||||
|
||||
Args:
|
||||
origin_type (string)
|
||||
origin_url (string)
|
||||
Returns:
|
||||
the same information in a different form
|
||||
"""
|
||||
return {
|
||||
'type': origin_type,
|
||||
'url': origin_url,
|
||||
}
|
||||
|
||||
def task_dict(self, origin_type, origin_url):
|
||||
"""Return special dict format for the tasks list
|
||||
|
||||
Args:
|
||||
origin_type (string)
|
||||
origin_url (string)
|
||||
Returns:
|
||||
the same information in a different form
|
||||
"""
|
||||
return {
|
||||
'type': 'origin-update-%s' % origin_type,
|
||||
'arguments': {
|
||||
'args': [
|
||||
origin_url,
|
||||
],
|
||||
'kwargs': {},
|
||||
},
|
||||
'next_run': datetime.now(),
|
||||
}
|
||||
|
||||
def string_pattern_check(self, a, b, c=None):
|
||||
"""When comparing indexable types in is_within_bounds, complex strings
|
||||
may not be allowed to differ in basic structure. If they do, it
|
||||
could be a sign of not understanding the data well. For instance,
|
||||
an ISO 8601 time string cannot be compared against its urlencoded
|
||||
equivalent, but this is an easy mistake to accidentally make. This
|
||||
method acts as a friendly sanity check.
|
||||
|
||||
Args:
|
||||
a (string): inner component of the is_within_bounds method
|
||||
b (string): lower component of the is_within_bounds method
|
||||
c (string): upper component of the is_within_bounds method
|
||||
Returns:
|
||||
nothing
|
||||
Raises:
|
||||
TypeError if strings a, b, and c don't conform to the same basic
|
||||
pattern.
|
||||
"""
|
||||
if isinstance(a, str):
|
||||
a_pattern = re.sub('[a-zA-Z0-9]',
|
||||
'[a-zA-Z0-9]',
|
||||
re.escape(a))
|
||||
if (isinstance(b, str) and (re.match(a_pattern, b) is None)
|
||||
or isinstance(c, str) and (re.match(a_pattern, c) is None)):
|
||||
logging.debug(a_pattern)
|
||||
raise TypeError('incomparable string patterns detected')
|
||||
|
||||
def inject_repo_data_into_db(self, models_list):
|
||||
"""Inject data into the db.
|
||||
|
||||
Args:
|
||||
models_list: list of dicts mapping keys from the db model
|
||||
for each repo to be injected
|
||||
Returns:
|
||||
dict of uid:sql_repo pairs
|
||||
"""
|
||||
injected_repos = {}
|
||||
for m in models_list:
|
||||
injected_repos[m['uid']] = self.db_inject_repo(m)
|
||||
return injected_repos
|
||||
|
||||
def create_missing_origins_and_tasks(self, models_list, injected_repos):
|
||||
"""Find any newly created db entries that don't yet have tasks or
|
||||
origin objects assigned.
|
||||
|
||||
Args:
|
||||
models_list: a list of dicts mapping keys in the db model for
|
||||
each repo
|
||||
injected_repos: dict of uid:sql_repo pairs that have just
|
||||
been created
|
||||
Returns:
|
||||
Nothing. Modifies injected_repos.
|
||||
"""
|
||||
for m in models_list:
|
||||
ir = injected_repos[m['uid']]
|
||||
if not ir.origin_id:
|
||||
ir.origin_id = self.storage.origin_add_one(
|
||||
self.origin_dict(m['origin_type'], m['origin_url'])
|
||||
)
|
||||
if not ir.task_id:
|
||||
ir.task_id = self.scheduler.create_tasks(
|
||||
[self.task_dict(m['origin_type'], m['origin_url'])]
|
||||
)['id']
|
||||
|
||||
def ingest_data(self, identifier):
|
||||
"""The core data fetch sequence. Request server endpoint. Simplify and
|
||||
filter response list of repositories. Inject repo information into
|
||||
local db. Queue loader tasks for linked repositories.
|
||||
|
||||
Args:
|
||||
identifier: Resource identifier.
|
||||
"""
|
||||
# Request (partial?) list of repositories info
|
||||
response = self.safely_issue_request(identifier)
|
||||
models_list = self.transport_response_simplified(response)
|
||||
models_list = self.filter_before_inject(models_list)
|
||||
# inject into local db
|
||||
injected = self.inject_repo_data_into_db(models_list)
|
||||
# queue workers
|
||||
self.create_missing_origins_and_tasks(models_list, injected)
|
||||
return response, injected
|
||||
|
||||
def save_response(self, response):
|
||||
"""Log the response from a server request to a cache dir.
|
||||
|
||||
Args:
|
||||
response: full server response
|
||||
cache_dir: system path for cache dir
|
||||
Returns:
|
||||
nothing
|
||||
"""
|
||||
def escape_url_path(p):
|
||||
return p.replace('/', '__')
|
||||
|
||||
fname = os.path.join(
|
||||
self.config['cache_dir'],
|
||||
escape_url_path(response.request.path_url) + '.gz'
|
||||
)
|
||||
with gzip.open(fname, 'w') as f:
|
||||
f.write(bytes(
|
||||
self.transport_response_to_string(response),
|
||||
'UTF-8'
|
||||
))
|
97
swh/lister/core/lister_transports.py
Normal file
97
swh/lister/core/lister_transports.py
Normal file
|
@ -0,0 +1,97 @@
|
|||
# Copyright (C) 2017 the Software Heritage developers
|
||||
# License: GNU General Public License version 3, or any later version
|
||||
# See top-level LICENSE file for more information
|
||||
|
||||
import abc
|
||||
import random
|
||||
from datetime import datetime
|
||||
from email.utils import parsedate
|
||||
from pprint import pformat
|
||||
|
||||
import requests
|
||||
import xmltodict
|
||||
|
||||
from .abstractattribute import AbstractAttribute
|
||||
from .lister_base import FetchError
|
||||
|
||||
|
||||
class SWHListerHttpTransport(abc.ABC):
|
||||
"""Use the Requests library for making Lister endpoint requests.
|
||||
|
||||
To be used in conjunction with SWHListerBase or a subclass of it.
|
||||
"""
|
||||
|
||||
PATH_TEMPLATE = AbstractAttribute('string containing a python string'
|
||||
' format pattern that produces the API'
|
||||
' endpoint path for listing stored'
|
||||
' repositories when given an index.'
|
||||
' eg. "/repositories?after=%s".'
|
||||
'To be implemented in the API-specific'
|
||||
' class inheriting this.')
|
||||
|
||||
def request_headers(self):
|
||||
"""Returns dictionary of any request headers needed by the server.
|
||||
|
||||
MAY BE OVERRIDDEN if request headers are needed.
|
||||
"""
|
||||
return {}
|
||||
|
||||
def transport_quota_check(self, response):
|
||||
"""Implements SWHListerBase.transport_quota_check with standard 429 code
|
||||
check for HTTP with Requests library.
|
||||
|
||||
MAY BE OVERRIDDEN if the server notifies about rate limits in a
|
||||
non-standard way that doesn't use HTTP 429 and the Retry-After
|
||||
response header. ( https://tools.ietf.org/html/rfc6585#section-4 )
|
||||
"""
|
||||
if response.status_code == 429: # HTTP too many requests
|
||||
retry_after = response.headers.get('Retry-After', self.back_off())
|
||||
try:
|
||||
# might be seconds
|
||||
return True, float(retry_after)
|
||||
except:
|
||||
# might be http-date
|
||||
at_date = datetime(*parsedate(retry_after)[:6])
|
||||
from_now = (at_date - datetime.today()).total_seconds() + 5
|
||||
return True, max(0, from_now)
|
||||
else: # response ok
|
||||
self.reset_backoff()
|
||||
return False, 0
|
||||
|
||||
def __init__(self, api_baseurl=None):
|
||||
if not api_baseurl:
|
||||
raise NameError('HTTP Lister Transport requires api_baseurl.')
|
||||
self.api_baseurl = api_baseurl # eg. 'https://api.github.com'
|
||||
self.session = requests.Session()
|
||||
|
||||
def transport_request(self, identifier):
|
||||
"""Implements SWHListerBase.transport_request for HTTP using Requests.
|
||||
"""
|
||||
path = self.PATH_TEMPLATE % identifier
|
||||
params = {}
|
||||
params['headers'] = self.request_headers() or {}
|
||||
creds = self.config['credentials']
|
||||
auth = random.choice(creds) if creds else None
|
||||
if auth:
|
||||
params['auth'] = auth
|
||||
try:
|
||||
return self.session.get(self.api_baseurl + path, **params)
|
||||
except requests.exceptions.ConnectionError as e:
|
||||
raise FetchError(e)
|
||||
|
||||
def transport_response_to_string(self, response):
|
||||
"""Implements SWHListerBase.transport_response_to_string for HTTP given
|
||||
Requests responses.
|
||||
"""
|
||||
s = pformat(response.request.path_url)
|
||||
s += '\n#\n' + pformat(response.status_code)
|
||||
s += '\n#\n' + pformat(response.headers)
|
||||
s += '\n#\n'
|
||||
try: # json?
|
||||
s += pformat(response.json())
|
||||
except: # not json
|
||||
try: # xml?
|
||||
s += pformat(xmltodict.parse(response.text))
|
||||
except: # not xml
|
||||
s += pformat(response.text)
|
||||
return s
|
67
swh/lister/core/models.py
Normal file
67
swh/lister/core/models.py
Normal file
|
@ -0,0 +1,67 @@
|
|||
# Copyright (C) 2015-2017 the Software Heritage developers
|
||||
# License: GNU General Public License version 3, or any later version
|
||||
# See top-level LICENSE file for more information
|
||||
|
||||
import abc
|
||||
from datetime import datetime
|
||||
|
||||
from sqlalchemy import Column, DateTime, Integer, String
|
||||
from sqlalchemy.ext.declarative import declarative_base, DeclarativeMeta
|
||||
|
||||
from .abstractattribute import AbstractAttribute
|
||||
|
||||
SQLBase = declarative_base()
|
||||
|
||||
|
||||
class ABCSQLMeta(abc.ABCMeta, DeclarativeMeta):
|
||||
pass
|
||||
|
||||
|
||||
class ModelBase(SQLBase, metaclass=ABCSQLMeta):
|
||||
"""a common repository"""
|
||||
__abstract__ = True
|
||||
__tablename__ = AbstractAttribute
|
||||
|
||||
uid = AbstractAttribute('Column(<uid_type>, primary_key=True)')
|
||||
|
||||
# The value used for sorting, segmenting, or api query paging,
|
||||
# because uids aren't always sequential.
|
||||
indexable = AbstractAttribute('Column(<indexable_type>, index=True)')
|
||||
|
||||
name = Column(String, index=True)
|
||||
full_name = Column(String, index=True)
|
||||
html_url = Column(String)
|
||||
origin_url = Column(String)
|
||||
origin_type = Column(String)
|
||||
description = Column(String)
|
||||
|
||||
last_seen = Column(DateTime, nullable=False)
|
||||
|
||||
task_id = Column(Integer)
|
||||
origin_id = Column(Integer)
|
||||
|
||||
def __init__(self, uid=None, indexable=None, name=None, full_name=None,
|
||||
html_url=None, origin_url=None, origin_type=None,
|
||||
description=None, task_id=None, origin_id=None):
|
||||
self.uid = uid
|
||||
self.last_seen = datetime.now()
|
||||
|
||||
if indexable is not None:
|
||||
self.indexable = indexable
|
||||
if name is not None:
|
||||
self.name = name
|
||||
if full_name is not None:
|
||||
self.full_name = full_name
|
||||
if html_url is not None:
|
||||
self.html_url = html_url
|
||||
if origin_url is not None:
|
||||
self.origin_url = origin_url
|
||||
if origin_type is not None:
|
||||
self.origin_type = origin_type
|
||||
if description is not None:
|
||||
self.description = description
|
||||
|
||||
if task_id is not None:
|
||||
self.task_id = task_id
|
||||
if origin_id is not None:
|
||||
self.origin_id = origin_id
|
71
swh/lister/core/tasks.py
Normal file
71
swh/lister/core/tasks.py
Normal file
|
@ -0,0 +1,71 @@
|
|||
# Copyright (C) 2017 the Software Heritage developers
|
||||
# License: GNU General Public License version 3, or any later version
|
||||
# See top-level LICENSE file for more information
|
||||
|
||||
import abc
|
||||
import random
|
||||
|
||||
from celery import group
|
||||
from celery.app.task import TaskType
|
||||
|
||||
from swh.scheduler.task import Task
|
||||
|
||||
from .abstractattribute import AbstractAttribute
|
||||
|
||||
|
||||
class AbstractTaskMeta(abc.ABCMeta, TaskType):
|
||||
pass
|
||||
|
||||
|
||||
class ListerTaskBase(Task, metaclass=AbstractTaskMeta):
|
||||
"""Lister Tasks define the process of periodically requesting batches of
|
||||
repository information from source code hosting services. They
|
||||
instantiate Listers to do batches of work at periodic intervals.
|
||||
|
||||
There are two main kinds of lister tasks:
|
||||
|
||||
1) Discovering new repositories.
|
||||
2) Refreshing the list of already discovered repositories.
|
||||
|
||||
If the hosting service is indexable (according to the requirements of
|
||||
SWHIndexingLister), then we can optionally partition the set of known
|
||||
repositories into sub-sets to distribute the work.
|
||||
|
||||
This means that there is a third possible Task type for Indexing
|
||||
Listers:
|
||||
3) Discover or refresh a specific range of indices.
|
||||
"""
|
||||
task_queue = AbstractAttribute('Celery Task queue name')
|
||||
|
||||
@abc.abstractmethod
|
||||
def new_lister(self):
|
||||
"""Return a new lister of the appropriate type.
|
||||
"""
|
||||
pass
|
||||
|
||||
@abc.abstractmethod
|
||||
def run(self):
|
||||
pass
|
||||
|
||||
|
||||
class IndexingDiscoveryListerTask(ListerTaskBase):
|
||||
def run(self):
|
||||
lister = self.new_lister()
|
||||
lister.run(min_index=lister.db_last_index(), max_index=None)
|
||||
|
||||
|
||||
class IndexingRangeListerTask(ListerTaskBase):
|
||||
def run(self, start, end):
|
||||
lister = self.new_lister()
|
||||
lister.run(min_index=start, max_index=end)
|
||||
|
||||
|
||||
class IndexingRefreshListerTask(ListerTaskBase):
|
||||
GROUP_SPLIT = 10000
|
||||
|
||||
def run(self):
|
||||
lister = self.new_lister()
|
||||
ranges = lister.db_partition_indices(self.GROUP_SPLIT)
|
||||
random.shuffle(ranges)
|
||||
range_task = IndexingRangeListerTask()
|
||||
group(range_task.s(minv, maxv) for minv, maxv in ranges)()
|
68
swh/lister/core/tests/test_abstractattribute.py
Normal file
68
swh/lister/core/tests/test_abstractattribute.py
Normal file
|
@ -0,0 +1,68 @@
|
|||
# Copyright (C) 2017 the Software Heritage developers
|
||||
# License: GNU General Public License version 3, or any later version
|
||||
# See top-level LICENSE file for more information
|
||||
|
||||
import abc
|
||||
import unittest
|
||||
|
||||
from nose.tools import istest
|
||||
|
||||
from swh.lister.core.abstractattribute import AbstractAttribute
|
||||
|
||||
|
||||
class BaseClass(abc.ABC):
|
||||
v1 = AbstractAttribute
|
||||
v2 = AbstractAttribute()
|
||||
v3 = AbstractAttribute('changed docstring')
|
||||
v4 = 'qux'
|
||||
|
||||
|
||||
class BadSubclass1(BaseClass):
|
||||
pass
|
||||
|
||||
|
||||
class BadSubclass2(BaseClass):
|
||||
v1 = 'foo'
|
||||
v2 = 'bar'
|
||||
|
||||
|
||||
class BadSubclass3(BaseClass):
|
||||
v2 = 'bar'
|
||||
v3 = 'baz'
|
||||
|
||||
|
||||
class GoodSubclass(BaseClass):
|
||||
v1 = 'foo'
|
||||
v2 = 'bar'
|
||||
v3 = 'baz'
|
||||
|
||||
|
||||
class TestAbstractAttributes(unittest.TestCase):
|
||||
@istest
|
||||
def test_aa(self):
|
||||
with self.assertRaises(TypeError):
|
||||
BaseClass()
|
||||
|
||||
with self.assertRaises(TypeError):
|
||||
BadSubclass1()
|
||||
|
||||
with self.assertRaises(TypeError):
|
||||
BadSubclass2()
|
||||
|
||||
with self.assertRaises(TypeError):
|
||||
BadSubclass3()
|
||||
|
||||
self.assertIsInstance(GoodSubclass(), GoodSubclass)
|
||||
gsc = GoodSubclass()
|
||||
|
||||
self.assertEqual(gsc.v1, 'foo')
|
||||
self.assertEqual(gsc.v2, 'bar')
|
||||
self.assertEqual(gsc.v3, 'baz')
|
||||
self.assertEqual(gsc.v4, 'qux')
|
||||
|
||||
@istest
|
||||
def test_aa_docstrings(self):
|
||||
self.assertEqual(BaseClass.v1.__doc__, AbstractAttribute.__doc__)
|
||||
self.assertEqual(BaseClass.v2.__doc__, AbstractAttribute.__doc__)
|
||||
self.assertEqual(BaseClass.v3.__doc__,
|
||||
'AbstractAttribute: changed docstring')
|
229
swh/lister/core/tests/test_lister.py
Normal file
229
swh/lister/core/tests/test_lister.py
Normal file
|
@ -0,0 +1,229 @@
|
|||
# Copyright (C) 2017 the Software Heritage developers
|
||||
# License: GNU General Public License version 3, or any later version
|
||||
# See top-level LICENSE file for more information
|
||||
|
||||
import abc
|
||||
import time
|
||||
from unittest import TestCase
|
||||
from unittest.mock import Mock, patch
|
||||
|
||||
import requests_mock
|
||||
import testing.postgresql
|
||||
from nose.tools import istest
|
||||
from sqlalchemy import create_engine
|
||||
|
||||
from swh.lister.core.abstractattribute import AbstractAttribute
|
||||
|
||||
|
||||
def noop(*args, **kwargs):
|
||||
pass
|
||||
|
||||
|
||||
@requests_mock.Mocker()
|
||||
class IndexingHttpListerTesterBase(abc.ABC):
|
||||
"""Base testing class for subclasses of
|
||||
swh.lister.core.indexing_lister.SWHIndexingHttpLister.
|
||||
|
||||
See swh.lister.github.tests.test_gh_lister for an example of how to
|
||||
customize for a specific listing service.
|
||||
"""
|
||||
Lister = AbstractAttribute('The lister class to test')
|
||||
test_re = AbstractAttribute('Compiled regex matching the server url. Must'
|
||||
' capture the index value.')
|
||||
lister_subdir = AbstractAttribute('bitbucket, github, etc.')
|
||||
good_api_response_file = AbstractAttribute('Example good response body')
|
||||
bad_api_response_file = AbstractAttribute('Example bad response body')
|
||||
first_index = AbstractAttribute('First index in good_api_response')
|
||||
last_index = AbstractAttribute('Last index in good_api_response')
|
||||
entries_per_page = AbstractAttribute('Number of results in good response')
|
||||
|
||||
# May need to override this if the headers are used for something
|
||||
def response_headers(self, request):
|
||||
return {}
|
||||
|
||||
# May need to override this if the server uses non-standard rate limiting
|
||||
# method.
|
||||
# Please keep the requested retry delay reasonably low.
|
||||
def mock_rate_quota(self, n, request, context):
|
||||
self.rate_limit += 1
|
||||
context.status_code = 429
|
||||
context.headers['Retry-After'] = '1'
|
||||
return '{"error":"dummy"}'
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self.rate_limit = 1
|
||||
self.response = None
|
||||
self.fl = None
|
||||
self.helper = None
|
||||
if self.__class__ != IndexingHttpListerTesterBase:
|
||||
self.run = TestCase.run.__get__(self, self.__class__)
|
||||
else:
|
||||
self.run = noop
|
||||
|
||||
def request_index(self, request):
|
||||
m = self.test_re.search(request.path_url)
|
||||
if m and (len(m.groups()) > 0):
|
||||
return m.group(1)
|
||||
else:
|
||||
return None
|
||||
|
||||
def mock_response(self, request, context):
|
||||
self.fl.reset_backoff()
|
||||
self.rate_limit = 1
|
||||
context.status_code = 200
|
||||
custom_headers = self.response_headers(request)
|
||||
context.headers.update(custom_headers)
|
||||
if self.request_index(request) == str(self.first_index):
|
||||
with open('swh/lister/%s/tests/%s' % (self.lister_subdir,
|
||||
self.good_api_response_file),
|
||||
'r') as r:
|
||||
return r.read()
|
||||
else:
|
||||
with open('swh/lister/%s/tests/%s' % (self.lister_subdir,
|
||||
self.bad_api_response_file),
|
||||
'r') as r:
|
||||
return r.read()
|
||||
|
||||
def mock_limit_n_response(self, n, request, context):
|
||||
self.fl.reset_backoff()
|
||||
if self.rate_limit <= n:
|
||||
return self.mock_rate_quota(n, request, context)
|
||||
else:
|
||||
return self.mock_response(request, context)
|
||||
|
||||
def mock_limit_once_response(self, request, context):
|
||||
return self.mock_limit_n_response(1, request, context)
|
||||
|
||||
def mock_limit_twice_response(self, request, context):
|
||||
return self.mock_limit_n_response(2, request, context)
|
||||
|
||||
def get_fl(self, override_config=None):
|
||||
if override_config or self.fl is None:
|
||||
with patch(
|
||||
'swh.scheduler.backend.SchedulerBackend.reconnect', noop
|
||||
):
|
||||
self.fl = self.Lister(lister_name='fakelister',
|
||||
api_baseurl='https://fakeurl',
|
||||
override_config=override_config)
|
||||
self.fl.INITIAL_BACKOFF = 1
|
||||
|
||||
self.fl.reset_backoff()
|
||||
return self.fl
|
||||
|
||||
def get_api_response(self):
|
||||
fl = self.get_fl()
|
||||
if self.response is None:
|
||||
self.response = fl.safely_issue_request(self.first_index)
|
||||
return self.response
|
||||
|
||||
@istest
|
||||
def test_is_within_bounds(self, http_mocker):
|
||||
fl = self.get_fl()
|
||||
self.assertFalse(fl.is_within_bounds(1, 2, 3))
|
||||
self.assertTrue(fl.is_within_bounds(2, 1, 3))
|
||||
self.assertTrue(fl.is_within_bounds(1, 1, 1))
|
||||
self.assertTrue(fl.is_within_bounds(1, None, None))
|
||||
self.assertTrue(fl.is_within_bounds(1, None, 2))
|
||||
self.assertTrue(fl.is_within_bounds(1, 0, None))
|
||||
self.assertTrue(fl.is_within_bounds("b", "a", "c"))
|
||||
self.assertFalse(fl.is_within_bounds("a", "b", "c"))
|
||||
self.assertTrue(fl.is_within_bounds("a", None, "c"))
|
||||
self.assertTrue(fl.is_within_bounds("a", None, None))
|
||||
self.assertTrue(fl.is_within_bounds("b", "a", None))
|
||||
self.assertFalse(fl.is_within_bounds("a", "b", None))
|
||||
self.assertTrue(fl.is_within_bounds("aa:02", "aa:01", "aa:03"))
|
||||
self.assertFalse(fl.is_within_bounds("aa:12", None, "aa:03"))
|
||||
with self.assertRaises(TypeError):
|
||||
fl.is_within_bounds(1.0, "b", None)
|
||||
with self.assertRaises(TypeError):
|
||||
fl.is_within_bounds("A:B", "A::B", None)
|
||||
|
||||
@istest
|
||||
def test_api_request(self, http_mocker):
|
||||
http_mocker.get(self.test_re, text=self.mock_limit_twice_response)
|
||||
with patch.object(time, 'sleep', wraps=time.sleep) as sleepmock:
|
||||
self.get_api_response()
|
||||
self.assertEqual(sleepmock.call_count, 2)
|
||||
|
||||
@istest
|
||||
def test_repos_list(self, http_mocker):
|
||||
http_mocker.get(self.test_re, text=self.mock_response)
|
||||
li = self.get_fl().transport_response_simplified(
|
||||
self.get_api_response()
|
||||
)
|
||||
self.assertIsInstance(li, list)
|
||||
self.assertEqual(len(li), self.entries_per_page)
|
||||
|
||||
@istest
|
||||
def test_model_map(self, http_mocker):
|
||||
http_mocker.get(self.test_re, text=self.mock_response)
|
||||
fl = self.get_fl()
|
||||
li = fl.transport_response_simplified(self.get_api_response())
|
||||
di = li[0]
|
||||
self.assertIsInstance(di, dict)
|
||||
pubs = [k for k in vars(fl.MODEL).keys() if not k.startswith('_')]
|
||||
for k in pubs:
|
||||
if k not in ['last_seen', 'task_id', 'origin_id']:
|
||||
self.assertIn(k, di)
|
||||
|
||||
def disable_storage_and_scheduler(self, fl):
|
||||
fl.create_missing_origins_and_tasks = Mock(return_value=None)
|
||||
|
||||
def disable_db(self, fl):
|
||||
fl.winnow_models = Mock(return_value=[])
|
||||
fl.db_inject_repo = Mock(return_value=fl.MODEL())
|
||||
fl.disable_deleted_repo_tasks = Mock(return_value=None)
|
||||
|
||||
@istest
|
||||
def test_fetch_none_nodb(self, http_mocker):
|
||||
http_mocker.get(self.test_re, text=self.mock_response)
|
||||
fl = self.get_fl()
|
||||
|
||||
self.disable_storage_and_scheduler(fl)
|
||||
self.disable_db(fl)
|
||||
|
||||
fl.run(min_index=1, max_index=1) # stores no results
|
||||
|
||||
@istest
|
||||
def test_fetch_one_nodb(self, http_mocker):
|
||||
http_mocker.get(self.test_re, text=self.mock_response)
|
||||
fl = self.get_fl()
|
||||
|
||||
self.disable_storage_and_scheduler(fl)
|
||||
self.disable_db(fl)
|
||||
|
||||
fl.run(min_index=self.first_index, max_index=self.first_index)
|
||||
|
||||
@istest
|
||||
def test_fetch_multiple_pages_nodb(self, http_mocker):
|
||||
http_mocker.get(self.test_re, text=self.mock_response)
|
||||
fl = self.get_fl()
|
||||
|
||||
self.disable_storage_and_scheduler(fl)
|
||||
self.disable_db(fl)
|
||||
|
||||
fl.run(min_index=self.first_index)
|
||||
|
||||
def init_db(self, db, model):
|
||||
engine = create_engine(db.url())
|
||||
model.metadata.create_all(engine)
|
||||
|
||||
@istest
|
||||
def test_fetch_multiple_pages_yesdb(self, http_mocker):
|
||||
http_mocker.get(self.test_re, text=self.mock_response)
|
||||
db = testing.postgresql.Postgresql()
|
||||
|
||||
fl = self.get_fl(override_config={'lister_db_url': db.url()})
|
||||
self.init_db(db, fl.MODEL)
|
||||
|
||||
self.disable_storage_and_scheduler(fl)
|
||||
|
||||
fl.run(min_index=self.first_index)
|
||||
|
||||
self.assertEqual(fl.db_last_index(), self.last_index)
|
||||
partitions = fl.db_partition_indices(5)
|
||||
self.assertGreater(len(partitions), 0)
|
||||
for k in partitions:
|
||||
self.assertLessEqual(len(k), 5)
|
||||
self.assertGreater(len(k), 0)
|
53
swh/lister/core/tests/test_model.py
Normal file
53
swh/lister/core/tests/test_model.py
Normal file
|
@ -0,0 +1,53 @@
|
|||
# Copyright (C) 2017 the Software Heritage developers
|
||||
# License: GNU General Public License version 3, or any later version
|
||||
# See top-level LICENSE file for more information
|
||||
|
||||
import unittest
|
||||
|
||||
from nose.tools import istest
|
||||
from sqlalchemy import Column, Integer
|
||||
|
||||
from swh.lister.core.models import ModelBase
|
||||
|
||||
|
||||
class BadSubclass1(ModelBase):
|
||||
__abstract__ = True
|
||||
pass
|
||||
|
||||
|
||||
class BadSubclass2(ModelBase):
|
||||
__abstract__ = True
|
||||
__tablename__ = 'foo'
|
||||
|
||||
|
||||
class BadSubclass3(BadSubclass2):
|
||||
__abstract__ = True
|
||||
pass
|
||||
|
||||
|
||||
class GoodSubclass(BadSubclass2):
|
||||
uid = Column(Integer, primary_key=True)
|
||||
indexable = Column(Integer, index=True)
|
||||
|
||||
|
||||
class TestModel(unittest.TestCase):
|
||||
@istest
|
||||
def test_model_instancing(self):
|
||||
with self.assertRaises(TypeError):
|
||||
ModelBase()
|
||||
|
||||
with self.assertRaises(TypeError):
|
||||
BadSubclass1()
|
||||
|
||||
with self.assertRaises(TypeError):
|
||||
BadSubclass2()
|
||||
|
||||
with self.assertRaises(TypeError):
|
||||
BadSubclass3()
|
||||
|
||||
self.assertIsInstance(GoodSubclass(), GoodSubclass)
|
||||
gsc = GoodSubclass(uid='uid', indexable='indexable')
|
||||
|
||||
self.assertEqual(gsc.__tablename__, 'foo')
|
||||
self.assertEqual(gsc.uid, 'uid')
|
||||
self.assertEqual(gsc.indexable, 'indexable')
|
|
@ -1,65 +0,0 @@
|
|||
# Copyright (C) 2016 The Software Heritage developers
|
||||
# License: GNU General Public License version 3, or any later version
|
||||
# See top-level LICENSE file for more information
|
||||
|
||||
from swh.core import config
|
||||
from swh.storage import get_storage
|
||||
from swh.scheduler.backend import SchedulerBackend
|
||||
|
||||
|
||||
# TODO: split this into a lister-agnostic module
|
||||
|
||||
class SWHLister(config.SWHConfig):
|
||||
CONFIG_BASE_FILENAME = None
|
||||
|
||||
DEFAULT_CONFIG = {
|
||||
'storage': ('dict', {
|
||||
'cls': 'remote',
|
||||
'args': {
|
||||
'url': 'http://localhost:5002/'
|
||||
},
|
||||
}),
|
||||
'scheduling_db': ('str', 'dbname=softwareheritage-scheduler'),
|
||||
}
|
||||
|
||||
ADDITIONAL_CONFIG = {}
|
||||
|
||||
def __init__(self):
|
||||
self.config = self.parse_config_file(
|
||||
additional_configs=[self.ADDITIONAL_CONFIG])
|
||||
|
||||
self.storage = get_storage(**self.config['storage'])
|
||||
|
||||
self.scheduler = SchedulerBackend(
|
||||
scheduling_db=self.config['scheduling_db'],
|
||||
)
|
||||
|
||||
def create_origins(self, origins):
|
||||
"""Create the origins listed, and return their ids.
|
||||
|
||||
Args:
|
||||
origins: a list of origins
|
||||
Returns:
|
||||
a list of origin ids
|
||||
"""
|
||||
return self.storage.origin_add(origins)
|
||||
|
||||
def create_tasks(self, tasks):
|
||||
"""Create the tasks specified, and return their ids.
|
||||
|
||||
Args:
|
||||
tasks (list of dict): a list of task specifications:
|
||||
type (str): the task type
|
||||
arguments (dict): the arguments for the task runner
|
||||
args (list of str): arguments
|
||||
kwargs (dict str -> str): keyword arguments
|
||||
next_run (datetime.datetime): when to schedule the next run
|
||||
Returns:
|
||||
a list of task ids
|
||||
"""
|
||||
returned_tasks = self.scheduler.create_tasks(tasks)
|
||||
return [returned_task['id'] for returned_task in returned_tasks]
|
||||
|
||||
def disable_tasks(self, task_ids):
|
||||
"""Disable the tasks identified by the given ids"""
|
||||
self.scheduler.disable_tasks(task_ids)
|
|
@ -1,348 +1,50 @@
|
|||
# Copyright (C) 2015 Stefano Zacchiroli <zack@upsilon.cc>
|
||||
# Copyright (C) 2016 The Software Heritage developers
|
||||
# Copyright (C) 2017 the Software Heritage developers
|
||||
# License: GNU General Public License version 3, or any later version
|
||||
# See top-level LICENSE file for more information
|
||||
|
||||
# see https://developer.github.com/v3/ for GitHub API documentation
|
||||
|
||||
import datetime
|
||||
import gzip
|
||||
import logging
|
||||
import os
|
||||
import random
|
||||
import re
|
||||
import requests
|
||||
import time
|
||||
|
||||
from pprint import pformat
|
||||
|
||||
from sqlalchemy import create_engine, func
|
||||
from sqlalchemy.orm import sessionmaker
|
||||
|
||||
from swh.core import config
|
||||
from swh.lister.github.base import SWHLister
|
||||
from swh.lister.github.db_utils import session_scope
|
||||
from swh.lister.github.models import Repository
|
||||
from swh.lister.core.indexing_lister import SWHIndexingHttpLister
|
||||
from swh.lister.github.models import GitHubModel
|
||||
|
||||
|
||||
GH_API_URL = 'https://api.github.com'
|
||||
GH_REPO_URL_TEMPLATE = 'https://github.com/%s'
|
||||
MAX_RETRIES = 7
|
||||
MAX_SLEEP = 3600 # 1 hour
|
||||
CONN_SLEEP = 10
|
||||
class GitHubLister(SWHIndexingHttpLister):
|
||||
PATH_TEMPLATE = '/repositories?since=%d'
|
||||
MODEL = GitHubModel
|
||||
API_URL_INDEX_RE = re.compile(r'^.*/repositories\?since=(\d+)')
|
||||
|
||||
REPO_API_URL_RE = re.compile(r'^.*/repositories\?since=(\d+)')
|
||||
def get_model_from_repo(self, repo):
|
||||
return {
|
||||
'uid': repo['id'],
|
||||
'indexable': repo['id'],
|
||||
'name': repo['name'],
|
||||
'full_name': repo['full_name'],
|
||||
'html_url': repo['html_url'],
|
||||
'origin_url': repo['html_url'],
|
||||
'origin_type': 'git',
|
||||
'description': repo['description']
|
||||
}
|
||||
|
||||
|
||||
class FetchError(RuntimeError):
|
||||
|
||||
def __init__(self, response):
|
||||
self.response = response
|
||||
|
||||
def __str__(self):
|
||||
return repr(self.response)
|
||||
|
||||
|
||||
def save_http_response(r, cache_dir):
|
||||
def escape_url_path(p):
|
||||
return p.replace('/', '__')
|
||||
|
||||
fname = os.path.join(cache_dir,
|
||||
escape_url_path(r.request.path_url) + '.gz')
|
||||
with gzip.open(fname, 'w') as f:
|
||||
def emit(s):
|
||||
f.write(bytes(s, 'UTF-8'))
|
||||
emit(pformat(r.request.path_url))
|
||||
emit('\n#\n')
|
||||
emit(pformat(r.status_code))
|
||||
emit('\n#\n')
|
||||
emit(pformat(r.headers))
|
||||
emit('\n#\n')
|
||||
emit(pformat(r.json()))
|
||||
|
||||
|
||||
def gh_api_request(path, username=None, password=None, session=None,
|
||||
headers=None):
|
||||
params = {}
|
||||
|
||||
if headers is None:
|
||||
headers = {}
|
||||
|
||||
if 'Accept' not in headers: # request version 3 of the API
|
||||
headers['Accept'] = 'application/vnd.github.v3+json'
|
||||
|
||||
params['headers'] = headers
|
||||
if username is not None and password is not None:
|
||||
params['auth'] = (username, password)
|
||||
|
||||
if session is None:
|
||||
session = requests.Session()
|
||||
|
||||
retries_left = MAX_RETRIES
|
||||
while retries_left > 0:
|
||||
logging.debug('sending API request: %s' % path)
|
||||
try:
|
||||
r = session.get(GH_API_URL + path, **params)
|
||||
except requests.exceptions.ConnectionError:
|
||||
# network-level connection error, try again
|
||||
logging.warn('connection error upon %s: sleep for %d seconds' %
|
||||
(path, CONN_SLEEP))
|
||||
time.sleep(CONN_SLEEP)
|
||||
retries_left -= 1
|
||||
continue
|
||||
|
||||
if r.ok: # all went well, do not retry
|
||||
break
|
||||
|
||||
# detect throttling
|
||||
if r.status_code == 403 and \
|
||||
int(r.headers['X-RateLimit-Remaining']) == 0:
|
||||
delay = int(r.headers['X-RateLimit-Reset']) - time.time()
|
||||
delay = min(delay, MAX_SLEEP)
|
||||
logging.warn('rate limited upon %s: sleep for %d seconds' %
|
||||
(path, int(delay)))
|
||||
time.sleep(delay)
|
||||
else: # unexpected error, abort
|
||||
break
|
||||
|
||||
retries_left -= 1
|
||||
|
||||
if not retries_left:
|
||||
logging.warn('giving up on %s: max retries exceed' % path)
|
||||
|
||||
return r
|
||||
|
||||
|
||||
class GitHubLister(SWHLister):
|
||||
CONFIG_BASE_FILENAME = 'lister-github'
|
||||
ADDITIONAL_CONFIG = {
|
||||
'lister_db_url': ('str', 'postgresql:///lister-github'),
|
||||
'credentials': ('list[dict]', []),
|
||||
'cache_json': ('bool', False),
|
||||
'cache_dir': ('str', '~/.cache/swh/lister/github'),
|
||||
}
|
||||
|
||||
def __init__(self, override_config=None):
|
||||
super().__init__()
|
||||
if override_config:
|
||||
self.config.update(override_config)
|
||||
|
||||
self.config['cache_dir'] = os.path.expanduser(self.config['cache_dir'])
|
||||
if self.config['cache_json']:
|
||||
config.prepare_folders(self.config, ['cache_dir'])
|
||||
|
||||
if not self.config['credentials']:
|
||||
raise ValueError('The GitHub lister needs credentials for API')
|
||||
|
||||
self.db_engine = create_engine(self.config['lister_db_url'])
|
||||
self.mk_session = sessionmaker(bind=self.db_engine)
|
||||
|
||||
def lookup_repo(self, repo_id, db_session=None):
|
||||
if not db_session:
|
||||
with session_scope(self.mk_session) as db_session:
|
||||
return self.lookup_repo(repo_id, db_session=db_session)
|
||||
|
||||
return db_session.query(Repository) \
|
||||
.filter(Repository.id == repo_id) \
|
||||
.first()
|
||||
|
||||
def query_range(self, start, end, db_session=None):
|
||||
if not db_session:
|
||||
with session_scope(self.mk_session) as db_session:
|
||||
return self.query_range(start, end, db_session=db_session)
|
||||
|
||||
return db_session.query(Repository) \
|
||||
.filter(Repository.id >= start) \
|
||||
.filter(Repository.id <= end)
|
||||
|
||||
def lookup_full_names(self, full_names, db_session=None):
|
||||
if not db_session:
|
||||
with session_scope(self.mk_session) as db_session:
|
||||
return self.lookup_full_names(full_names,
|
||||
db_session=db_session)
|
||||
|
||||
return db_session.query(Repository) \
|
||||
.filter(Repository.full_name.in_(full_names)) \
|
||||
.all()
|
||||
|
||||
def last_repo_id(self, db_session=None):
|
||||
if not db_session:
|
||||
with session_scope(self.mk_session) as db_session:
|
||||
return self.last_repo_id(db_session=db_session)
|
||||
|
||||
t = db_session.query(func.max(Repository.id)).first()
|
||||
|
||||
if t is not None:
|
||||
return t[0]
|
||||
|
||||
INJECT_KEYS = ['id', 'name', 'full_name', 'html_url', 'description',
|
||||
'fork']
|
||||
|
||||
def inject_repo(self, repo, db_session=None):
|
||||
if not db_session:
|
||||
with session_scope(self.mk_session) as db_session:
|
||||
return self.inject_repo(repo, db_session=db_session)
|
||||
|
||||
logging.debug('injecting repo %d' % repo['id'])
|
||||
sql_repo = self.lookup_repo(repo['id'], db_session)
|
||||
if not sql_repo:
|
||||
kwargs = {k: repo[k] for k in self.INJECT_KEYS if k in repo}
|
||||
sql_repo = Repository(**kwargs)
|
||||
db_session.add(sql_repo)
|
||||
def transport_quota_check(self, response):
|
||||
reqs_remaining = int(response.headers['X-RateLimit-Remaining'])
|
||||
if response.status_code == 403 and reqs_remaining == 0:
|
||||
reset_at = int(response.headers['X-RateLimit-Reset'])
|
||||
delay = min(reset_at - time.time(), 3600)
|
||||
return True, delay
|
||||
else:
|
||||
for k in self.INJECT_KEYS:
|
||||
if k in repo:
|
||||
setattr(sql_repo, k, repo[k])
|
||||
sql_repo.last_seen = datetime.datetime.now()
|
||||
return False, 0
|
||||
|
||||
return sql_repo
|
||||
def get_next_target_from_response(self, response):
|
||||
if 'next' in response.links:
|
||||
next_url = response.links['next']['url']
|
||||
return int(self.API_URL_INDEX_RE.match(next_url).group(1))
|
||||
else:
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def repo_to_origin(full_name):
|
||||
return {
|
||||
'type': 'git',
|
||||
'url': GH_REPO_URL_TEMPLATE % full_name,
|
||||
}
|
||||
def transport_response_simplified(self, response):
|
||||
repos = response.json()
|
||||
return [self.get_model_from_repo(repo) for repo in repos]
|
||||
|
||||
@staticmethod
|
||||
def repo_to_task(full_name):
|
||||
return {
|
||||
'type': 'origin-update-git',
|
||||
'arguments': {
|
||||
'args': [
|
||||
GH_REPO_URL_TEMPLATE % full_name,
|
||||
],
|
||||
'kwargs': {},
|
||||
},
|
||||
'next_run': datetime.datetime.now(),
|
||||
}
|
||||
|
||||
def fetch(self, min_id=None, max_id=None):
|
||||
if min_id is None:
|
||||
min_id = 1
|
||||
if max_id is None:
|
||||
max_id = float('inf')
|
||||
next_id = min_id
|
||||
|
||||
do_cache = self.config['cache_json']
|
||||
cache_dir = self.config['cache_dir']
|
||||
|
||||
session = requests.Session()
|
||||
db_session = self.mk_session()
|
||||
loop_count = 0
|
||||
while min_id <= next_id <= max_id:
|
||||
logging.info('listing repos starting at %d' % next_id)
|
||||
|
||||
# github API ?since=... is '>' strict, not '>='
|
||||
since = next_id - 1
|
||||
|
||||
cred = random.choice(self.config['credentials'])
|
||||
repos_res = gh_api_request('/repositories?since=%d' % since,
|
||||
session=session, **cred)
|
||||
|
||||
if do_cache:
|
||||
save_http_response(repos_res, cache_dir)
|
||||
|
||||
if not repos_res.ok:
|
||||
raise FetchError(repos_res)
|
||||
|
||||
next_next_id = None
|
||||
if 'next' in repos_res.links:
|
||||
next_url = repos_res.links['next']['url']
|
||||
m = REPO_API_URL_RE.match(next_url) # parse next_id
|
||||
next_next_id = int(m.group(1)) + 1
|
||||
|
||||
repos = repos_res.json()
|
||||
mapped_repos = {}
|
||||
tasks = {}
|
||||
origins = {}
|
||||
repo_ids = set()
|
||||
for repo in repos:
|
||||
if repo['id'] > max_id: # do not overstep max_id
|
||||
break
|
||||
repo_ids.add(repo['id'])
|
||||
full_name = repo['full_name']
|
||||
mapped_repos[full_name] = self.inject_repo(repo, db_session)
|
||||
|
||||
# Retrieve and reset task and origin ids from existing repos
|
||||
old_repos = self.lookup_full_names(list(mapped_repos.keys()),
|
||||
db_session=db_session)
|
||||
for old_repo in old_repos:
|
||||
full_name = old_repo.full_name
|
||||
if old_repo.task_id:
|
||||
tasks[full_name] = old_repo.task_id
|
||||
old_repo.task_id = None
|
||||
if old_repo.origin_id:
|
||||
origins[full_name] = old_repo.origin_id
|
||||
old_repo.origin_id = None
|
||||
|
||||
# Create missing origins
|
||||
missing_origins = [
|
||||
full_name for full_name in sorted(mapped_repos)
|
||||
if full_name not in origins
|
||||
]
|
||||
|
||||
if missing_origins:
|
||||
new_origins = [
|
||||
self.repo_to_origin(full_name)
|
||||
for full_name in missing_origins
|
||||
]
|
||||
new_origin_ids = self.create_origins(new_origins)
|
||||
origins.update(zip(missing_origins, new_origin_ids))
|
||||
|
||||
for full_name, origin_id in origins.items():
|
||||
mapped_repos[full_name].origin_id = origin_id
|
||||
|
||||
# Create missing tasks
|
||||
missing_tasks = [
|
||||
full_name for full_name in sorted(mapped_repos)
|
||||
if full_name not in tasks
|
||||
]
|
||||
|
||||
if missing_tasks:
|
||||
new_tasks = [
|
||||
self.repo_to_task(full_name)
|
||||
for full_name in missing_tasks
|
||||
]
|
||||
new_task_ids = self.create_tasks(new_tasks)
|
||||
tasks.update(zip(missing_tasks, new_task_ids))
|
||||
|
||||
for full_name, task_id in tasks.items():
|
||||
mapped_repos[full_name].task_id = task_id
|
||||
|
||||
# Disable tasks for deleted repos
|
||||
if next_next_id is not None:
|
||||
start, end = next_id, min(max_id, next_next_id)
|
||||
else:
|
||||
start, end = next_id, max_id
|
||||
|
||||
deleted_repos = self.query_range(start, end,
|
||||
db_session=db_session) \
|
||||
.filter(~Repository.id.in_(repo_ids)) \
|
||||
.all()
|
||||
|
||||
if deleted_repos:
|
||||
tasks_to_disable = []
|
||||
for repo in deleted_repos:
|
||||
if repo.task_id is not None:
|
||||
tasks_to_disable.append(repo.task_id)
|
||||
repo.task_id = None
|
||||
|
||||
if tasks_to_disable:
|
||||
self.disable_tasks(tasks_to_disable)
|
||||
|
||||
if next_next_id is None:
|
||||
logging.info('stopping after id %d, no next link found' %
|
||||
next_id)
|
||||
break
|
||||
else:
|
||||
next_id = next_next_id
|
||||
|
||||
loop_count += 1
|
||||
if loop_count == 20:
|
||||
logging.info('flushing updates')
|
||||
loop_count = 0
|
||||
db_session.commit()
|
||||
db_session = self.mk_session()
|
||||
|
||||
db_session.commit()
|
||||
def request_headers(self):
|
||||
return {'Accept': 'application/vnd.github.v3+json'}
|
||||
|
|
|
@ -1,51 +1,18 @@
|
|||
# Copyright (C) 2015 Stefano Zacchiroli <zack@upsilon.cc>
|
||||
# Copyright (C) 2017 the Software Heritage developers
|
||||
# License: GNU General Public License version 3, or any later version
|
||||
# See top-level LICENSE file for more information
|
||||
|
||||
from datetime import datetime
|
||||
from sqlalchemy import Column, Integer
|
||||
|
||||
from sqlalchemy import Column
|
||||
from sqlalchemy import Boolean, DateTime, Integer, String
|
||||
from sqlalchemy.ext.declarative import declarative_base
|
||||
from swh.lister.core.models import ModelBase
|
||||
|
||||
|
||||
SQLBase = declarative_base()
|
||||
|
||||
|
||||
class Repository(SQLBase):
|
||||
|
||||
class GitHubModel(ModelBase):
|
||||
"""a GitHub repository"""
|
||||
__tablename__ = 'github_repos'
|
||||
|
||||
__tablename__ = 'repos'
|
||||
uid = Column(Integer, primary_key=True)
|
||||
indexable = Column(Integer, index=True)
|
||||
|
||||
id = Column(Integer, primary_key=True)
|
||||
|
||||
name = Column(String, index=True)
|
||||
full_name = Column(String, index=True)
|
||||
html_url = Column(String)
|
||||
description = Column(String)
|
||||
fork = Column(Boolean, index=True)
|
||||
|
||||
last_seen = Column(DateTime, nullable=False)
|
||||
|
||||
task_id = Column(Integer)
|
||||
origin_id = Column(Integer)
|
||||
|
||||
def __init__(self, id, name=None, full_name=None, html_url=None,
|
||||
description=None, fork=None, task_id=None, origin_id=None):
|
||||
self.id = id
|
||||
self.last_seen = datetime.now()
|
||||
if name is not None:
|
||||
self.name = name
|
||||
if full_name is not None:
|
||||
self.full_name = full_name
|
||||
if html_url is not None:
|
||||
self.html_url = html_url
|
||||
if description is not None:
|
||||
self.description = description
|
||||
if fork is not None:
|
||||
self.fork = fork
|
||||
if task_id is not None:
|
||||
self.task_id = task_id
|
||||
if origin_id is not None:
|
||||
self.origin_id = origin_id
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
|
|
|
@ -1,47 +1,27 @@
|
|||
# Copyright (C) 2016 the Software Heritage developers
|
||||
# Copyright (C) 2017 the Software Heritage developers
|
||||
# License: GNU General Public License version 3, or any later version
|
||||
# See top-level LICENSE file for more information
|
||||
|
||||
import random
|
||||
|
||||
from celery import group
|
||||
|
||||
from swh.scheduler.task import Task
|
||||
from swh.lister.core.tasks import (IndexingDiscoveryListerTask,
|
||||
IndexingRangeListerTask,
|
||||
IndexingRefreshListerTask, ListerTaskBase)
|
||||
|
||||
from .lister import GitHubLister
|
||||
|
||||
GROUP_SPLIT = 10000
|
||||
|
||||
class GitHubListerTask(ListerTaskBase):
|
||||
def new_lister(self):
|
||||
return GitHubLister(lister_name='github.com',
|
||||
api_baseurl='https://github.com')
|
||||
|
||||
|
||||
class IncrementalGitHubLister(Task):
|
||||
task_queue = 'swh_lister_github_incremental'
|
||||
|
||||
def run(self):
|
||||
lister = GitHubLister()
|
||||
last_id = lister.last_repo_id()
|
||||
lister.fetch(min_id=last_id + 1, max_id=None)
|
||||
class IncrementalGitHubLister(GitHubListerTask, IndexingDiscoveryListerTask):
|
||||
task_queue = 'swh_lister_github_discover'
|
||||
|
||||
|
||||
class RangeGitHubLister(Task):
|
||||
task_queue = 'swh_lister_github_full'
|
||||
|
||||
def run(self, start, end):
|
||||
lister = GitHubLister()
|
||||
lister.fetch(min_id=start, max_id=end)
|
||||
class RangeGitHubLister(GitHubListerTask, IndexingRangeListerTask):
|
||||
task_queue = 'swh_lister_github_refresh'
|
||||
|
||||
|
||||
class FullGitHubLister(Task):
|
||||
task_queue = 'swh_lister_github_full'
|
||||
|
||||
def run(self):
|
||||
lister = GitHubLister()
|
||||
last_id = lister.last_repo_id()
|
||||
ranges = [
|
||||
(i, min(last_id, i + GROUP_SPLIT - 1))
|
||||
for i in range(1, last_id, GROUP_SPLIT)
|
||||
]
|
||||
|
||||
random.shuffle(ranges)
|
||||
|
||||
range_task = RangeGitHubLister()
|
||||
group(range_task.s(min, max) for min, max in ranges)()
|
||||
class FullGitHubRelister(GitHubListerTask, IndexingRefreshListerTask):
|
||||
task_queue = 'swh_lister_github_refresh'
|
||||
|
|
1
swh/lister/github/tests/api_empty_response.json
Normal file
1
swh/lister/github/tests/api_empty_response.json
Normal file
|
@ -0,0 +1 @@
|
|||
[]
|
6502
swh/lister/github/tests/api_response.json
Normal file
6502
swh/lister/github/tests/api_response.json
Normal file
File diff suppressed because it is too large
Load diff
46
swh/lister/github/tests/test_gh_lister.py
Normal file
46
swh/lister/github/tests/test_gh_lister.py
Normal file
|
@ -0,0 +1,46 @@
|
|||
# Copyright (C) 2017 the Software Heritage developers
|
||||
# License: GNU General Public License version 3, or any later version
|
||||
# See top-level LICENSE file for more information
|
||||
|
||||
import re
|
||||
import unittest
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
from swh.lister.core.tests.test_lister import IndexingHttpListerTesterBase
|
||||
from swh.lister.github.lister import GitHubLister
|
||||
|
||||
|
||||
class GitHubListerTester(IndexingHttpListerTesterBase, unittest.TestCase):
|
||||
Lister = GitHubLister
|
||||
test_re = re.compile(r'/repositories\?since=([^?&]+)')
|
||||
lister_subdir = 'github'
|
||||
good_api_response_file = 'api_response.json'
|
||||
bad_api_response_file = 'api_empty_response.json'
|
||||
first_index = 26
|
||||
last_index = 368
|
||||
entries_per_page = 100
|
||||
|
||||
def response_headers(self, request):
|
||||
headers = {'X-RateLimit-Remaining': '1'}
|
||||
if self.request_index(request) == str(self.first_index):
|
||||
headers.update({
|
||||
'Link': '<https://api.github.com/repositories?since=367>;'
|
||||
' rel="next",'
|
||||
'<https://api.github.com/repositories{?since}>;'
|
||||
' rel="first"'
|
||||
})
|
||||
else:
|
||||
headers.update({
|
||||
'Link': '<https://api.github.com/repositories{?since}>;'
|
||||
' rel="first"'
|
||||
})
|
||||
|
||||
return headers
|
||||
|
||||
def mock_rate_quota(self, n, request, context):
|
||||
self.rate_limit += 1
|
||||
context.status_code = 403
|
||||
context.headers['X-RateLimit-Remaining'] = '0'
|
||||
one_second = int((datetime.now() + timedelta(seconds=1.5)).timestamp())
|
||||
context.headers['X-RateLimit-Reset'] = str(one_second)
|
||||
return '{"error":"dummy"}'
|
Loading…
Add table
Add a link
Reference in a new issue