Refactor lister code

Streamline production of new listers by aggressively moving core
functionality into progressively inherited (A->B->C) base classes
with the transport layer abstracted.
This should make common individual forge listers straightforward to
produce with minimal customization. Github and Bitbucket listers
can be used as examples of the indexing type.
This commit is contained in:
Avi Kelman (fiendish) 2017-03-06 11:34:37 +01:00
parent a6e43f2777
commit 68d77fd43f
24 changed files with 8835 additions and 477 deletions

View file

@ -1,3 +1,8 @@
nose
requests
requests_mock
setuptools
SQLAlchemy
testing.postgresql
xmltodict
celery

View file

@ -0,0 +1,36 @@
# Copyright (C) 2017 the Software Heritage developers
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from urllib import parse
from swh.lister.bitbucket.models import BitBucketModel
from swh.lister.core.indexing_lister import SWHIndexingHttpLister
class BitBucketLister(SWHIndexingHttpLister):
PATH_TEMPLATE = '/repositories?after=%s'
MODEL = BitBucketModel
def get_model_from_repo(self, repo):
return {
'uid': repo['uuid'],
'indexable': repo['created_on'],
'name': repo['name'],
'full_name': repo['full_name'],
'html_url': repo['links']['html']['href'],
'origin_url': repo['links']['clone'][0]['href'],
'origin_type': repo['scm'],
'description': repo['description']
}
def get_next_target_from_response(self, response):
body = response.json()
if 'next' in body:
return parse.unquote(body['next'].split('after=')[1])
else:
return None
def transport_response_simplified(self, response):
repos = response.json()['values']
return [self.get_model_from_repo(repo) for repo in repos]

View file

@ -0,0 +1,15 @@
# Copyright (C) 2017 the Software Heritage developers
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from sqlalchemy import Column, String
from swh.lister.core.models import ModelBase
class BitBucketModel(ModelBase):
"""a BitBucket repository"""
__tablename__ = 'bitbucket_repos'
uid = Column(String, primary_key=True)
indexable = Column(String, index=True)

View file

@ -0,0 +1,28 @@
# Copyright (C) 2017 the Software Heritage developers
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from swh.lister.core.tasks import (IndexingDiscoveryListerTask,
IndexingRangeListerTask,
IndexingRefreshListerTask, ListerTaskBase)
from .lister import BitBucketLister
class BitBucketListerTask(ListerTaskBase):
def new_lister(self):
return BitBucketLister(lister_name='bitbucket.com',
api_baseurl='https://api.bitbucket.org/2.0')
class IncrementalBitBucketLister(BitBucketListerTask,
IndexingDiscoveryListerTask):
task_queue = 'swh_lister_bitbucket_discover'
class RangeBitBucketLister(BitBucketListerTask, IndexingRangeListerTask):
task_queue = 'swh_lister_bitbucket_refresh'
class FullBitBucketRelister(BitBucketListerTask, IndexingRefreshListerTask):
task_queue = 'swh_lister_bitbucket_refresh'

View file

@ -0,0 +1,4 @@
{
"pagelen": 10,
"values": []
}

View file

@ -0,0 +1,806 @@
{
"pagelen": 10,
"values": [
{
"scm": "hg",
"website": "",
"has_wiki": true,
"name": "app-template",
"links": {
"watchers": {
"href": "https://api.bitbucket.org/2.0/repositories/bebac/app-template/watchers"
},
"branches": {
"href": "https://api.bitbucket.org/2.0/repositories/bebac/app-template/refs/branches"
},
"tags": {
"href": "https://api.bitbucket.org/2.0/repositories/bebac/app-template/refs/tags"
},
"commits": {
"href": "https://api.bitbucket.org/2.0/repositories/bebac/app-template/commits"
},
"clone": [
{
"href": "https://bitbucket.org/bebac/app-template",
"name": "https"
},
{
"href": "ssh://hg@bitbucket.org/bebac/app-template",
"name": "ssh"
}
],
"self": {
"href": "https://api.bitbucket.org/2.0/repositories/bebac/app-template"
},
"html": {
"href": "https://bitbucket.org/bebac/app-template"
},
"avatar": {
"href": "https://bitbucket.org/bebac/app-template/avatar/32/"
},
"hooks": {
"href": "https://api.bitbucket.org/2.0/repositories/bebac/app-template/hooks"
},
"forks": {
"href": "https://api.bitbucket.org/2.0/repositories/bebac/app-template/forks"
},
"downloads": {
"href": "https://api.bitbucket.org/2.0/repositories/bebac/app-template/downloads"
},
"pullrequests": {
"href": "https://api.bitbucket.org/2.0/repositories/bebac/app-template/pullrequests"
}
},
"fork_policy": "allow_forks",
"uuid": "{0cf80a6e-e91f-4a4c-a61b-8c8ff51ca3ec}",
"language": "c++",
"created_on": "2008-07-12T07:44:01.476818+00:00",
"full_name": "bebac/app-template",
"has_issues": true,
"owner": {
"username": "bebac",
"display_name": "Benny Bach",
"type": "user",
"uuid": "{d1a83a2a-be1b-4034-8c1d-386a6690cddb}",
"links": {
"self": {
"href": "https://api.bitbucket.org/2.0/users/bebac"
},
"html": {
"href": "https://bitbucket.org/bebac/"
},
"avatar": {
"href": "https://bitbucket.org/account/bebac/avatar/32/"
}
}
},
"updated_on": "2011-10-05T15:36:19.409008+00:00",
"size": 71548,
"type": "repository",
"slug": "app-template",
"is_private": false,
"description": "Basic files and directory structure for a C++ project. Intended as a starting point for a new project. Includes a basic cross platform core library."
},
{
"scm": "hg",
"website": "",
"has_wiki": true,
"name": "mercurialeclipse",
"links": {
"watchers": {
"href": "https://api.bitbucket.org/2.0/repositories/bastiand/mercurialeclipse/watchers"
},
"branches": {
"href": "https://api.bitbucket.org/2.0/repositories/bastiand/mercurialeclipse/refs/branches"
},
"tags": {
"href": "https://api.bitbucket.org/2.0/repositories/bastiand/mercurialeclipse/refs/tags"
},
"commits": {
"href": "https://api.bitbucket.org/2.0/repositories/bastiand/mercurialeclipse/commits"
},
"clone": [
{
"href": "https://bitbucket.org/bastiand/mercurialeclipse",
"name": "https"
},
{
"href": "ssh://hg@bitbucket.org/bastiand/mercurialeclipse",
"name": "ssh"
}
],
"self": {
"href": "https://api.bitbucket.org/2.0/repositories/bastiand/mercurialeclipse"
},
"html": {
"href": "https://bitbucket.org/bastiand/mercurialeclipse"
},
"avatar": {
"href": "https://bitbucket.org/bastiand/mercurialeclipse/avatar/32/"
},
"hooks": {
"href": "https://api.bitbucket.org/2.0/repositories/bastiand/mercurialeclipse/hooks"
},
"forks": {
"href": "https://api.bitbucket.org/2.0/repositories/bastiand/mercurialeclipse/forks"
},
"downloads": {
"href": "https://api.bitbucket.org/2.0/repositories/bastiand/mercurialeclipse/downloads"
},
"pullrequests": {
"href": "https://api.bitbucket.org/2.0/repositories/bastiand/mercurialeclipse/pullrequests"
}
},
"fork_policy": "allow_forks",
"uuid": "{f7a08670-bdd1-4465-aa97-7a5ce8d1a25b}",
"language": "",
"created_on": "2008-07-12T09:37:06.254721+00:00",
"full_name": "bastiand/mercurialeclipse",
"has_issues": false,
"owner": {
"username": "bastiand",
"display_name": "Bastian Doetsch",
"type": "user",
"uuid": "{3742cd48-adad-4205-ab0d-04fc992a1728}",
"links": {
"self": {
"href": "https://api.bitbucket.org/2.0/users/bastiand"
},
"html": {
"href": "https://bitbucket.org/bastiand/"
},
"avatar": {
"href": "https://bitbucket.org/account/bastiand/avatar/32/"
}
}
},
"updated_on": "2011-09-17T02:36:59.062596+00:00",
"size": 6445145,
"type": "repository",
"slug": "mercurialeclipse",
"is_private": false,
"description": "my own repo for MercurialEclipse."
},
{
"scm": "hg",
"website": "",
"has_wiki": true,
"name": "sandboxpublic",
"links": {
"watchers": {
"href": "https://api.bitbucket.org/2.0/repositories/aleax/sandboxpublic/watchers"
},
"branches": {
"href": "https://api.bitbucket.org/2.0/repositories/aleax/sandboxpublic/refs/branches"
},
"tags": {
"href": "https://api.bitbucket.org/2.0/repositories/aleax/sandboxpublic/refs/tags"
},
"commits": {
"href": "https://api.bitbucket.org/2.0/repositories/aleax/sandboxpublic/commits"
},
"clone": [
{
"href": "https://bitbucket.org/aleax/sandboxpublic",
"name": "https"
},
{
"href": "ssh://hg@bitbucket.org/aleax/sandboxpublic",
"name": "ssh"
}
],
"self": {
"href": "https://api.bitbucket.org/2.0/repositories/aleax/sandboxpublic"
},
"html": {
"href": "https://bitbucket.org/aleax/sandboxpublic"
},
"avatar": {
"href": "https://bitbucket.org/aleax/sandboxpublic/avatar/32/"
},
"hooks": {
"href": "https://api.bitbucket.org/2.0/repositories/aleax/sandboxpublic/hooks"
},
"forks": {
"href": "https://api.bitbucket.org/2.0/repositories/aleax/sandboxpublic/forks"
},
"downloads": {
"href": "https://api.bitbucket.org/2.0/repositories/aleax/sandboxpublic/downloads"
},
"pullrequests": {
"href": "https://api.bitbucket.org/2.0/repositories/aleax/sandboxpublic/pullrequests"
}
},
"fork_policy": "allow_forks",
"uuid": "{452c716c-a1ce-42bc-a95b-d38da49cbb37}",
"language": "",
"created_on": "2008-07-14T01:59:23.568048+00:00",
"full_name": "aleax/sandboxpublic",
"has_issues": true,
"owner": {
"username": "aleax",
"display_name": "Alex Martelli",
"type": "user",
"uuid": "{1155d94d-fb48-43fe-a431-ec07c900b636}",
"links": {
"self": {
"href": "https://api.bitbucket.org/2.0/users/aleax"
},
"html": {
"href": "https://bitbucket.org/aleax/"
},
"avatar": {
"href": "https://bitbucket.org/account/aleax/avatar/32/"
}
}
},
"updated_on": "2012-06-22T21:55:28.753727+00:00",
"size": 3120,
"type": "repository",
"slug": "sandboxpublic",
"is_private": false,
"description": "to help debug ACLs for private vs public bitbucket repos"
},
{
"scm": "hg",
"website": "",
"has_wiki": true,
"name": "otrsfix-ng",
"links": {
"watchers": {
"href": "https://api.bitbucket.org/2.0/repositories/adiakin/otrsfix-ng/watchers"
},
"branches": {
"href": "https://api.bitbucket.org/2.0/repositories/adiakin/otrsfix-ng/refs/branches"
},
"tags": {
"href": "https://api.bitbucket.org/2.0/repositories/adiakin/otrsfix-ng/refs/tags"
},
"commits": {
"href": "https://api.bitbucket.org/2.0/repositories/adiakin/otrsfix-ng/commits"
},
"clone": [
{
"href": "https://bitbucket.org/adiakin/otrsfix-ng",
"name": "https"
},
{
"href": "ssh://hg@bitbucket.org/adiakin/otrsfix-ng",
"name": "ssh"
}
],
"self": {
"href": "https://api.bitbucket.org/2.0/repositories/adiakin/otrsfix-ng"
},
"html": {
"href": "https://bitbucket.org/adiakin/otrsfix-ng"
},
"avatar": {
"href": "https://bitbucket.org/adiakin/otrsfix-ng/avatar/32/"
},
"hooks": {
"href": "https://api.bitbucket.org/2.0/repositories/adiakin/otrsfix-ng/hooks"
},
"forks": {
"href": "https://api.bitbucket.org/2.0/repositories/adiakin/otrsfix-ng/forks"
},
"downloads": {
"href": "https://api.bitbucket.org/2.0/repositories/adiakin/otrsfix-ng/downloads"
},
"pullrequests": {
"href": "https://api.bitbucket.org/2.0/repositories/adiakin/otrsfix-ng/pullrequests"
}
},
"fork_policy": "allow_forks",
"uuid": "{05b1b9dc-a7b6-46d6-ae1b-e66a17aa4f55}",
"language": "",
"created_on": "2008-07-15T06:14:39.306314+00:00",
"full_name": "adiakin/otrsfix-ng",
"has_issues": true,
"owner": {
"username": "adiakin",
"display_name": "adiakin",
"type": "user",
"uuid": "{414012b5-1ac9-4096-9f46-8893cfa3cda5}",
"links": {
"self": {
"href": "https://api.bitbucket.org/2.0/users/adiakin"
},
"html": {
"href": "https://bitbucket.org/adiakin/"
},
"avatar": {
"href": "https://bitbucket.org/account/adiakin/avatar/32/"
}
}
},
"updated_on": "2016-06-02T18:56:34.868302+00:00",
"size": 211631,
"type": "repository",
"slug": "otrsfix-ng",
"is_private": false,
"description": "OTRS greasemonkey extension"
},
{
"scm": "hg",
"website": "",
"has_wiki": true,
"name": "pida-pypaned",
"links": {
"watchers": {
"href": "https://api.bitbucket.org/2.0/repositories/aafshar/pida-pypaned/watchers"
},
"branches": {
"href": "https://api.bitbucket.org/2.0/repositories/aafshar/pida-pypaned/refs/branches"
},
"tags": {
"href": "https://api.bitbucket.org/2.0/repositories/aafshar/pida-pypaned/refs/tags"
},
"commits": {
"href": "https://api.bitbucket.org/2.0/repositories/aafshar/pida-pypaned/commits"
},
"clone": [
{
"href": "https://bitbucket.org/aafshar/pida-pypaned",
"name": "https"
},
{
"href": "ssh://hg@bitbucket.org/aafshar/pida-pypaned",
"name": "ssh"
}
],
"self": {
"href": "https://api.bitbucket.org/2.0/repositories/aafshar/pida-pypaned"
},
"html": {
"href": "https://bitbucket.org/aafshar/pida-pypaned"
},
"avatar": {
"href": "https://bitbucket.org/aafshar/pida-pypaned/avatar/32/"
},
"hooks": {
"href": "https://api.bitbucket.org/2.0/repositories/aafshar/pida-pypaned/hooks"
},
"forks": {
"href": "https://api.bitbucket.org/2.0/repositories/aafshar/pida-pypaned/forks"
},
"downloads": {
"href": "https://api.bitbucket.org/2.0/repositories/aafshar/pida-pypaned/downloads"
},
"pullrequests": {
"href": "https://api.bitbucket.org/2.0/repositories/aafshar/pida-pypaned/pullrequests"
}
},
"fork_policy": "allow_forks",
"uuid": "{94cb830a-1784-4e51-9791-8f5cc93990a9}",
"language": "",
"created_on": "2008-07-16T22:47:38.682491+00:00",
"full_name": "aafshar/pida-pypaned",
"has_issues": true,
"owner": {
"username": "aafshar",
"display_name": "Ali Afshar",
"type": "user",
"uuid": "{bcb87110-6a92-41fc-b95c-680feeea5512}",
"links": {
"self": {
"href": "https://api.bitbucket.org/2.0/users/aafshar"
},
"html": {
"href": "https://bitbucket.org/aafshar/"
},
"avatar": {
"href": "https://bitbucket.org/account/aafshar/avatar/32/"
}
}
},
"updated_on": "2012-06-22T21:55:42.451431+00:00",
"size": 4680652,
"type": "repository",
"slug": "pida-pypaned",
"is_private": false,
"description": ""
},
{
"scm": "hg",
"website": "",
"has_wiki": true,
"name": "TLOMM-testing",
"links": {
"watchers": {
"href": "https://api.bitbucket.org/2.0/repositories/tgrimley/tlomm-testing/watchers"
},
"branches": {
"href": "https://api.bitbucket.org/2.0/repositories/tgrimley/tlomm-testing/refs/branches"
},
"tags": {
"href": "https://api.bitbucket.org/2.0/repositories/tgrimley/tlomm-testing/refs/tags"
},
"commits": {
"href": "https://api.bitbucket.org/2.0/repositories/tgrimley/tlomm-testing/commits"
},
"clone": [
{
"href": "https://bitbucket.org/tgrimley/tlomm-testing",
"name": "https"
},
{
"href": "ssh://hg@bitbucket.org/tgrimley/tlomm-testing",
"name": "ssh"
}
],
"self": {
"href": "https://api.bitbucket.org/2.0/repositories/tgrimley/tlomm-testing"
},
"html": {
"href": "https://bitbucket.org/tgrimley/tlomm-testing"
},
"avatar": {
"href": "https://bitbucket.org/tgrimley/tlomm-testing/avatar/32/"
},
"hooks": {
"href": "https://api.bitbucket.org/2.0/repositories/tgrimley/tlomm-testing/hooks"
},
"forks": {
"href": "https://api.bitbucket.org/2.0/repositories/tgrimley/tlomm-testing/forks"
},
"downloads": {
"href": "https://api.bitbucket.org/2.0/repositories/tgrimley/tlomm-testing/downloads"
},
"pullrequests": {
"href": "https://api.bitbucket.org/2.0/repositories/tgrimley/tlomm-testing/pullrequests"
}
},
"fork_policy": "allow_forks",
"uuid": "{95283ca1-f77e-40d6-b3ed-5bfae6ed2d15}",
"language": "",
"created_on": "2008-07-18T21:05:17.750587+00:00",
"full_name": "tgrimley/tlomm-testing",
"has_issues": true,
"owner": {
"username": "tgrimley",
"display_name": "Thomas Grimley",
"type": "user",
"uuid": "{c958a08f-4669-4c77-81ec-4e2faa8ebf35}",
"links": {
"self": {
"href": "https://api.bitbucket.org/2.0/users/tgrimley"
},
"html": {
"href": "https://bitbucket.org/tgrimley/"
},
"avatar": {
"href": "https://bitbucket.org/account/tgrimley/avatar/32/"
}
}
},
"updated_on": "2012-06-22T21:55:46.627825+00:00",
"size": 3128,
"type": "repository",
"slug": "tlomm-testing",
"is_private": false,
"description": "File related to testing functionality of TLOMM->TLOTTS transition"
},
{
"scm": "hg",
"website": "",
"has_wiki": true,
"name": "test",
"links": {
"watchers": {
"href": "https://api.bitbucket.org/2.0/repositories/tingle/test/watchers"
},
"branches": {
"href": "https://api.bitbucket.org/2.0/repositories/tingle/test/refs/branches"
},
"tags": {
"href": "https://api.bitbucket.org/2.0/repositories/tingle/test/refs/tags"
},
"commits": {
"href": "https://api.bitbucket.org/2.0/repositories/tingle/test/commits"
},
"clone": [
{
"href": "https://bitbucket.org/tingle/test",
"name": "https"
},
{
"href": "ssh://hg@bitbucket.org/tingle/test",
"name": "ssh"
}
],
"self": {
"href": "https://api.bitbucket.org/2.0/repositories/tingle/test"
},
"html": {
"href": "https://bitbucket.org/tingle/test"
},
"avatar": {
"href": "https://bitbucket.org/tingle/test/avatar/32/"
},
"hooks": {
"href": "https://api.bitbucket.org/2.0/repositories/tingle/test/hooks"
},
"forks": {
"href": "https://api.bitbucket.org/2.0/repositories/tingle/test/forks"
},
"downloads": {
"href": "https://api.bitbucket.org/2.0/repositories/tingle/test/downloads"
},
"pullrequests": {
"href": "https://api.bitbucket.org/2.0/repositories/tingle/test/pullrequests"
}
},
"fork_policy": "allow_forks",
"uuid": "{457953ec-fe87-41b9-b659-94756fb40ece}",
"language": "",
"created_on": "2008-07-18T22:24:31.984981+00:00",
"full_name": "tingle/test",
"has_issues": true,
"owner": {
"username": "tingle",
"display_name": "tingle",
"type": "user",
"uuid": "{dddce42b-bd19-417b-90ff-72cdbfb6eb7d}",
"links": {
"self": {
"href": "https://api.bitbucket.org/2.0/users/tingle"
},
"html": {
"href": "https://bitbucket.org/tingle/"
},
"avatar": {
"href": "https://bitbucket.org/account/tingle/avatar/32/"
}
}
},
"updated_on": "2012-06-22T21:55:49.860564+00:00",
"size": 3090,
"type": "repository",
"slug": "test",
"is_private": false,
"description": ""
},
{
"scm": "hg",
"website": "http://shaze.myopenid.com/",
"has_wiki": true,
"name": "Repository",
"links": {
"watchers": {
"href": "https://api.bitbucket.org/2.0/repositories/Shaze/repository/watchers"
},
"branches": {
"href": "https://api.bitbucket.org/2.0/repositories/Shaze/repository/refs/branches"
},
"tags": {
"href": "https://api.bitbucket.org/2.0/repositories/Shaze/repository/refs/tags"
},
"commits": {
"href": "https://api.bitbucket.org/2.0/repositories/Shaze/repository/commits"
},
"clone": [
{
"href": "https://bitbucket.org/Shaze/repository",
"name": "https"
},
{
"href": "ssh://hg@bitbucket.org/Shaze/repository",
"name": "ssh"
}
],
"self": {
"href": "https://api.bitbucket.org/2.0/repositories/Shaze/repository"
},
"html": {
"href": "https://bitbucket.org/Shaze/repository"
},
"avatar": {
"href": "https://bitbucket.org/Shaze/repository/avatar/32/"
},
"hooks": {
"href": "https://api.bitbucket.org/2.0/repositories/Shaze/repository/hooks"
},
"forks": {
"href": "https://api.bitbucket.org/2.0/repositories/Shaze/repository/forks"
},
"downloads": {
"href": "https://api.bitbucket.org/2.0/repositories/Shaze/repository/downloads"
},
"pullrequests": {
"href": "https://api.bitbucket.org/2.0/repositories/Shaze/repository/pullrequests"
}
},
"fork_policy": "allow_forks",
"uuid": "{3c0b8076-caef-465a-8d08-a184459f659b}",
"language": "",
"created_on": "2008-07-18T22:39:51.380134+00:00",
"full_name": "Shaze/repository",
"has_issues": true,
"owner": {
"username": "Shaze",
"display_name": "Shaze",
"type": "user",
"uuid": "{f57817e9-bfe4-4c65-84dd-662152430323}",
"links": {
"self": {
"href": "https://api.bitbucket.org/2.0/users/Shaze"
},
"html": {
"href": "https://bitbucket.org/Shaze/"
},
"avatar": {
"href": "https://bitbucket.org/account/Shaze/avatar/32/"
}
}
},
"updated_on": "2012-06-22T21:55:51.570502+00:00",
"size": 3052,
"type": "repository",
"slug": "repository",
"is_private": false,
"description": "Mine, all mine!"
},
{
"scm": "hg",
"website": "http://bitbucket.org/copiesofcopies/identifox/",
"has_wiki": true,
"name": "identifox",
"links": {
"watchers": {
"href": "https://api.bitbucket.org/2.0/repositories/uncryptic/identifox/watchers"
},
"branches": {
"href": "https://api.bitbucket.org/2.0/repositories/uncryptic/identifox/refs/branches"
},
"tags": {
"href": "https://api.bitbucket.org/2.0/repositories/uncryptic/identifox/refs/tags"
},
"commits": {
"href": "https://api.bitbucket.org/2.0/repositories/uncryptic/identifox/commits"
},
"clone": [
{
"href": "https://bitbucket.org/uncryptic/identifox",
"name": "https"
},
{
"href": "ssh://hg@bitbucket.org/uncryptic/identifox",
"name": "ssh"
}
],
"self": {
"href": "https://api.bitbucket.org/2.0/repositories/uncryptic/identifox"
},
"html": {
"href": "https://bitbucket.org/uncryptic/identifox"
},
"avatar": {
"href": "https://bitbucket.org/uncryptic/identifox/avatar/32/"
},
"hooks": {
"href": "https://api.bitbucket.org/2.0/repositories/uncryptic/identifox/hooks"
},
"forks": {
"href": "https://api.bitbucket.org/2.0/repositories/uncryptic/identifox/forks"
},
"downloads": {
"href": "https://api.bitbucket.org/2.0/repositories/uncryptic/identifox/downloads"
},
"pullrequests": {
"href": "https://api.bitbucket.org/2.0/repositories/uncryptic/identifox/pullrequests"
}
},
"fork_policy": "allow_forks",
"uuid": "{78a1a080-a77e-4d0d-823a-b107484477a8}",
"language": "",
"created_on": "2008-07-19T00:33:14.065946+00:00",
"full_name": "uncryptic/identifox",
"has_issues": true,
"owner": {
"username": "uncryptic",
"display_name": "Uncryptic Communications",
"type": "user",
"uuid": "{db87bb9a-9980-4840-bd4a-61f7748a56b4}",
"links": {
"self": {
"href": "https://api.bitbucket.org/2.0/users/uncryptic"
},
"html": {
"href": "https://bitbucket.org/uncryptic/"
},
"avatar": {
"href": "https://bitbucket.org/account/uncryptic/avatar/32/"
}
}
},
"updated_on": "2008-07-19T00:33:14+00:00",
"size": 1918,
"type": "repository",
"slug": "identifox",
"is_private": false,
"description": "TwitterFox, modified to work with Identi.ca, including cosmetic and subtle code changes. For the most part, the code is nearly identical to the TwitterFox base: http://www.naan.net/trac/wiki/TwitterFox"
},
{
"scm": "hg",
"website": "http://rforce.rubyforge.org",
"has_wiki": false,
"name": "rforce",
"links": {
"watchers": {
"href": "https://api.bitbucket.org/2.0/repositories/undees/rforce/watchers"
},
"branches": {
"href": "https://api.bitbucket.org/2.0/repositories/undees/rforce/refs/branches"
},
"tags": {
"href": "https://api.bitbucket.org/2.0/repositories/undees/rforce/refs/tags"
},
"commits": {
"href": "https://api.bitbucket.org/2.0/repositories/undees/rforce/commits"
},
"clone": [
{
"href": "https://bitbucket.org/undees/rforce",
"name": "https"
},
{
"href": "ssh://hg@bitbucket.org/undees/rforce",
"name": "ssh"
}
],
"self": {
"href": "https://api.bitbucket.org/2.0/repositories/undees/rforce"
},
"html": {
"href": "https://bitbucket.org/undees/rforce"
},
"avatar": {
"href": "https://bitbucket.org/undees/rforce/avatar/32/"
},
"hooks": {
"href": "https://api.bitbucket.org/2.0/repositories/undees/rforce/hooks"
},
"forks": {
"href": "https://api.bitbucket.org/2.0/repositories/undees/rforce/forks"
},
"downloads": {
"href": "https://api.bitbucket.org/2.0/repositories/undees/rforce/downloads"
},
"pullrequests": {
"href": "https://api.bitbucket.org/2.0/repositories/undees/rforce/pullrequests"
}
},
"fork_policy": "allow_forks",
"uuid": "{ec2ffee7-bfcd-4e95-83c8-22ac31e69fa3}",
"language": "",
"created_on": "2008-07-19T06:16:43.044743+00:00",
"full_name": "undees/rforce",
"has_issues": false,
"owner": {
"username": "undees",
"display_name": "Ian Dees",
"type": "user",
"uuid": "{6ff66a34-6412-4f28-bf57-707a2a5c6d7b}",
"links": {
"self": {
"href": "https://api.bitbucket.org/2.0/users/undees"
},
"html": {
"href": "https://bitbucket.org/undees/"
},
"avatar": {
"href": "https://bitbucket.org/account/undees/avatar/32/"
}
}
},
"updated_on": "2015-02-09T00:48:15.408680+00:00",
"size": 338402,
"type": "repository",
"slug": "rforce",
"is_private": false,
"description": "A simple, usable binding to the SalesForce API."
}
],
"next": "https://api.bitbucket.org/2.0/repositories?after=2008-07-19T19%3A53%3A07.031845%2B00%3A00"
}

View file

@ -0,0 +1,20 @@
# Copyright (C) 2017 the Software Heritage developers
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import re
import unittest
from swh.lister.bitbucket.lister import BitBucketLister
from swh.lister.core.tests.test_lister import IndexingHttpListerTesterBase
class BitBucketListerTester(IndexingHttpListerTesterBase, unittest.TestCase):
Lister = BitBucketLister
test_re = re.compile(r'/repositories\?after=([^?&]+)')
lister_subdir = 'bitbucket'
good_api_response_file = 'api_response.json'
bad_api_response_file = 'api_empty_response.json'
first_index = '2008-07-12T07:44:01.476818+00:00'
last_index = '2008-07-19T06:16:43.044743+00:00'
entries_per_page = 10

View file

@ -0,0 +1,24 @@
# Copyright (C) 2017 the Software Heritage developers
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
class AbstractAttribute:
"""AbstractAttributes in a base class must be overridden by the subclass.
It's like the @abc.abstractmethod decorator, but for things that are
explicitly attributes/properties, not methods, without the need for
empty method def boilerplate. Like abc.abstractmethod, the class
containing AbstractAttributes must inherit abc.ABC or use the
abc.ABCMeta metaclass.
Usage Example:
import abc
class ClassContainingAnAbstractAttribute(abc.ABC):
foo = AbstractAttribute('descriptive docstring for foo')
"""
__isabstractmethod__ = True
def __init__(self, docstring=None):
if docstring is not None:
self.__doc__ = 'AbstractAttribute: ' + docstring

View file

@ -0,0 +1,210 @@
# Copyright (C) 2015-2017 the Software Heritage developers
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import abc
import logging
from sqlalchemy import func
from .lister_transports import SWHListerHttpTransport
from .lister_base import SWHListerBase
class SWHIndexingLister(SWHListerBase):
"""Lister* intermediate class for any service that follows the pattern:
-- The service must report at least one stable unique identifier,
known herein as the UID value, for every listed repository.
-- If the service splits the list of repositories into sublists,
it must report at least one stable and sorted index identifier
for every listed repository, known herein as the indexable value,
which can be used as part of the service endpoint query to request
a sublist beginning from that index. This might be the UID if the
UID is monotonic.
-- Client sends a request to list repositories starting from a given
index.
-- Client receives structured (json/xml/etc) response with information
about a sequential series of repositories starting from that index
and, if necessary/available, some indication of the URL or index
for fetching the next series of repository data.
* - See swh.lister.core.lister_base.SWHListerBase for more details.
This class cannot be instantiated. To create a new Lister for a source
code listing service that follows the model described above, you must
subclass this class and provide the required overrides in addition to
any unmet implementation/override requirements of this class's base.
(see parent class and member docstrings for details)
Required Overrides:
def get_next_target_from_response
"""
@abc.abstractmethod
def get_next_target_from_response(self, response):
"""Find the next server endpoint identifier given the entire response.
Implementation of this method depends on the server API spec
and the shape of the network response object returned by the
transport_request method.
Args:
response (transport response): response page from the server
Returns:
index of next page, possibly extracted from a next href url
"""
pass
# You probably don't need to override anything below this line.
def filter_before_inject(self, models_list):
"""Overrides SWHListerBase.filter_before_inject
Bounds query results by this Lister's set max_index.
"""
models_list = [
m for m in models_list
if self.is_within_bounds(m['indexable'], None, self.max_index)
]
return models_list
def db_query_range(self, start, end):
"""Look in the db for a range of repositories with indexable
values in the range [start, end]
Args:
start (model indexable type): start of desired indexable range
end (model indexable type): end of desired indexable range
Returns:
a list of sqlalchemy.ext.declarative.declarative_base objects
with indexable values within the given range
"""
retlist = self.db_session.query(self.MODEL)
if start is not None:
retlist.filter(self.MODEL.indexable >= start)
if end is not None:
retlist.filter(self.MODEL.indexable <= end)
return retlist
def db_partition_indices(self, partition_size):
"""Describe an index-space compartmentalization of the db table
in equal sized chunks. This is used to describe min&max bounds for
parallelizing fetch tasks.
Args:
partition_size (int): desired size to make each partition
Returns:
a list of tuples (begin, end) of indexable value that
declare approximately equal-sized ranges of existing
repos
"""
n = self.db_num_entries()
partitions = []
partition_size = min(partition_size, n)
prev_index = None
for i in range(0, n-1, partition_size):
# indexable column from the ith row
index = self.db_session.query(self.MODEL.indexable) \
.order_by(self.MODEL.indexable).offset(i).first()
if index is not None and prev_index is not None:
partitions.append((prev_index, index))
prev_index = index
partitions.append((prev_index, self.db_last_index()))
return partitions
def db_last_index(self):
"""Look in the db for the largest indexable value
Returns:
the largest indexable value of all repos in the db
"""
t = self.db_session.query(func.max(self.MODEL.indexable)).first()
if t:
return t[0]
else:
return None
def disable_deleted_repo_tasks(self, start, end, keep_these):
"""Disable tasks for repos that no longer exist between start and end.
Args:
start: beginning of range to disable
end: end of range to disable
keep_these (uid list): do not disable repos with uids in this list
"""
if end is None:
end = self.db_last_index()
if not self.is_within_bounds(end, None, self.max_index):
end = self.max_index
deleted_repos = self.winnow_models(
self.db_query_range(start, end), self.MODEL.uid, keep_these
)
tasks_to_disable = [repo for repo in deleted_repos
if repo.task_id is not None]
if tasks_to_disable:
self.scheduler.disable_tasks(tasks_to_disable)
for repo in deleted_repos:
repo.task_id = None
def run(self, min_index=None, max_index=None):
"""Main entry function. Sequentially fetches repository data
from the service according to the basic outline in the class
docstring, continually fetching sublists until either there
is no next index reference given or the given next index is greater
than the desired max_index.
Args:
min_index (indexable type): optional index to start from
max_index (indexable type): optional index to stop at
Returns:
nothing
"""
index = min_index or ''
loop_count = 0
self.min_index = min_index
self.max_index = max_index
while self.is_within_bounds(index, self.min_index, self.max_index):
logging.info('listing repos starting at %s' % index)
response, injected_repos = self.ingest_data(index)
next_index = self.get_next_target_from_response(response)
# Determine if any repos were deleted, and disable their tasks.
keep_these = [k for k in injected_repos.keys()]
self.disable_deleted_repo_tasks(index, next_index, keep_these)
# termination condition
if (next_index is None) or (next_index == index):
logging.info('stopping after index %s, no next link found' %
index)
break
else:
index = next_index
loop_count += 1
if loop_count == 20:
logging.info('flushing updates')
loop_count = 0
self.db_session.commit()
self.db_session = self.mk_session()
self.db_session.commit()
self.db_session = self.mk_session()
class SWHIndexingHttpLister(SWHListerHttpTransport, SWHIndexingLister):
"""Convenience class for ensuring right lookup and init order
when combining SWHIndexingLister and SWHListerHttpTransport."""
def __init__(self, lister_name=None, api_baseurl=None,
override_config=None):
SWHListerHttpTransport.__init__(self, api_baseurl=api_baseurl)
SWHIndexingLister.__init__(self, lister_name=lister_name,
override_config=override_config)

View file

@ -0,0 +1,492 @@
# Copyright (C) 2015-2017 the Software Heritage developers
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import abc
import gzip
import logging
import os
import re
import time
from datetime import datetime
from sqlalchemy import create_engine, func
from sqlalchemy.orm import sessionmaker
from swh.core import config
from swh.scheduler.backend import SchedulerBackend
from swh.storage import get_storage
from .abstractattribute import AbstractAttribute
class FetchError(RuntimeError):
def __init__(self, response):
self.response = response
def __str__(self):
return repr(self.response)
class SWHListerBase(abc.ABC, config.SWHConfig):
"""Lister core base class.
Generally a source code hosting service provides an API endpoint
for listing the set of stored repositories. A Lister is the discovery
service responsible for finding this list, all at once or sequentially
by parts, and queueing local tasks to fetch and ingest the referenced
repositories.
The core method in this class is ingest_data. Any subclasses should be
calling this method one or more times to fetch and ingest data from API
endpoints. See swh.lister.core.lister_base.SWHIndexingLister for
example usage.
This class cannot be instantiated. Any instantiable Lister descending
from SWHListerBase must provide at least the required overrides.
(see member docstrings for details):
Required Overrides:
MODEL
def transport_request
def transport_response_to_string
def transport_response_simplified
def transport_quota_check
Optional Overrides:
def filter_before_inject
def is_within_bounds
"""
MODEL = AbstractAttribute('Subclass type (not instance)'
' of swh.lister.core.models.ModelBase'
' customized for a specific service.')
@abc.abstractmethod
def transport_request(self, identifier):
"""Given a target endpoint identifier to query, try once to request it.
Implementation of this method determines the network request protocol.
Args:
identifier (string): unique identifier for an endpoint query.
e.g. If the service indexes lists of repositories by date and
time of creation, this might be that as a formatted string. Or
it might be an integer UID. Or it might be nothing.
It depends on what the service needs.
Returns:
the entire request response
Raises:
Will catch internal transport-dependent connection exceptions and
raise swh.lister.core.lister_base.FetchError instead. Other
non-connection exceptions should propogate unchanged.
"""
pass
@abc.abstractmethod
def transport_response_to_string(self, response):
"""Convert the server response into a formatted string for logging.
Implementation of this method depends on the shape of the network
response object returned by the transport_request method.
Args:
response: the server response
Returns:
a pretty string of the response
"""
pass
@abc.abstractmethod
def transport_response_simplified(self, response):
"""Convert the server response into list of a dict for each repo in the
response, mapping columns in the lister's MODEL class to repo data.
Implementation of this method depends on the server API spec and the
shape of the network response object returned by the transport_request
method.
Args:
response: response object from the server.
Returns:
list of repo MODEL dicts
( eg. [{'uid': r['id'], etc.} for r in response.json()] )
"""
pass
@abc.abstractmethod
def transport_quota_check(self, response):
"""Check server response to see if we're hitting request rate limits.
Implementation of this method depends on the server communication
protocol and API spec and the shape of the network response object
returned by the transport_request method.
Args:
response (session response): complete API query response
Returns:
1) must retry request? True/False
2) seconds to delay if True
"""
pass
def filter_before_inject(self, models_list):
"""Function run after transport_response_simplified but before injection
into the local db and creation of workers. Can be used to eliminate
some of the results if necessary.
MAY BE OVERRIDDEN if an intermediate Lister class needs to filter
results before injection without requiring every child class to do so.
Args:
models_list: list of dicts returned by
transport_response_simplified.
Returns:
models_list with entries changed according to custom logic.
"""
pass
def is_within_bounds(self, inner, lower=None, upper=None):
"""See if a sortable value is inside the range [lower,upper].
MAY BE OVERRIDDEN, for example if the server indexable* key is
technically sortable but not automatically so.
* - ( see: swh.lister.core.indexing_lister.SWHIndexingLister )
Args:
inner (sortable type): the value being checked
lower (sortable type): optional lower bound
upper (sortable type): optional upper bound
Returns:
whether inner is confined by the optional lower and upper bounds
"""
try:
if lower is None and upper is None:
return True
elif lower is None:
ret = inner <= upper
elif upper is None:
ret = inner >= lower
else:
ret = lower <= inner <= upper
self.string_pattern_check(inner, lower, upper)
except Exception as e:
logging.error(str(e) + ': %s, %s, %s' %
(('inner=%s%s' % (type(inner), inner)),
('lower=%s%s' % (type(lower), lower)),
('upper=%s%s' % (type(upper), upper)))
)
raise
return ret
# You probably don't need to override anything below this line.
DEFAULT_CONFIG = {
'storage': ('dict', {
'cls': 'remote',
'args': {
'url': 'http://localhost:5002/'
},
}),
'scheduling_db': ('str', 'dbname=softwareheritage-scheduler'),
}
@property
def CONFIG_BASE_FILENAME(self): # noqa: N802
return 'lister-%s' % self.lister_name
@property
def ADDITIONAL_CONFIG(self): # noqa: N802
return {
'lister_db_url':
('str', 'postgresql:///lister-%s' % self.lister_name),
'credentials':
('list[dict]', []),
'cache_responses':
('bool', False),
'cache_dir':
('str', '~/.cache/swh/lister/%s' % self.lister_name),
}
INITIAL_BACKOFF = 10
MAX_RETRIES = 7
CONN_SLEEP = 10
def __init__(self, lister_name=None, override_config=None):
self.backoff = self.INITIAL_BACKOFF
if lister_name is None:
raise NameError("Every lister must be assigned a lister_name.")
self.lister_name = lister_name # 'github?', 'bitbucket?', 'foo.com?'
self.config = self.parse_config_file(
base_filename=self.CONFIG_BASE_FILENAME,
additional_configs=[self.ADDITIONAL_CONFIG]
)
self.config['cache_dir'] = os.path.expanduser(self.config['cache_dir'])
if self.config['cache_responses']:
config.prepare_folders(self.config, ['cache_dir'])
if override_config:
self.config.update(override_config)
self.storage = get_storage(**self.config['storage'])
self.scheduler = SchedulerBackend(
scheduling_db=self.config['scheduling_db'],
)
self.db_engine = create_engine(self.config['lister_db_url'])
self.mk_session = sessionmaker(bind=self.db_engine)
self.db_session = self.mk_session()
def reset_backoff(self):
"""Reset exponential backoff timeout to initial level."""
self.backoff = self.INITIAL_BACKOFF
def back_off(self):
"""Get next exponential backoff timeout."""
ret = self.backoff
self.backoff *= 10
return ret
def safely_issue_request(self, identifier):
"""Make network request with retries, rate quotas, and response logs.
Protocol is handled by the implementation of the transport_request
method.
Args:
identifier: resource identifier
Returns:
server response
"""
retries_left = self.MAX_RETRIES
while retries_left > 0:
try:
r = self.transport_request(identifier)
except FetchError:
# network-level connection error, try again
logging.warn('connection error on %s: sleep for %d seconds' %
(identifier, self.CONN_SLEEP))
time.sleep(self.CONN_SLEEP)
retries_left -= 1
continue
# detect throttling
must_retry, delay = self.transport_quota_check(r)
if must_retry:
logging.warn('rate limited on %s: sleep for %f seconds' %
(identifier, delay))
time.sleep(delay)
else: # request ok
break
retries_left -= 1
if not retries_left:
logging.warn('giving up on %s: max retries exceeded' % identifier)
do_cache = self.config['cache_responses']
if do_cache:
self.save_response(r)
return r
def db_query_equal(self, key, value):
"""Look in the db for a row with key == value
Args:
key: column key to look at
value: value to look for in that column
Returns:
sqlalchemy.ext.declarative.declarative_base object
with the given key == value
"""
if isinstance(key, str):
key = self.MODEL.__dict__[key]
return self.db_session.query(self.MODEL) \
.filter(key == value).first()
def winnow_models(self, mlist, key, to_remove):
"""Given a list of models, remove any with <key> matching
some member of a list of values.
Args:
mlist (list of model rows): the initial list of models
key (column): the column to filter on
to_remove (list): if anything in mlist has column <key> equal to
one of the values in to_remove, it will be removed from the
result
Returns:
A list of model rows starting from mlist minus any matching rows
"""
if isinstance(key, str):
key = self.MODEL.__dict__[key]
if to_remove:
return mlist.filter(~key.in_(to_remove)).all()
else:
return mlist.all()
def db_num_entries(self):
"""Return the known number of entries in the lister db"""
return self.db_session.query(func.count('*')).select_from(self.MODEL) \
.scalar()
def db_inject_repo(self, model_dict):
"""Add/update a new repo to the db and mark it last_seen now.
Args:
model_dict: dictionary mapping model keys to values
Returns:
new or updated sqlalchemy.ext.declarative.declarative_base
object associated with the injection
"""
sql_repo = self.db_query_equal('uid', model_dict['uid'])
if not sql_repo:
sql_repo = self.MODEL(**model_dict)
self.db_session.add(sql_repo)
else:
for k in model_dict:
setattr(sql_repo, k, model_dict[k])
sql_repo.last_seen = datetime.now()
return sql_repo
def origin_dict(self, origin_type, origin_url):
"""Return special dict format for the origins list
Args:
origin_type (string)
origin_url (string)
Returns:
the same information in a different form
"""
return {
'type': origin_type,
'url': origin_url,
}
def task_dict(self, origin_type, origin_url):
"""Return special dict format for the tasks list
Args:
origin_type (string)
origin_url (string)
Returns:
the same information in a different form
"""
return {
'type': 'origin-update-%s' % origin_type,
'arguments': {
'args': [
origin_url,
],
'kwargs': {},
},
'next_run': datetime.now(),
}
def string_pattern_check(self, a, b, c=None):
"""When comparing indexable types in is_within_bounds, complex strings
may not be allowed to differ in basic structure. If they do, it
could be a sign of not understanding the data well. For instance,
an ISO 8601 time string cannot be compared against its urlencoded
equivalent, but this is an easy mistake to accidentally make. This
method acts as a friendly sanity check.
Args:
a (string): inner component of the is_within_bounds method
b (string): lower component of the is_within_bounds method
c (string): upper component of the is_within_bounds method
Returns:
nothing
Raises:
TypeError if strings a, b, and c don't conform to the same basic
pattern.
"""
if isinstance(a, str):
a_pattern = re.sub('[a-zA-Z0-9]',
'[a-zA-Z0-9]',
re.escape(a))
if (isinstance(b, str) and (re.match(a_pattern, b) is None)
or isinstance(c, str) and (re.match(a_pattern, c) is None)):
logging.debug(a_pattern)
raise TypeError('incomparable string patterns detected')
def inject_repo_data_into_db(self, models_list):
"""Inject data into the db.
Args:
models_list: list of dicts mapping keys from the db model
for each repo to be injected
Returns:
dict of uid:sql_repo pairs
"""
injected_repos = {}
for m in models_list:
injected_repos[m['uid']] = self.db_inject_repo(m)
return injected_repos
def create_missing_origins_and_tasks(self, models_list, injected_repos):
"""Find any newly created db entries that don't yet have tasks or
origin objects assigned.
Args:
models_list: a list of dicts mapping keys in the db model for
each repo
injected_repos: dict of uid:sql_repo pairs that have just
been created
Returns:
Nothing. Modifies injected_repos.
"""
for m in models_list:
ir = injected_repos[m['uid']]
if not ir.origin_id:
ir.origin_id = self.storage.origin_add_one(
self.origin_dict(m['origin_type'], m['origin_url'])
)
if not ir.task_id:
ir.task_id = self.scheduler.create_tasks(
[self.task_dict(m['origin_type'], m['origin_url'])]
)['id']
def ingest_data(self, identifier):
"""The core data fetch sequence. Request server endpoint. Simplify and
filter response list of repositories. Inject repo information into
local db. Queue loader tasks for linked repositories.
Args:
identifier: Resource identifier.
"""
# Request (partial?) list of repositories info
response = self.safely_issue_request(identifier)
models_list = self.transport_response_simplified(response)
models_list = self.filter_before_inject(models_list)
# inject into local db
injected = self.inject_repo_data_into_db(models_list)
# queue workers
self.create_missing_origins_and_tasks(models_list, injected)
return response, injected
def save_response(self, response):
"""Log the response from a server request to a cache dir.
Args:
response: full server response
cache_dir: system path for cache dir
Returns:
nothing
"""
def escape_url_path(p):
return p.replace('/', '__')
fname = os.path.join(
self.config['cache_dir'],
escape_url_path(response.request.path_url) + '.gz'
)
with gzip.open(fname, 'w') as f:
f.write(bytes(
self.transport_response_to_string(response),
'UTF-8'
))

View file

@ -0,0 +1,97 @@
# Copyright (C) 2017 the Software Heritage developers
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import abc
import random
from datetime import datetime
from email.utils import parsedate
from pprint import pformat
import requests
import xmltodict
from .abstractattribute import AbstractAttribute
from .lister_base import FetchError
class SWHListerHttpTransport(abc.ABC):
"""Use the Requests library for making Lister endpoint requests.
To be used in conjunction with SWHListerBase or a subclass of it.
"""
PATH_TEMPLATE = AbstractAttribute('string containing a python string'
' format pattern that produces the API'
' endpoint path for listing stored'
' repositories when given an index.'
' eg. "/repositories?after=%s".'
'To be implemented in the API-specific'
' class inheriting this.')
def request_headers(self):
"""Returns dictionary of any request headers needed by the server.
MAY BE OVERRIDDEN if request headers are needed.
"""
return {}
def transport_quota_check(self, response):
"""Implements SWHListerBase.transport_quota_check with standard 429 code
check for HTTP with Requests library.
MAY BE OVERRIDDEN if the server notifies about rate limits in a
non-standard way that doesn't use HTTP 429 and the Retry-After
response header. ( https://tools.ietf.org/html/rfc6585#section-4 )
"""
if response.status_code == 429: # HTTP too many requests
retry_after = response.headers.get('Retry-After', self.back_off())
try:
# might be seconds
return True, float(retry_after)
except:
# might be http-date
at_date = datetime(*parsedate(retry_after)[:6])
from_now = (at_date - datetime.today()).total_seconds() + 5
return True, max(0, from_now)
else: # response ok
self.reset_backoff()
return False, 0
def __init__(self, api_baseurl=None):
if not api_baseurl:
raise NameError('HTTP Lister Transport requires api_baseurl.')
self.api_baseurl = api_baseurl # eg. 'https://api.github.com'
self.session = requests.Session()
def transport_request(self, identifier):
"""Implements SWHListerBase.transport_request for HTTP using Requests.
"""
path = self.PATH_TEMPLATE % identifier
params = {}
params['headers'] = self.request_headers() or {}
creds = self.config['credentials']
auth = random.choice(creds) if creds else None
if auth:
params['auth'] = auth
try:
return self.session.get(self.api_baseurl + path, **params)
except requests.exceptions.ConnectionError as e:
raise FetchError(e)
def transport_response_to_string(self, response):
"""Implements SWHListerBase.transport_response_to_string for HTTP given
Requests responses.
"""
s = pformat(response.request.path_url)
s += '\n#\n' + pformat(response.status_code)
s += '\n#\n' + pformat(response.headers)
s += '\n#\n'
try: # json?
s += pformat(response.json())
except: # not json
try: # xml?
s += pformat(xmltodict.parse(response.text))
except: # not xml
s += pformat(response.text)
return s

67
swh/lister/core/models.py Normal file
View file

@ -0,0 +1,67 @@
# Copyright (C) 2015-2017 the Software Heritage developers
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import abc
from datetime import datetime
from sqlalchemy import Column, DateTime, Integer, String
from sqlalchemy.ext.declarative import declarative_base, DeclarativeMeta
from .abstractattribute import AbstractAttribute
SQLBase = declarative_base()
class ABCSQLMeta(abc.ABCMeta, DeclarativeMeta):
pass
class ModelBase(SQLBase, metaclass=ABCSQLMeta):
"""a common repository"""
__abstract__ = True
__tablename__ = AbstractAttribute
uid = AbstractAttribute('Column(<uid_type>, primary_key=True)')
# The value used for sorting, segmenting, or api query paging,
# because uids aren't always sequential.
indexable = AbstractAttribute('Column(<indexable_type>, index=True)')
name = Column(String, index=True)
full_name = Column(String, index=True)
html_url = Column(String)
origin_url = Column(String)
origin_type = Column(String)
description = Column(String)
last_seen = Column(DateTime, nullable=False)
task_id = Column(Integer)
origin_id = Column(Integer)
def __init__(self, uid=None, indexable=None, name=None, full_name=None,
html_url=None, origin_url=None, origin_type=None,
description=None, task_id=None, origin_id=None):
self.uid = uid
self.last_seen = datetime.now()
if indexable is not None:
self.indexable = indexable
if name is not None:
self.name = name
if full_name is not None:
self.full_name = full_name
if html_url is not None:
self.html_url = html_url
if origin_url is not None:
self.origin_url = origin_url
if origin_type is not None:
self.origin_type = origin_type
if description is not None:
self.description = description
if task_id is not None:
self.task_id = task_id
if origin_id is not None:
self.origin_id = origin_id

71
swh/lister/core/tasks.py Normal file
View file

@ -0,0 +1,71 @@
# Copyright (C) 2017 the Software Heritage developers
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import abc
import random
from celery import group
from celery.app.task import TaskType
from swh.scheduler.task import Task
from .abstractattribute import AbstractAttribute
class AbstractTaskMeta(abc.ABCMeta, TaskType):
pass
class ListerTaskBase(Task, metaclass=AbstractTaskMeta):
"""Lister Tasks define the process of periodically requesting batches of
repository information from source code hosting services. They
instantiate Listers to do batches of work at periodic intervals.
There are two main kinds of lister tasks:
1) Discovering new repositories.
2) Refreshing the list of already discovered repositories.
If the hosting service is indexable (according to the requirements of
SWHIndexingLister), then we can optionally partition the set of known
repositories into sub-sets to distribute the work.
This means that there is a third possible Task type for Indexing
Listers:
3) Discover or refresh a specific range of indices.
"""
task_queue = AbstractAttribute('Celery Task queue name')
@abc.abstractmethod
def new_lister(self):
"""Return a new lister of the appropriate type.
"""
pass
@abc.abstractmethod
def run(self):
pass
class IndexingDiscoveryListerTask(ListerTaskBase):
def run(self):
lister = self.new_lister()
lister.run(min_index=lister.db_last_index(), max_index=None)
class IndexingRangeListerTask(ListerTaskBase):
def run(self, start, end):
lister = self.new_lister()
lister.run(min_index=start, max_index=end)
class IndexingRefreshListerTask(ListerTaskBase):
GROUP_SPLIT = 10000
def run(self):
lister = self.new_lister()
ranges = lister.db_partition_indices(self.GROUP_SPLIT)
random.shuffle(ranges)
range_task = IndexingRangeListerTask()
group(range_task.s(minv, maxv) for minv, maxv in ranges)()

View file

@ -0,0 +1,68 @@
# Copyright (C) 2017 the Software Heritage developers
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import abc
import unittest
from nose.tools import istest
from swh.lister.core.abstractattribute import AbstractAttribute
class BaseClass(abc.ABC):
v1 = AbstractAttribute
v2 = AbstractAttribute()
v3 = AbstractAttribute('changed docstring')
v4 = 'qux'
class BadSubclass1(BaseClass):
pass
class BadSubclass2(BaseClass):
v1 = 'foo'
v2 = 'bar'
class BadSubclass3(BaseClass):
v2 = 'bar'
v3 = 'baz'
class GoodSubclass(BaseClass):
v1 = 'foo'
v2 = 'bar'
v3 = 'baz'
class TestAbstractAttributes(unittest.TestCase):
@istest
def test_aa(self):
with self.assertRaises(TypeError):
BaseClass()
with self.assertRaises(TypeError):
BadSubclass1()
with self.assertRaises(TypeError):
BadSubclass2()
with self.assertRaises(TypeError):
BadSubclass3()
self.assertIsInstance(GoodSubclass(), GoodSubclass)
gsc = GoodSubclass()
self.assertEqual(gsc.v1, 'foo')
self.assertEqual(gsc.v2, 'bar')
self.assertEqual(gsc.v3, 'baz')
self.assertEqual(gsc.v4, 'qux')
@istest
def test_aa_docstrings(self):
self.assertEqual(BaseClass.v1.__doc__, AbstractAttribute.__doc__)
self.assertEqual(BaseClass.v2.__doc__, AbstractAttribute.__doc__)
self.assertEqual(BaseClass.v3.__doc__,
'AbstractAttribute: changed docstring')

View file

@ -0,0 +1,229 @@
# Copyright (C) 2017 the Software Heritage developers
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import abc
import time
from unittest import TestCase
from unittest.mock import Mock, patch
import requests_mock
import testing.postgresql
from nose.tools import istest
from sqlalchemy import create_engine
from swh.lister.core.abstractattribute import AbstractAttribute
def noop(*args, **kwargs):
pass
@requests_mock.Mocker()
class IndexingHttpListerTesterBase(abc.ABC):
"""Base testing class for subclasses of
swh.lister.core.indexing_lister.SWHIndexingHttpLister.
See swh.lister.github.tests.test_gh_lister for an example of how to
customize for a specific listing service.
"""
Lister = AbstractAttribute('The lister class to test')
test_re = AbstractAttribute('Compiled regex matching the server url. Must'
' capture the index value.')
lister_subdir = AbstractAttribute('bitbucket, github, etc.')
good_api_response_file = AbstractAttribute('Example good response body')
bad_api_response_file = AbstractAttribute('Example bad response body')
first_index = AbstractAttribute('First index in good_api_response')
last_index = AbstractAttribute('Last index in good_api_response')
entries_per_page = AbstractAttribute('Number of results in good response')
# May need to override this if the headers are used for something
def response_headers(self, request):
return {}
# May need to override this if the server uses non-standard rate limiting
# method.
# Please keep the requested retry delay reasonably low.
def mock_rate_quota(self, n, request, context):
self.rate_limit += 1
context.status_code = 429
context.headers['Retry-After'] = '1'
return '{"error":"dummy"}'
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.rate_limit = 1
self.response = None
self.fl = None
self.helper = None
if self.__class__ != IndexingHttpListerTesterBase:
self.run = TestCase.run.__get__(self, self.__class__)
else:
self.run = noop
def request_index(self, request):
m = self.test_re.search(request.path_url)
if m and (len(m.groups()) > 0):
return m.group(1)
else:
return None
def mock_response(self, request, context):
self.fl.reset_backoff()
self.rate_limit = 1
context.status_code = 200
custom_headers = self.response_headers(request)
context.headers.update(custom_headers)
if self.request_index(request) == str(self.first_index):
with open('swh/lister/%s/tests/%s' % (self.lister_subdir,
self.good_api_response_file),
'r') as r:
return r.read()
else:
with open('swh/lister/%s/tests/%s' % (self.lister_subdir,
self.bad_api_response_file),
'r') as r:
return r.read()
def mock_limit_n_response(self, n, request, context):
self.fl.reset_backoff()
if self.rate_limit <= n:
return self.mock_rate_quota(n, request, context)
else:
return self.mock_response(request, context)
def mock_limit_once_response(self, request, context):
return self.mock_limit_n_response(1, request, context)
def mock_limit_twice_response(self, request, context):
return self.mock_limit_n_response(2, request, context)
def get_fl(self, override_config=None):
if override_config or self.fl is None:
with patch(
'swh.scheduler.backend.SchedulerBackend.reconnect', noop
):
self.fl = self.Lister(lister_name='fakelister',
api_baseurl='https://fakeurl',
override_config=override_config)
self.fl.INITIAL_BACKOFF = 1
self.fl.reset_backoff()
return self.fl
def get_api_response(self):
fl = self.get_fl()
if self.response is None:
self.response = fl.safely_issue_request(self.first_index)
return self.response
@istest
def test_is_within_bounds(self, http_mocker):
fl = self.get_fl()
self.assertFalse(fl.is_within_bounds(1, 2, 3))
self.assertTrue(fl.is_within_bounds(2, 1, 3))
self.assertTrue(fl.is_within_bounds(1, 1, 1))
self.assertTrue(fl.is_within_bounds(1, None, None))
self.assertTrue(fl.is_within_bounds(1, None, 2))
self.assertTrue(fl.is_within_bounds(1, 0, None))
self.assertTrue(fl.is_within_bounds("b", "a", "c"))
self.assertFalse(fl.is_within_bounds("a", "b", "c"))
self.assertTrue(fl.is_within_bounds("a", None, "c"))
self.assertTrue(fl.is_within_bounds("a", None, None))
self.assertTrue(fl.is_within_bounds("b", "a", None))
self.assertFalse(fl.is_within_bounds("a", "b", None))
self.assertTrue(fl.is_within_bounds("aa:02", "aa:01", "aa:03"))
self.assertFalse(fl.is_within_bounds("aa:12", None, "aa:03"))
with self.assertRaises(TypeError):
fl.is_within_bounds(1.0, "b", None)
with self.assertRaises(TypeError):
fl.is_within_bounds("A:B", "A::B", None)
@istest
def test_api_request(self, http_mocker):
http_mocker.get(self.test_re, text=self.mock_limit_twice_response)
with patch.object(time, 'sleep', wraps=time.sleep) as sleepmock:
self.get_api_response()
self.assertEqual(sleepmock.call_count, 2)
@istest
def test_repos_list(self, http_mocker):
http_mocker.get(self.test_re, text=self.mock_response)
li = self.get_fl().transport_response_simplified(
self.get_api_response()
)
self.assertIsInstance(li, list)
self.assertEqual(len(li), self.entries_per_page)
@istest
def test_model_map(self, http_mocker):
http_mocker.get(self.test_re, text=self.mock_response)
fl = self.get_fl()
li = fl.transport_response_simplified(self.get_api_response())
di = li[0]
self.assertIsInstance(di, dict)
pubs = [k for k in vars(fl.MODEL).keys() if not k.startswith('_')]
for k in pubs:
if k not in ['last_seen', 'task_id', 'origin_id']:
self.assertIn(k, di)
def disable_storage_and_scheduler(self, fl):
fl.create_missing_origins_and_tasks = Mock(return_value=None)
def disable_db(self, fl):
fl.winnow_models = Mock(return_value=[])
fl.db_inject_repo = Mock(return_value=fl.MODEL())
fl.disable_deleted_repo_tasks = Mock(return_value=None)
@istest
def test_fetch_none_nodb(self, http_mocker):
http_mocker.get(self.test_re, text=self.mock_response)
fl = self.get_fl()
self.disable_storage_and_scheduler(fl)
self.disable_db(fl)
fl.run(min_index=1, max_index=1) # stores no results
@istest
def test_fetch_one_nodb(self, http_mocker):
http_mocker.get(self.test_re, text=self.mock_response)
fl = self.get_fl()
self.disable_storage_and_scheduler(fl)
self.disable_db(fl)
fl.run(min_index=self.first_index, max_index=self.first_index)
@istest
def test_fetch_multiple_pages_nodb(self, http_mocker):
http_mocker.get(self.test_re, text=self.mock_response)
fl = self.get_fl()
self.disable_storage_and_scheduler(fl)
self.disable_db(fl)
fl.run(min_index=self.first_index)
def init_db(self, db, model):
engine = create_engine(db.url())
model.metadata.create_all(engine)
@istest
def test_fetch_multiple_pages_yesdb(self, http_mocker):
http_mocker.get(self.test_re, text=self.mock_response)
db = testing.postgresql.Postgresql()
fl = self.get_fl(override_config={'lister_db_url': db.url()})
self.init_db(db, fl.MODEL)
self.disable_storage_and_scheduler(fl)
fl.run(min_index=self.first_index)
self.assertEqual(fl.db_last_index(), self.last_index)
partitions = fl.db_partition_indices(5)
self.assertGreater(len(partitions), 0)
for k in partitions:
self.assertLessEqual(len(k), 5)
self.assertGreater(len(k), 0)

View file

@ -0,0 +1,53 @@
# Copyright (C) 2017 the Software Heritage developers
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import unittest
from nose.tools import istest
from sqlalchemy import Column, Integer
from swh.lister.core.models import ModelBase
class BadSubclass1(ModelBase):
__abstract__ = True
pass
class BadSubclass2(ModelBase):
__abstract__ = True
__tablename__ = 'foo'
class BadSubclass3(BadSubclass2):
__abstract__ = True
pass
class GoodSubclass(BadSubclass2):
uid = Column(Integer, primary_key=True)
indexable = Column(Integer, index=True)
class TestModel(unittest.TestCase):
@istest
def test_model_instancing(self):
with self.assertRaises(TypeError):
ModelBase()
with self.assertRaises(TypeError):
BadSubclass1()
with self.assertRaises(TypeError):
BadSubclass2()
with self.assertRaises(TypeError):
BadSubclass3()
self.assertIsInstance(GoodSubclass(), GoodSubclass)
gsc = GoodSubclass(uid='uid', indexable='indexable')
self.assertEqual(gsc.__tablename__, 'foo')
self.assertEqual(gsc.uid, 'uid')
self.assertEqual(gsc.indexable, 'indexable')

View file

@ -1,65 +0,0 @@
# Copyright (C) 2016 The Software Heritage developers
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from swh.core import config
from swh.storage import get_storage
from swh.scheduler.backend import SchedulerBackend
# TODO: split this into a lister-agnostic module
class SWHLister(config.SWHConfig):
CONFIG_BASE_FILENAME = None
DEFAULT_CONFIG = {
'storage': ('dict', {
'cls': 'remote',
'args': {
'url': 'http://localhost:5002/'
},
}),
'scheduling_db': ('str', 'dbname=softwareheritage-scheduler'),
}
ADDITIONAL_CONFIG = {}
def __init__(self):
self.config = self.parse_config_file(
additional_configs=[self.ADDITIONAL_CONFIG])
self.storage = get_storage(**self.config['storage'])
self.scheduler = SchedulerBackend(
scheduling_db=self.config['scheduling_db'],
)
def create_origins(self, origins):
"""Create the origins listed, and return their ids.
Args:
origins: a list of origins
Returns:
a list of origin ids
"""
return self.storage.origin_add(origins)
def create_tasks(self, tasks):
"""Create the tasks specified, and return their ids.
Args:
tasks (list of dict): a list of task specifications:
type (str): the task type
arguments (dict): the arguments for the task runner
args (list of str): arguments
kwargs (dict str -> str): keyword arguments
next_run (datetime.datetime): when to schedule the next run
Returns:
a list of task ids
"""
returned_tasks = self.scheduler.create_tasks(tasks)
return [returned_task['id'] for returned_task in returned_tasks]
def disable_tasks(self, task_ids):
"""Disable the tasks identified by the given ids"""
self.scheduler.disable_tasks(task_ids)

View file

@ -1,348 +1,50 @@
# Copyright (C) 2015 Stefano Zacchiroli <zack@upsilon.cc>
# Copyright (C) 2016 The Software Heritage developers
# Copyright (C) 2017 the Software Heritage developers
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
# see https://developer.github.com/v3/ for GitHub API documentation
import datetime
import gzip
import logging
import os
import random
import re
import requests
import time
from pprint import pformat
from sqlalchemy import create_engine, func
from sqlalchemy.orm import sessionmaker
from swh.core import config
from swh.lister.github.base import SWHLister
from swh.lister.github.db_utils import session_scope
from swh.lister.github.models import Repository
from swh.lister.core.indexing_lister import SWHIndexingHttpLister
from swh.lister.github.models import GitHubModel
GH_API_URL = 'https://api.github.com'
GH_REPO_URL_TEMPLATE = 'https://github.com/%s'
MAX_RETRIES = 7
MAX_SLEEP = 3600 # 1 hour
CONN_SLEEP = 10
class GitHubLister(SWHIndexingHttpLister):
PATH_TEMPLATE = '/repositories?since=%d'
MODEL = GitHubModel
API_URL_INDEX_RE = re.compile(r'^.*/repositories\?since=(\d+)')
REPO_API_URL_RE = re.compile(r'^.*/repositories\?since=(\d+)')
def get_model_from_repo(self, repo):
return {
'uid': repo['id'],
'indexable': repo['id'],
'name': repo['name'],
'full_name': repo['full_name'],
'html_url': repo['html_url'],
'origin_url': repo['html_url'],
'origin_type': 'git',
'description': repo['description']
}
class FetchError(RuntimeError):
def __init__(self, response):
self.response = response
def __str__(self):
return repr(self.response)
def save_http_response(r, cache_dir):
def escape_url_path(p):
return p.replace('/', '__')
fname = os.path.join(cache_dir,
escape_url_path(r.request.path_url) + '.gz')
with gzip.open(fname, 'w') as f:
def emit(s):
f.write(bytes(s, 'UTF-8'))
emit(pformat(r.request.path_url))
emit('\n#\n')
emit(pformat(r.status_code))
emit('\n#\n')
emit(pformat(r.headers))
emit('\n#\n')
emit(pformat(r.json()))
def gh_api_request(path, username=None, password=None, session=None,
headers=None):
params = {}
if headers is None:
headers = {}
if 'Accept' not in headers: # request version 3 of the API
headers['Accept'] = 'application/vnd.github.v3+json'
params['headers'] = headers
if username is not None and password is not None:
params['auth'] = (username, password)
if session is None:
session = requests.Session()
retries_left = MAX_RETRIES
while retries_left > 0:
logging.debug('sending API request: %s' % path)
try:
r = session.get(GH_API_URL + path, **params)
except requests.exceptions.ConnectionError:
# network-level connection error, try again
logging.warn('connection error upon %s: sleep for %d seconds' %
(path, CONN_SLEEP))
time.sleep(CONN_SLEEP)
retries_left -= 1
continue
if r.ok: # all went well, do not retry
break
# detect throttling
if r.status_code == 403 and \
int(r.headers['X-RateLimit-Remaining']) == 0:
delay = int(r.headers['X-RateLimit-Reset']) - time.time()
delay = min(delay, MAX_SLEEP)
logging.warn('rate limited upon %s: sleep for %d seconds' %
(path, int(delay)))
time.sleep(delay)
else: # unexpected error, abort
break
retries_left -= 1
if not retries_left:
logging.warn('giving up on %s: max retries exceed' % path)
return r
class GitHubLister(SWHLister):
CONFIG_BASE_FILENAME = 'lister-github'
ADDITIONAL_CONFIG = {
'lister_db_url': ('str', 'postgresql:///lister-github'),
'credentials': ('list[dict]', []),
'cache_json': ('bool', False),
'cache_dir': ('str', '~/.cache/swh/lister/github'),
}
def __init__(self, override_config=None):
super().__init__()
if override_config:
self.config.update(override_config)
self.config['cache_dir'] = os.path.expanduser(self.config['cache_dir'])
if self.config['cache_json']:
config.prepare_folders(self.config, ['cache_dir'])
if not self.config['credentials']:
raise ValueError('The GitHub lister needs credentials for API')
self.db_engine = create_engine(self.config['lister_db_url'])
self.mk_session = sessionmaker(bind=self.db_engine)
def lookup_repo(self, repo_id, db_session=None):
if not db_session:
with session_scope(self.mk_session) as db_session:
return self.lookup_repo(repo_id, db_session=db_session)
return db_session.query(Repository) \
.filter(Repository.id == repo_id) \
.first()
def query_range(self, start, end, db_session=None):
if not db_session:
with session_scope(self.mk_session) as db_session:
return self.query_range(start, end, db_session=db_session)
return db_session.query(Repository) \
.filter(Repository.id >= start) \
.filter(Repository.id <= end)
def lookup_full_names(self, full_names, db_session=None):
if not db_session:
with session_scope(self.mk_session) as db_session:
return self.lookup_full_names(full_names,
db_session=db_session)
return db_session.query(Repository) \
.filter(Repository.full_name.in_(full_names)) \
.all()
def last_repo_id(self, db_session=None):
if not db_session:
with session_scope(self.mk_session) as db_session:
return self.last_repo_id(db_session=db_session)
t = db_session.query(func.max(Repository.id)).first()
if t is not None:
return t[0]
INJECT_KEYS = ['id', 'name', 'full_name', 'html_url', 'description',
'fork']
def inject_repo(self, repo, db_session=None):
if not db_session:
with session_scope(self.mk_session) as db_session:
return self.inject_repo(repo, db_session=db_session)
logging.debug('injecting repo %d' % repo['id'])
sql_repo = self.lookup_repo(repo['id'], db_session)
if not sql_repo:
kwargs = {k: repo[k] for k in self.INJECT_KEYS if k in repo}
sql_repo = Repository(**kwargs)
db_session.add(sql_repo)
def transport_quota_check(self, response):
reqs_remaining = int(response.headers['X-RateLimit-Remaining'])
if response.status_code == 403 and reqs_remaining == 0:
reset_at = int(response.headers['X-RateLimit-Reset'])
delay = min(reset_at - time.time(), 3600)
return True, delay
else:
for k in self.INJECT_KEYS:
if k in repo:
setattr(sql_repo, k, repo[k])
sql_repo.last_seen = datetime.datetime.now()
return False, 0
return sql_repo
def get_next_target_from_response(self, response):
if 'next' in response.links:
next_url = response.links['next']['url']
return int(self.API_URL_INDEX_RE.match(next_url).group(1))
else:
return None
@staticmethod
def repo_to_origin(full_name):
return {
'type': 'git',
'url': GH_REPO_URL_TEMPLATE % full_name,
}
def transport_response_simplified(self, response):
repos = response.json()
return [self.get_model_from_repo(repo) for repo in repos]
@staticmethod
def repo_to_task(full_name):
return {
'type': 'origin-update-git',
'arguments': {
'args': [
GH_REPO_URL_TEMPLATE % full_name,
],
'kwargs': {},
},
'next_run': datetime.datetime.now(),
}
def fetch(self, min_id=None, max_id=None):
if min_id is None:
min_id = 1
if max_id is None:
max_id = float('inf')
next_id = min_id
do_cache = self.config['cache_json']
cache_dir = self.config['cache_dir']
session = requests.Session()
db_session = self.mk_session()
loop_count = 0
while min_id <= next_id <= max_id:
logging.info('listing repos starting at %d' % next_id)
# github API ?since=... is '>' strict, not '>='
since = next_id - 1
cred = random.choice(self.config['credentials'])
repos_res = gh_api_request('/repositories?since=%d' % since,
session=session, **cred)
if do_cache:
save_http_response(repos_res, cache_dir)
if not repos_res.ok:
raise FetchError(repos_res)
next_next_id = None
if 'next' in repos_res.links:
next_url = repos_res.links['next']['url']
m = REPO_API_URL_RE.match(next_url) # parse next_id
next_next_id = int(m.group(1)) + 1
repos = repos_res.json()
mapped_repos = {}
tasks = {}
origins = {}
repo_ids = set()
for repo in repos:
if repo['id'] > max_id: # do not overstep max_id
break
repo_ids.add(repo['id'])
full_name = repo['full_name']
mapped_repos[full_name] = self.inject_repo(repo, db_session)
# Retrieve and reset task and origin ids from existing repos
old_repos = self.lookup_full_names(list(mapped_repos.keys()),
db_session=db_session)
for old_repo in old_repos:
full_name = old_repo.full_name
if old_repo.task_id:
tasks[full_name] = old_repo.task_id
old_repo.task_id = None
if old_repo.origin_id:
origins[full_name] = old_repo.origin_id
old_repo.origin_id = None
# Create missing origins
missing_origins = [
full_name for full_name in sorted(mapped_repos)
if full_name not in origins
]
if missing_origins:
new_origins = [
self.repo_to_origin(full_name)
for full_name in missing_origins
]
new_origin_ids = self.create_origins(new_origins)
origins.update(zip(missing_origins, new_origin_ids))
for full_name, origin_id in origins.items():
mapped_repos[full_name].origin_id = origin_id
# Create missing tasks
missing_tasks = [
full_name for full_name in sorted(mapped_repos)
if full_name not in tasks
]
if missing_tasks:
new_tasks = [
self.repo_to_task(full_name)
for full_name in missing_tasks
]
new_task_ids = self.create_tasks(new_tasks)
tasks.update(zip(missing_tasks, new_task_ids))
for full_name, task_id in tasks.items():
mapped_repos[full_name].task_id = task_id
# Disable tasks for deleted repos
if next_next_id is not None:
start, end = next_id, min(max_id, next_next_id)
else:
start, end = next_id, max_id
deleted_repos = self.query_range(start, end,
db_session=db_session) \
.filter(~Repository.id.in_(repo_ids)) \
.all()
if deleted_repos:
tasks_to_disable = []
for repo in deleted_repos:
if repo.task_id is not None:
tasks_to_disable.append(repo.task_id)
repo.task_id = None
if tasks_to_disable:
self.disable_tasks(tasks_to_disable)
if next_next_id is None:
logging.info('stopping after id %d, no next link found' %
next_id)
break
else:
next_id = next_next_id
loop_count += 1
if loop_count == 20:
logging.info('flushing updates')
loop_count = 0
db_session.commit()
db_session = self.mk_session()
db_session.commit()
def request_headers(self):
return {'Accept': 'application/vnd.github.v3+json'}

View file

@ -1,51 +1,18 @@
# Copyright (C) 2015 Stefano Zacchiroli <zack@upsilon.cc>
# Copyright (C) 2017 the Software Heritage developers
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from datetime import datetime
from sqlalchemy import Column, Integer
from sqlalchemy import Column
from sqlalchemy import Boolean, DateTime, Integer, String
from sqlalchemy.ext.declarative import declarative_base
from swh.lister.core.models import ModelBase
SQLBase = declarative_base()
class Repository(SQLBase):
class GitHubModel(ModelBase):
"""a GitHub repository"""
__tablename__ = 'github_repos'
__tablename__ = 'repos'
uid = Column(Integer, primary_key=True)
indexable = Column(Integer, index=True)
id = Column(Integer, primary_key=True)
name = Column(String, index=True)
full_name = Column(String, index=True)
html_url = Column(String)
description = Column(String)
fork = Column(Boolean, index=True)
last_seen = Column(DateTime, nullable=False)
task_id = Column(Integer)
origin_id = Column(Integer)
def __init__(self, id, name=None, full_name=None, html_url=None,
description=None, fork=None, task_id=None, origin_id=None):
self.id = id
self.last_seen = datetime.now()
if name is not None:
self.name = name
if full_name is not None:
self.full_name = full_name
if html_url is not None:
self.html_url = html_url
if description is not None:
self.description = description
if fork is not None:
self.fork = fork
if task_id is not None:
self.task_id = task_id
if origin_id is not None:
self.origin_id = origin_id
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)

View file

@ -1,47 +1,27 @@
# Copyright (C) 2016 the Software Heritage developers
# Copyright (C) 2017 the Software Heritage developers
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import random
from celery import group
from swh.scheduler.task import Task
from swh.lister.core.tasks import (IndexingDiscoveryListerTask,
IndexingRangeListerTask,
IndexingRefreshListerTask, ListerTaskBase)
from .lister import GitHubLister
GROUP_SPLIT = 10000
class GitHubListerTask(ListerTaskBase):
def new_lister(self):
return GitHubLister(lister_name='github.com',
api_baseurl='https://github.com')
class IncrementalGitHubLister(Task):
task_queue = 'swh_lister_github_incremental'
def run(self):
lister = GitHubLister()
last_id = lister.last_repo_id()
lister.fetch(min_id=last_id + 1, max_id=None)
class IncrementalGitHubLister(GitHubListerTask, IndexingDiscoveryListerTask):
task_queue = 'swh_lister_github_discover'
class RangeGitHubLister(Task):
task_queue = 'swh_lister_github_full'
def run(self, start, end):
lister = GitHubLister()
lister.fetch(min_id=start, max_id=end)
class RangeGitHubLister(GitHubListerTask, IndexingRangeListerTask):
task_queue = 'swh_lister_github_refresh'
class FullGitHubLister(Task):
task_queue = 'swh_lister_github_full'
def run(self):
lister = GitHubLister()
last_id = lister.last_repo_id()
ranges = [
(i, min(last_id, i + GROUP_SPLIT - 1))
for i in range(1, last_id, GROUP_SPLIT)
]
random.shuffle(ranges)
range_task = RangeGitHubLister()
group(range_task.s(min, max) for min, max in ranges)()
class FullGitHubRelister(GitHubListerTask, IndexingRefreshListerTask):
task_queue = 'swh_lister_github_refresh'

View file

@ -0,0 +1 @@
[]

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,46 @@
# Copyright (C) 2017 the Software Heritage developers
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import re
import unittest
from datetime import datetime, timedelta
from swh.lister.core.tests.test_lister import IndexingHttpListerTesterBase
from swh.lister.github.lister import GitHubLister
class GitHubListerTester(IndexingHttpListerTesterBase, unittest.TestCase):
Lister = GitHubLister
test_re = re.compile(r'/repositories\?since=([^?&]+)')
lister_subdir = 'github'
good_api_response_file = 'api_response.json'
bad_api_response_file = 'api_empty_response.json'
first_index = 26
last_index = 368
entries_per_page = 100
def response_headers(self, request):
headers = {'X-RateLimit-Remaining': '1'}
if self.request_index(request) == str(self.first_index):
headers.update({
'Link': '<https://api.github.com/repositories?since=367>;'
' rel="next",'
'<https://api.github.com/repositories{?since}>;'
' rel="first"'
})
else:
headers.update({
'Link': '<https://api.github.com/repositories{?since}>;'
' rel="first"'
})
return headers
def mock_rate_quota(self, n, request, context):
self.rate_limit += 1
context.status_code = 403
context.headers['X-RateLimit-Remaining'] = '0'
one_second = int((datetime.now() + timedelta(seconds=1.5)).timestamp())
context.headers['X-RateLimit-Reset'] = str(one_second)
return '{"error":"dummy"}'