swh.lister.cgit
Implemented a lister to list the repos for a given CGit instance. Closes T1659
This commit is contained in:
parent
d85bcdac5b
commit
b972a2a88d
13 changed files with 360 additions and 1 deletions
13
README.md
13
README.md
|
@ -203,6 +203,19 @@ logging.basicConfig(level=logging.DEBUG)
|
|||
cran_lister()
|
||||
```
|
||||
|
||||
## lister-cgit
|
||||
|
||||
Once configured, you can execute a cgit lister using the following instructions
|
||||
in a `python3` script:
|
||||
|
||||
```lang=python
|
||||
import logging
|
||||
from swh.lister.cgit.tasks import cgit_lister
|
||||
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
cgit_lister(base_url='http://git.savannah.gnu.org/cgit/')
|
||||
```
|
||||
|
||||
Licensing
|
||||
---------
|
||||
|
||||
|
|
|
@ -5,3 +5,4 @@ requests
|
|||
setuptools
|
||||
xmltodict
|
||||
iso8601
|
||||
beautifulsoup4
|
||||
|
|
0
swh/lister/cgit/__init__.py
Normal file
0
swh/lister/cgit/__init__.py
Normal file
180
swh/lister/cgit/lister.py
Normal file
180
swh/lister/cgit/lister.py
Normal file
|
@ -0,0 +1,180 @@
|
|||
# Copyright (C) 2019 the Software Heritage developers
|
||||
# License: GNU General Public License version 3, or any later version
|
||||
# See top-level LICENSE file for more information
|
||||
|
||||
import random
|
||||
from bs4 import BeautifulSoup
|
||||
from collections import defaultdict
|
||||
import requests
|
||||
import urllib.parse
|
||||
|
||||
from .models import CGitModel
|
||||
|
||||
from swh.lister.core.simple_lister import SimpleLister
|
||||
from swh.lister.core.lister_transports import ListerOnePageApiTransport
|
||||
|
||||
|
||||
class CGitLister(ListerOnePageApiTransport, SimpleLister):
|
||||
MODEL = CGitModel
|
||||
LISTER_NAME = 'cgit'
|
||||
PAGE = ''
|
||||
|
||||
def __init__(self, base_url, instance=None, override_config=None):
|
||||
if not base_url.endswith('/'):
|
||||
base_url = base_url+'/'
|
||||
self.PAGE = base_url
|
||||
|
||||
# This part removes any suffix from the base url and stores it in
|
||||
# next_url. For example for base_url = https://git.kernel.org/pub/scm/
|
||||
# it will convert it into https://git.kernel.org and then attach
|
||||
# the suffix
|
||||
(part1, part2, next_url) = self.PAGE.split('/', 2)
|
||||
self.next_url = part1 + '//' + next_url
|
||||
|
||||
if not instance:
|
||||
instance = urllib.parse.urlparse(base_url).hostname
|
||||
self.instance = instance
|
||||
ListerOnePageApiTransport .__init__(self)
|
||||
SimpleLister.__init__(self, override_config=override_config)
|
||||
|
||||
def list_packages(self, response):
|
||||
"""List the actual cgit instance origins from the response.
|
||||
|
||||
"""
|
||||
repos_details = []
|
||||
soup = BeautifulSoup(response.text, features="html.parser") \
|
||||
.find('div', {"class": "content"})
|
||||
repos = soup.find_all("tr", {"class": ""})
|
||||
for repo in repos:
|
||||
repo_name = repo.a.text
|
||||
repo_url = self.get_url(repo)
|
||||
origin_url = find_origin_url(repo_url)
|
||||
|
||||
try:
|
||||
time = repo.span['title']
|
||||
except Exception:
|
||||
time = None
|
||||
|
||||
if origin_url is not None:
|
||||
repos_details.append({
|
||||
'name': repo_name,
|
||||
'time': time,
|
||||
'origin_url': origin_url,
|
||||
})
|
||||
|
||||
random.shuffle(repos_details)
|
||||
return repos_details
|
||||
|
||||
def get_url(self, repo):
|
||||
"""Finds url of a repo page.
|
||||
|
||||
Finds the url of a repo page by parsing over the html of the row of
|
||||
that repo present in the base url.
|
||||
|
||||
Args:
|
||||
repo: a beautifulsoup object of the html code of the repo row
|
||||
present in base url.
|
||||
|
||||
Returns:
|
||||
string: The url of a repo.
|
||||
"""
|
||||
suffix = repo.a['href']
|
||||
return self.next_url + suffix
|
||||
|
||||
def get_model_from_repo(self, repo):
|
||||
"""Transform from repository representation to model.
|
||||
|
||||
"""
|
||||
return {
|
||||
'uid': self.PAGE + repo['name'],
|
||||
'name': repo['name'],
|
||||
'full_name': repo['name'],
|
||||
'html_url': repo['origin_url'],
|
||||
'origin_url': repo['origin_url'],
|
||||
'origin_type': 'git',
|
||||
'time_updated': repo['time'],
|
||||
}
|
||||
|
||||
def transport_response_simplified(self, response):
|
||||
"""Transform response to list for model manipulation.
|
||||
|
||||
"""
|
||||
return [self.get_model_from_repo(repo) for repo in response]
|
||||
|
||||
|
||||
def find_origin_url(repo_url):
|
||||
"""Finds origin url for a repo.
|
||||
|
||||
Finds the origin url for a particular repo by parsing over the page of
|
||||
that repo.
|
||||
|
||||
Args:
|
||||
repo_url: URL of the repo.
|
||||
|
||||
Returns:
|
||||
string: Origin url for the repo.
|
||||
|
||||
Examples:
|
||||
|
||||
>>> find_origin_url(
|
||||
'http://git.savannah.gnu.org/cgit/fbvbconv-py.git/')
|
||||
'https://git.savannah.gnu.org/git/fbvbconv-py.git'
|
||||
|
||||
"""
|
||||
|
||||
response = requests.get(repo_url)
|
||||
soup = BeautifulSoup(response.text, features="html.parser")
|
||||
|
||||
origin_urls = find_all_origin_url(soup)
|
||||
return priority_origin_url(origin_urls)
|
||||
|
||||
|
||||
def find_all_origin_url(soup):
|
||||
"""
|
||||
Finds all the origin url for a particular repo by parsing over the html of
|
||||
repo page.
|
||||
|
||||
Args:
|
||||
soup: a beautifulsoup object of the html code of the repo.
|
||||
|
||||
Returns:
|
||||
dictionary: All possible origin urls with their protocol as key.
|
||||
|
||||
Examples:
|
||||
If soup is beautifulsoup object of the html code at
|
||||
http://git.savannah.gnu.org/cgit/fbvbconv-py.git/
|
||||
|
||||
>>> print(find_all_origin_url(soup))
|
||||
{ 'https': 'https://git.savannah.gnu.org/git/fbvbconv-py.git',
|
||||
'ssh': 'ssh://git.savannah.gnu.org/srv/git/fbvbconv-py.git',
|
||||
'git': 'git://git.savannah.gnu.org/fbvbconv-py.git'}
|
||||
"""
|
||||
origin_urls = defaultdict(dict)
|
||||
found_clone_word = False
|
||||
|
||||
for i in soup.find_all('tr'):
|
||||
if found_clone_word:
|
||||
link = i.text
|
||||
protocol = link[:link.find(':')]
|
||||
origin_urls[protocol] = link
|
||||
if i.text == 'Clone':
|
||||
found_clone_word = True
|
||||
|
||||
return origin_urls
|
||||
|
||||
|
||||
def priority_origin_url(origin_url):
|
||||
"""Finds the highest priority link for a particular repo.
|
||||
|
||||
Priority order is https>http>git>ssh.
|
||||
|
||||
Args:
|
||||
origin_urls: A dictionary of origin links with their protocol as key.
|
||||
|
||||
Returns:
|
||||
string: URL with the highest priority.
|
||||
|
||||
"""
|
||||
for protocol in ['https', 'http', 'git', 'ssh']:
|
||||
if protocol in origin_url:
|
||||
return origin_url[protocol]
|
17
swh/lister/cgit/models.py
Normal file
17
swh/lister/cgit/models.py
Normal file
|
@ -0,0 +1,17 @@
|
|||
# Copyright (C) 2019 the Software Heritage developers
|
||||
# License: GNU General Public License version 3, or any later version
|
||||
# See top-level LICENSE file for more information
|
||||
|
||||
from sqlalchemy import Column, String
|
||||
|
||||
from ..core.models import ModelBase
|
||||
|
||||
|
||||
class CGitModel(ModelBase):
|
||||
"""a CGit repository representation
|
||||
|
||||
"""
|
||||
__tablename__ = 'cgit_repo'
|
||||
|
||||
uid = Column(String, primary_key=True)
|
||||
time_updated = Column(String)
|
23
swh/lister/cgit/tasks.py
Normal file
23
swh/lister/cgit/tasks.py
Normal file
|
@ -0,0 +1,23 @@
|
|||
# Copyright (C) 2019 the Software Heritage developers
|
||||
# License: GNU General Public License version 3, or any later version
|
||||
# See top-level LICENSE file for more information
|
||||
|
||||
from swh.scheduler.celery_backend.config import app
|
||||
|
||||
from .lister import CGitLister
|
||||
|
||||
|
||||
def new_lister(base_url='https://git.savannah.gnu.org/cgit/',
|
||||
instance='savannah-gnu', **kw):
|
||||
return CGitLister(base_url=base_url, instance=instance, **kw)
|
||||
|
||||
|
||||
@app.task(name=__name__ + '.CGitListerTask')
|
||||
def cgit_lister(**lister_args):
|
||||
lister = new_lister(**lister_args)
|
||||
lister.run()
|
||||
|
||||
|
||||
@app.task(name=__name__ + '.ping')
|
||||
def ping():
|
||||
return 'OK'
|
0
swh/lister/cgit/tests/__init__.py
Normal file
0
swh/lister/cgit/tests/__init__.py
Normal file
47
swh/lister/cgit/tests/api_response.html
Normal file
47
swh/lister/cgit/tests/api_response.html
Normal file
|
@ -0,0 +1,47 @@
|
|||
<!DOCTYPE html>
|
||||
<html lang='en'>
|
||||
<head>
|
||||
<title>fbvbconv-py.git - Unnamed repository; edit this file 'description' to name the repository.</title>
|
||||
<meta name='generator' content='cgit v1.0-41-gc330'/>
|
||||
<meta name='robots' content='index, nofollow'/>
|
||||
<link rel='stylesheet' type='text/css' href='/cgit/cgit.css'/>
|
||||
<link rel='shortcut icon' href='/gitweb/git-favicon.png'/>
|
||||
<link rel='alternate' title='Atom feed' href='http://git.savannah.gnu.org/cgit/fbvbconv-py.git/atom/?h=master' type='application/atom+xml'/>
|
||||
<link rel='vcs-git' href='git://git.savannah.gnu.org/fbvbconv-py.git' title='fbvbconv-py.git Git repository'/>
|
||||
<link rel='vcs-git' href='https://git.savannah.gnu.org/git/fbvbconv-py.git' title='fbvbconv-py.git Git repository'/>
|
||||
<link rel='vcs-git' href='ssh://git.savannah.gnu.org/srv/git/fbvbconv-py.git' title='fbvbconv-py.git Git repository'/>
|
||||
</head>
|
||||
<body>
|
||||
<div id='cgit'><table id='header'>
|
||||
<tr>
|
||||
<td class='logo' rowspan='2'><a href='/cgit/'><img src='/cgit/cgit.png' alt='cgit logo'/></a></td>
|
||||
<td class='main'><a href='/cgit/'>index</a> : <a title='fbvbconv-py.git' href='/cgit/fbvbconv-py.git/'>fbvbconv-py.git</a></td><td class='form'><form method='get'>
|
||||
<select name='h' onchange='this.form.submit();'>
|
||||
<option value='master' selected='selected'>master</option>
|
||||
</select> <input type='submit' value='switch'/></form></td></tr>
|
||||
<tr><td class='sub'>Unnamed repository; edit this file 'description' to name the repository.</td><td class='sub right'></td></tr></table>
|
||||
<table class='tabs'><tr><td>
|
||||
<a class='active' href='/cgit/fbvbconv-py.git/'>summary</a><a href='/cgit/fbvbconv-py.git/refs/'>refs</a><a href='/cgit/fbvbconv-py.git/log/'>log</a><a href='/cgit/fbvbconv-py.git/tree/'>tree</a><a href='/cgit/fbvbconv-py.git/commit/'>commit</a><a href='/cgit/fbvbconv-py.git/diff/'>diff</a></td><td class='form'><form class='right' method='get' action='/cgit/fbvbconv-py.git/log/'>
|
||||
<select name='qt'>
|
||||
<option value='grep'>log msg</option>
|
||||
<option value='author'>author</option>
|
||||
<option value='committer'>committer</option>
|
||||
<option value='range'>range</option>
|
||||
</select>
|
||||
<input class='txt' type='text' size='10' name='q' value=''/>
|
||||
<input type='submit' value='search'/>
|
||||
</form>
|
||||
</td></tr></table>
|
||||
<div class='content'><table summary='repository info' class='list nowrap'><tr class='nohover'><th class='left'>Branch</th><th class='left'>Commit message</th><th class='left'>Author</th><th class='left' colspan='2'>Age</th></tr>
|
||||
<tr><td><a href='/cgit/fbvbconv-py.git/log/'>master</a></td><td><a href='/cgit/fbvbconv-py.git/commit/'>initial import</a></td><td>Johannes Stezenbach</td><td colspan='2'><span class='age-years' title='2017-06-02 09:57:38 +0200'>2 years</span></td></tr>
|
||||
<tr class='nohover'><td colspan='5'> </td></tr><tr class='nohover'><td colspan='5'> </td></tr><tr class='nohover'><th class='left'>Age</th><th class='left'>Commit message</th><th class='left'>Author</th><th class='left'>Files</th><th class='left'>Lines</th></tr>
|
||||
<tr><td><span title='2017-06-02 09:57:38 +0200'>2017-06-02</span></td><td><a href='/cgit/fbvbconv-py.git/commit/?id=9766bcfae598e3e077f321bade823028eb5553bb'>initial import</a><span class='decoration'><a class='deco' href='/cgit/fbvbconv-py.git/commit/?id=9766bcfae598e3e077f321bade823028eb5553bb'>HEAD</a><a class='branch-deco' href='/cgit/fbvbconv-py.git/log/'>master</a></span></td><td>Johannes Stezenbach</td><td>3</td><td><span class='deletions'>-0</span>/<span class='insertions'>+889</span></td></tr>
|
||||
<tr class='nohover'><td colspan='5'> </td></tr><tr class='nohover'><th class='left' colspan='5'>Clone</th></tr>
|
||||
<tr><td colspan='5'><a rel='vcs-git' href='git://git.savannah.gnu.org/fbvbconv-py.git' title='fbvbconv-py.git Git repository'>git://git.savannah.gnu.org/fbvbconv-py.git</a></td></tr>
|
||||
<tr><td colspan='5'><a rel='vcs-git' href='https://git.savannah.gnu.org/git/fbvbconv-py.git' title='fbvbconv-py.git Git repository'>https://git.savannah.gnu.org/git/fbvbconv-py.git</a></td></tr>
|
||||
<tr><td colspan='5'><a rel='vcs-git' href='ssh://git.savannah.gnu.org/srv/git/fbvbconv-py.git' title='fbvbconv-py.git Git repository'>ssh://git.savannah.gnu.org/srv/git/fbvbconv-py.git</a></td></tr>
|
||||
</table></div> <!-- class=content -->
|
||||
<div class='footer'>generated by <a href='https://git.zx2c4.com/cgit/about/'>cgit v1.0-41-gc330</a> at 2019-06-19 10:51:46 +0000</div>
|
||||
</div> <!-- id=cgit -->
|
||||
</body>
|
||||
</html>
|
1
swh/lister/cgit/tests/conftest.py
Normal file
1
swh/lister/cgit/tests/conftest.py
Normal file
|
@ -0,0 +1 @@
|
|||
from swh.lister.core.tests.conftest import * # noqa
|
40
swh/lister/cgit/tests/test_lister.py
Normal file
40
swh/lister/cgit/tests/test_lister.py
Normal file
|
@ -0,0 +1,40 @@
|
|||
# Copyright (C) 2019 the Software Heritage developers
|
||||
# License: GNU General Public License version 3, or any later version
|
||||
# See top-level LICENSE file for more information
|
||||
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from swh.lister.cgit.lister import priority_origin_url, find_all_origin_url
|
||||
|
||||
|
||||
def test_find_all_origin_url():
|
||||
f = open('swh/lister/cgit/tests/api_response.html')
|
||||
soup = BeautifulSoup(f.read(), features="html.parser")
|
||||
expected_output = {'https': 'https://git.savannah.gnu.org/git/'
|
||||
'fbvbconv-py.git',
|
||||
'ssh': 'ssh://git.savannah.gnu.org/srv/git/'
|
||||
'fbvbconv-py.git',
|
||||
'git': 'git://git.savannah.gnu.org/fbvbconv-py.git'}
|
||||
|
||||
output = find_all_origin_url(soup)
|
||||
|
||||
for protocol, url in expected_output.items():
|
||||
assert url == output[protocol]
|
||||
|
||||
|
||||
def test_priority_origin_url():
|
||||
first_input = {'https': 'https://kernel.googlesource.com/pub/scm/docs/'
|
||||
'man-pages/man-pages.git',
|
||||
'git': 'git://git.kernel.org/pub/scm/docs/man-pages/'
|
||||
'man-pages.git'}
|
||||
second_input = {'git': 'git://git.savannah.gnu.org/perl-pesel.git',
|
||||
'ssh': 'ssh://git.savannah.gnu.org/srv/git/perl-pesel.git'}
|
||||
third_input = {}
|
||||
|
||||
assert (priority_origin_url(first_input) ==
|
||||
'https://kernel.googlesource.com/pub/scm/docs/man-pages/'
|
||||
'man-pages.git')
|
||||
assert (priority_origin_url(second_input) ==
|
||||
'git://git.savannah.gnu.org/perl-pesel.git')
|
||||
assert priority_origin_url(third_input) is None
|
29
swh/lister/cgit/tests/test_tasks.py
Normal file
29
swh/lister/cgit/tests/test_tasks.py
Normal file
|
@ -0,0 +1,29 @@
|
|||
from unittest.mock import patch
|
||||
|
||||
|
||||
def test_ping(swh_app, celery_session_worker):
|
||||
res = swh_app.send_task(
|
||||
'swh.lister.cgit.tasks.ping')
|
||||
assert res
|
||||
res.wait()
|
||||
assert res.successful()
|
||||
assert res.result == 'OK'
|
||||
|
||||
|
||||
@patch('swh.lister.cgit.tasks.CGitLister')
|
||||
def test_lister(lister, swh_app, celery_session_worker):
|
||||
# setup the mocked CGitLister
|
||||
lister.return_value = lister
|
||||
lister.run.return_value = None
|
||||
|
||||
res = swh_app.send_task(
|
||||
'swh.lister.cgit.tasks.CGitListerTask')
|
||||
assert res
|
||||
res.wait()
|
||||
assert res.successful()
|
||||
|
||||
lister.assert_called_once_with(
|
||||
base_url='https://git.savannah.gnu.org/cgit/',
|
||||
instance='savannah-gnu')
|
||||
lister.db_last_index.assert_not_called()
|
||||
lister.run.assert_called_once_with()
|
|
@ -12,7 +12,7 @@ from swh.core.cli import CONTEXT_SETTINGS
|
|||
logger = logging.getLogger(__name__)
|
||||
|
||||
SUPPORTED_LISTERS = ['github', 'gitlab', 'bitbucket', 'debian', 'pypi',
|
||||
'npm', 'phabricator', 'gnu', 'cran']
|
||||
'npm', 'phabricator', 'gnu', 'cran', 'cgit']
|
||||
|
||||
|
||||
@click.group(name='lister', context_settings=CONTEXT_SETTINGS)
|
||||
|
@ -125,6 +125,13 @@ def cli(ctx, db_url, listers, drop_tables):
|
|||
from .cran.lister import CRANLister
|
||||
_lister = CRANLister(override_config=override_conf)
|
||||
|
||||
elif lister == 'cgit':
|
||||
from .cgit.models import ModelBase
|
||||
from .cgit.lister import CGitLister
|
||||
_lister = CGitLister(
|
||||
base_url='http://git.savannah.gnu.org/cgit/',
|
||||
override_config=override_conf)
|
||||
|
||||
else:
|
||||
raise ValueError(
|
||||
'Invalid lister %s: only supported listers are %s' %
|
||||
|
|
|
@ -6,6 +6,7 @@ from swh.scheduler.tests.conftest import * # noqa
|
|||
def celery_includes():
|
||||
return [
|
||||
'swh.lister.bitbucket.tasks',
|
||||
'swh.lister.cgit.tasks',
|
||||
'swh.lister.cran.tasks',
|
||||
'swh.lister.debian.tasks',
|
||||
'swh.lister.github.tasks',
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue