swh.lister.cgit: Remove repo page visit step

Remove the need to visit every page and extract the
origin url by introducing a parameter url_prefix.
The origin url is in format <prefix>/<repo_name> where
The prefix is same for all the repos for a particular
cgit instance.
This commit is contained in:
Archit Agrawal 2019-06-28 18:15:23 +05:30
parent 7e3c79bb1d
commit 0bf24469b7
9 changed files with 215 additions and 223 deletions

View file

@ -3,8 +3,8 @@
# See top-level LICENSE file for more information
import random
import logging
from bs4 import BeautifulSoup
from collections import defaultdict
import requests
from urllib.parse import urlparse
@ -18,38 +18,60 @@ class CGitLister(ListerOnePageApiTransport, SimpleLister):
MODEL = CGitModel
LISTER_NAME = 'cgit'
PAGE = None
url_prefix_present = True
def __init__(self, base_url, instance=None, override_config=None):
def __init__(self, url, instance=None, url_prefix=None,
override_config=None):
"""Inits Class with PAGE url and origin url prefix.
self.PAGE = base_url
Args:
url (str): URL of the CGit instance.
instance (str): Name of cgit instance.
url_prefix (str): Prefix of the origin_url. Origin link of the
repos of some special instances do not match
the url of the repository page, they have origin
url in the format <url_prefix>/<repo_name>.
"""
self.PAGE = url
if url_prefix is None:
self.url_prefix = url
self.url_prefix_present = False
else:
self.url_prefix = url_prefix
if not self.url_prefix.endswith('/'):
self.url_prefix += '/'
url = urlparse(self.PAGE)
self.url_netloc = find_netloc(url)
if not instance:
instance = url.hostname
self.instance = instance
ListerOnePageApiTransport .__init__(self)
SimpleLister.__init__(self, override_config=override_config)
def list_packages(self, response):
"""List the actual cgit instance origins from the response.
Find the repos in all the pages by parsing over the HTML of
the `base_url`. Find the details for all the repos and return
them in the format of list of dictionaries.
Find repositories metadata by parsing the html page (response's raw
content). If there are links in the html page, retrieve those
repositories metadata from those pages as well. Return the
repositories as list of dictionaries.
Args:
response (Response): http api request response.
Returns:
List of repository origin urls (as dict) included in the response.
"""
repos_details = []
repos = get_repo_list(response)
soup = make_repo_soup(response)
pages = self.get_page(soup)
if len(pages) > 1:
repos.extend(self.get_all_pages(pages))
for repo in repos:
for repo in self.yield_repo_from_responses(response):
repo_name = repo.a.text
repo_url = self.get_url(repo)
origin_url = find_origin_url(repo_url)
origin_url = self.find_origin_url(repo, repo_name)
try:
time = repo.span['title']
@ -58,57 +80,93 @@ class CGitLister(ListerOnePageApiTransport, SimpleLister):
if origin_url is not None:
repos_details.append({
'name': repo_name,
'time': time,
'origin_url': origin_url,
})
'name': repo_name,
'time': time,
'origin_url': origin_url,
})
random.shuffle(repos_details)
return repos_details
def get_page(self, soup):
"""Find URL of all pages
def yield_repo_from_responses(self, response):
"""Yield repositories from all pages of the cgit instance.
Finds URL of all the pages that are present by parsing over the HTML of
Finds the number of pages present and yields the list of
repositories present.
Args:
response (Response): server response.
Yields:
List of beautifulsoup object of repository rows.
"""
html = response.text
yield from get_repo_list(html)
pages = self.get_pages(make_soup(html))
if len(pages) > 1:
yield from self.get_repos_from_pages(pages[1:])
def find_origin_url(self, repo, repo_name):
"""Finds the origin url for a repository
Args:
repo (Beautifulsoup): Beautifulsoup object of the repository
row present in base url.
repo_name (str): Repository name.
Returns:
string: origin url.
"""
if self.url_prefix_present:
return self.url_prefix + repo_name
return self.get_url(repo)
def get_pages(self, url_soup):
"""Find URL of all pages.
Finds URL of pages that are present by parsing over the HTML of
pagination present at the end of the page.
Args:
soup (Beautifulsoup): a beautifulsoup object of base URL
url_soup (Beautifulsoup): a beautifulsoup object of base URL
Returns:
list: URL of all the pages present for a cgit instance
list: URL of pages present for a cgit instance
"""
pages = soup.find('div', {"class": "content"}).find_all('li')
pages = url_soup.find('div', {"class": "content"}).find_all('li')
if not pages:
return [self.PAGE]
return [self.get_url(page) for page in pages]
def get_all_pages(self, pages):
"""Find repos from all the pages
def get_repos_from_pages(self, pages):
"""Find repos from all pages.
Make the request for all the pages (except the first) present for a
particular cgit instance and finds the repos that are available
for each and every page.
Request the available repos from the pages. This yields
the available repositories found as beautiful object representation.
Args:
pages ([str]): list of urls of all the pages present for a
particular cgit instance
pages ([str]): list of urls of all pages present for a
particular cgit instance.
Returns:
List of beautifulsoup object of all the repositories (url) row
present in all the pages(except first).
Yields:
List of beautifulsoup object of repository (url) rows
present in pages(except first).
"""
all_repos = []
for page in pages[1:]:
for page in pages:
response = requests.get(page)
repos = get_repo_list(response)
all_repos.extend(repos)
if not response.ok:
logging.warning('Failed to retrieve repositories from page %s',
page)
continue
return all_repos
yield from get_repo_list(response.text)
def get_url(self, repo):
"""Finds url of a repo page.
@ -150,19 +208,19 @@ class CGitLister(ListerOnePageApiTransport, SimpleLister):
def find_netloc(url):
"""Finds the network location from then base_url
"""Finds the network location from then url.
All the url in the repo are relative to the network location part of base
url, so we need to compute it to reconstruct all the urls.
URL in the repo are relative to the network location part of base
URL, so we need to compute it to reconstruct URLs.
Args:
url (urllib): urllib object of base_url
url (urllib): urllib object of url.
Returns:
string: Scheme and Network location part in the base URL.
Example:
For base_url = https://git.kernel.org/pub/scm/
For url = https://git.kernel.org/pub/scm/
>>> find_netloc(url)
'https://git.kernel.org'
@ -171,106 +229,23 @@ def find_netloc(url):
def get_repo_list(response):
"""Find all the rows with repo for a particualar page on the base url
Finds all the repos on page and retuens a list of all the repos. Each
element of the list is a beautifulsoup object representing a repo.
"""Find repositories (as beautifulsoup object) available within the server
response.
Args:
response (Response): server response
Returns:
List of all the repos on a page.
List all repositories as beautifulsoup object within the response.
"""
repo_soup = make_repo_soup(response)
repo_soup = make_soup(response)
return repo_soup \
.find('div', {"class": "content"}).find_all("tr", {"class": ""})
def make_repo_soup(response):
"""Makes BeautifulSoup object of the response
def make_soup(response):
"""Instantiates a beautiful soup object from the response object.
"""
return BeautifulSoup(response.text, features="html.parser")
def find_origin_url(repo_url):
"""Finds origin url for a repo.
Finds the origin url for a particular repo by parsing over the page of
that repo.
Args:
repo_url: URL of the repo.
Returns:
string: Origin url for the repo.
Examples:
>>> find_origin_url(
'http://git.savannah.gnu.org/cgit/fbvbconv-py.git/')
'https://git.savannah.gnu.org/git/fbvbconv-py.git'
"""
response = requests.get(repo_url)
repo_soup = make_repo_soup(response)
origin_urls = find_all_origin_url(repo_soup)
return priority_origin_url(origin_urls)
def find_all_origin_url(soup):
"""Finds all possible origin url for a repo.
Finds all the origin url for a particular repo by parsing over the html of
repo page.
Args:
soup: a beautifulsoup object repo representation.
Returns:
dictionary: All possible origin urls for a repository (dict with
key 'protocol', value the associated url).
Examples:
If soup is beautifulsoup object of the html code at
http://git.savannah.gnu.org/cgit/fbvbconv-py.git/
>>> print(find_all_origin_url(soup))
{ 'https': 'https://git.savannah.gnu.org/git/fbvbconv-py.git',
'ssh': 'ssh://git.savannah.gnu.org/srv/git/fbvbconv-py.git',
'git': 'git://git.savannah.gnu.org/fbvbconv-py.git'}
"""
origin_urls = defaultdict(dict)
found_clone_word = False
for i in soup.find_all('tr'):
if found_clone_word:
link = i.text
protocol = link[:link.find(':')]
origin_urls[protocol] = link
if i.text == 'Clone':
found_clone_word = True
return origin_urls
def priority_origin_url(origin_url):
"""Finds the highest priority link for a particular repo.
Priority order is https>http>git>ssh.
Args:
origin_urls (Dict): All possible origin urls for a repository
(key 'protocol', value the associated url)
Returns:
Url (str) with the highest priority.
"""
for protocol in ['https', 'http', 'git', 'ssh']:
if protocol in origin_url:
return origin_url[protocol]
return BeautifulSoup(response, features="html.parser")

View file

@ -7,9 +7,11 @@ from swh.scheduler.celery_backend.config import app
from .lister import CGitLister
def new_lister(base_url='https://git.savannah.gnu.org/cgit/',
instance='savannah-gnu', **kw):
return CGitLister(base_url=base_url, instance=instance, **kw)
def new_lister(url='https://git.kernel.org/',
url_prefix=None,
instance='kernal', **kw):
return CGitLister(url=url, instance=instance, url_prefix=url_prefix,
**kw)
@app.task(name=__name__ + '.CGitListerTask')

View file

@ -1,47 +0,0 @@
<!DOCTYPE html>
<html lang='en'>
<head>
<title>fbvbconv-py.git - Unnamed repository; edit this file 'description' to name the repository.</title>
<meta name='generator' content='cgit v1.0-41-gc330'/>
<meta name='robots' content='index, nofollow'/>
<link rel='stylesheet' type='text/css' href='/cgit/cgit.css'/>
<link rel='shortcut icon' href='/gitweb/git-favicon.png'/>
<link rel='alternate' title='Atom feed' href='http://git.savannah.gnu.org/cgit/fbvbconv-py.git/atom/?h=master' type='application/atom+xml'/>
<link rel='vcs-git' href='git://git.savannah.gnu.org/fbvbconv-py.git' title='fbvbconv-py.git Git repository'/>
<link rel='vcs-git' href='https://git.savannah.gnu.org/git/fbvbconv-py.git' title='fbvbconv-py.git Git repository'/>
<link rel='vcs-git' href='ssh://git.savannah.gnu.org/srv/git/fbvbconv-py.git' title='fbvbconv-py.git Git repository'/>
</head>
<body>
<div id='cgit'><table id='header'>
<tr>
<td class='logo' rowspan='2'><a href='/cgit/'><img src='/cgit/cgit.png' alt='cgit logo'/></a></td>
<td class='main'><a href='/cgit/'>index</a> : <a title='fbvbconv-py.git' href='/cgit/fbvbconv-py.git/'>fbvbconv-py.git</a></td><td class='form'><form method='get'>
<select name='h' onchange='this.form.submit();'>
<option value='master' selected='selected'>master</option>
</select> <input type='submit' value='switch'/></form></td></tr>
<tr><td class='sub'>Unnamed repository; edit this file 'description' to name the repository.</td><td class='sub right'></td></tr></table>
<table class='tabs'><tr><td>
<a class='active' href='/cgit/fbvbconv-py.git/'>summary</a><a href='/cgit/fbvbconv-py.git/refs/'>refs</a><a href='/cgit/fbvbconv-py.git/log/'>log</a><a href='/cgit/fbvbconv-py.git/tree/'>tree</a><a href='/cgit/fbvbconv-py.git/commit/'>commit</a><a href='/cgit/fbvbconv-py.git/diff/'>diff</a></td><td class='form'><form class='right' method='get' action='/cgit/fbvbconv-py.git/log/'>
<select name='qt'>
<option value='grep'>log msg</option>
<option value='author'>author</option>
<option value='committer'>committer</option>
<option value='range'>range</option>
</select>
<input class='txt' type='text' size='10' name='q' value=''/>
<input type='submit' value='search'/>
</form>
</td></tr></table>
<div class='content'><table summary='repository info' class='list nowrap'><tr class='nohover'><th class='left'>Branch</th><th class='left'>Commit message</th><th class='left'>Author</th><th class='left' colspan='2'>Age</th></tr>
<tr><td><a href='/cgit/fbvbconv-py.git/log/'>master</a></td><td><a href='/cgit/fbvbconv-py.git/commit/'>initial import</a></td><td>Johannes Stezenbach</td><td colspan='2'><span class='age-years' title='2017-06-02 09:57:38 +0200'>2 years</span></td></tr>
<tr class='nohover'><td colspan='5'>&nbsp;</td></tr><tr class='nohover'><td colspan='5'>&nbsp;</td></tr><tr class='nohover'><th class='left'>Age</th><th class='left'>Commit message</th><th class='left'>Author</th><th class='left'>Files</th><th class='left'>Lines</th></tr>
<tr><td><span title='2017-06-02 09:57:38 +0200'>2017-06-02</span></td><td><a href='/cgit/fbvbconv-py.git/commit/?id=9766bcfae598e3e077f321bade823028eb5553bb'>initial import</a><span class='decoration'><a class='deco' href='/cgit/fbvbconv-py.git/commit/?id=9766bcfae598e3e077f321bade823028eb5553bb'>HEAD</a><a class='branch-deco' href='/cgit/fbvbconv-py.git/log/'>master</a></span></td><td>Johannes Stezenbach</td><td>3</td><td><span class='deletions'>-0</span>/<span class='insertions'>+889</span></td></tr>
<tr class='nohover'><td colspan='5'>&nbsp;</td></tr><tr class='nohover'><th class='left' colspan='5'>Clone</th></tr>
<tr><td colspan='5'><a rel='vcs-git' href='git://git.savannah.gnu.org/fbvbconv-py.git' title='fbvbconv-py.git Git repository'>git://git.savannah.gnu.org/fbvbconv-py.git</a></td></tr>
<tr><td colspan='5'><a rel='vcs-git' href='https://git.savannah.gnu.org/git/fbvbconv-py.git' title='fbvbconv-py.git Git repository'>https://git.savannah.gnu.org/git/fbvbconv-py.git</a></td></tr>
<tr><td colspan='5'><a rel='vcs-git' href='ssh://git.savannah.gnu.org/srv/git/fbvbconv-py.git' title='fbvbconv-py.git Git repository'>ssh://git.savannah.gnu.org/srv/git/fbvbconv-py.git</a></td></tr>
</table></div> <!-- class=content -->
<div class='footer'>generated by <a href='https://git.zx2c4.com/cgit/about/'>cgit v1.0-41-gc330</a> at 2019-06-19 10:51:46 +0000</div>
</div> <!-- id=cgit -->
</body>
</html>

View file

@ -0,0 +1,15 @@
<tr><td class="toplevel-repo"><a href="/openembedded-core/" title="openembedded-core">openembedded-core</a></td><td><a href="/openembedded-core/">OpenEmbedded Core layer</a></td><td><a href="/?q=OpenEmbedded">OpenEmbedded</a></td><td><span class="age-hours" title="2019-06-26 13:04:31 +0000">5 hours</span></td><td><a class="button" href="/openembedded-core/">summary</a><a class="button" href="/openembedded-core/log/">log</a><a class="button" href="/openembedded-core/tree/">tree</a></td></tr>
<tr><td class="toplevel-repo"><a href="/openembedded-core-contrib/" title="openembedded-core-contrib">openembedded-core-contrib</a></td><td><a href="/openembedded-core-contrib/">OpenEmbedded Core user contribution trees</a></td><td><a href="/?q=OpenEmbedded">OpenEmbedded</a></td><td><span class="age-hours" title="2019-06-26 12:57:22 +0000">5 hours</span></td><td><a class="button" href="/openembedded-core-contrib/">summary</a><a class="button" href="/openembedded-core-contrib/log/">log</a><a class="button" href="/openembedded-core-contrib/tree/">tree</a></td></tr>
<tr><td class="toplevel-repo"><a href="/meta-openembedded/" title="meta-openembedded">meta-openembedded</a></td><td><a href="/meta-openembedded/">Collection of OpenEmbedded layers</a></td><td><a href="/?q=OpenEmbedded">OpenEmbedded</a></td><td><span class="age-hours" title="2019-06-25 21:22:33 +0000">21 hours</span></td><td><a class="button" href="/meta-openembedded/">summary</a><a class="button" href="/meta-openembedded/log/">log</a><a class="button" href="/meta-openembedded/tree/">tree</a></td></tr>
<tr><td class="toplevel-repo"><a href="/meta-openembedded-contrib/" title="meta-openembedded-contrib">meta-openembedded-contrib</a></td><td><a href="/meta-openembedded-contrib/">OpenEmbedded layers collection contribution trees</a></td><td><a href="/?q=OpenEmbedded">OpenEmbedded</a></td><td><span class="age-hours" title="2019-06-25 21:22:33 +0000">21 hours</span></td><td><a class="button" href="/meta-openembedded-contrib/">summary</a><a class="button" href="/meta-openembedded-contrib/log/">log</a><a class="button" href="/meta-openembedded-contrib/tree/">tree</a></td></tr>
<tr><td class="toplevel-repo"><a href="/bitbake/" title="bitbake">bitbake</a></td><td><a href="/bitbake/">Bitbake Development tree</a></td><td><a href="/?q=OpenEmbedded">OpenEmbedded</a></td><td><span class="age-days" title="2019-06-19 17:12:23 +0000">7 days</span></td><td><a class="button" href="/bitbake/">summary</a><a class="button" href="/bitbake/log/">log</a><a class="button" href="/bitbake/tree/">tree</a></td></tr>
<tr><td class="toplevel-repo"><a href="/bitbake-contrib/" title="bitbake-contrib">bitbake-contrib</a></td><td><a href="/bitbake-contrib/">Bitbake user contribution trees</a></td><td><a href="/?q=OpenEmbedded">OpenEmbedded</a></td><td><span class="age-days" title="2019-06-18 15:30:38 +0000">8 days</span></td><td><a class="button" href="/bitbake-contrib/">summary</a><a class="button" href="/bitbake-contrib/log/">log</a><a class="button" href="/bitbake-contrib/tree/">tree</a></td></tr>
<tr><td class="toplevel-repo"><a href="/meta-handheld/" title="meta-handheld">meta-handheld</a></td><td><a href="/meta-handheld/">Handheld device meta layer</a></td><td><a href="/?q=OpenEmbedded">OpenEmbedded</a></td><td><span class="age-months" title="2018-10-01 21:25:11 +0000">9 months</span></td><td><a class="button" href="/meta-handheld/">summary</a><a class="button" href="/meta-handheld/log/">log</a><a class="button" href="/meta-handheld/tree/">tree</a></td></tr>
<tr><td class="toplevel-repo"><a href="/meta-opie/" title="meta-opie">meta-opie</a></td><td><a href="/meta-opie/">OPIE meta layer</a></td><td><a href="/?q=OpenEmbedded">OpenEmbedded</a></td><td><span class="age-years" title="2016-06-12 03:58:09 +0000">3 years</span></td><td><a class="button" href="/meta-opie/">summary</a><a class="button" href="/meta-opie/log/">log</a><a class="button" href="/meta-opie/tree/">tree</a></td></tr>
<tr><td class="toplevel-repo"><a href="/openembedded/" title="openembedded">openembedded</a></td><td><a href="/openembedded/">Classic OpenEmbedded Development Tree</a></td><td><a href="/?q=OpenEmbedded">OpenEmbedded</a></td><td><span class="age-years" title="2015-05-05 08:44:03 +0000">4 years</span></td><td><a class="button" href="/openembedded/">summary</a><a class="button" href="/openembedded/log/">log</a><a class="button" href="/openembedded/tree/">tree</a></td></tr>
<tr><td class="toplevel-repo"><a href="/openembedded-web-frontpages/" title="openembedded-web-frontpages">openembedded-web-frontpages</a></td><td><a href="/openembedded-web-frontpages/">OpenEmbedded Website Source Code</a></td><td><a href="/?q=OpenEmbedded">OpenEmbedded</a></td><td><span class="age-years" title="2014-08-24 13:39:24 +0000">5 years</span></td><td><a class="button" href="/openembedded-web-frontpages/">summary</a><a class="button" href="/openembedded-web-frontpages/log/">log</a><a class="button" href="/openembedded-web-frontpages/tree/">tree</a></td></tr>
<tr><td class="toplevel-repo"><a href="/openembedded-admin/" title="openembedded-admin">openembedded-admin</a></td><td><a href="/openembedded-admin/">OE Admin tools</a></td><td><a href="/?q=OpenEmbedded">OpenEmbedded</a></td><td><span class="age-years" title="2013-10-21 21:20:18 +0000">6 years</span></td><td><a class="button" href="/openembedded-admin/">summary</a><a class="button" href="/openembedded-admin/log/">log</a><a class="button" href="/openembedded-admin/tree/">tree</a></td></tr>
<tr><td class="toplevel-repo"><a href="/meta-micro/" title="meta-micro">meta-micro</a></td><td><a href="/meta-micro/">Micro distribution meta layer</a></td><td><a href="/?q=OpenEmbedded">OpenEmbedded</a></td><td><span class="age-years" title="2012-09-08 21:51:18 +0000">7 years</span></td><td><a class="button" href="/meta-micro/">summary</a><a class="button" href="/meta-micro/log/">log</a><a class="button" href="/meta-micro/tree/">tree</a></td></tr>
<tr><td class="toplevel-repo"><a href="/eclipsetools/" title="eclipsetools">eclipsetools</a></td><td><a href="/eclipsetools/">Eclipse tools for OpenEmbedded</a></td><td><a href="/?q=OpenEmbedded">OpenEmbedded</a></td><td><span class="age-years" title="2011-11-05 09:35:20 +0000">8 years</span></td><td><a class="button" href="/eclipsetools/">summary</a><a class="button" href="/eclipsetools/log/">log</a><a class="button" href="/eclipsetools/tree/">tree</a></td></tr>
<tr><td class="toplevel-repo"><a href="/oetest/" title="oetest">oetest</a></td><td><a href="/oetest/">Test utilities for OpenEmbedded</a></td><td><a href="/?q=OpenEmbedded">OpenEmbedded</a></td><td><span class="age-years" title="2009-08-14 14:10:25 +0000">10 years</span></td><td><a class="button" href="/oetest/">summary</a><a class="button" href="/oetest/log/">log</a><a class="button" href="/oetest/tree/">tree</a></td></tr>
<tr><td class="toplevel-repo"><a href="/oebuildstats/" title="oebuildstats">oebuildstats</a></td><td><a href="/oebuildstats/">OE Build Stats</a></td><td><a href="/?q=OpenEmbedded">OpenEmbedded</a></td><td></td><td><a class="button" href="/oebuildstats/">summary</a><a class="button" href="/oebuildstats/log/">log</a><a class="button" href="/oebuildstats/tree/">tree</a></td></tr>

View file

@ -0,0 +1,41 @@
<!DOCTYPE html>
<html lang='en'>
<head>
<title>OpenEmbedded Git Repository Browser</title>
<meta name='generator' content='cgit v1.2'/>
<meta name='robots' content='index, nofollow'/>
<link rel='stylesheet' type='text/css' href='/cgit.css'/>
<link rel='shortcut icon' href='/favicon.ico'/>
</head>
<body>
<div id='cgit'><table id='header'>
<tr>
<td class='logo' rowspan='2'><a href='/'><img src='/oe.png' alt='cgit logo'/></a></td>
<td class='main'>OpenEmbedded Git Repository Browser</td></tr>
<tr><td class='sub'>A web frontend for git repositories</td></tr></table>
<table class='tabs'><tr><td>
<a class='active' href='/'>index</a></td><td class='form'><form method='get' action='/'>
<input type='search' name='q' size='10' value=''/>
<input type='submit' value='search'/>
</form></td></tr></table>
<div class='content'><table summary='repository list' class='list nowrap'><tr class='nohover'><th class='left'><a href='/?s=name'>Name</a></th><th class='left'><a href='/?s=desc'>Description</a></th><th class='left'><a href='/?s=owner'>Owner</a></th><th class='left'><a href='/?s=idle'>Idle</a></th><th class='left'>Links</th></tr>
<tr><td class='toplevel-repo'><a title='openembedded-core' href='/openembedded-core/'>openembedded-core</a></td><td><a href='/openembedded-core/'>OpenEmbedded Core layer</a></td><td><a href='/?q=OpenEmbedded'>OpenEmbedded</a></td><td><span class='age-hours' title='2019-06-26 13:04:31 +0000'>5 hours</span></td><td><a class='button' href='/openembedded-core/'>summary</a><a class='button' href='/openembedded-core/log/'>log</a><a class='button' href='/openembedded-core/tree/'>tree</a></td></tr>
<tr><td class='toplevel-repo'><a title='openembedded-core-contrib' href='/openembedded-core-contrib/'>openembedded-core-contrib</a></td><td><a href='/openembedded-core-contrib/'>OpenEmbedded Core user contribution trees</a></td><td><a href='/?q=OpenEmbedded'>OpenEmbedded</a></td><td><span class='age-hours' title='2019-06-26 12:57:22 +0000'>5 hours</span></td><td><a class='button' href='/openembedded-core-contrib/'>summary</a><a class='button' href='/openembedded-core-contrib/log/'>log</a><a class='button' href='/openembedded-core-contrib/tree/'>tree</a></td></tr>
<tr><td class='toplevel-repo'><a title='meta-openembedded' href='/meta-openembedded/'>meta-openembedded</a></td><td><a href='/meta-openembedded/'>Collection of OpenEmbedded layers</a></td><td><a href='/?q=OpenEmbedded'>OpenEmbedded</a></td><td><span class='age-hours' title='2019-06-25 21:22:33 +0000'>21 hours</span></td><td><a class='button' href='/meta-openembedded/'>summary</a><a class='button' href='/meta-openembedded/log/'>log</a><a class='button' href='/meta-openembedded/tree/'>tree</a></td></tr>
<tr><td class='toplevel-repo'><a title='meta-openembedded-contrib' href='/meta-openembedded-contrib/'>meta-openembedded-contrib</a></td><td><a href='/meta-openembedded-contrib/'>OpenEmbedded layers collection contribution trees</a></td><td><a href='/?q=OpenEmbedded'>OpenEmbedded</a></td><td><span class='age-hours' title='2019-06-25 21:22:33 +0000'>21 hours</span></td><td><a class='button' href='/meta-openembedded-contrib/'>summary</a><a class='button' href='/meta-openembedded-contrib/log/'>log</a><a class='button' href='/meta-openembedded-contrib/tree/'>tree</a></td></tr>
<tr><td class='toplevel-repo'><a title='bitbake' href='/bitbake/'>bitbake</a></td><td><a href='/bitbake/'>Bitbake Development tree</a></td><td><a href='/?q=OpenEmbedded'>OpenEmbedded</a></td><td><span class='age-days' title='2019-06-19 17:12:23 +0000'>7 days</span></td><td><a class='button' href='/bitbake/'>summary</a><a class='button' href='/bitbake/log/'>log</a><a class='button' href='/bitbake/tree/'>tree</a></td></tr>
<tr><td class='toplevel-repo'><a title='bitbake-contrib' href='/bitbake-contrib/'>bitbake-contrib</a></td><td><a href='/bitbake-contrib/'>Bitbake user contribution trees</a></td><td><a href='/?q=OpenEmbedded'>OpenEmbedded</a></td><td><span class='age-days' title='2019-06-18 15:30:38 +0000'>8 days</span></td><td><a class='button' href='/bitbake-contrib/'>summary</a><a class='button' href='/bitbake-contrib/log/'>log</a><a class='button' href='/bitbake-contrib/tree/'>tree</a></td></tr>
<tr><td class='toplevel-repo'><a title='meta-handheld' href='/meta-handheld/'>meta-handheld</a></td><td><a href='/meta-handheld/'>Handheld device meta layer</a></td><td><a href='/?q=OpenEmbedded'>OpenEmbedded</a></td><td><span class='age-months' title='2018-10-01 21:25:11 +0000'>9 months</span></td><td><a class='button' href='/meta-handheld/'>summary</a><a class='button' href='/meta-handheld/log/'>log</a><a class='button' href='/meta-handheld/tree/'>tree</a></td></tr>
<tr><td class='toplevel-repo'><a title='meta-opie' href='/meta-opie/'>meta-opie</a></td><td><a href='/meta-opie/'>OPIE meta layer</a></td><td><a href='/?q=OpenEmbedded'>OpenEmbedded</a></td><td><span class='age-years' title='2016-06-12 03:58:09 +0000'>3 years</span></td><td><a class='button' href='/meta-opie/'>summary</a><a class='button' href='/meta-opie/log/'>log</a><a class='button' href='/meta-opie/tree/'>tree</a></td></tr>
<tr><td class='toplevel-repo'><a title='openembedded' href='/openembedded/'>openembedded</a></td><td><a href='/openembedded/'>Classic OpenEmbedded Development Tree</a></td><td><a href='/?q=OpenEmbedded'>OpenEmbedded</a></td><td><span class='age-years' title='2015-05-05 08:44:03 +0000'>4 years</span></td><td><a class='button' href='/openembedded/'>summary</a><a class='button' href='/openembedded/log/'>log</a><a class='button' href='/openembedded/tree/'>tree</a></td></tr>
<tr><td class='toplevel-repo'><a title='openembedded-web-frontpages' href='/openembedded-web-frontpages/'>openembedded-web-frontpages</a></td><td><a href='/openembedded-web-frontpages/'>OpenEmbedded Website Source Code</a></td><td><a href='/?q=OpenEmbedded'>OpenEmbedded</a></td><td><span class='age-years' title='2014-08-24 13:39:24 +0000'>5 years</span></td><td><a class='button' href='/openembedded-web-frontpages/'>summary</a><a class='button' href='/openembedded-web-frontpages/log/'>log</a><a class='button' href='/openembedded-web-frontpages/tree/'>tree</a></td></tr>
<tr><td class='toplevel-repo'><a title='openembedded-admin' href='/openembedded-admin/'>openembedded-admin</a></td><td><a href='/openembedded-admin/'>OE Admin tools</a></td><td><a href='/?q=OpenEmbedded'>OpenEmbedded</a></td><td><span class='age-years' title='2013-10-21 21:20:18 +0000'>6 years</span></td><td><a class='button' href='/openembedded-admin/'>summary</a><a class='button' href='/openembedded-admin/log/'>log</a><a class='button' href='/openembedded-admin/tree/'>tree</a></td></tr>
<tr><td class='toplevel-repo'><a title='meta-micro' href='/meta-micro/'>meta-micro</a></td><td><a href='/meta-micro/'>Micro distribution meta layer</a></td><td><a href='/?q=OpenEmbedded'>OpenEmbedded</a></td><td><span class='age-years' title='2012-09-08 21:51:18 +0000'>7 years</span></td><td><a class='button' href='/meta-micro/'>summary</a><a class='button' href='/meta-micro/log/'>log</a><a class='button' href='/meta-micro/tree/'>tree</a></td></tr>
<tr><td class='toplevel-repo'><a title='eclipsetools' href='/eclipsetools/'>eclipsetools</a></td><td><a href='/eclipsetools/'>Eclipse tools for OpenEmbedded</a></td><td><a href='/?q=OpenEmbedded'>OpenEmbedded</a></td><td><span class='age-years' title='2011-11-05 09:35:20 +0000'>8 years</span></td><td><a class='button' href='/eclipsetools/'>summary</a><a class='button' href='/eclipsetools/log/'>log</a><a class='button' href='/eclipsetools/tree/'>tree</a></td></tr>
<tr><td class='toplevel-repo'><a title='oetest' href='/oetest/'>oetest</a></td><td><a href='/oetest/'>Test utilities for OpenEmbedded</a></td><td><a href='/?q=OpenEmbedded'>OpenEmbedded</a></td><td><span class='age-years' title='2009-08-14 14:10:25 +0000'>10 years</span></td><td><a class='button' href='/oetest/'>summary</a><a class='button' href='/oetest/log/'>log</a><a class='button' href='/oetest/tree/'>tree</a></td></tr>
<tr><td class='toplevel-repo'><a title='oebuildstats' href='/oebuildstats/'>oebuildstats</a></td><td><a href='/oebuildstats/'>OE Build Stats</a></td><td><a href='/?q=OpenEmbedded'>OpenEmbedded</a></td><td></td><td><a class='button' href='/oebuildstats/'>summary</a><a class='button' href='/oebuildstats/log/'>log</a><a class='button' href='/oebuildstats/tree/'>tree</a></td></tr>
</table></div> <!-- class=content -->
<div class='footer'>generated by <a href='https://git.zx2c4.com/cgit/about/'>cgit v1.2</a> (<a href='https://git-scm.com/'>git 2.18.0</a>) at 2019-06-26 18:03:12 +0000</div>
</div> <!-- id=cgit -->
</body>
</html>

View file

@ -3,43 +3,20 @@
# See top-level LICENSE file for more information
from bs4 import BeautifulSoup
from urllib.parse import urlparse
from swh.lister.cgit.lister import priority_origin_url, find_all_origin_url
from swh.lister.cgit.lister import find_netloc
from swh.lister.cgit.lister import find_netloc, get_repo_list
def test_find_all_origin_url():
f = open('swh/lister/cgit/tests/api_response.html')
soup = BeautifulSoup(f.read(), features="html.parser")
expected_output = {'https': 'https://git.savannah.gnu.org/git/'
'fbvbconv-py.git',
'ssh': 'ssh://git.savannah.gnu.org/srv/git/'
'fbvbconv-py.git',
'git': 'git://git.savannah.gnu.org/fbvbconv-py.git'}
output = find_all_origin_url(soup)
for protocol, url in expected_output.items():
assert url == output[protocol]
def test_priority_origin_url():
first_input = {'https': 'https://kernel.googlesource.com/pub/scm/docs/'
'man-pages/man-pages.git',
'git': 'git://git.kernel.org/pub/scm/docs/man-pages/'
'man-pages.git'}
second_input = {'git': 'git://git.savannah.gnu.org/perl-pesel.git',
'ssh': 'ssh://git.savannah.gnu.org/srv/git/perl-pesel.git'}
third_input = {}
assert (priority_origin_url(first_input) ==
'https://kernel.googlesource.com/pub/scm/docs/man-pages/'
'man-pages.git')
assert (priority_origin_url(second_input) ==
'git://git.savannah.gnu.org/perl-pesel.git')
assert priority_origin_url(third_input) is None
def test_get_repo_list():
f = open('swh/lister/cgit/tests/response.html')
repos = get_repo_list(f.read())
f = open('swh/lister/cgit/tests/repo_list.txt')
expected_repos = f.readlines()
expected_repos = list(map((lambda repo: repo[:-1]), expected_repos))
assert len(repos) == len(expected_repos)
for i in range(len(repos)):
assert str(repos[i]) == expected_repos[i]
def test_find_netloc():

View file

@ -11,19 +11,43 @@ def test_ping(swh_app, celery_session_worker):
@patch('swh.lister.cgit.tasks.CGitLister')
def test_lister(lister, swh_app, celery_session_worker):
def test_lister_no_url_prefix(lister, swh_app, celery_session_worker):
# setup the mocked CGitLister
lister.return_value = lister
lister.run.return_value = None
res = swh_app.send_task(
'swh.lister.cgit.tasks.CGitListerTask')
'swh.lister.cgit.tasks.CGitListerTask',
kwargs=dict(url='https://git.kernel.org/', instance='kernel'))
assert res
res.wait()
assert res.successful()
lister.assert_called_once_with(
base_url='https://git.savannah.gnu.org/cgit/',
instance='savannah-gnu')
url='https://git.kernel.org/',
url_prefix=None,
instance='kernel')
lister.db_last_index.assert_not_called()
lister.run.assert_called_once_with()
@patch('swh.lister.cgit.tasks.CGitLister')
def test_lister_with_url_prefix(lister, swh_app, celery_session_worker):
# setup the mocked CGitLister
lister.return_value = lister
lister.run.return_value = None
res = swh_app.send_task(
'swh.lister.cgit.tasks.CGitListerTask',
kwargs=dict(url='https://cgit.kde.org/',
url_prefix='https://anongit.kde.org/', instance='kde'))
assert res
res.wait()
assert res.successful()
lister.assert_called_once_with(
url='https://cgit.kde.org/',
url_prefix='https://anongit.kde.org/',
instance='kde')
lister.db_last_index.assert_not_called()
lister.run.assert_called_once_with()

View file

@ -129,7 +129,8 @@ def cli(ctx, db_url, listers, drop_tables):
from .cgit.models import ModelBase
from .cgit.lister import CGitLister
_lister = CGitLister(
base_url='http://git.savannah.gnu.org/cgit/',
url='http://git.savannah.gnu.org/cgit/',
url_prefix='http://git.savannah.gnu.org/git/',
override_config=override_conf)
else: