From 7e3c79bb1d18e49ac446159066de33470528b809 Mon Sep 17 00:00:00 2001
From: Archit Agrawal <archit18221@iiitd.ac.in>
Date: Wed, 26 Jun 2019 15:41:32 +0530
Subject: [PATCH] swh.lister.cgit: Add pagination support

Some cgit instance have a pagination. Modifiy
lister to find all the pages and list all the repos
from all the pages.
---
 README.md                            |   1 +
 swh/lister/cgit/lister.py            | 152 ++++++++++++++++++++++-----
 swh/lister/cgit/models.py            |   1 +
 swh/lister/cgit/tests/test_lister.py |  10 ++
 4 files changed, 136 insertions(+), 28 deletions(-)

diff --git a/README.md b/README.md
index acc86e2..f5c29d1 100644
--- a/README.md
+++ b/README.md
@@ -18,6 +18,7 @@ following Python modules:
 - `swh.lister.npm`
 - `swh.lister.phabricator`
 - `swh.lister.cran`
+- `swh.lister.cgit`
 
 Dependencies
 ------------
diff --git a/swh/lister/cgit/lister.py b/swh/lister/cgit/lister.py
index a16f922..4f5db9b 100644
--- a/swh/lister/cgit/lister.py
+++ b/swh/lister/cgit/lister.py
@@ -6,7 +6,7 @@ import random
 from bs4 import BeautifulSoup
 from collections import defaultdict
 import requests
-import urllib.parse
+from urllib.parse import urlparse
 
 from .models import CGitModel
 
@@ -17,22 +17,16 @@ from swh.lister.core.lister_transports import ListerOnePageApiTransport
 class CGitLister(ListerOnePageApiTransport, SimpleLister):
     MODEL = CGitModel
     LISTER_NAME = 'cgit'
-    PAGE = ''
+    PAGE = None
 
     def __init__(self, base_url, instance=None, override_config=None):
-        if not base_url.endswith('/'):
-            base_url = base_url+'/'
-        self.PAGE = base_url
 
-        # This part removes any suffix from the base url and stores it in
-        # next_url. For example for base_url = https://git.kernel.org/pub/scm/
-        # it will convert it into https://git.kernel.org and then attach
-        # the suffix
-        (part1, part2, next_url) = self.PAGE.split('/', 2)
-        self.next_url = part1 + '//' + next_url
+        self.PAGE = base_url
+        url = urlparse(self.PAGE)
+        self.url_netloc = find_netloc(url)
 
         if not instance:
-            instance = urllib.parse.urlparse(base_url).hostname
+            instance = url.hostname
         self.instance = instance
         ListerOnePageApiTransport .__init__(self)
         SimpleLister.__init__(self, override_config=override_config)
@@ -40,11 +34,18 @@ class CGitLister(ListerOnePageApiTransport, SimpleLister):
     def list_packages(self, response):
         """List the actual cgit instance origins from the response.
 
+        Find the repos in all the pages by parsing over the HTML of
+        the `base_url`. Find the details for all the repos and return
+        them in the format of list of dictionaries.
+
         """
         repos_details = []
-        soup = BeautifulSoup(response.text, features="html.parser") \
-            .find('div', {"class": "content"})
-        repos = soup.find_all("tr", {"class": ""})
+        repos = get_repo_list(response)
+        soup = make_repo_soup(response)
+        pages = self.get_page(soup)
+        if len(pages) > 1:
+            repos.extend(self.get_all_pages(pages))
+
         for repo in repos:
             repo_name = repo.a.text
             repo_url = self.get_url(repo)
@@ -60,11 +61,55 @@ class CGitLister(ListerOnePageApiTransport, SimpleLister):
                                         'name': repo_name,
                                         'time': time,
                                         'origin_url': origin_url,
-                                    })
+                                     })
 
         random.shuffle(repos_details)
         return repos_details
 
+    def get_page(self, soup):
+        """Find URL of all pages
+
+        Finds URL of all the pages that are present by parsing over the HTML of
+        pagination present at the end of the page.
+
+        Args:
+            soup (Beautifulsoup): a beautifulsoup object of base URL
+
+        Returns:
+            list: URL of all the pages present for a cgit instance
+
+        """
+        pages = soup.find('div', {"class": "content"}).find_all('li')
+
+        if not pages:
+            return [self.PAGE]
+
+        return [self.get_url(page) for page in pages]
+
+    def get_all_pages(self, pages):
+        """Find repos from all the pages
+
+        Make the request for all the pages (except the first) present for a
+        particular cgit instance and finds the repos that are available
+        for each and every page.
+
+        Args:
+            pages ([str]): list of urls of all the pages present for a
+                           particular cgit instance
+
+        Returns:
+            List of beautifulsoup object of all the repositories (url) row
+            present in all the pages(except first).
+
+        """
+        all_repos = []
+        for page in pages[1:]:
+            response = requests.get(page)
+            repos = get_repo_list(response)
+            all_repos.extend(repos)
+
+        return all_repos
+
     def get_url(self, repo):
         """Finds url of a repo page.
 
@@ -72,14 +117,15 @@ class CGitLister(ListerOnePageApiTransport, SimpleLister):
         that repo present in the base url.
 
         Args:
-            repo: a beautifulsoup object of the html code of the repo row
-                   present in base url.
+            repo (Beautifulsoup): a beautifulsoup object of the repository
+                                  row present in base url.
 
         Returns:
             string: The url of a repo.
+
         """
         suffix = repo.a['href']
-        return self.next_url + suffix
+        return self.url_netloc + suffix
 
     def get_model_from_repo(self, repo):
         """Transform from repository representation to model.
@@ -93,13 +139,60 @@ class CGitLister(ListerOnePageApiTransport, SimpleLister):
             'origin_url': repo['origin_url'],
             'origin_type': 'git',
             'time_updated': repo['time'],
+            'instance': self.instance,
         }
 
-    def transport_response_simplified(self, response):
+    def transport_response_simplified(self, repos_details):
         """Transform response to list for model manipulation.
 
         """
-        return [self.get_model_from_repo(repo) for repo in response]
+        return [self.get_model_from_repo(repo) for repo in repos_details]
+
+
+def find_netloc(url):
+    """Finds the network location from then base_url
+
+    All the url in the repo are relative to the network location part of base
+    url, so we need to compute it to reconstruct all the urls.
+
+    Args:
+        url (urllib): urllib object of base_url
+
+    Returns:
+        string: Scheme and Network location part in the base URL.
+
+    Example:
+    For base_url = https://git.kernel.org/pub/scm/
+        >>> find_netloc(url)
+        'https://git.kernel.org'
+
+    """
+    return '%s://%s' % (url.scheme, url.netloc)
+
+
+def get_repo_list(response):
+    """Find all the rows with repo for a particualar page on the base url
+
+    Finds all the repos on page and retuens a list of all the repos. Each
+    element of the list is a beautifulsoup object representing a repo.
+
+    Args:
+        response (Response): server response
+
+    Returns:
+        List of all the repos on a page.
+
+    """
+    repo_soup = make_repo_soup(response)
+    return repo_soup \
+        .find('div', {"class": "content"}).find_all("tr", {"class": ""})
+
+
+def make_repo_soup(response):
+    """Makes BeautifulSoup object of the response
+
+    """
+    return BeautifulSoup(response.text, features="html.parser")
 
 
 def find_origin_url(repo_url):
@@ -123,22 +216,24 @@ def find_origin_url(repo_url):
     """
 
     response = requests.get(repo_url)
-    soup = BeautifulSoup(response.text, features="html.parser")
+    repo_soup = make_repo_soup(response)
 
-    origin_urls = find_all_origin_url(soup)
+    origin_urls = find_all_origin_url(repo_soup)
     return priority_origin_url(origin_urls)
 
 
 def find_all_origin_url(soup):
-    """
+    """Finds all possible origin url for a repo.
+
     Finds all the origin url for a particular repo by parsing over the html of
     repo page.
 
     Args:
-        soup: a beautifulsoup object of the html code of the repo.
+        soup: a beautifulsoup object repo representation.
 
     Returns:
-        dictionary: All possible origin urls with their protocol as key.
+        dictionary: All possible origin urls for a repository (dict with
+                    key 'protocol', value the associated url).
 
     Examples:
         If soup is beautifulsoup object of the html code at
@@ -169,10 +264,11 @@ def priority_origin_url(origin_url):
     Priority order is https>http>git>ssh.
 
     Args:
-        origin_urls: A dictionary of origin links with their protocol as key.
+        origin_urls (Dict): All possible origin urls for a repository
+                            (key 'protocol', value the associated url)
 
     Returns:
-        string: URL with the highest priority.
+        Url (str) with the highest priority.
 
     """
     for protocol in ['https', 'http', 'git', 'ssh']:
diff --git a/swh/lister/cgit/models.py b/swh/lister/cgit/models.py
index 8ecf40f..4e16798 100644
--- a/swh/lister/cgit/models.py
+++ b/swh/lister/cgit/models.py
@@ -15,3 +15,4 @@ class CGitModel(ModelBase):
 
     uid = Column(String, primary_key=True)
     time_updated = Column(String)
+    instance = Column(String, index=True)
diff --git a/swh/lister/cgit/tests/test_lister.py b/swh/lister/cgit/tests/test_lister.py
index 600758a..e3c3610 100644
--- a/swh/lister/cgit/tests/test_lister.py
+++ b/swh/lister/cgit/tests/test_lister.py
@@ -4,8 +4,10 @@
 
 
 from bs4 import BeautifulSoup
+from urllib.parse import urlparse
 
 from swh.lister.cgit.lister import priority_origin_url, find_all_origin_url
+from swh.lister.cgit.lister import find_netloc
 
 
 def test_find_all_origin_url():
@@ -38,3 +40,11 @@ def test_priority_origin_url():
     assert (priority_origin_url(second_input) ==
             'git://git.savannah.gnu.org/perl-pesel.git')
     assert priority_origin_url(third_input) is None
+
+
+def test_find_netloc():
+    first_url = urlparse('http://git.savannah.gnu.org/cgit/')
+    second_url = urlparse('https://cgit.kde.org/')
+
+    assert find_netloc(first_url) == 'http://git.savannah.gnu.org'
+    assert find_netloc(second_url) == 'https://cgit.kde.org'