diff --git a/README.md b/README.md
--- a/README.md
+++ b/README.md
@@ -18,6 +18,7 @@
- `swh.lister.npm`
- `swh.lister.phabricator`
- `swh.lister.cran`
+- `swh.lister.cgit`
@@ -203,6 +204,20 @@
+## lister-cgit
+Once configured, you can execute a cgit lister using the following instructions
+in a `python3` script:
+import logging
+from swh.lister.cgit.tasks import cgit_lister
+ url_prefix='http://git.savannah.gnu.org/git/')
diff --git a/requirements.txt b/requirements.txt
--- a/requirements.txt
+++ b/requirements.txt
@@ -5,3 +5,4 @@
diff --git a/swh/lister/cgit/__init__.py b/swh/lister/cgit/__init__.py
new file mode 100644
diff --git a/swh/lister/cgit/lister.py b/swh/lister/cgit/lister.py
new file mode 100644
--- /dev/null
+++ b/swh/lister/cgit/lister.py
@@ -0,0 +1,209 @@
+# Copyright (C) 2019 the Software Heritage developers
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+import random
+from bs4 import BeautifulSoup
+import requests
+from urllib.parse import urlparse
+from .models import CGitModel
+from swh.lister.core.simple_lister import SimpleLister
+from swh.lister.core.lister_transports import ListerOnePageApiTransport
+class CGitLister(ListerOnePageApiTransport, SimpleLister):
+ MODEL = CGitModel
+ LISTER_NAME = 'cgit'
+ PAGE = None
+ def __init__(self, url, instance=None, url_prefix=None,
+ override_config=None):
+ """Inits Class with PAGE url and origin url prefix.
+ Args:
+ url (str): URL of the CGit instance.
+ instance (str): Name of cgit instance.
+ url_prefix (str): Prefix of the origin_url.
+ """
+ self.PAGE = url
+ if url_prefix is None:
+ self.url_prefix = url
+ else:
+ self.url_prefix = url_prefix
+ if not self.url_prefix.endswith('/'):
+ self.url_prefix += '/'
+ url = urlparse(self.PAGE)
+ self.url_netloc = find_netloc(url)
+ if not instance:
+ instance = urlparse(self.PAGE).hostname
+ self.instance = instance
+ ListerOnePageApiTransport .__init__(self)
+ SimpleLister.__init__(self, override_config=override_config)
+ def list_packages(self, response):
+ """List the actual cgit instance origins from the response.
+ Find the repos in all the pages by parsing over the HTML of
+ the `url`. Find the details for all the repos and return
+ them in the format of list of dictionaries.
+ """
+ repos_details = []
+ repos = get_repo_list(response.text)
+ soup = make_repo_soup(response.text)
+ pages = self.get_page(soup)
+ if len(pages) > 1:
+ repos.extend(self.get_all_pages(pages))
+ for repo in repos:
+ repo_name = repo.a.text
+ origin_url = self.url_prefix + repo_name
+ try:
+ time = repo.span['title']
+ except Exception:
+ time = None
+ if origin_url is not None:
+ repos_details.append({
+ 'name': repo_name,
+ 'time': time,
+ 'origin_url': origin_url,
+ })
+ random.shuffle(repos_details)
+ return repos_details
+ def get_page(self, url_soup):
+ """Find URL of all pages
+ Finds URL of all the pages that are present by parsing over the HTML of
+ pagination present at the end of the page.
+ Args:
+ url_soup (Beautifulsoup): a beautifulsoup object of base URL
+ Returns:
+ list: URL of all the pages present for a cgit instance
+ """
+ pages = url_soup.find('div', {"class": "content"}).find_all('li')
+ if not pages:
+ return [self.PAGE]
+ return [self.get_url(page) for page in pages]
+ def get_all_pages(self, pages):
+ """Find repos from all the pages
+ Make the request for all the pages (except the first) present for a
+ particular cgit instance and finds the repos that are available
+ for each and every page.
+ Args:
+ pages ([str]): list of urls of all the pages present for a
+ particular cgit instance
+ Returns:
+ List of beautifulsoup object of all the repositories (url) row
+ present in all the pages(except first).
+ """
+ all_repos = []
+ for page in pages[1:]:
+ response = requests.get(page)
+ repos = get_repo_list(response.text)
+ all_repos.extend(repos)
+ return all_repos
+ def get_url(self, repo):
+ """Finds url of a repo page.
+ Finds the url of a repo page by parsing over the html of the row of
+ that repo present in the base url.
+ Args:
+ repo (Beautifulsoup): a beautifulsoup object of the repository
+ row present in base url.
+ Returns:
+ string: The url of a repo.
+ """
+ suffix = repo.a['href']
+ return self.url_netloc + suffix
+ def get_model_from_repo(self, repo):
+ """Transform from repository representation to model.
+ """
+ return {
+ 'uid': self.PAGE + repo['name'],
+ 'name': repo['name'],
+ 'full_name': repo['name'],
+ 'html_url': repo['origin_url'],
+ 'origin_url': repo['origin_url'],
+ 'origin_type': 'git',
+ 'time_updated': repo['time'],
+ 'instance': self.instance,
+ }
+ def transport_response_simplified(self, repos_details):
+ """Transform response to list for model manipulation.
+ """
+ return [self.get_model_from_repo(repo) for repo in repos_details]
+def find_netloc(url):
+ """Finds the network location from then url
+ All the url in the repo are relative to the network location part of base
+ url, so we need to compute it to reconstruct all the urls.
+ Args:
+ url (urllib): urllib object of url
+ Returns:
+ string: Scheme and Network location part in the base URL.
+ Example:
+ For url = https://git.kernel.org/pub/scm/
+ >>> find_netloc(url)
+ 'https://git.kernel.org'
+ """
+ return '%s://%s' % (url.scheme, url.netloc)
+def get_repo_list(response):
+ """Find all the rows with repo for a particualar page on the base url
+ Finds all the repos on page and retuens a list of all the repos. Each
+ element of the list is a beautifulsoup object representing a repo.
+ Args:
+ response (Response): server response
+ Returns:
+ List of all the repos on a page.
+ """
+ repo_soup = make_repo_soup(response)
+ return repo_soup \
+ .find('div', {"class": "content"}).find_all("tr", {"class": ""})
+def make_repo_soup(response):
+ """Makes BeautifulSoup object of the response
+ """
+ return BeautifulSoup(response, features="html.parser")
diff --git a/swh/lister/cgit/models.py b/swh/lister/cgit/models.py
new file mode 100644
--- /dev/null
+++ b/swh/lister/cgit/models.py
@@ -0,0 +1,18 @@
+# Copyright (C) 2019 the Software Heritage developers
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+from sqlalchemy import Column, String
+from ..core.models import ModelBase
+class CGitModel(ModelBase):
+ """a CGit repository representation
+ """
+ __tablename__ = 'cgit_repo'
+ uid = Column(String, primary_key=True)
+ time_updated = Column(String)
+ instance = Column(String, index=True)
diff --git a/swh/lister/cgit/tasks.py b/swh/lister/cgit/tasks.py
new file mode 100644
--- /dev/null
+++ b/swh/lister/cgit/tasks.py
@@ -0,0 +1,25 @@
+# Copyright (C) 2019 the Software Heritage developers
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+from swh.scheduler.celery_backend.config import app
+from .lister import CGitLister
+def new_lister(url='https://git.savannah.gnu.org/cgit/',
+ url_prefix='https://git.savannah.gnu.org/git/',
+ instance='savannah-gnu', **kw):
+ return CGitLister(url=url, instance=instance, url_prefix=url_prefix,
+ **kw)
+@app.task(name=__name__ + '.CGitListerTask')
+def cgit_lister(**lister_args):
+ lister = new_lister(**lister_args)
+ lister.run()
+@app.task(name=__name__ + '.ping')
+def ping():
+ return 'OK'
diff --git a/swh/lister/cgit/tests/__init__.py b/swh/lister/cgit/tests/__init__.py
new file mode 100644
diff --git a/swh/lister/cgit/tests/conftest.py b/swh/lister/cgit/tests/conftest.py
new file mode 100644
--- /dev/null
+++ b/swh/lister/cgit/tests/conftest.py
@@ -0,0 +1 @@
+from swh.lister.core.tests.conftest import * # noqa
diff --git a/swh/lister/cgit/tests/repo_list.txt b/swh/lister/cgit/tests/repo_list.txt
new file mode 100644
--- /dev/null
+++ b/swh/lister/cgit/tests/repo_list.txt
@@ -0,0 +1,15 @@