diff --git a/README.md b/README.md
--- a/README.md
+++ b/README.md
@@ -203,6 +203,19 @@
cran_lister()
```
+## lister-cgit
+
+Once configured, you can execute a cgit lister using the following instructions
+in a `python3` script:
+
+```lang=python
+import logging
+from swh.lister.cgit.tasks import cgit_lister
+
+logging.basicConfig(level=logging.DEBUG)
+cgit_lister(base_url='http://git.savannah.gnu.org/cgit/')
+```
+
Licensing
---------
diff --git a/requirements.txt b/requirements.txt
--- a/requirements.txt
+++ b/requirements.txt
@@ -5,3 +5,4 @@
setuptools
xmltodict
iso8601
+beautifulsoup4
diff --git a/swh/lister/cgit/__init__.py b/swh/lister/cgit/__init__.py
new file mode 100644
diff --git a/swh/lister/cgit/lister.py b/swh/lister/cgit/lister.py
new file mode 100644
--- /dev/null
+++ b/swh/lister/cgit/lister.py
@@ -0,0 +1,180 @@
+# Copyright (C) 2019 the Software Heritage developers
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+import random
+from bs4 import BeautifulSoup
+from collections import defaultdict
+import requests
+import urllib.parse
+
+from .models import CGitModel
+
+from swh.lister.core.simple_lister import SimpleLister
+from swh.lister.core.lister_transports import ListerOnePageApiTransport
+
+
+class CGitLister(ListerOnePageApiTransport, SimpleLister):
+ MODEL = CGitModel
+ LISTER_NAME = 'cgit'
+ PAGE = ''
+
+ def __init__(self, base_url, instance=None, override_config=None):
+ if not base_url.endswith('/'):
+ base_url = base_url+'/'
+ self.PAGE = base_url
+
+ # This part removes any suffix from the base url and stores it in
+ # next_url. For example for base_url = https://git.kernel.org/pub/scm/
+ # it will convert it into https://git.kernel.org and then attach
+ # the suffix
+ (part1, part2, next_url) = self.PAGE.split('/', 2)
+ self.next_url = part1 + '//' + next_url
+
+ if not instance:
+ instance = urllib.parse.urlparse(base_url).hostname
+ self.instance = instance
+ ListerOnePageApiTransport .__init__(self)
+ SimpleLister.__init__(self, override_config=override_config)
+
+ def list_packages(self, response):
+ """List the actual cgit instance origins from the response.
+
+ """
+ repos_details = []
+ soup = BeautifulSoup(response.text, features="html.parser") \
+ .find('div', {"class": "content"})
+ repos = soup.find_all("tr", {"class": ""})
+ for repo in repos:
+ repo_name = repo.a.text
+ repo_url = self.get_url(repo)
+ origin_url = find_origin_url(repo_url)
+
+ try:
+ time = repo.span['title']
+ except Exception:
+ time = None
+
+ if origin_url is not None:
+ repos_details.append({
+ 'name': repo_name,
+ 'time': time,
+ 'origin_url': origin_url,
+ })
+
+ random.shuffle(repos_details)
+ return repos_details
+
+ def get_url(self, repo):
+ """Finds url of a repo page.
+
+ Finds the url of a repo page by parsing over the html of the row of
+ that repo present in the base url.
+
+ Args:
+ repo: a beautifulsoup object of the html code of the repo row
+ present in base url.
+
+ Returns:
+ string: The url of a repo.
+ """
+ suffix = repo.a['href']
+ return self.next_url + suffix
+
+ def get_model_from_repo(self, repo):
+ """Transform from repository representation to model.
+
+ """
+ return {
+ 'uid': self.PAGE + repo['name'],
+ 'name': repo['name'],
+ 'full_name': repo['name'],
+ 'html_url': repo['origin_url'],
+ 'origin_url': repo['origin_url'],
+ 'origin_type': 'git',
+ 'time_updated': repo['time'],
+ }
+
+ def transport_response_simplified(self, response):
+ """Transform response to list for model manipulation.
+
+ """
+ return [self.get_model_from_repo(repo) for repo in response]
+
+
+def find_origin_url(repo_url):
+ """Finds origin url for a repo.
+
+ Finds the origin url for a particular repo by parsing over the page of
+ that repo.
+
+ Args:
+ repo_url: URL of the repo.
+
+ Returns:
+ string: Origin url for the repo.
+
+ Examples:
+
+ >>> find_origin_url(
+ 'http://git.savannah.gnu.org/cgit/fbvbconv-py.git/')
+ 'https://git.savannah.gnu.org/git/fbvbconv-py.git'
+
+ """
+
+ response = requests.get(repo_url)
+ soup = BeautifulSoup(response.text, features="html.parser")
+
+ origin_urls = find_all_origin_url(soup)
+ return priority_origin_url(origin_urls)
+
+
+def find_all_origin_url(soup):
+ """
+ Finds all the origin url for a particular repo by parsing over the html of
+ repo page.
+
+ Args:
+ soup: a beautifulsoup object of the html code of the repo.
+
+ Returns:
+ dictionary: All possible origin urls with their protocol as key.
+
+ Examples:
+ If soup is beautifulsoup object of the html code at
+ http://git.savannah.gnu.org/cgit/fbvbconv-py.git/
+
+ >>> print(find_all_origin_url(soup))
+ { 'https': 'https://git.savannah.gnu.org/git/fbvbconv-py.git',
+ 'ssh': 'ssh://git.savannah.gnu.org/srv/git/fbvbconv-py.git',
+ 'git': 'git://git.savannah.gnu.org/fbvbconv-py.git'}
+ """
+ origin_urls = defaultdict(dict)
+ found_clone_word = False
+
+ for i in soup.find_all('tr'):
+ if found_clone_word:
+ link = i.text
+ protocol = link[:link.find(':')]
+ origin_urls[protocol] = link
+ if i.text == 'Clone':
+ found_clone_word = True
+
+ return origin_urls
+
+
+def priority_origin_url(origin_url):
+ """Finds the highest priority link for a particular repo.
+
+ Priority order is https>http>git>ssh.
+
+ Args:
+ origin_urls: A dictionary of origin links with their protocol as key.
+
+ Returns:
+ string: URL with the highest priority.
+
+ """
+ for protocol in ['https', 'http', 'git', 'ssh']:
+ if protocol in origin_url:
+ return origin_url[protocol]
diff --git a/swh/lister/cgit/models.py b/swh/lister/cgit/models.py
new file mode 100644
--- /dev/null
+++ b/swh/lister/cgit/models.py
@@ -0,0 +1,17 @@
+# Copyright (C) 2019 the Software Heritage developers
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+from sqlalchemy import Column, String
+
+from ..core.models import ModelBase
+
+
+class CGitModel(ModelBase):
+ """a CGit repository representation
+
+ """
+ __tablename__ = 'cgit_repo'
+
+ uid = Column(String, primary_key=True)
+ time_updated = Column(String)
diff --git a/swh/lister/cgit/tasks.py b/swh/lister/cgit/tasks.py
new file mode 100644
--- /dev/null
+++ b/swh/lister/cgit/tasks.py
@@ -0,0 +1,23 @@
+# Copyright (C) 2019 the Software Heritage developers
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+from swh.scheduler.celery_backend.config import app
+
+from .lister import CGitLister
+
+
+def new_lister(base_url='https://git.savannah.gnu.org/cgit/',
+ instance='savannah-gnu', **kw):
+ return CGitLister(base_url=base_url, instance=instance, **kw)
+
+
+@app.task(name=__name__ + '.CGitListerTask')
+def cgit_lister(**lister_args):
+ lister = new_lister(**lister_args)
+ lister.run()
+
+
+@app.task(name=__name__ + '.ping')
+def ping():
+ return 'OK'
diff --git a/swh/lister/cgit/tests/__init__.py b/swh/lister/cgit/tests/__init__.py
new file mode 100644
diff --git a/swh/lister/cgit/tests/api_response.html b/swh/lister/cgit/tests/api_response.html
new file mode 100644
--- /dev/null
+++ b/swh/lister/cgit/tests/api_response.html
@@ -0,0 +1,47 @@
+
+
+
+fbvbconv-py.git - Unnamed repository; edit this file 'description' to name the repository.
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/swh/lister/cgit/tests/conftest.py b/swh/lister/cgit/tests/conftest.py
new file mode 100644
--- /dev/null
+++ b/swh/lister/cgit/tests/conftest.py
@@ -0,0 +1 @@
+from swh.lister.core.tests.conftest import * # noqa
diff --git a/swh/lister/cgit/tests/test_lister.py b/swh/lister/cgit/tests/test_lister.py
new file mode 100644
--- /dev/null
+++ b/swh/lister/cgit/tests/test_lister.py
@@ -0,0 +1,40 @@
+# Copyright (C) 2019 the Software Heritage developers
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+
+from bs4 import BeautifulSoup
+
+from swh.lister.cgit.lister import priority_origin_url, find_all_origin_url
+
+
+def test_find_all_origin_url():
+ f = open('swh/lister/cgit/tests/api_response.html')
+ soup = BeautifulSoup(f.read(), features="html.parser")
+ expected_output = {'https': 'https://git.savannah.gnu.org/git/'
+ 'fbvbconv-py.git',
+ 'ssh': 'ssh://git.savannah.gnu.org/srv/git/'
+ 'fbvbconv-py.git',
+ 'git': 'git://git.savannah.gnu.org/fbvbconv-py.git'}
+
+ output = find_all_origin_url(soup)
+
+ for protocol, url in expected_output.items():
+ assert url == output[protocol]
+
+
+def test_priority_origin_url():
+ first_input = {'https': 'https://kernel.googlesource.com/pub/scm/docs/'
+ 'man-pages/man-pages.git',
+ 'git': 'git://git.kernel.org/pub/scm/docs/man-pages/'
+ 'man-pages.git'}
+ second_input = {'git': 'git://git.savannah.gnu.org/perl-pesel.git',
+ 'ssh': 'ssh://git.savannah.gnu.org/srv/git/perl-pesel.git'}
+ third_input = {}
+
+ assert (priority_origin_url(first_input) ==
+ 'https://kernel.googlesource.com/pub/scm/docs/man-pages/'
+ 'man-pages.git')
+ assert (priority_origin_url(second_input) ==
+ 'git://git.savannah.gnu.org/perl-pesel.git')
+ assert priority_origin_url(third_input) is None
diff --git a/swh/lister/cgit/tests/test_tasks.py b/swh/lister/cgit/tests/test_tasks.py
new file mode 100644
--- /dev/null
+++ b/swh/lister/cgit/tests/test_tasks.py
@@ -0,0 +1,29 @@
+from unittest.mock import patch
+
+
+def test_ping(swh_app, celery_session_worker):
+ res = swh_app.send_task(
+ 'swh.lister.cgit.tasks.ping')
+ assert res
+ res.wait()
+ assert res.successful()
+ assert res.result == 'OK'
+
+
+@patch('swh.lister.cgit.tasks.CGitLister')
+def test_lister(lister, swh_app, celery_session_worker):
+ # setup the mocked CGitLister
+ lister.return_value = lister
+ lister.run.return_value = None
+
+ res = swh_app.send_task(
+ 'swh.lister.cgit.tasks.CGitListerTask')
+ assert res
+ res.wait()
+ assert res.successful()
+
+ lister.assert_called_once_with(
+ base_url='https://git.savannah.gnu.org/cgit/',
+ instance='savannah-gnu')
+ lister.db_last_index.assert_not_called()
+ lister.run.assert_called_once_with()
diff --git a/swh/lister/cli.py b/swh/lister/cli.py
--- a/swh/lister/cli.py
+++ b/swh/lister/cli.py
@@ -12,7 +12,7 @@
logger = logging.getLogger(__name__)
SUPPORTED_LISTERS = ['github', 'gitlab', 'bitbucket', 'debian', 'pypi',
- 'npm', 'phabricator', 'gnu', 'cran']
+ 'npm', 'phabricator', 'gnu', 'cran', 'cgit']
@click.group(name='lister', context_settings=CONTEXT_SETTINGS)
@@ -125,6 +125,13 @@
from .cran.lister import CRANLister
_lister = CRANLister(override_config=override_conf)
+ elif lister == 'cgit':
+ from .cgit.models import ModelBase
+ from .cgit.lister import CGitLister
+ _lister = CGitLister(
+ base_url='http://git.savannah.gnu.org/cgit/',
+ override_config=override_conf)
+
else:
raise ValueError(
'Invalid lister %s: only supported listers are %s' %
diff --git a/swh/lister/core/tests/conftest.py b/swh/lister/core/tests/conftest.py
--- a/swh/lister/core/tests/conftest.py
+++ b/swh/lister/core/tests/conftest.py
@@ -6,6 +6,7 @@
def celery_includes():
return [
'swh.lister.bitbucket.tasks',
+ 'swh.lister.cgit.tasks',
'swh.lister.cran.tasks',
'swh.lister.debian.tasks',
'swh.lister.github.tasks',