Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F7437755
D1610.id5454.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
19 KB
Subscribers
None
D1610.id5454.diff
View Options
diff --git a/README.md b/README.md
--- a/README.md
+++ b/README.md
@@ -18,6 +18,7 @@
- `swh.lister.npm`
- `swh.lister.phabricator`
- `swh.lister.cran`
+- `swh.lister.cgit`
Dependencies
------------
@@ -203,6 +204,19 @@
cran_lister()
```
+## lister-cgit
+
+Once configured, you can execute a cgit lister using the following instructions
+in a `python3` script:
+
+```lang=python
+import logging
+from swh.lister.cgit.tasks import cgit_lister
+
+logging.basicConfig(level=logging.DEBUG)
+cgit_lister(base_url='http://git.savannah.gnu.org/cgit/')
+```
+
Licensing
---------
diff --git a/requirements.txt b/requirements.txt
--- a/requirements.txt
+++ b/requirements.txt
@@ -5,3 +5,4 @@
setuptools
xmltodict
iso8601
+beautifulsoup4
diff --git a/swh/lister/cgit/__init__.py b/swh/lister/cgit/__init__.py
new file mode 100644
diff --git a/swh/lister/cgit/lister.py b/swh/lister/cgit/lister.py
new file mode 100644
--- /dev/null
+++ b/swh/lister/cgit/lister.py
@@ -0,0 +1,276 @@
+# Copyright (C) 2019 the Software Heritage developers
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+import random
+from bs4 import BeautifulSoup
+from collections import defaultdict
+import requests
+from urllib.parse import urlparse
+
+from .models import CGitModel
+
+from swh.lister.core.simple_lister import SimpleLister
+from swh.lister.core.lister_transports import ListerOnePageApiTransport
+
+
+class CGitLister(ListerOnePageApiTransport, SimpleLister):
+ MODEL = CGitModel
+ LISTER_NAME = 'cgit'
+ PAGE = None
+
+ def __init__(self, base_url, instance=None, override_config=None):
+
+ self.PAGE = base_url
+ url = urlparse(self.PAGE)
+ self.url_netloc = find_netloc(url)
+
+ if not instance:
+ instance = url.hostname
+ self.instance = instance
+ ListerOnePageApiTransport .__init__(self)
+ SimpleLister.__init__(self, override_config=override_config)
+
+ def list_packages(self, response):
+ """List the actual cgit instance origins from the response.
+
+ Find the repos in all the pages by parsing over the HTML of
+ the `base_url`. Find the details for all the repos and return
+ them in the format of list of dictionary
+
+ """
+ repos_details = []
+ repos = get_repo_list(response)
+ soup = make_repo_soup(response)
+ pages = self.get_page(soup)
+ if len(pages) > 1:
+ repos.extend(self.get_all_pages(pages))
+
+ for repo in repos:
+ repo_name = repo.a.text
+ repo_url = self.get_url(repo)
+ origin_url = find_origin_url(repo_url)
+
+ try:
+ time = repo.span['title']
+ except Exception:
+ time = None
+
+ if origin_url is not None:
+ repos_details.append({
+ 'name': repo_name,
+ 'time': time,
+ 'origin_url': origin_url,
+ })
+
+ random.shuffle(repos_details)
+ return repos_details
+
+ def get_page(self, soup):
+ """Find URL of all pages
+
+ Finds URL of all the pages that are present by parsing over the HTML of
+ pagination present at the end of the page.
+
+ Args:
+ soup: a beautifulsoup object of the html code present at base URL
+
+ Returns:
+ list: URL of all the pages present for a cgit instance
+
+ """
+ pages = soup.find('div', {"class": "content"}).find_all('li')
+
+ if not pages:
+ return [self.PAGE]
+
+ return [self.get_url(page) for page in pages]
+
+ def get_all_pages(self, pages):
+ """Find repos from all the pages
+
+ Make the request for all the pages (except the first) present for a
+ particular cgit instance and finds the repos that are available
+ for each and every page.
+
+ Args:
+ pages: list of urls of all the pages present for a particular cgit
+ instance
+
+ Returns:
+ list: List of beautifulsoup object of the html code of all the repo
+ row present in all the pages(except first).
+
+ """
+ all_repos = []
+ for page in pages[1:]:
+ response = requests.get(page)
+ repos = get_repo_list(response)
+ all_repos.extend(repos)
+
+ return all_repos
+
+ def get_url(self, repo):
+ """Finds url of a repo page.
+
+ Finds the url of a repo page by parsing over the html of the row of
+ that repo present in the base url.
+
+ Args:
+ repo: a beautifulsoup object of the html code of the repo row
+ present in base url.
+
+ Returns:
+ string: The url of a repo.
+
+ """
+ suffix = repo.a['href']
+ return self.url_netloc + suffix
+
+ def get_model_from_repo(self, repo):
+ """Transform from repository representation to model.
+
+ """
+ return {
+ 'uid': self.PAGE + repo['name'],
+ 'name': repo['name'],
+ 'full_name': repo['name'],
+ 'html_url': repo['origin_url'],
+ 'origin_url': repo['origin_url'],
+ 'origin_type': 'git',
+ 'time_updated': repo['time'],
+ 'instance': self.instance,
+ }
+
+ def transport_response_simplified(self, repos_details):
+ """Transform response to list for model manipulation.
+
+ """
+ return [self.get_model_from_repo(repo) for repo in repos_details]
+
+
+def find_netloc(url):
+ """Finds the network location from then base_url
+
+ All the url in the repo are relative to the network location part of base
+ url, so we need to compute it to reconstruct all the urls.
+
+ Args:
+ url: urllib object of base_url
+
+ Returns:
+ string: Scheme and Network location part in the base URL.
+
+ Example:
+ For base_url = https://git.kernel.org/pub/scm/
+ >>> find_netloc(url)
+ 'https://git.kernel.org'
+
+ """
+ return '%s://%s' % (url.scheme, url.netloc)
+
+
+def get_repo_list(response):
+ """Find all the rows with repo for a particualar page on the base url
+
+ Finds all the repos on page and retuens a list of all the repos. Each
+ element of the list is a beautifulsoup object representing a repo.
+
+ Args:
+ response: server response
+
+ Returns:
+ list of all the repos on a page.
+
+ """
+ repo_soup = make_repo_soup(response)
+ return repo_soup \
+ .find('div', {"class": "content"}).find_all("tr", {"class": ""})
+
+
+def make_repo_soup(response):
+ """Makes BeautifulSoup object of the response
+
+ """
+ return BeautifulSoup(response.text, features="html.parser")
+
+
+def find_origin_url(repo_url):
+ """Finds origin url for a repo.
+
+ Finds the origin url for a particular repo by parsing over the page of
+ that repo.
+
+ Args:
+ repo_url: URL of the repo.
+
+ Returns:
+ string: Origin url for the repo.
+
+ Examples:
+
+ >>> find_origin_url(
+ 'http://git.savannah.gnu.org/cgit/fbvbconv-py.git/')
+ 'https://git.savannah.gnu.org/git/fbvbconv-py.git'
+
+ """
+
+ response = requests.get(repo_url)
+ repo_soup = make_repo_soup(response)
+
+ origin_urls = find_all_origin_url(repo_soup)
+ return priority_origin_url(origin_urls)
+
+
+def find_all_origin_url(soup):
+ """Finds all possible origin url for a repo.
+
+ Finds all the origin url for a particular repo by parsing over the html of
+ repo page.
+
+ Args:
+ soup: a beautifulsoup object repo representation.
+
+ Returns:
+ dictionary: All possible origin urls for a repository (dict with
+ key 'protocol', value the associated url).
+
+ Examples:
+ If soup is beautifulsoup object of the html code at
+ http://git.savannah.gnu.org/cgit/fbvbconv-py.git/
+
+ >>> print(find_all_origin_url(soup))
+ { 'https': 'https://git.savannah.gnu.org/git/fbvbconv-py.git',
+ 'ssh': 'ssh://git.savannah.gnu.org/srv/git/fbvbconv-py.git',
+ 'git': 'git://git.savannah.gnu.org/fbvbconv-py.git'}
+ """
+ origin_urls = defaultdict(dict)
+ found_clone_word = False
+
+ for i in soup.find_all('tr'):
+ if found_clone_word:
+ link = i.text
+ protocol = link[:link.find(':')]
+ origin_urls[protocol] = link
+ if i.text == 'Clone':
+ found_clone_word = True
+
+ return origin_urls
+
+
+def priority_origin_url(origin_url):
+ """Finds the highest priority link for a particular repo.
+
+ Priority order is https>http>git>ssh.
+
+ Args:
+ origin_urls (Dict): All possible origin urls for a repository
+ (key 'protocol', value the associated url)
+
+ Returns:
+ Url (str) with the highest priority.
+
+ """
+ for protocol in ['https', 'http', 'git', 'ssh']:
+ if protocol in origin_url:
+ return origin_url[protocol]
diff --git a/swh/lister/cgit/models.py b/swh/lister/cgit/models.py
new file mode 100644
--- /dev/null
+++ b/swh/lister/cgit/models.py
@@ -0,0 +1,18 @@
+# Copyright (C) 2019 the Software Heritage developers
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+from sqlalchemy import Column, String
+
+from ..core.models import ModelBase
+
+
+class CGitModel(ModelBase):
+ """a CGit repository representation
+
+ """
+ __tablename__ = 'cgit_repo'
+
+ uid = Column(String, primary_key=True)
+ time_updated = Column(String)
+ instance = Column(String, index=True)
diff --git a/swh/lister/cgit/tasks.py b/swh/lister/cgit/tasks.py
new file mode 100644
--- /dev/null
+++ b/swh/lister/cgit/tasks.py
@@ -0,0 +1,23 @@
+# Copyright (C) 2019 the Software Heritage developers
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+from swh.scheduler.celery_backend.config import app
+
+from .lister import CGitLister
+
+
+def new_lister(base_url='https://git.savannah.gnu.org/cgit/',
+ instance='savannah-gnu', **kw):
+ return CGitLister(base_url=base_url, instance=instance, **kw)
+
+
+@app.task(name=__name__ + '.CGitListerTask')
+def cgit_lister(**lister_args):
+ lister = new_lister(**lister_args)
+ lister.run()
+
+
+@app.task(name=__name__ + '.ping')
+def ping():
+ return 'OK'
diff --git a/swh/lister/cgit/tests/__init__.py b/swh/lister/cgit/tests/__init__.py
new file mode 100644
diff --git a/swh/lister/cgit/tests/api_response.html b/swh/lister/cgit/tests/api_response.html
new file mode 100644
--- /dev/null
+++ b/swh/lister/cgit/tests/api_response.html
@@ -0,0 +1,47 @@
+<!DOCTYPE html>
+<html lang='en'>
+<head>
+<title>fbvbconv-py.git - Unnamed repository; edit this file 'description' to name the repository.</title>
+<meta name='generator' content='cgit v1.0-41-gc330'/>
+<meta name='robots' content='index, nofollow'/>
+<link rel='stylesheet' type='text/css' href='/cgit/cgit.css'/>
+<link rel='shortcut icon' href='/gitweb/git-favicon.png'/>
+<link rel='alternate' title='Atom feed' href='http://git.savannah.gnu.org/cgit/fbvbconv-py.git/atom/?h=master' type='application/atom+xml'/>
+<link rel='vcs-git' href='git://git.savannah.gnu.org/fbvbconv-py.git' title='fbvbconv-py.git Git repository'/>
+<link rel='vcs-git' href='https://git.savannah.gnu.org/git/fbvbconv-py.git' title='fbvbconv-py.git Git repository'/>
+<link rel='vcs-git' href='ssh://git.savannah.gnu.org/srv/git/fbvbconv-py.git' title='fbvbconv-py.git Git repository'/>
+</head>
+<body>
+<div id='cgit'><table id='header'>
+<tr>
+<td class='logo' rowspan='2'><a href='/cgit/'><img src='/cgit/cgit.png' alt='cgit logo'/></a></td>
+<td class='main'><a href='/cgit/'>index</a> : <a title='fbvbconv-py.git' href='/cgit/fbvbconv-py.git/'>fbvbconv-py.git</a></td><td class='form'><form method='get'>
+<select name='h' onchange='this.form.submit();'>
+<option value='master' selected='selected'>master</option>
+</select> <input type='submit' value='switch'/></form></td></tr>
+<tr><td class='sub'>Unnamed repository; edit this file 'description' to name the repository.</td><td class='sub right'></td></tr></table>
+<table class='tabs'><tr><td>
+<a class='active' href='/cgit/fbvbconv-py.git/'>summary</a><a href='/cgit/fbvbconv-py.git/refs/'>refs</a><a href='/cgit/fbvbconv-py.git/log/'>log</a><a href='/cgit/fbvbconv-py.git/tree/'>tree</a><a href='/cgit/fbvbconv-py.git/commit/'>commit</a><a href='/cgit/fbvbconv-py.git/diff/'>diff</a></td><td class='form'><form class='right' method='get' action='/cgit/fbvbconv-py.git/log/'>
+<select name='qt'>
+<option value='grep'>log msg</option>
+<option value='author'>author</option>
+<option value='committer'>committer</option>
+<option value='range'>range</option>
+</select>
+<input class='txt' type='text' size='10' name='q' value=''/>
+<input type='submit' value='search'/>
+</form>
+</td></tr></table>
+<div class='content'><table summary='repository info' class='list nowrap'><tr class='nohover'><th class='left'>Branch</th><th class='left'>Commit message</th><th class='left'>Author</th><th class='left' colspan='2'>Age</th></tr>
+<tr><td><a href='/cgit/fbvbconv-py.git/log/'>master</a></td><td><a href='/cgit/fbvbconv-py.git/commit/'>initial import</a></td><td>Johannes Stezenbach</td><td colspan='2'><span class='age-years' title='2017-06-02 09:57:38 +0200'>2 years</span></td></tr>
+<tr class='nohover'><td colspan='5'> </td></tr><tr class='nohover'><td colspan='5'> </td></tr><tr class='nohover'><th class='left'>Age</th><th class='left'>Commit message</th><th class='left'>Author</th><th class='left'>Files</th><th class='left'>Lines</th></tr>
+<tr><td><span title='2017-06-02 09:57:38 +0200'>2017-06-02</span></td><td><a href='/cgit/fbvbconv-py.git/commit/?id=9766bcfae598e3e077f321bade823028eb5553bb'>initial import</a><span class='decoration'><a class='deco' href='/cgit/fbvbconv-py.git/commit/?id=9766bcfae598e3e077f321bade823028eb5553bb'>HEAD</a><a class='branch-deco' href='/cgit/fbvbconv-py.git/log/'>master</a></span></td><td>Johannes Stezenbach</td><td>3</td><td><span class='deletions'>-0</span>/<span class='insertions'>+889</span></td></tr>
+<tr class='nohover'><td colspan='5'> </td></tr><tr class='nohover'><th class='left' colspan='5'>Clone</th></tr>
+<tr><td colspan='5'><a rel='vcs-git' href='git://git.savannah.gnu.org/fbvbconv-py.git' title='fbvbconv-py.git Git repository'>git://git.savannah.gnu.org/fbvbconv-py.git</a></td></tr>
+<tr><td colspan='5'><a rel='vcs-git' href='https://git.savannah.gnu.org/git/fbvbconv-py.git' title='fbvbconv-py.git Git repository'>https://git.savannah.gnu.org/git/fbvbconv-py.git</a></td></tr>
+<tr><td colspan='5'><a rel='vcs-git' href='ssh://git.savannah.gnu.org/srv/git/fbvbconv-py.git' title='fbvbconv-py.git Git repository'>ssh://git.savannah.gnu.org/srv/git/fbvbconv-py.git</a></td></tr>
+</table></div> <!-- class=content -->
+<div class='footer'>generated by <a href='https://git.zx2c4.com/cgit/about/'>cgit v1.0-41-gc330</a> at 2019-06-19 10:51:46 +0000</div>
+</div> <!-- id=cgit -->
+</body>
+</html>
diff --git a/swh/lister/cgit/tests/conftest.py b/swh/lister/cgit/tests/conftest.py
new file mode 100644
--- /dev/null
+++ b/swh/lister/cgit/tests/conftest.py
@@ -0,0 +1 @@
+from swh.lister.core.tests.conftest import * # noqa
diff --git a/swh/lister/cgit/tests/test_lister.py b/swh/lister/cgit/tests/test_lister.py
new file mode 100644
--- /dev/null
+++ b/swh/lister/cgit/tests/test_lister.py
@@ -0,0 +1,50 @@
+# Copyright (C) 2019 the Software Heritage developers
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+
+from bs4 import BeautifulSoup
+from urllib.parse import urlparse
+
+from swh.lister.cgit.lister import priority_origin_url, find_all_origin_url
+from swh.lister.cgit.lister import find_netloc
+
+
+def test_find_all_origin_url():
+ f = open('swh/lister/cgit/tests/api_response.html')
+ soup = BeautifulSoup(f.read(), features="html.parser")
+ expected_output = {'https': 'https://git.savannah.gnu.org/git/'
+ 'fbvbconv-py.git',
+ 'ssh': 'ssh://git.savannah.gnu.org/srv/git/'
+ 'fbvbconv-py.git',
+ 'git': 'git://git.savannah.gnu.org/fbvbconv-py.git'}
+
+ output = find_all_origin_url(soup)
+
+ for protocol, url in expected_output.items():
+ assert url == output[protocol]
+
+
+def test_priority_origin_url():
+ first_input = {'https': 'https://kernel.googlesource.com/pub/scm/docs/'
+ 'man-pages/man-pages.git',
+ 'git': 'git://git.kernel.org/pub/scm/docs/man-pages/'
+ 'man-pages.git'}
+ second_input = {'git': 'git://git.savannah.gnu.org/perl-pesel.git',
+ 'ssh': 'ssh://git.savannah.gnu.org/srv/git/perl-pesel.git'}
+ third_input = {}
+
+ assert (priority_origin_url(first_input) ==
+ 'https://kernel.googlesource.com/pub/scm/docs/man-pages/'
+ 'man-pages.git')
+ assert (priority_origin_url(second_input) ==
+ 'git://git.savannah.gnu.org/perl-pesel.git')
+ assert priority_origin_url(third_input) is None
+
+
+def test_find_netloc():
+ first_url = urlparse('http://git.savannah.gnu.org/cgit/')
+ second_url = urlparse('https://cgit.kde.org/')
+
+ assert find_netloc(first_url) == 'http://git.savannah.gnu.org'
+ assert find_netloc(second_url) == 'https://cgit.kde.org'
diff --git a/swh/lister/cgit/tests/test_tasks.py b/swh/lister/cgit/tests/test_tasks.py
new file mode 100644
--- /dev/null
+++ b/swh/lister/cgit/tests/test_tasks.py
@@ -0,0 +1,29 @@
+from unittest.mock import patch
+
+
+def test_ping(swh_app, celery_session_worker):
+ res = swh_app.send_task(
+ 'swh.lister.cgit.tasks.ping')
+ assert res
+ res.wait()
+ assert res.successful()
+ assert res.result == 'OK'
+
+
+@patch('swh.lister.cgit.tasks.CGitLister')
+def test_lister(lister, swh_app, celery_session_worker):
+ # setup the mocked CGitLister
+ lister.return_value = lister
+ lister.run.return_value = None
+
+ res = swh_app.send_task(
+ 'swh.lister.cgit.tasks.CGitListerTask')
+ assert res
+ res.wait()
+ assert res.successful()
+
+ lister.assert_called_once_with(
+ base_url='https://git.savannah.gnu.org/cgit/',
+ instance='savannah-gnu')
+ lister.db_last_index.assert_not_called()
+ lister.run.assert_called_once_with()
diff --git a/swh/lister/cli.py b/swh/lister/cli.py
--- a/swh/lister/cli.py
+++ b/swh/lister/cli.py
@@ -12,7 +12,7 @@
logger = logging.getLogger(__name__)
SUPPORTED_LISTERS = ['github', 'gitlab', 'bitbucket', 'debian', 'pypi',
- 'npm', 'phabricator', 'gnu', 'cran']
+ 'npm', 'phabricator', 'gnu', 'cran', 'cgit']
@click.group(name='lister', context_settings=CONTEXT_SETTINGS)
@@ -125,6 +125,13 @@
from .cran.lister import CRANLister
_lister = CRANLister(override_config=override_conf)
+ elif lister == 'cgit':
+ from .cgit.models import ModelBase
+ from .cgit.lister import CGitLister
+ _lister = CGitLister(
+ base_url='http://git.savannah.gnu.org/cgit/',
+ override_config=override_conf)
+
else:
raise ValueError(
'Invalid lister %s: only supported listers are %s' %
diff --git a/swh/lister/core/tests/conftest.py b/swh/lister/core/tests/conftest.py
--- a/swh/lister/core/tests/conftest.py
+++ b/swh/lister/core/tests/conftest.py
@@ -6,6 +6,7 @@
def celery_includes():
return [
'swh.lister.bitbucket.tasks',
+ 'swh.lister.cgit.tasks',
'swh.lister.cran.tasks',
'swh.lister.debian.tasks',
'swh.lister.github.tasks',
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Mon, Apr 14, 4:48 AM (13 h, 43 m ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3234075
Attached To
D1610: swh.lister.cgit
Event Timeline
Log In to Comment