diff --git a/README.md b/README.md --- a/README.md +++ b/README.md @@ -18,6 +18,7 @@ - `swh.lister.npm` - `swh.lister.phabricator` - `swh.lister.cran` +- `swh.lister.cgit` Dependencies ------------ @@ -203,6 +204,28 @@ cran_lister() ``` +## lister-cgit + +Once configured, you can execute a cgit lister using the following instructions +in a `python3` script: + +```lang=python +import logging +from swh.lister.cgit.tasks import cgit_lister + +logging.basicConfig(level=logging.DEBUG) +cgit_lister(url='https://cgit.kde.org/', + url_prefix='https://anongit.kde.org/') +``` + +```lang=python +import logging +from swh.lister.cgit.tasks import cgit_lister + +logging.basicConfig(level=logging.DEBUG) +cgit_lister(url='https://git.kernel.org/') +``` + Licensing --------- diff --git a/requirements.txt b/requirements.txt --- a/requirements.txt +++ b/requirements.txt @@ -5,3 +5,4 @@ setuptools xmltodict iso8601 +beautifulsoup4 diff --git a/swh/lister/cgit/__init__.py b/swh/lister/cgit/__init__.py new file mode 100644 diff --git a/swh/lister/cgit/lister.py b/swh/lister/cgit/lister.py new file mode 100644 --- /dev/null +++ b/swh/lister/cgit/lister.py @@ -0,0 +1,241 @@ +# Copyright (C) 2019 the Software Heritage developers +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import random +import logging +from bs4 import BeautifulSoup +import requests +from urllib.parse import urlparse + +from .models import CGitModel + +from swh.lister.core.simple_lister import SimpleLister +from swh.lister.core.lister_transports import ListerOnePageApiTransport + + +class CGitLister(ListerOnePageApiTransport, SimpleLister): + MODEL = CGitModel + LISTER_NAME = 'cgit' + PAGE = None + url_prefix_present = True + + def __init__(self, url, instance=None, url_prefix=None, + override_config=None): + """Inits Class with PAGE url and origin url prefix. + + Args: + url (str): URL of the CGit instance. + instance (str): Name of cgit instance. + url_prefix (str): Prefix of the origin_url. Origin link of the + repos of some special instances do not match + the url of the repository page, they have origin + url in the format /. + + """ + + self.PAGE = url + if url_prefix is None: + self.url_prefix = url + self.url_prefix_present = False + else: + self.url_prefix = url_prefix + + if not self.url_prefix.endswith('/'): + self.url_prefix += '/' + url = urlparse(self.PAGE) + self.url_netloc = find_netloc(url) + + if not instance: + instance = url.hostname + self.instance = instance + + ListerOnePageApiTransport .__init__(self) + SimpleLister.__init__(self, override_config=override_config) + + def list_packages(self, response): + """List the actual cgit instance origins from the response. + + Find repositories metadata by parsing the html page (response's raw + content). If there are links in the html page, retrieve those + repositories metadata from those pages as well. Return the + repositories as list of dictionaries. + + Args: + response (Response): http api request response. + + Returns: + List of repository origin urls (as dict) included in the response. + + """ + repos_details = [] + repos = get_repo_list(response.text) + url_soup = make_soup(response.text) + pages = self.get_pages(url_soup) + if len(pages) > 1: + repos.extend(list(self.get_repos_from_pages(pages[1:]))) + + for repo in repos: + repo_name = repo.a.text + origin_url = self.find_origin_url(repo, repo_name) + + try: + time = repo.span['title'] + except Exception: + time = None + + if origin_url is not None: + repos_details.append({ + 'name': repo_name, + 'time': time, + 'origin_url': origin_url, + }) + + random.shuffle(repos_details) + return repos_details + + def find_origin_url(self, repo, repo_name): + """Finds the origin url for a repository + + Args: + repo (Beautifulsoup): Beautifulsoup object of the repository + row present in base url. + repo_name (str): Repository name. + + Returns: + string: origin url. + + """ + + if self.url_prefix_present: + return self.url_prefix + repo_name + + return self.get_url(repo) + + def get_pages(self, url_soup): + """Find URL of all pages. + + Finds URL of pages that are present by parsing over the HTML of + pagination present at the end of the page. + + Args: + url_soup (Beautifulsoup): a beautifulsoup object of base URL + + Returns: + list: URL of pages present for a cgit instance + + """ + pages = url_soup.find('div', {"class": "content"}).find_all('li') + + if not pages: + return [self.PAGE] + + return [self.get_url(page) for page in pages] + + def get_repos_from_pages(self, pages): + """Find repos from all pages. + + Request the available repos from the pages. This yields + the available repositories found as beautiful object representation. + + Args: + pages ([str]): list of urls of all pages present for a + particular cgit instance. + + Yields: + List of beautifulsoup object of repository (url) rows + present in pages(except first). + + """ + + for page in pages: + response = requests.get(page) + if not response.ok: # deal with error as warning without impeding + # the listing to finish + logging.warning('Failed to retrieve repositories from page %s', + page) + continue + + yield from get_repo_list(response.text) + + def get_url(self, repo): + """Finds url of a repo page. + + Finds the url of a repo page by parsing over the html of the row of + that repo present in the base url. + + Args: + repo (Beautifulsoup): a beautifulsoup object of the repository + row present in base url. + + Returns: + string: The url of a repo. + + """ + suffix = repo.a['href'] + return self.url_netloc + suffix + + def get_model_from_repo(self, repo): + """Transform from repository representation to model. + + """ + return { + 'uid': self.PAGE + repo['name'], + 'name': repo['name'], + 'full_name': repo['name'], + 'html_url': repo['origin_url'], + 'origin_url': repo['origin_url'], + 'origin_type': 'git', + 'time_updated': repo['time'], + 'instance': self.instance, + } + + def transport_response_simplified(self, repos_details): + """Transform response to list for model manipulation. + + """ + return [self.get_model_from_repo(repo) for repo in repos_details] + + +def find_netloc(url): + """Finds the network location from then url. + + URL in the repo are relative to the network location part of base + URL, so we need to compute it to reconstruct URLs. + + Args: + url (urllib): urllib object of url. + + Returns: + string: Scheme and Network location part in the base URL. + + Example: + For url = https://git.kernel.org/pub/scm/ + >>> find_netloc(url) + 'https://git.kernel.org' + + """ + return '%s://%s' % (url.scheme, url.netloc) + + +def get_repo_list(response): + """Find repositories (as beautifulsoup object) available within the server + response. + + Args: + response (Response): server response + + Returns: + List all repositories as beautifulsoup object within the response. + + """ + repo_soup = make_soup(response) + return repo_soup \ + .find('div', {"class": "content"}).find_all("tr", {"class": ""}) + + +def make_soup(response): + """Instantiates a beautiful soup object from the response object. + + """ + return BeautifulSoup(response, features="html.parser") diff --git a/swh/lister/cgit/models.py b/swh/lister/cgit/models.py new file mode 100644 --- /dev/null +++ b/swh/lister/cgit/models.py @@ -0,0 +1,18 @@ +# Copyright (C) 2019 the Software Heritage developers +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from sqlalchemy import Column, String + +from ..core.models import ModelBase + + +class CGitModel(ModelBase): + """a CGit repository representation + + """ + __tablename__ = 'cgit_repo' + + uid = Column(String, primary_key=True) + time_updated = Column(String) + instance = Column(String, index=True) diff --git a/swh/lister/cgit/tasks.py b/swh/lister/cgit/tasks.py new file mode 100644 --- /dev/null +++ b/swh/lister/cgit/tasks.py @@ -0,0 +1,25 @@ +# Copyright (C) 2019 the Software Heritage developers +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from swh.scheduler.celery_backend.config import app + +from .lister import CGitLister + + +def new_lister(url='https://git.kernel.org/', + url_prefix=None, + instance='kernal', **kw): + return CGitLister(url=url, instance=instance, url_prefix=url_prefix, + **kw) + + +@app.task(name=__name__ + '.CGitListerTask') +def cgit_lister(**lister_args): + lister = new_lister(**lister_args) + lister.run() + + +@app.task(name=__name__ + '.ping') +def ping(): + return 'OK' diff --git a/swh/lister/cgit/tests/__init__.py b/swh/lister/cgit/tests/__init__.py new file mode 100644 diff --git a/swh/lister/cgit/tests/conftest.py b/swh/lister/cgit/tests/conftest.py new file mode 100644 --- /dev/null +++ b/swh/lister/cgit/tests/conftest.py @@ -0,0 +1 @@ +from swh.lister.core.tests.conftest import * # noqa diff --git a/swh/lister/cgit/tests/repo_list.txt b/swh/lister/cgit/tests/repo_list.txt new file mode 100644 --- /dev/null +++ b/swh/lister/cgit/tests/repo_list.txt @@ -0,0 +1,15 @@ +openembedded-coreOpenEmbedded Core layerOpenEmbedded5 hourssummarylogtree +openembedded-core-contribOpenEmbedded Core user contribution treesOpenEmbedded5 hourssummarylogtree +meta-openembeddedCollection of OpenEmbedded layersOpenEmbedded21 hourssummarylogtree +meta-openembedded-contribOpenEmbedded layers collection contribution treesOpenEmbedded21 hourssummarylogtree +bitbakeBitbake Development treeOpenEmbedded7 dayssummarylogtree +bitbake-contribBitbake user contribution treesOpenEmbedded8 dayssummarylogtree +meta-handheldHandheld device meta layerOpenEmbedded9 monthssummarylogtree +meta-opieOPIE meta layerOpenEmbedded3 yearssummarylogtree +openembeddedClassic OpenEmbedded Development TreeOpenEmbedded4 yearssummarylogtree +openembedded-web-frontpagesOpenEmbedded Website Source CodeOpenEmbedded5 yearssummarylogtree +openembedded-adminOE Admin toolsOpenEmbedded6 yearssummarylogtree +meta-microMicro distribution meta layerOpenEmbedded7 yearssummarylogtree +eclipsetoolsEclipse tools for OpenEmbeddedOpenEmbedded8 yearssummarylogtree +oetestTest utilities for OpenEmbeddedOpenEmbedded10 yearssummarylogtree +oebuildstatsOE Build StatsOpenEmbeddedsummarylogtree diff --git a/swh/lister/cgit/tests/response.html b/swh/lister/cgit/tests/response.html new file mode 100644 --- /dev/null +++ b/swh/lister/cgit/tests/response.html @@ -0,0 +1,41 @@ + + + +OpenEmbedded Git Repository Browser + + + + + + + + + diff --git a/swh/lister/cgit/tests/test_lister.py b/swh/lister/cgit/tests/test_lister.py new file mode 100644 --- /dev/null +++ b/swh/lister/cgit/tests/test_lister.py @@ -0,0 +1,27 @@ +# Copyright (C) 2019 the Software Heritage developers +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + + +from urllib.parse import urlparse + +from swh.lister.cgit.lister import find_netloc, get_repo_list + + +def test_get_repo_list(): + f = open('swh/lister/cgit/tests/response.html') + repos = get_repo_list(f.read()) + f = open('swh/lister/cgit/tests/repo_list.txt') + expected_repos = f.readlines() + expected_repos = list(map((lambda repo: repo[:-1]), expected_repos)) + assert len(repos) == len(expected_repos) + for i in range(len(repos)): + assert str(repos[i]) == expected_repos[i] + + +def test_find_netloc(): + first_url = urlparse('http://git.savannah.gnu.org/cgit/') + second_url = urlparse('https://cgit.kde.org/') + + assert find_netloc(first_url) == 'http://git.savannah.gnu.org' + assert find_netloc(second_url) == 'https://cgit.kde.org' diff --git a/swh/lister/cgit/tests/test_tasks.py b/swh/lister/cgit/tests/test_tasks.py new file mode 100644 --- /dev/null +++ b/swh/lister/cgit/tests/test_tasks.py @@ -0,0 +1,53 @@ +from unittest.mock import patch + + +def test_ping(swh_app, celery_session_worker): + res = swh_app.send_task( + 'swh.lister.cgit.tasks.ping') + assert res + res.wait() + assert res.successful() + assert res.result == 'OK' + + +@patch('swh.lister.cgit.tasks.CGitLister') +def test_lister_no_url_prefix(lister, swh_app, celery_session_worker): + # setup the mocked CGitLister + lister.return_value = lister + lister.run.return_value = None + + res = swh_app.send_task( + 'swh.lister.cgit.tasks.CGitListerTask', + kwargs=dict(url='https://git.kernel.org/', instance='kernel')) + assert res + res.wait() + assert res.successful() + + lister.assert_called_once_with( + url='https://git.kernel.org/', + url_prefix=None, + instance='kernel') + lister.db_last_index.assert_not_called() + lister.run.assert_called_once_with() + + +@patch('swh.lister.cgit.tasks.CGitLister') +def test_lister_with_url_prefix(lister, swh_app, celery_session_worker): + # setup the mocked CGitLister + lister.return_value = lister + lister.run.return_value = None + + res = swh_app.send_task( + 'swh.lister.cgit.tasks.CGitListerTask', + kwargs=dict(url='https://cgit.kde.org/', + url_prefix='https://anongit.kde.org/', instance='kde')) + assert res + res.wait() + assert res.successful() + + lister.assert_called_once_with( + url='https://cgit.kde.org/', + url_prefix='https://anongit.kde.org/', + instance='kde') + lister.db_last_index.assert_not_called() + lister.run.assert_called_once_with() diff --git a/swh/lister/cli.py b/swh/lister/cli.py --- a/swh/lister/cli.py +++ b/swh/lister/cli.py @@ -12,7 +12,7 @@ logger = logging.getLogger(__name__) SUPPORTED_LISTERS = ['github', 'gitlab', 'bitbucket', 'debian', 'pypi', - 'npm', 'phabricator', 'gnu', 'cran'] + 'npm', 'phabricator', 'gnu', 'cran', 'cgit'] @click.group(name='lister', context_settings=CONTEXT_SETTINGS) @@ -125,6 +125,14 @@ from .cran.lister import CRANLister _lister = CRANLister(override_config=override_conf) + elif lister == 'cgit': + from .cgit.models import ModelBase + from .cgit.lister import CGitLister + _lister = CGitLister( + url='http://git.savannah.gnu.org/cgit/', + url_prefix='http://git.savannah.gnu.org/git/', + override_config=override_conf) + else: raise ValueError( 'Invalid lister %s: only supported listers are %s' % diff --git a/swh/lister/core/tests/conftest.py b/swh/lister/core/tests/conftest.py --- a/swh/lister/core/tests/conftest.py +++ b/swh/lister/core/tests/conftest.py @@ -6,6 +6,7 @@ def celery_includes(): return [ 'swh.lister.bitbucket.tasks', + 'swh.lister.cgit.tasks', 'swh.lister.cran.tasks', 'swh.lister.debian.tasks', 'swh.lister.github.tasks',