diff --git a/swh/lister/cli.py b/swh/lister/cli.py --- a/swh/lister/cli.py +++ b/swh/lister/cli.py @@ -12,7 +12,8 @@ logger = logging.getLogger(__name__) SUPPORTED_LISTERS = ['github', 'gitlab', 'bitbucket', 'debian', 'pypi', - 'npm', 'phabricator', 'gnu', 'cran', 'cgit', 'packagist'] + 'npm', 'phabricator', 'gnu', 'cran', 'cgit', 'packagist', + 'json'] # Base urls for most listers @@ -124,6 +125,11 @@ from .gnu.lister import GNULister _lister = GNULister(override_config=override_conf) + elif lister_name == 'json': + from .json.models import ModelBase + from .json.lister import JSONLister + _lister = JSONLister(url=api_baseurl, override_config=override_conf) + elif lister_name == 'cran': from .cran.models import ModelBase from .cran.lister import CRANLister diff --git a/swh/lister/core/tests/conftest.py b/swh/lister/core/tests/conftest.py --- a/swh/lister/core/tests/conftest.py +++ b/swh/lister/core/tests/conftest.py @@ -12,6 +12,7 @@ 'swh.lister.github.tasks', 'swh.lister.gitlab.tasks', 'swh.lister.gnu.tasks', + 'swh.lister.json.tasks', 'swh.lister.npm.tasks', 'swh.lister.packagist.tasks', 'swh.lister.phabricator.tasks', diff --git a/swh/lister/json/__init__.py b/swh/lister/json/__init__.py new file mode 100644 diff --git a/swh/lister/json/lister.py b/swh/lister/json/lister.py new file mode 100644 --- /dev/null +++ b/swh/lister/json/lister.py @@ -0,0 +1,77 @@ +# Copyright (C) 2019 the Software Heritage developers +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import json +import requests + +from .models import JSONModel + +from swh.scheduler import utils +from swh.lister.core.simple_lister import SimpleLister + + +class JSONLister(SimpleLister): + MODEL = JSONModel + LISTER_NAME = 'json' + instance = 'json' + + def __init__(self, url, override_config=None): + self.url = url + super().__init__(override_config=override_config) + + def task_dict(self, origin_type, origin_url, **kwargs): + """ + Return task format dict + + This is overridden from the lister_base as more information is + needed for the ingestion task creation. + """ + return utils.create_task_dict( + 'load-tar', kwargs.get('policy', 'oneshot'), + kwargs.get('name'), + origin_url, tarballs=[{'archive': origin_url, 'date': 0}]) + + def safely_issue_request(self, identifier): + ''' + Make network request to download the JSON file. + + Args: + identifier: resource identifier (unused) + Returns: + Server response + ''' + response = requests.get(self.url, + allow_redirects=True) + # TODO: support gzip content as well + return json.loads(response.content.decode('utf-8')) + + def list_packages(self, response): + """List packages from the response + """ + return [r for r in response if r["source"]["type"] == "url"] + + def get_model_from_repo(self, source): + """Transform from source representation to model + """ + # The integrity attribute is a hash of the content in the SRI format + # See https://www.w3.org/TR/SRI + if 'integrity' in source['source']: + uid = source['source']['integrity'] + else: + uid = source['source']['url'] + + return { + 'uid': uid, + 'name': source['name'], + 'full_name': source['name'], + 'html_url': source['source']['url'], + 'origin_url': source['source']['url'], + 'origin_type': 'tar', + } + + def transport_response_simplified(self, response): + """Transform response to list for model manipulation + + """ + return [self.get_model_from_repo(repo) for repo in response] diff --git a/swh/lister/json/models.py b/swh/lister/json/models.py new file mode 100644 --- /dev/null +++ b/swh/lister/json/models.py @@ -0,0 +1,16 @@ +# Copyright (C) 2019 the Software Heritage developers +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from sqlalchemy import Column, String, Integer + +from ..core.models import ModelBase + + +class JSONModel(ModelBase): + """a JSON packages list representation + + """ + __tablename__ = 'json' + + uid = Column(String, primary_key=True) diff --git a/swh/lister/json/tasks.py b/swh/lister/json/tasks.py new file mode 100644 --- /dev/null +++ b/swh/lister/json/tasks.py @@ -0,0 +1,17 @@ +# Copyright (C) 2019 the Software Heritage developers +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from swh.scheduler.celery_backend.config import app + +from .lister import JSONLister + + +@app.task(name=__name__ + '.JSONListerTask') +def json_lister(**lister_args): + JSONLister(**lister_args).run() + + +@app.task(name=__name__ + '.ping') +def ping(): + return 'OK' diff --git a/swh/lister/json/tests/__init__.py b/swh/lister/json/tests/__init__.py new file mode 100644 diff --git a/swh/lister/json/tests/conftest.py b/swh/lister/json/tests/conftest.py new file mode 100644 --- /dev/null +++ b/swh/lister/json/tests/conftest.py @@ -0,0 +1 @@ +from swh.lister.core.tests.conftest import * # noqa diff --git a/swh/lister/json/tests/test_lister.py b/swh/lister/json/tests/test_lister.py new file mode 100644 --- /dev/null +++ b/swh/lister/json/tests/test_lister.py @@ -0,0 +1,36 @@ +# Copyright (C) 2019 the Software Heritage developers +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import unittest +from swh.lister.json.lister import JSONLister + +lister = JSONLister(url='http://fake') + +packages = [{'name': 'hello-2.10.tar.gz', + 'source': { + 'type': 'url', + 'integrity': + 'sha256-MeBmE3qWJnbon2nRtlOC3pWn732RS4y5VvQepy4PUWs=', + 'url': 'https://ftpmirror.gnu.org//hello/hello-2.10.tar.gz' + }}] + +expected_model = { + 'uid': 'sha256-MeBmE3qWJnbon2nRtlOC3pWn732RS4y5VvQepy4PUWs=', + 'name': 'hello-2.10.tar.gz', + 'full_name': 'hello-2.10.tar.gz', + 'html_url': 'https://ftpmirror.gnu.org//hello/hello-2.10.tar.gz', + 'origin_url': 'https://ftpmirror.gnu.org//hello/hello-2.10.tar.gz', + 'origin_type': 'tar', + } + + +class JSONListerTester(unittest.TestCase): + def test_transport_response_simplified(self): + """Test model created by the lister + + """ + model = lister.transport_response_simplified(packages) + assert len(model) == 1 + for key, values in model[0].items(): + self.assertEqual(values, expected_model[key]) diff --git a/swh/lister/json/tests/test_tasks.py b/swh/lister/json/tests/test_tasks.py new file mode 100644 --- /dev/null +++ b/swh/lister/json/tests/test_tasks.py @@ -0,0 +1,27 @@ +from unittest.mock import patch + + +def test_ping(swh_app, celery_session_worker): + res = swh_app.send_task( + 'swh.lister.json.tasks.ping') + assert res + res.wait() + assert res.successful() + assert res.result == 'OK' + + +@patch('swh.lister.json.tasks.JSONLister') +def test_lister(lister, swh_app, celery_session_worker): + # setup the mocked JSONLister + lister.return_value = lister + lister.run.return_value = None + + res = swh_app.send_task( + 'swh.lister.json.tasks.JSONListerTask') + assert res + res.wait() + assert res.successful() + + lister.assert_called_once_with() + lister.db_last_index.assert_not_called() + lister.run.assert_called_once_with()