diff --git a/README.md b/README.md --- a/README.md +++ b/README.md @@ -16,6 +16,7 @@ - `swh.lister.pypi` - `swh.lister.npm` - `swh.lister.phabricator` +- `swh.lister.gnu` Dependencies ------------ @@ -177,6 +178,18 @@ incremental_phabricator_lister(forge_url='https://forge.softwareheritage.org', api_token='XXXX') ``` +## lister-gnu + +Once configured, you can execute a PyPI lister using the following instructions in a `python3` script: + +```lang=python +import logging +from swh.lister.gnu.tasks import gnu_lister + +logging.basicConfig(level=logging.DEBUG) +gnu_lister() +``` + Licensing --------- diff --git a/swh/lister/cli.py b/swh/lister/cli.py --- a/swh/lister/cli.py +++ b/swh/lister/cli.py @@ -12,7 +12,7 @@ logger = logging.getLogger(__name__) SUPPORTED_LISTERS = ['github', 'gitlab', 'bitbucket', 'debian', 'pypi', - 'npm', 'phabricator'] + 'npm', 'phabricator', 'gnu'] @click.group(name='lister', context_settings=CONTEXT_SETTINGS) @@ -115,6 +115,11 @@ api_token='', override_config=override_conf) + elif lister == 'gnu': + from .gnu.models import ModelBase + from .gnu.lister import GNULister + _lister = GNULister(override_config=override_conf) + else: raise ValueError( 'Invalid lister %s: only supported listers are %s' % diff --git a/swh/lister/core/tests/conftest.py b/swh/lister/core/tests/conftest.py --- a/swh/lister/core/tests/conftest.py +++ b/swh/lister/core/tests/conftest.py @@ -12,4 +12,5 @@ 'swh.lister.npm.tasks', 'swh.lister.pypi.tasks', 'swh.lister.phabricator.tasks', + 'swh.lister.gnu.tasks' ] diff --git a/swh/lister/gnu/__init__.py b/swh/lister/gnu/__init__.py new file mode 100644 diff --git a/swh/lister/gnu/lister.py b/swh/lister/gnu/lister.py new file mode 100644 --- /dev/null +++ b/swh/lister/gnu/lister.py @@ -0,0 +1,200 @@ +# Copyright (C) 2019 the Software Heritage developers +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import random +import gzip +import json +import requests + +from .models import GNUModel + +from swh.scheduler import utils +from swh.lister.core.simple_lister import SimpleLister + + +class GNULister(SimpleLister): + MODEL = GNUModel + LISTER_NAME = 'gnu' + TREE_URL = 'https://ftp.gnu.org/tree.json.gz' + BASE_URL = 'https://ftp.gnu.org' + + def task_dict(self, origin_type, origin_url, **kwargs): + """ + Return task format dict + + This is overridden from the lister_base as more information is + needed for the ingestion task creation. + """ + return utils.create_task_dict( + 'load-%s' % origin_type, 'recurring', kwargs.get('name'), + origin_url, list_of_tarball=kwargs.get('list_of_tarball')) + + def get_file(self): + ''' + Downloads and unzip tree.json.gz file and returns its content + in JSON format + + Returns + File content in JSON format + ''' + response = requests.get(self.TREE_URL, + allow_redirects=True) + uncompressed_content = gzip.decompress(response.content) + return json.loads(uncompressed_content.decode('utf-8')) + + def safely_issue_request(self, identifier): + ''' + Make network request with to download the file which + has file structure of the GNU website. + + Args: + identifier: resource identifier + Returns: + server response + ''' + response = self.get_file() + return response + + def list_packages(self, response): + """ + List the actual gnu origins with their names and + time last updated from the response. + + Args: + response : File structure of the website + in JSON format + + Returns: + a list of all the packages with their names, url of their root + directory and the tarballs present for the particular package. + [ + {'name': '3dldf', 'url': 'https://ftp.gnu.org/gnu/3dldf/', + 'list_of_tarballs': + [ + {'archive': + 'https://ftp.gnu.org/gnu/3dldf/3DLDF-1.1.3.tar.gz', + 'date': '1071002600'}, + {'archive': + 'https://ftp.gnu.org/gnu/3dldf/3DLDF-1.1.4.tar.gz', + 'date': '1071078759'}} + ] + }, + {'name': '8sync', 'url': 'https://ftp.gnu.org/gnu/8sync/', + 'list_of_tarballs': + [ + {'archive': + 'https://ftp.gnu.org/gnu/8sync/8sync-0.1.0.tar.gz', + 'date': '1461357336'}, + {'archive': + 'https://ftp.gnu.org/gnu/8sync/8sync-0.2.0.tar.gz', + 'date': '1480991830'} + ] + ] + """ + response = clean_up_response(response) + packages = [] + for directory in response: + content = directory['contents'] + for repo in content: + if repo['type'] == 'directory': + package_url = '%s/%s/%s/' % (self.BASE_URL, + directory['name'], + repo['name']) + list_of_tarball = find_all_tarball( + repo['contents'], package_url) + if list_of_tarball != []: + repo_details = { + 'name': repo['name'], + 'url': package_url, + 'list_of_tarball': list_of_tarball + } + packages.append(repo_details) + random.shuffle(packages) + return packages + + def get_model_from_repo(self, repo): + """Transform from repository representation to model + + """ + return { + 'uid': repo['name'], + 'name': repo['name'], + 'full_name': repo['name'], + 'html_url': repo['url'], + 'origin_url': repo['url'], + 'time_last_upated': repo['time_modified'], + 'origin_type': 'gnu', + 'description': None, + } + + def transport_response_simplified(self, response): + """Transform response to list for model manipulation + + """ + return [self.get_model_from_repo(repo) for repo in response] + + def transport_request(self): + pass + + def transport_response_to_string(self): + pass + + def transport_quota_check(self): + pass + + +def find_all_tarball(package_file_structure, url): + ''' + Recusively lists all the tarball present in the folder and subfolders + + Args + package_file_structure : File structure of the package root directory + url : URL of the corrosponding package + + Returns + List of all the tarball urls and the last their time of update + example- + For a package called 3dldf + + [ + {'archive': 'https://ftp.gnu.org/gnu/3dldf/3DLDF-1.1.3.tar.gz', + 'date': '1071002600'} + {'archive': 'https://ftp.gnu.org/gnu/3dldf/3DLDF-1.1.4.tar.gz', + 'date': '1071078759'} + {'archive': 'https://ftp.gnu.org/gnu/3dldf/3DLDF-1.1.5.1.tar.gz', + 'date': '1074278633'} + ... + ] + ''' + list_of_tarball = [] + for single_file in package_file_structure: + + if single_file['type'] == 'file': + if(single_file['name'][-6:-3] == "tar" or + single_file['name'][-3:] == "zip"): + list_of_tarball.append({ + "archive": url + single_file['name'], + "date": single_file['time'] + }) + # It will recursively check for tarballs in all sub-folders + elif single_file['type'] == 'directory': + list_of_tarballs_in_dir = find_all_tarball( + single_file['contents'], + url + single_file['name'] + '/') + list_of_tarball.extend(list_of_tarballs_in_dir) + + return list_of_tarball + + +def clean_up_response(response): + ''' + Clears our JSON response by keeping only those directory which + have tarballs + ''' + final_response = [] + file_system = response[0]['contents'] + for directory in file_system: + if directory['name'] in ('gnu', 'mirrors', 'old-gnu'): + final_response.append(directory) + return final_response diff --git a/swh/lister/gnu/models.py b/swh/lister/gnu/models.py new file mode 100644 --- /dev/null +++ b/swh/lister/gnu/models.py @@ -0,0 +1,17 @@ +# Copyright (C) 2019 the Software Heritage developers +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from sqlalchemy import Column, String, Integer + +from ..core.models import ModelBase + + +class GNUModel(ModelBase): + """a GNU repository representation + + """ + __tablename__ = 'gnu_repo' + + uid = Column(String, primary_key=True) + time_last_upated = Column(Integer) diff --git a/swh/lister/gnu/tasks.py b/swh/lister/gnu/tasks.py new file mode 100644 --- /dev/null +++ b/swh/lister/gnu/tasks.py @@ -0,0 +1,17 @@ +# Copyright (C) 2019 the Software Heritage developers +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from swh.scheduler.celery_backend.config import app + +from .lister import GNULister + + +@app.task(name=__name__ + '.GNUListerTask') +def gnu_lister(**lister_args): + GNULister(**lister_args).run() + + +@app.task(name=__name__ + '.ping') +def ping(): + return 'OK' diff --git a/swh/lister/gnu/tests/__init__.py b/swh/lister/gnu/tests/__init__.py new file mode 100644 diff --git a/swh/lister/gnu/tests/conftest.py b/swh/lister/gnu/tests/conftest.py new file mode 100644 --- /dev/null +++ b/swh/lister/gnu/tests/conftest.py @@ -0,0 +1 @@ +from swh.lister.core.tests.conftest import * # noqa diff --git a/swh/lister/gnu/tests/test_tasks.py b/swh/lister/gnu/tests/test_tasks.py new file mode 100644 --- /dev/null +++ b/swh/lister/gnu/tests/test_tasks.py @@ -0,0 +1,27 @@ +from unittest.mock import patch + + +def test_ping(swh_app, celery_session_worker): + res = swh_app.send_task( + 'swh.lister.gnu.tasks.ping') + assert res + res.wait() + assert res.successful() + assert res.result == 'OK' + + +@patch('swh.lister.gnu.tasks.GNULister') +def test_lister(lister, swh_app, celery_session_worker): + # setup the mocked GNULister + lister.return_value = lister + lister.run.return_value = None + + res = swh_app.send_task( + 'swh.lister.gnu.tasks.GNUListerTask') + assert res + res.wait() + assert res.successful() + + lister.assert_called_once_with() + lister.db_last_index.assert_not_called() + lister.run.assert_called_once_with()