Page MenuHomeSoftware Heritage

D1482.id4989.diff
No OneTemporary

D1482.id4989.diff

diff --git a/README.md b/README.md
--- a/README.md
+++ b/README.md
@@ -16,6 +16,7 @@
- `swh.lister.pypi`
- `swh.lister.npm`
- `swh.lister.phabricator`
+- `swh.lister.gnu`
Dependencies
------------
@@ -177,6 +178,18 @@
incremental_phabricator_lister(forge_url='https://forge.softwareheritage.org', api_token='XXXX')
```
+## lister-gnu
+
+Once configured, you can execute a PyPI lister using the following instructions in a `python3` script:
+
+```lang=python
+import logging
+from swh.lister.gnu.tasks import gnu_lister
+
+logging.basicConfig(level=logging.DEBUG)
+gnu_lister()
+```
+
Licensing
---------
diff --git a/setup.py b/setup.py
--- a/setup.py
+++ b/setup.py
@@ -54,6 +54,8 @@
entry_points='''
[console_scripts]
swh-lister=swh.lister.cli:cli
+ [swh.cli.subcommands]
+ lister=swh.lister.cli:lister
''',
classifiers=[
"Programming Language :: Python :: 3",
diff --git a/swh/lister/cli.py b/swh/lister/cli.py
--- a/swh/lister/cli.py
+++ b/swh/lister/cli.py
@@ -6,14 +6,23 @@
import logging
import click
+from swh.core.cli import CONTEXT_SETTINGS
+
logger = logging.getLogger(__name__)
SUPPORTED_LISTERS = ['github', 'gitlab', 'bitbucket', 'debian', 'pypi',
- 'npm', 'phabricator']
+ 'npm', 'phabricator', 'gnu']
+
+@click.group(name='lister', context_settings=CONTEXT_SETTINGS)
+@click.pass_context
+def lister(ctx):
+ '''Software Heritage Lister tools.'''
+ pass
-@click.command()
+
+@lister.command(name='db-init', context_settings=CONTEXT_SETTINGS)
@click.option(
'--db-url', '-d', default='postgres:///lister-gitlab.com',
help='SQLAlchemy DB URL; see '
@@ -22,8 +31,9 @@
type=click.Choice(SUPPORTED_LISTERS + ['all']))
@click.option('--drop-tables', '-D', is_flag=True, default=False,
help='Drop tables before creating the database schema')
-def cli(db_url, listers, drop_tables):
- """Initialize db model according to lister.
+@click.pass_context
+def cli(ctx, db_url, listers, drop_tables):
+ """Initialize the database model for given listers.
"""
override_conf = {
@@ -105,6 +115,11 @@
api_token='',
override_config=override_conf)
+ elif lister == 'gnu':
+ from .gnu.models import ModelBase
+ from .gnu.lister import GNULister
+ _lister = GNULister(override_config=override_conf)
+
else:
raise ValueError(
'Invalid lister %s: only supported listers are %s' %
diff --git a/swh/lister/core/lister_base.py b/swh/lister/core/lister_base.py
--- a/swh/lister/core/lister_base.py
+++ b/swh/lister/core/lister_base.py
@@ -414,7 +414,7 @@
Returns:
the same information in a different form
"""
- _type = 'origin-update-%s' % origin_type
+ _type = 'load-%s' % origin_type
_policy = 'recurring'
return utils.create_task_dict(_type, _policy, origin_url)
diff --git a/swh/lister/core/tests/conftest.py b/swh/lister/core/tests/conftest.py
--- a/swh/lister/core/tests/conftest.py
+++ b/swh/lister/core/tests/conftest.py
@@ -12,4 +12,5 @@
'swh.lister.npm.tasks',
'swh.lister.pypi.tasks',
'swh.lister.phabricator.tasks',
+ 'swh.lister.gnu.tasks'
]
diff --git a/swh/lister/gnu/__init__.py b/swh/lister/gnu/__init__.py
new file mode 100644
diff --git a/swh/lister/gnu/lister.py b/swh/lister/gnu/lister.py
new file mode 100644
--- /dev/null
+++ b/swh/lister/gnu/lister.py
@@ -0,0 +1,200 @@
+# Copyright (C) 2019 the Software Heritage developers
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+import random
+import gzip
+import json
+import requests
+
+from .models import GNUModel
+
+from swh.scheduler import utils
+from swh.lister.core.simple_lister import SimpleLister
+
+
+class GNULister(SimpleLister):
+ MODEL = GNUModel
+ LISTER_NAME = 'gnu'
+ TREE_URL = 'https://ftp.gnu.org/tree.json.gz'
+ BASE_URL = 'https://ftp.gnu.org'
+
+ def task_dict(self, origin_type, origin_url, **kwargs):
+ """
+ Return task format dict
+
+ This is overridden from the lister_base as more information is
+ needed for the ingestion task creation.
+ """
+ return utils.create_task_dict(
+ 'load-%s' % origin_type, 'recurring', kwargs.get('name'),
+ origin_url, list_of_tarball=kwargs.get('list_of_tarball'))
+
+ def get_file(self):
+ '''
+ Downloads and unzip tree.json.gz file and returns its content
+ in JSON format
+
+ Returns
+ File content in JSON format
+ '''
+ response = requests.get(self.TREE_URL,
+ allow_redirects=True)
+ uncompressed_content = gzip.decompress(response.content)
+ return json.loads(uncompressed_content.decode('utf-8'))
+
+ def safely_issue_request(self, identifier):
+ '''
+ Make network request with to download the file which
+ has file structure of the GNU website.
+
+ Args:
+ identifier: resource identifier
+ Returns:
+ server response
+ '''
+ response = self.get_file()
+ return response
+
+ def list_packages(self, response):
+ """
+ List the actual gnu origins with their names and
+ time last updated from the response.
+
+ Args:
+ response : File structure of the website
+ in JSON format
+
+ Returns:
+ a list of all the packages with their names, url of their root
+ directory and the tarballs present for the particular package.
+ [
+ {'name': '3dldf', 'url': 'https://ftp.gnu.org/gnu/3dldf/',
+ 'list_of_tarballs':
+ [
+ {'archive':
+ 'https://ftp.gnu.org/gnu/3dldf/3DLDF-1.1.3.tar.gz',
+ 'date': '1071002600'},
+ {'archive':
+ 'https://ftp.gnu.org/gnu/3dldf/3DLDF-1.1.4.tar.gz',
+ 'date': '1071078759'}}
+ ]
+ },
+ {'name': '8sync', 'url': 'https://ftp.gnu.org/gnu/8sync/',
+ 'list_of_tarballs':
+ [
+ {'archive':
+ 'https://ftp.gnu.org/gnu/8sync/8sync-0.1.0.tar.gz',
+ 'date': '1461357336'},
+ {'archive':
+ 'https://ftp.gnu.org/gnu/8sync/8sync-0.2.0.tar.gz',
+ 'date': '1480991830'}
+ ]
+ ]
+ """
+ response = clean_up_response(response)
+ packages = []
+ for directory in response:
+ content = directory['contents']
+ for repo in content:
+ if repo['type'] == 'directory':
+ package_url = '%s/%s/%s/' % (self.BASE_URL,
+ directory['name'],
+ repo['name'])
+ list_of_tarball = find_all_tarball(
+ repo['contents'], package_url)
+ if list_of_tarball != []:
+ repo_details = {
+ 'name': repo['name'],
+ 'url': package_url,
+ 'list_of_tarball': list_of_tarball
+ }
+ packages.append(repo_details)
+ random.shuffle(packages)
+ return packages
+
+ def get_model_from_repo(self, repo):
+ """Transform from repository representation to model
+
+ """
+ return {
+ 'uid': repo['name'],
+ 'name': repo['name'],
+ 'full_name': repo['name'],
+ 'html_url': repo['url'],
+ 'origin_url': repo['url'],
+ 'time_last_upated': repo['time_modified'],
+ 'origin_type': 'gnu',
+ 'description': None,
+ }
+
+ def transport_response_simplified(self, response):
+ """Transform response to list for model manipulation
+
+ """
+ return [self.get_model_from_repo(repo) for repo in response]
+
+ def transport_request(self):
+ pass
+
+ def transport_response_to_string(self):
+ pass
+
+ def transport_quota_check(self):
+ pass
+
+
+def find_all_tarball(package_file_structure, url):
+ '''
+ Recusively lists all the tarball present in the folder and subfolders
+
+ Args
+ package_file_structure : File structure of the package root directory
+ url : URL of the corrosponding package
+
+ Returns
+ List of all the tarball urls and the last their time of update
+ example-
+ For a package called 3dldf
+
+ [
+ {'archive': 'https://ftp.gnu.org/gnu/3dldf/3DLDF-1.1.3.tar.gz',
+ 'date': '1071002600'}
+ {'archive': 'https://ftp.gnu.org/gnu/3dldf/3DLDF-1.1.4.tar.gz',
+ 'date': '1071078759'}
+ {'archive': 'https://ftp.gnu.org/gnu/3dldf/3DLDF-1.1.5.1.tar.gz',
+ 'date': '1074278633'}
+ ...
+ ]
+ '''
+ list_of_tarball = []
+ for single_file in package_file_structure:
+
+ if single_file['type'] == 'file':
+ if(single_file['name'][-6:-3] == "tar" or
+ single_file['name'][-3:] == "zip"):
+ list_of_tarball.append({
+ "archive": url + single_file['name'],
+ "date": single_file['time']
+ })
+ # It will recursively check for tarballs in all sub-folders
+ elif single_file['type'] == 'directory':
+ list_of_tarballs_in_dir = find_all_tarball(
+ single_file['contents'],
+ url + single_file['name'] + '/')
+ list_of_tarball.extend(list_of_tarballs_in_dir)
+
+ return list_of_tarball
+
+
+def clean_up_response(response):
+ '''
+ Clears our JSON response by keeping only those directory which
+ have tarballs
+ '''
+ final_response = []
+ file_system = response[0]['contents']
+ for directory in file_system:
+ if directory['name'] in ('gnu', 'mirrors', 'old-gnu'):
+ final_response.append(directory)
+ return final_response
diff --git a/swh/lister/gnu/models.py b/swh/lister/gnu/models.py
new file mode 100644
--- /dev/null
+++ b/swh/lister/gnu/models.py
@@ -0,0 +1,17 @@
+# Copyright (C) 2019 the Software Heritage developers
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+from sqlalchemy import Column, String, Integer
+
+from ..core.models import ModelBase
+
+
+class GNUModel(ModelBase):
+ """a GNU repository representation
+
+ """
+ __tablename__ = 'gnu_repo'
+
+ uid = Column(String, primary_key=True)
+ time_last_upated = Column(Integer)
diff --git a/swh/lister/gnu/tasks.py b/swh/lister/gnu/tasks.py
new file mode 100644
--- /dev/null
+++ b/swh/lister/gnu/tasks.py
@@ -0,0 +1,17 @@
+# Copyright (C) 2019 the Software Heritage developers
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+from swh.scheduler.celery_backend.config import app
+
+from .lister import GNULister
+
+
+@app.task(name=__name__ + '.GNUListerTask')
+def gnu_lister(**lister_args):
+ GNULister(**lister_args).run()
+
+
+@app.task(name=__name__ + '.ping')
+def ping():
+ return 'OK'
diff --git a/swh/lister/gnu/tests/__init__.py b/swh/lister/gnu/tests/__init__.py
new file mode 100644
diff --git a/swh/lister/gnu/tests/conftest.py b/swh/lister/gnu/tests/conftest.py
new file mode 100644
--- /dev/null
+++ b/swh/lister/gnu/tests/conftest.py
@@ -0,0 +1 @@
+from swh.lister.core.tests.conftest import * # noqa
diff --git a/swh/lister/gnu/tests/test_tasks.py b/swh/lister/gnu/tests/test_tasks.py
new file mode 100644
--- /dev/null
+++ b/swh/lister/gnu/tests/test_tasks.py
@@ -0,0 +1,27 @@
+from unittest.mock import patch
+
+
+def test_ping(swh_app, celery_session_worker):
+ res = swh_app.send_task(
+ 'swh.lister.gnu.tasks.ping')
+ assert res
+ res.wait()
+ assert res.successful()
+ assert res.result == 'OK'
+
+
+@patch('swh.lister.gnu.tasks.GNULister')
+def test_lister(lister, swh_app, celery_session_worker):
+ # setup the mocked GNULister
+ lister.return_value = lister
+ lister.run.return_value = None
+
+ res = swh_app.send_task(
+ 'swh.lister.gnu.tasks.GNUListerTask')
+ assert res
+ res.wait()
+ assert res.successful()
+
+ lister.assert_called_once_with()
+ lister.db_last_index.assert_not_called()
+ lister.run.assert_called_once_with()
diff --git a/swh/lister/pypi/lister.py b/swh/lister/pypi/lister.py
--- a/swh/lister/pypi/lister.py
+++ b/swh/lister/pypi/lister.py
@@ -28,7 +28,7 @@
needed for the ingestion task creation.
"""
- _type = 'origin-update-%s' % origin_type
+ _type = 'load-%s' % origin_type
_policy = 'recurring'
project_name = kwargs.get('name')
project_metadata_url = kwargs.get('html_url')
diff --git a/tox.ini b/tox.ini
--- a/tox.ini
+++ b/tox.ini
@@ -3,6 +3,7 @@
[testenv:py3]
deps =
+ swh.core[http] >= 0.0.61
.[testing]
pytest-cov
commands =

File Metadata

Mime Type
text/plain
Expires
Wed, Dec 18, 1:48 AM (1 d, 23 h ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3233943

Event Timeline