Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F7123182
D1482.id4989.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
13 KB
Subscribers
None
D1482.id4989.diff
View Options
diff --git a/README.md b/README.md
--- a/README.md
+++ b/README.md
@@ -16,6 +16,7 @@
- `swh.lister.pypi`
- `swh.lister.npm`
- `swh.lister.phabricator`
+- `swh.lister.gnu`
Dependencies
------------
@@ -177,6 +178,18 @@
incremental_phabricator_lister(forge_url='https://forge.softwareheritage.org', api_token='XXXX')
```
+## lister-gnu
+
+Once configured, you can execute a PyPI lister using the following instructions in a `python3` script:
+
+```lang=python
+import logging
+from swh.lister.gnu.tasks import gnu_lister
+
+logging.basicConfig(level=logging.DEBUG)
+gnu_lister()
+```
+
Licensing
---------
diff --git a/setup.py b/setup.py
--- a/setup.py
+++ b/setup.py
@@ -54,6 +54,8 @@
entry_points='''
[console_scripts]
swh-lister=swh.lister.cli:cli
+ [swh.cli.subcommands]
+ lister=swh.lister.cli:lister
''',
classifiers=[
"Programming Language :: Python :: 3",
diff --git a/swh/lister/cli.py b/swh/lister/cli.py
--- a/swh/lister/cli.py
+++ b/swh/lister/cli.py
@@ -6,14 +6,23 @@
import logging
import click
+from swh.core.cli import CONTEXT_SETTINGS
+
logger = logging.getLogger(__name__)
SUPPORTED_LISTERS = ['github', 'gitlab', 'bitbucket', 'debian', 'pypi',
- 'npm', 'phabricator']
+ 'npm', 'phabricator', 'gnu']
+
+@click.group(name='lister', context_settings=CONTEXT_SETTINGS)
+@click.pass_context
+def lister(ctx):
+ '''Software Heritage Lister tools.'''
+ pass
-@click.command()
+
+@lister.command(name='db-init', context_settings=CONTEXT_SETTINGS)
@click.option(
'--db-url', '-d', default='postgres:///lister-gitlab.com',
help='SQLAlchemy DB URL; see '
@@ -22,8 +31,9 @@
type=click.Choice(SUPPORTED_LISTERS + ['all']))
@click.option('--drop-tables', '-D', is_flag=True, default=False,
help='Drop tables before creating the database schema')
-def cli(db_url, listers, drop_tables):
- """Initialize db model according to lister.
+@click.pass_context
+def cli(ctx, db_url, listers, drop_tables):
+ """Initialize the database model for given listers.
"""
override_conf = {
@@ -105,6 +115,11 @@
api_token='',
override_config=override_conf)
+ elif lister == 'gnu':
+ from .gnu.models import ModelBase
+ from .gnu.lister import GNULister
+ _lister = GNULister(override_config=override_conf)
+
else:
raise ValueError(
'Invalid lister %s: only supported listers are %s' %
diff --git a/swh/lister/core/lister_base.py b/swh/lister/core/lister_base.py
--- a/swh/lister/core/lister_base.py
+++ b/swh/lister/core/lister_base.py
@@ -414,7 +414,7 @@
Returns:
the same information in a different form
"""
- _type = 'origin-update-%s' % origin_type
+ _type = 'load-%s' % origin_type
_policy = 'recurring'
return utils.create_task_dict(_type, _policy, origin_url)
diff --git a/swh/lister/core/tests/conftest.py b/swh/lister/core/tests/conftest.py
--- a/swh/lister/core/tests/conftest.py
+++ b/swh/lister/core/tests/conftest.py
@@ -12,4 +12,5 @@
'swh.lister.npm.tasks',
'swh.lister.pypi.tasks',
'swh.lister.phabricator.tasks',
+ 'swh.lister.gnu.tasks'
]
diff --git a/swh/lister/gnu/__init__.py b/swh/lister/gnu/__init__.py
new file mode 100644
diff --git a/swh/lister/gnu/lister.py b/swh/lister/gnu/lister.py
new file mode 100644
--- /dev/null
+++ b/swh/lister/gnu/lister.py
@@ -0,0 +1,200 @@
+# Copyright (C) 2019 the Software Heritage developers
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+import random
+import gzip
+import json
+import requests
+
+from .models import GNUModel
+
+from swh.scheduler import utils
+from swh.lister.core.simple_lister import SimpleLister
+
+
+class GNULister(SimpleLister):
+ MODEL = GNUModel
+ LISTER_NAME = 'gnu'
+ TREE_URL = 'https://ftp.gnu.org/tree.json.gz'
+ BASE_URL = 'https://ftp.gnu.org'
+
+ def task_dict(self, origin_type, origin_url, **kwargs):
+ """
+ Return task format dict
+
+ This is overridden from the lister_base as more information is
+ needed for the ingestion task creation.
+ """
+ return utils.create_task_dict(
+ 'load-%s' % origin_type, 'recurring', kwargs.get('name'),
+ origin_url, list_of_tarball=kwargs.get('list_of_tarball'))
+
+ def get_file(self):
+ '''
+ Downloads and unzip tree.json.gz file and returns its content
+ in JSON format
+
+ Returns
+ File content in JSON format
+ '''
+ response = requests.get(self.TREE_URL,
+ allow_redirects=True)
+ uncompressed_content = gzip.decompress(response.content)
+ return json.loads(uncompressed_content.decode('utf-8'))
+
+ def safely_issue_request(self, identifier):
+ '''
+ Make network request with to download the file which
+ has file structure of the GNU website.
+
+ Args:
+ identifier: resource identifier
+ Returns:
+ server response
+ '''
+ response = self.get_file()
+ return response
+
+ def list_packages(self, response):
+ """
+ List the actual gnu origins with their names and
+ time last updated from the response.
+
+ Args:
+ response : File structure of the website
+ in JSON format
+
+ Returns:
+ a list of all the packages with their names, url of their root
+ directory and the tarballs present for the particular package.
+ [
+ {'name': '3dldf', 'url': 'https://ftp.gnu.org/gnu/3dldf/',
+ 'list_of_tarballs':
+ [
+ {'archive':
+ 'https://ftp.gnu.org/gnu/3dldf/3DLDF-1.1.3.tar.gz',
+ 'date': '1071002600'},
+ {'archive':
+ 'https://ftp.gnu.org/gnu/3dldf/3DLDF-1.1.4.tar.gz',
+ 'date': '1071078759'}}
+ ]
+ },
+ {'name': '8sync', 'url': 'https://ftp.gnu.org/gnu/8sync/',
+ 'list_of_tarballs':
+ [
+ {'archive':
+ 'https://ftp.gnu.org/gnu/8sync/8sync-0.1.0.tar.gz',
+ 'date': '1461357336'},
+ {'archive':
+ 'https://ftp.gnu.org/gnu/8sync/8sync-0.2.0.tar.gz',
+ 'date': '1480991830'}
+ ]
+ ]
+ """
+ response = clean_up_response(response)
+ packages = []
+ for directory in response:
+ content = directory['contents']
+ for repo in content:
+ if repo['type'] == 'directory':
+ package_url = '%s/%s/%s/' % (self.BASE_URL,
+ directory['name'],
+ repo['name'])
+ list_of_tarball = find_all_tarball(
+ repo['contents'], package_url)
+ if list_of_tarball != []:
+ repo_details = {
+ 'name': repo['name'],
+ 'url': package_url,
+ 'list_of_tarball': list_of_tarball
+ }
+ packages.append(repo_details)
+ random.shuffle(packages)
+ return packages
+
+ def get_model_from_repo(self, repo):
+ """Transform from repository representation to model
+
+ """
+ return {
+ 'uid': repo['name'],
+ 'name': repo['name'],
+ 'full_name': repo['name'],
+ 'html_url': repo['url'],
+ 'origin_url': repo['url'],
+ 'time_last_upated': repo['time_modified'],
+ 'origin_type': 'gnu',
+ 'description': None,
+ }
+
+ def transport_response_simplified(self, response):
+ """Transform response to list for model manipulation
+
+ """
+ return [self.get_model_from_repo(repo) for repo in response]
+
+ def transport_request(self):
+ pass
+
+ def transport_response_to_string(self):
+ pass
+
+ def transport_quota_check(self):
+ pass
+
+
+def find_all_tarball(package_file_structure, url):
+ '''
+ Recusively lists all the tarball present in the folder and subfolders
+
+ Args
+ package_file_structure : File structure of the package root directory
+ url : URL of the corrosponding package
+
+ Returns
+ List of all the tarball urls and the last their time of update
+ example-
+ For a package called 3dldf
+
+ [
+ {'archive': 'https://ftp.gnu.org/gnu/3dldf/3DLDF-1.1.3.tar.gz',
+ 'date': '1071002600'}
+ {'archive': 'https://ftp.gnu.org/gnu/3dldf/3DLDF-1.1.4.tar.gz',
+ 'date': '1071078759'}
+ {'archive': 'https://ftp.gnu.org/gnu/3dldf/3DLDF-1.1.5.1.tar.gz',
+ 'date': '1074278633'}
+ ...
+ ]
+ '''
+ list_of_tarball = []
+ for single_file in package_file_structure:
+
+ if single_file['type'] == 'file':
+ if(single_file['name'][-6:-3] == "tar" or
+ single_file['name'][-3:] == "zip"):
+ list_of_tarball.append({
+ "archive": url + single_file['name'],
+ "date": single_file['time']
+ })
+ # It will recursively check for tarballs in all sub-folders
+ elif single_file['type'] == 'directory':
+ list_of_tarballs_in_dir = find_all_tarball(
+ single_file['contents'],
+ url + single_file['name'] + '/')
+ list_of_tarball.extend(list_of_tarballs_in_dir)
+
+ return list_of_tarball
+
+
+def clean_up_response(response):
+ '''
+ Clears our JSON response by keeping only those directory which
+ have tarballs
+ '''
+ final_response = []
+ file_system = response[0]['contents']
+ for directory in file_system:
+ if directory['name'] in ('gnu', 'mirrors', 'old-gnu'):
+ final_response.append(directory)
+ return final_response
diff --git a/swh/lister/gnu/models.py b/swh/lister/gnu/models.py
new file mode 100644
--- /dev/null
+++ b/swh/lister/gnu/models.py
@@ -0,0 +1,17 @@
+# Copyright (C) 2019 the Software Heritage developers
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+from sqlalchemy import Column, String, Integer
+
+from ..core.models import ModelBase
+
+
+class GNUModel(ModelBase):
+ """a GNU repository representation
+
+ """
+ __tablename__ = 'gnu_repo'
+
+ uid = Column(String, primary_key=True)
+ time_last_upated = Column(Integer)
diff --git a/swh/lister/gnu/tasks.py b/swh/lister/gnu/tasks.py
new file mode 100644
--- /dev/null
+++ b/swh/lister/gnu/tasks.py
@@ -0,0 +1,17 @@
+# Copyright (C) 2019 the Software Heritage developers
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+from swh.scheduler.celery_backend.config import app
+
+from .lister import GNULister
+
+
+@app.task(name=__name__ + '.GNUListerTask')
+def gnu_lister(**lister_args):
+ GNULister(**lister_args).run()
+
+
+@app.task(name=__name__ + '.ping')
+def ping():
+ return 'OK'
diff --git a/swh/lister/gnu/tests/__init__.py b/swh/lister/gnu/tests/__init__.py
new file mode 100644
diff --git a/swh/lister/gnu/tests/conftest.py b/swh/lister/gnu/tests/conftest.py
new file mode 100644
--- /dev/null
+++ b/swh/lister/gnu/tests/conftest.py
@@ -0,0 +1 @@
+from swh.lister.core.tests.conftest import * # noqa
diff --git a/swh/lister/gnu/tests/test_tasks.py b/swh/lister/gnu/tests/test_tasks.py
new file mode 100644
--- /dev/null
+++ b/swh/lister/gnu/tests/test_tasks.py
@@ -0,0 +1,27 @@
+from unittest.mock import patch
+
+
+def test_ping(swh_app, celery_session_worker):
+ res = swh_app.send_task(
+ 'swh.lister.gnu.tasks.ping')
+ assert res
+ res.wait()
+ assert res.successful()
+ assert res.result == 'OK'
+
+
+@patch('swh.lister.gnu.tasks.GNULister')
+def test_lister(lister, swh_app, celery_session_worker):
+ # setup the mocked GNULister
+ lister.return_value = lister
+ lister.run.return_value = None
+
+ res = swh_app.send_task(
+ 'swh.lister.gnu.tasks.GNUListerTask')
+ assert res
+ res.wait()
+ assert res.successful()
+
+ lister.assert_called_once_with()
+ lister.db_last_index.assert_not_called()
+ lister.run.assert_called_once_with()
diff --git a/swh/lister/pypi/lister.py b/swh/lister/pypi/lister.py
--- a/swh/lister/pypi/lister.py
+++ b/swh/lister/pypi/lister.py
@@ -28,7 +28,7 @@
needed for the ingestion task creation.
"""
- _type = 'origin-update-%s' % origin_type
+ _type = 'load-%s' % origin_type
_policy = 'recurring'
project_name = kwargs.get('name')
project_metadata_url = kwargs.get('html_url')
diff --git a/tox.ini b/tox.ini
--- a/tox.ini
+++ b/tox.ini
@@ -3,6 +3,7 @@
[testenv:py3]
deps =
+ swh.core[http] >= 0.0.61
.[testing]
pytest-cov
commands =
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Wed, Dec 18, 1:48 AM (2 d, 5 h ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3233943
Attached To
D1482: GNU Lister
Event Timeline
Log In to Comment