Page MenuHomeSoftware Heritage

D2629.diff
No OneTemporary

D2629.diff

diff --git a/swh/dataset/cli.py b/swh/dataset/cli.py
new file mode 100644
--- /dev/null
+++ b/swh/dataset/cli.py
@@ -0,0 +1,49 @@
+# Copyright (C) 2019 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+import click
+import functools
+import uuid
+
+from swh.core import config
+from swh.core.cli import CONTEXT_SETTINGS
+from swh.dataset import graph
+from swh.journal.cli import get_journal_client
+
+
+@click.group(name='dataset', context_settings=CONTEXT_SETTINGS)
+@click.option('--config-file', '-C', default=None,
+ type=click.Path(exists=True, dir_okay=False,),
+ help="Configuration file.")
+@click.pass_context
+def cli(ctx, config_file):
+ '''Software Heritage Dataset Tools'''
+ ctx.ensure_object(dict)
+
+ conf = config.read(config_file)
+ ctx.obj['config'] = conf
+
+
+@cli.command('export-graph')
+@click.argument('export-path', type=click.File('w'))
+@click.option('--export-id', '-e', help="Unique ID of the export run.")
+@click.pass_context
+def export_graph(ctx, export_path, export_id):
+ if not export_id:
+ export_id = str(uuid.uuid4())
+ client = get_journal_client(
+ ctx,
+ object_types=[
+ 'origin_visit',
+ 'snapshot',
+ 'release',
+ 'revision',
+ 'directory',
+ ],
+ group_id=('swh-dataset-export-' + export_id)
+ )
+ process_messages = functools.partial(graph.process_messages,
+ output=export_path)
+ client.process(process_messages)
diff --git a/swh/dataset/graph.py b/swh/dataset/graph.py
new file mode 100644
--- /dev/null
+++ b/swh/dataset/graph.py
@@ -0,0 +1,34 @@
+# Copyright (C) 2019 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+from swh.model.identifiers import origin_identifier
+from swh.model.hashutil import hash_to_hex
+
+
+def process_messages(messages, output):
+ def write(src, dst):
+ if src is None or dst is None:
+ return
+ print(hash_to_hex(src), hash_to_hex(dst), file=output)
+
+ for visit in messages.get('origin_visit', []):
+ write(origin_identifier({'url': visit['origin']}), visit['snapshot'])
+
+ for snapshot in messages.get('snapshot', []):
+ for branch in snapshot['branches'].values():
+ while branch['target_type'] == 'alias':
+ branch = snapshot['branches'][branch['target']]
+ write(snapshot['id'], branch['target'])
+
+ for release in messages.get('release', []):
+ write(release['id'], release['target'])
+
+ for revision in messages.get('revision', []):
+ for parent in revision['parents']:
+ write(revision['id'], parent)
+
+ for directory in messages.get('directory', []):
+ for entry in directory['entries']:
+ write(directory['id'], entry['target'])

File Metadata

Mime Type
text/plain
Expires
Dec 21 2024, 4:13 AM (11 w, 4 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3218248

Event Timeline