Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F7124271
D2629.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
3 KB
Subscribers
None
D2629.diff
View Options
diff --git a/swh/dataset/cli.py b/swh/dataset/cli.py
new file mode 100644
--- /dev/null
+++ b/swh/dataset/cli.py
@@ -0,0 +1,49 @@
+# Copyright (C) 2019 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+import click
+import functools
+import uuid
+
+from swh.core import config
+from swh.core.cli import CONTEXT_SETTINGS
+from swh.dataset import graph
+from swh.journal.cli import get_journal_client
+
+
+@click.group(name='dataset', context_settings=CONTEXT_SETTINGS)
+@click.option('--config-file', '-C', default=None,
+ type=click.Path(exists=True, dir_okay=False,),
+ help="Configuration file.")
+@click.pass_context
+def cli(ctx, config_file):
+ '''Software Heritage Dataset Tools'''
+ ctx.ensure_object(dict)
+
+ conf = config.read(config_file)
+ ctx.obj['config'] = conf
+
+
+@cli.command('export-graph')
+@click.argument('export-path', type=click.File('w'))
+@click.option('--export-id', '-e', help="Unique ID of the export run.")
+@click.pass_context
+def export_graph(ctx, export_path, export_id):
+ if not export_id:
+ export_id = str(uuid.uuid4())
+ client = get_journal_client(
+ ctx,
+ object_types=[
+ 'origin_visit',
+ 'snapshot',
+ 'release',
+ 'revision',
+ 'directory',
+ ],
+ group_id=('swh-dataset-export-' + export_id)
+ )
+ process_messages = functools.partial(graph.process_messages,
+ output=export_path)
+ client.process(process_messages)
diff --git a/swh/dataset/graph.py b/swh/dataset/graph.py
new file mode 100644
--- /dev/null
+++ b/swh/dataset/graph.py
@@ -0,0 +1,34 @@
+# Copyright (C) 2019 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+from swh.model.identifiers import origin_identifier
+from swh.model.hashutil import hash_to_hex
+
+
+def process_messages(messages, output):
+ def write(src, dst):
+ if src is None or dst is None:
+ return
+ print(hash_to_hex(src), hash_to_hex(dst), file=output)
+
+ for visit in messages.get('origin_visit', []):
+ write(origin_identifier({'url': visit['origin']}), visit['snapshot'])
+
+ for snapshot in messages.get('snapshot', []):
+ for branch in snapshot['branches'].values():
+ while branch['target_type'] == 'alias':
+ branch = snapshot['branches'][branch['target']]
+ write(snapshot['id'], branch['target'])
+
+ for release in messages.get('release', []):
+ write(release['id'], release['target'])
+
+ for revision in messages.get('revision', []):
+ for parent in revision['parents']:
+ write(revision['id'], parent)
+
+ for directory in messages.get('directory', []):
+ for entry in directory['entries']:
+ write(directory['id'], entry['target'])
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Dec 21 2024, 4:13 AM (11 w, 4 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3218248
Attached To
D2629: dataset: add graph export based on kafka
Event Timeline
Log In to Comment