Page MenuHomeSoftware Heritage

No OneTemporary


diff --git a/mypy.ini b/mypy.ini
--- a/mypy.ini
+++ b/mypy.ini
@@ -11,5 +11,11 @@
ignore_missing_imports = True
+ignore_missing_imports = True
+ignore_missing_imports = True
# [mypy-add_your_lib_here.*]
# ignore_missing_imports = True
diff --git a/requirements.txt b/requirements.txt
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,3 +2,4 @@
# should match names. For the full spec or
# dependency lines, see
diff --git a/swh/dataset/ b/swh/dataset/
new file mode 100644
--- /dev/null
+++ b/swh/dataset/
@@ -0,0 +1,44 @@
+# Copyright (C) 2019 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+import click
+import uuid
+from swh.core import config
+from swh.core.cli import CONTEXT_SETTINGS
+from swh.dataset.graph import export_edges, sort_graph_nodes
+'dataset', context_settings=CONTEXT_SETTINGS)
+@click.option('--config-file', '-C', default=None,
+ type=click.Path(exists=True, dir_okay=False),
+ help="Configuration file.")
+def cli(ctx, config_file):
+ '''Software Heritage Dataset Tools'''
+ ctx.ensure_object(dict)
+ conf =
+ ctx.obj['config'] = conf
+@click.argument('export-path', type=click.Path())
+@click.option('--export-id', '-e', help="Unique ID of the export run.")
+@click.option('--processes', '-p', default=1,
+ help="Number of parallel processes")
+def export_graph(ctx, export_path, export_id, processes):
+ config = ctx.obj['config']
+ if not export_id:
+ export_id = str(uuid.uuid4())
+ print()
+ print('== Edges export phase ==')
+ export_edges(config, export_path, export_id, processes)
+ print()
+ print('== Sort phase ==')
+ sort_graph_nodes(export_path, config)
diff --git a/swh/dataset/ b/swh/dataset/
new file mode 100644
--- /dev/null
+++ b/swh/dataset/
@@ -0,0 +1,171 @@
+import concurrent.futures
+import multiprocessing
+import tqdm
+import time
+from concurrent.futures import FIRST_EXCEPTION, ProcessPoolExecutor
+from confluent_kafka import TopicPartition
+from swh.journal.client import JournalClient
+class JournalClientOffsetRanges(JournalClient):
+ def __init__(self, *args, offset_ranges=None, assignment=None,
+ progress_queue=None, refresh_every=200, **kwargs):
+ self.offset_ranges = offset_ranges
+ self.progress_queue = progress_queue
+ self.refresh_every = refresh_every
+ self.assignment = assignment
+ super().__init__(*args, **kwargs)
+ def subscribe(self):
+ topic_name = self.subscription[0]
+ time.sleep(0.1) #
+ self.consumer.assign([
+ TopicPartition(topic_name, pid) for pid in self.assignment
+ ])
+ def process(self, *args, **kwargs):
+ self.count = 0
+ try:
+ # Handle already committed partition offsets
+ topic_name = self.subscription[0]
+ committed = self.consumer.committed([
+ TopicPartition(topic_name, pid) for pid in self.assignment
+ ])
+ for tp in committed:
+ self.handle_offset(tp.partition, tp.offset)
+ if not self.assignment:
+ raise EOFError
+ # Process the messages
+ super().process(*args, **kwargs)
+ except EOFError:
+ self.progress_queue.put(None)
+ pass
+ def handle_offset(self, partition_id, offset):
+ if offset < 0: # Uninitialized partition offset
+ return
+ if self.count % self.refresh_every == 0:
+ self.progress_queue.put({partition_id: offset})
+ if offset >= self.offset_ranges[partition_id][1] - 1:
+ self.assignment = [pid for pid in self.assignment
+ if pid != partition_id]
+ self.subscribe()
+ def deserialize_message(self, message):
+ self.handle_offset(message.partition(), message.offset())
+ self.count += 1
+ if not self.assignment:
+ raise EOFError
+ return super().deserialize_message(message)
+class ParallelExporter:
+ def __init__(self, config, export_id, obj_type, processes=1):
+ self.config = config
+ self.export_id = 'swh-dataset-export-{}'.format(export_id)
+ self.obj_type = obj_type
+ self.processes = processes
+ self.offsets = None
+ def get_offsets(self):
+ if self.offsets is None:
+ client = JournalClient(
+ **self.config['journal'],
+ object_types=[self.obj_type],
+ group_id=self.export_id,
+ )
+ topic_name = client.subscription[0]
+ topics = client.consumer.list_topics(topic_name).topics
+ partitions = topics[topic_name].partitions
+ self.offsets = {}
+ for partition_id in tqdm.tqdm(partitions.keys(),
+ desc=" - Partition offsets"):
+ tp = TopicPartition(topic_name, partition_id)
+ (lo, hi) = client.consumer.get_watermark_offsets(tp)
+ self.offsets[partition_id] = (lo, hi)
+ return self.offsets
+ def run(self, *args):
+ self.get_offsets()
+ to_assign = list(self.offsets.keys())
+ manager = multiprocessing.Manager()
+ q = manager.Queue()
+ with ProcessPoolExecutor(self.processes + 1) as pool:
+ futures = []
+ for i in range(self.processes):
+ futures.append(pool.submit(
+ self.export_worker,
+ *args,
+ assignment=to_assign[i::self.processes],
+ queue=q
+ ))
+ futures.append(pool.submit(self.progress_worker, queue=q))
+ concurrent.futures.wait(futures, return_when=FIRST_EXCEPTION)
+ for f in futures:
+ if f.running():
+ continue
+ exc = f.exception()
+ if exc:
+ pool.shutdown(wait=False)
+ print(exc)
+ f.result()
+ raise exc
+ def progress_worker(self, *args, queue=None):
+ d = {}
+ active_workers = self.processes
+ offset_diff = sum((hi - lo) for lo, hi in self.offsets.values())
+ with tqdm.tqdm(total=offset_diff, desc=" - Journal export") as pbar:
+ while active_workers:
+ item = queue.get()
+ if item is None:
+ active_workers -= 1
+ continue
+ d.update(item)
+ progress = sum(n - self.offsets[p][0] for p, n in d.items())
+ pbar.set_postfix(active_workers=active_workers,
+ total_workers=self.processes)
+ pbar.update(progress - pbar.n)
+ def process(self, callback, assignment=None, queue=None):
+ client = JournalClientOffsetRanges(
+ **self.config['journal'],
+ object_types=[self.obj_type],
+ group_id=self.export_id,
+ debug='cgrp,broker',
+ offset_ranges=self.offsets,
+ assignment=assignment,
+ progress_queue=queue,
+ )
+ client.process(callback)
+ def export_worker(self, *args, **kwargs):
+ """
+ Override this with a custom implementation of a worker function.
+ A worker function should call `self.process(fn, **kwargs)` with `fn`
+ being a callback that will be called in the same fashion as with
+ `JournalClient.process()`.
+ A simple exporter to print all the objects in the log would look like
+ this:
+ ```
+ class PrintExporter(ParallelExporter):
+ def export_worker(self, **kwargs):
+ self.process(print, **kwargs)
+ ```
+ """
+ raise NotImplementedError
diff --git a/swh/dataset/ b/swh/dataset/
new file mode 100644
--- /dev/null
+++ b/swh/dataset/
@@ -0,0 +1,121 @@
+# Copyright (C) 2019 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+import functools
+import os
+import pathlib
+import shlex
+import subprocess
+import tempfile
+import uuid
+from swh.dataset.exporter import ParallelExporter
+from swh.dataset.utils import ZSTWriter
+from swh.model.identifiers import origin_identifier, persistent_identifier
+def process_messages(messages, writer, config):
+ def write(src, dst):
+ src_type, src_id = src
+ dst_type, dst_id = dst
+ if src_id is None or dst_id is None:
+ return
+ src_pid = persistent_identifier(object_type=src_type, object_id=src_id)
+ dst_pid = persistent_identifier(object_type=dst_type, object_id=dst_id)
+ writer.write('{} {}\n'.format(src_pid, dst_pid))
+ for visit in messages.get('origin_visit', []):
+ write(('origin', origin_identifier({'url': visit['origin']['url']})),
+ ('snapshot', visit['snapshot']))
+ for snapshot in messages.get('snapshot', []):
+ for branch_name, branch in snapshot['branches'].items():
+ while branch and branch.get('target_type') == 'alias':
+ branch_name = branch['target']
+ branch = snapshot['branches'][branch_name]
+ if branch is None or not branch_name:
+ continue
+ if (config.get('remove_pull_requests')
+ and branch_name.startswith(b'refs/pull')
+ or branch_name.startswith(b'refs/merge-requests')):
+ continue
+ write(('snapshot', snapshot['id']),
+ (branch['target_type'], branch['target']))
+ for release in messages.get('release', []):
+ write(('release', release['id']),
+ (release['target_type'], release['target']))
+ for revision in messages.get('revision', []):
+ write(('revision', revision['id']),
+ ('directory', revision['directory']))
+ for parent in revision['parents']:
+ write(('revision', revision['id']),
+ ('revision', parent))
+ for directory in messages.get('directory', []):
+ for entry in directory['entries']:
+ entry_type_mapping = {
+ 'file': 'content',
+ 'dir': 'directory',
+ 'rev': 'revision'
+ }
+ write(('directory', directory['id']),
+ (entry_type_mapping[entry['type']], entry['target']))
+class GraphEdgeExporter(ParallelExporter):
+ def export_worker(self, export_path, **kwargs):
+ dataset_path = pathlib.Path(export_path)
+ dataset_path.mkdir(exist_ok=True, parents=True)
+ dataset_file = dataset_path / ('graph-{}.edges.csv.zst'
+ .format(str(uuid.uuid4())))
+ with ZSTWriter(dataset_file) as writer:
+ process_fn = functools.partial(
+ process_messages, writer=writer, config=self.config,
+ )
+ self.process(process_fn, **kwargs)
+def export_edges(config, export_path, export_id, processes):
+ object_types = [
+ 'origin_visit',
+ 'snapshot',
+ 'release',
+ 'revision',
+ 'directory',
+ ]
+ for obj_type in object_types:
+ print('{} edges:'.format(obj_type))
+ exporter = GraphEdgeExporter(config, export_id, obj_type, processes)
+def sort_graph_nodes(export_path, config):
+ # Use bytes for the sorting algorithm (faster than being locale-specific)
+ env = {
+ **os.environ.copy(),
+ 'LC_ALL': 'C',
+ 'LC_COLLATE': 'C',
+ 'LANG': 'C',
+ }
+ sort_buffer_size = config.get('sort_buffer_size', '4G')
+ disk_buffer_dir = config.get('disk_buffer_dir', export_path)
+ with tempfile.TemporaryDirectory(prefix='.graph_node_sort_',
+ dir=disk_buffer_dir) as buffer_path:
+ ("zstdcat {export_path}/*.edges.csv.zst | "
+ "tr ' ' '\\n' | "
+ "sort -u -S{sort_buffer_size} -T{buffer_path} | "
+ "zstdmt > {export_path}/graph.nodes.csv.zst")
+ .format(
+ export_path=shlex.quote(export_path),
+ buffer_path=shlex.quote(buffer_path),
+ sort_buffer_size=shlex.quote(sort_buffer_size),
+ ),
+ shell=True,
+ env=env,
+ )
diff --git a/swh/dataset/ b/swh/dataset/
new file mode 100644
--- /dev/null
+++ b/swh/dataset/
@@ -0,0 +1,22 @@
+import subprocess
+class ZSTWriter:
+ def __init__(self, path, mode='w'):
+ self.path = path
+ self.mode = mode
+ def __enter__(self):
+ is_text = not (self.mode == 'wb')
+ self.process = subprocess.Popen(
+ ['zstd', '-q', '-o', self.path],
+ text=is_text, stdin=subprocess.PIPE
+ )
+ return self
+ def __exit__(self, exc_type, exc_value, tb):
+ self.process.stdin.close()
+ self.process.wait()
+ def write(self, buf):
+ self.process.stdin.write(buf)

File Metadata

Mime Type
Dec 20 2024, 5:13 AM (11 w, 3 d ago)
Storage Engine
Storage Format
Raw Data
Storage Handle

Event Timeline