diff --git a/swh/dataset/exporter.py b/swh/dataset/exporter.py --- a/swh/dataset/exporter.py +++ b/swh/dataset/exporter.py @@ -7,6 +7,7 @@ import pathlib from types import TracebackType from typing import Any, Dict, Optional, Type +import uuid class Exporter: @@ -53,6 +54,14 @@ """ raise NotImplementedError + def get_unique_file_id(self) -> str: + """ + Return a unique random file id for the current process. + + If config['test_unique_file_id'] is set, it will be used instead. + """ + return str(self.config.get("test_unique_file_id", uuid.uuid4())) + class ExporterDispatch(Exporter): """ diff --git a/swh/dataset/exporters/edges.py b/swh/dataset/exporters/edges.py --- a/swh/dataset/exporters/edges.py +++ b/swh/dataset/exporters/edges.py @@ -10,7 +10,6 @@ import subprocess import tempfile from typing import Tuple -import uuid from swh.dataset.exporter import ExporterDispatch from swh.dataset.utils import ZSTFile, remove_pull_requests @@ -42,7 +41,7 @@ if obj_type not in self.writers: dataset_path = self.export_path / obj_type.name.lower() dataset_path.mkdir(exist_ok=True) - unique_id = str(uuid.uuid4()) + unique_id = self.get_unique_file_id() nodes_file = dataset_path / ("graph-{}.nodes.csv.zst".format(unique_id)) edges_file = dataset_path / ("graph-{}.edges.csv.zst".format(unique_id)) node_writer = self.exit_stack.enter_context(ZSTFile(str(nodes_file), "w")) diff --git a/swh/dataset/exporters/orc.py b/swh/dataset/exporters/orc.py --- a/swh/dataset/exporters/orc.py +++ b/swh/dataset/exporters/orc.py @@ -6,7 +6,6 @@ from datetime import datetime import math from typing import Any, Optional, Tuple, Type, cast -import uuid from pyorc import ( BigInt, @@ -117,7 +116,7 @@ if table_name not in self.writers: object_type_dir = self.export_path / table_name object_type_dir.mkdir(exist_ok=True) - unique_id = str(uuid.uuid4()) + unique_id = self.get_unique_file_id() export_file = object_type_dir / ("graph-{}.orc".format(unique_id)) export_obj = self.exit_stack.enter_context(export_file.open("wb")) self.writers[table_name] = self.exit_stack.enter_context( diff --git a/swh/dataset/utils.py b/swh/dataset/utils.py --- a/swh/dataset/utils.py +++ b/swh/dataset/utils.py @@ -29,7 +29,7 @@ is_text = not (self.mode in ("rb", "wb")) writing = self.mode in ("w", "wb") if writing: - cmd = ["zstd", "-q", "-o", self.path] + cmd = ["zstd", "-f", "-q", "-o", self.path] else: cmd = ["zstdcat", self.path] self.process = subprocess.Popen(