Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F9348251
D7322.id27005.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
2 KB
Subscribers
None
D7322.id27005.diff
View Options
diff --git a/swh/dataset/exporter.py b/swh/dataset/exporter.py
--- a/swh/dataset/exporter.py
+++ b/swh/dataset/exporter.py
@@ -7,6 +7,7 @@
import pathlib
from types import TracebackType
from typing import Any, Dict, Optional, Type
+import uuid
class Exporter:
@@ -53,6 +54,14 @@
"""
raise NotImplementedError
+ def get_unique_file_id(self) -> str:
+ """
+ Return a unique random file id for the current process.
+
+ If config['test_unique_file_id'] is set, it will be used instead.
+ """
+ return str(self.config.get("test_unique_file_id", uuid.uuid4()))
+
class ExporterDispatch(Exporter):
"""
diff --git a/swh/dataset/exporters/edges.py b/swh/dataset/exporters/edges.py
--- a/swh/dataset/exporters/edges.py
+++ b/swh/dataset/exporters/edges.py
@@ -10,7 +10,6 @@
import subprocess
import tempfile
from typing import Tuple
-import uuid
from swh.dataset.exporter import ExporterDispatch
from swh.dataset.utils import ZSTFile, remove_pull_requests
@@ -42,7 +41,7 @@
if obj_type not in self.writers:
dataset_path = self.export_path / obj_type.name.lower()
dataset_path.mkdir(exist_ok=True)
- unique_id = str(uuid.uuid4())
+ unique_id = self.get_unique_file_id()
nodes_file = dataset_path / ("graph-{}.nodes.csv.zst".format(unique_id))
edges_file = dataset_path / ("graph-{}.edges.csv.zst".format(unique_id))
node_writer = self.exit_stack.enter_context(ZSTFile(str(nodes_file), "w"))
diff --git a/swh/dataset/exporters/orc.py b/swh/dataset/exporters/orc.py
--- a/swh/dataset/exporters/orc.py
+++ b/swh/dataset/exporters/orc.py
@@ -6,7 +6,6 @@
from datetime import datetime
import math
from typing import Any, Optional, Tuple, Type, cast
-import uuid
from pyorc import (
BigInt,
@@ -117,7 +116,7 @@
if table_name not in self.writers:
object_type_dir = self.export_path / table_name
object_type_dir.mkdir(exist_ok=True)
- unique_id = str(uuid.uuid4())
+ unique_id = self.get_unique_file_id()
export_file = object_type_dir / ("graph-{}.orc".format(unique_id))
export_obj = self.exit_stack.enter_context(export_file.open("wb"))
self.writers[table_name] = self.exit_stack.enter_context(
diff --git a/swh/dataset/utils.py b/swh/dataset/utils.py
--- a/swh/dataset/utils.py
+++ b/swh/dataset/utils.py
@@ -29,7 +29,7 @@
is_text = not (self.mode in ("rb", "wb"))
writing = self.mode in ("w", "wb")
if writing:
- cmd = ["zstd", "-q", "-o", self.path]
+ cmd = ["zstd", "-f", "-q", "-o", self.path]
else:
cmd = ["zstdcat", self.path]
self.process = subprocess.Popen(
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Jul 3 2025, 6:20 PM (5 w, 2 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3220819
Attached To
D7322: Exporters: add option to write in a deterministic location
Event Timeline
Log In to Comment