Page MenuHomeSoftware Heritage

D7322.id27005.diff
No OneTemporary

D7322.id27005.diff

diff --git a/swh/dataset/exporter.py b/swh/dataset/exporter.py
--- a/swh/dataset/exporter.py
+++ b/swh/dataset/exporter.py
@@ -7,6 +7,7 @@
import pathlib
from types import TracebackType
from typing import Any, Dict, Optional, Type
+import uuid
class Exporter:
@@ -53,6 +54,14 @@
"""
raise NotImplementedError
+ def get_unique_file_id(self) -> str:
+ """
+ Return a unique random file id for the current process.
+
+ If config['test_unique_file_id'] is set, it will be used instead.
+ """
+ return str(self.config.get("test_unique_file_id", uuid.uuid4()))
+
class ExporterDispatch(Exporter):
"""
diff --git a/swh/dataset/exporters/edges.py b/swh/dataset/exporters/edges.py
--- a/swh/dataset/exporters/edges.py
+++ b/swh/dataset/exporters/edges.py
@@ -10,7 +10,6 @@
import subprocess
import tempfile
from typing import Tuple
-import uuid
from swh.dataset.exporter import ExporterDispatch
from swh.dataset.utils import ZSTFile, remove_pull_requests
@@ -42,7 +41,7 @@
if obj_type not in self.writers:
dataset_path = self.export_path / obj_type.name.lower()
dataset_path.mkdir(exist_ok=True)
- unique_id = str(uuid.uuid4())
+ unique_id = self.get_unique_file_id()
nodes_file = dataset_path / ("graph-{}.nodes.csv.zst".format(unique_id))
edges_file = dataset_path / ("graph-{}.edges.csv.zst".format(unique_id))
node_writer = self.exit_stack.enter_context(ZSTFile(str(nodes_file), "w"))
diff --git a/swh/dataset/exporters/orc.py b/swh/dataset/exporters/orc.py
--- a/swh/dataset/exporters/orc.py
+++ b/swh/dataset/exporters/orc.py
@@ -6,7 +6,6 @@
from datetime import datetime
import math
from typing import Any, Optional, Tuple, Type, cast
-import uuid
from pyorc import (
BigInt,
@@ -117,7 +116,7 @@
if table_name not in self.writers:
object_type_dir = self.export_path / table_name
object_type_dir.mkdir(exist_ok=True)
- unique_id = str(uuid.uuid4())
+ unique_id = self.get_unique_file_id()
export_file = object_type_dir / ("graph-{}.orc".format(unique_id))
export_obj = self.exit_stack.enter_context(export_file.open("wb"))
self.writers[table_name] = self.exit_stack.enter_context(
diff --git a/swh/dataset/utils.py b/swh/dataset/utils.py
--- a/swh/dataset/utils.py
+++ b/swh/dataset/utils.py
@@ -29,7 +29,7 @@
is_text = not (self.mode in ("rb", "wb"))
writing = self.mode in ("w", "wb")
if writing:
- cmd = ["zstd", "-q", "-o", self.path]
+ cmd = ["zstd", "-f", "-q", "-o", self.path]
else:
cmd = ["zstdcat", self.path]
self.process = subprocess.Popen(

File Metadata

Mime Type
text/plain
Expires
Jul 3 2025, 6:20 PM (5 w, 2 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3220819

Event Timeline