Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F9749663
D6066.id21965.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
6 KB
Subscribers
None
D6066.id21965.diff
View Options
diff --git a/swh/dataset/exporters/edges.py b/swh/dataset/exporters/edges.py
--- a/swh/dataset/exporters/edges.py
+++ b/swh/dataset/exporters/edges.py
@@ -9,20 +9,20 @@
import shlex
import subprocess
import tempfile
+from typing import Tuple
import uuid
from swh.dataset.exporter import ExporterDispatch
from swh.dataset.utils import ZSTFile, remove_pull_requests
-from swh.model.hashutil import hash_to_bytes
-from swh.model.identifiers import ExtendedObjectType, ExtendedSWHID, origin_identifier
+from swh.model.hashutil import hash_to_bytes, hash_to_hex
+from swh.model.identifiers import ExtendedObjectType, origin_identifier
def swhid(object_type, object_id):
- return str(
- ExtendedSWHID(
- object_type=ExtendedObjectType[object_type.upper()], object_id=object_id
- )
- )
+ # We use string interpolation here instead of using ExtendedSWHID to format,
+ # as building temporary ExtendedSWHID objects has a non-negligeable impact
+ # on performance.
+ return f"swh:1:{object_type.value}:{hash_to_hex(object_id)}"
class GraphEdgesExporter(ExporterDispatch):
@@ -37,9 +37,9 @@
super().__init__(*args, **kwargs)
self.writers = {}
- def get_writers_for(self, obj_type: str):
+ def get_writers_for(self, obj_type: ExtendedObjectType):
if obj_type not in self.writers:
- dataset_path = self.export_path / obj_type
+ dataset_path = self.export_path / obj_type.name.lower()
dataset_path.mkdir(exist_ok=True)
unique_id = str(uuid.uuid4())
nodes_file = dataset_path / ("graph-{}.nodes.csv.zst".format(unique_id))
@@ -49,13 +49,13 @@
self.writers[obj_type] = (node_writer, edge_writer)
return self.writers[obj_type]
- def get_node_writer_for(self, obj_type: str):
+ def get_node_writer_for(self, obj_type: ExtendedObjectType):
return self.get_writers_for(obj_type)[0]
- def get_edge_writer_for(self, obj_type: str):
+ def get_edge_writer_for(self, obj_type: ExtendedObjectType):
return self.get_writers_for(obj_type)[1]
- def write_node(self, node):
+ def write_node(self, node: Tuple[ExtendedObjectType, bytes]):
node_type, node_id = node
if node_id is None:
return
@@ -63,7 +63,13 @@
node_writer = self.get_node_writer_for(node_type)
node_writer.write("{}\n".format(node_swhid))
- def write_edge(self, src, dst, *, labels=None):
+ def write_edge(
+ self,
+ src: Tuple[ExtendedObjectType, bytes],
+ dst: Tuple[ExtendedObjectType, bytes],
+ *,
+ labels=None,
+ ):
src_type, src_id = src
dst_type, dst_id = dst
if src_id is None or dst_id is None:
@@ -76,17 +82,20 @@
def process_origin(self, origin):
origin_id = hash_to_bytes(origin_identifier({"url": origin["url"]}))
- self.write_node(("origin", origin_id))
+ self.write_node((ExtendedObjectType.ORIGIN, origin_id))
def process_origin_visit_status(self, visit_status):
origin_id = hash_to_bytes(origin_identifier({"url": visit_status["origin"]}))
- self.write_edge(("origin", origin_id), ("snapshot", visit_status["snapshot"]))
+ self.write_edge(
+ (ExtendedObjectType.ORIGIN, origin_id),
+ (ExtendedObjectType.SNAPSHOT, visit_status["snapshot"]),
+ )
def process_snapshot(self, snapshot):
if self.config.get("remove_pull_requests"):
remove_pull_requests(snapshot)
- self.write_node(("snapshot", snapshot["id"]))
+ self.write_node((ExtendedObjectType.SNAPSHOT, snapshot["id"]))
for branch_name, branch in snapshot["branches"].items():
original_branch_name = branch_name
while branch and branch.get("target_type") == "alias":
@@ -95,41 +104,46 @@
if branch is None or not branch_name:
continue
self.write_edge(
- ("snapshot", snapshot["id"]),
- (branch["target_type"], branch["target"]),
+ (ExtendedObjectType.SNAPSHOT, snapshot["id"]),
+ (ExtendedObjectType[branch["target_type"].upper()], branch["target"]),
labels=[base64.b64encode(original_branch_name).decode(),],
)
def process_release(self, release):
- self.write_node(("release", release["id"]))
+ self.write_node((ExtendedObjectType.RELEASE, release["id"]))
self.write_edge(
- ("release", release["id"]), (release["target_type"], release["target"])
+ (ExtendedObjectType.RELEASE, release["id"]),
+ (ExtendedObjectType[release["target_type"].upper()], release["target"]),
)
def process_revision(self, revision):
- self.write_node(("revision", revision["id"]))
+ self.write_node((ExtendedObjectType.REVISION, revision["id"]))
self.write_edge(
- ("revision", revision["id"]), ("directory", revision["directory"])
+ (ExtendedObjectType.REVISION, revision["id"]),
+ (ExtendedObjectType.DIRECTORY, revision["directory"]),
)
for parent in revision["parents"]:
- self.write_edge(("revision", revision["id"]), ("revision", parent))
+ self.write_edge(
+ (ExtendedObjectType.REVISION, revision["id"]),
+ (ExtendedObjectType.REVISION, parent),
+ )
def process_directory(self, directory):
- self.write_node(("directory", directory["id"]))
+ self.write_node((ExtendedObjectType.DIRECTORY, directory["id"]))
for entry in directory["entries"]:
entry_type_mapping = {
- "file": "content",
- "dir": "directory",
- "rev": "revision",
+ "file": ExtendedObjectType.CONTENT,
+ "dir": ExtendedObjectType.DIRECTORY,
+ "rev": ExtendedObjectType.REVISION,
}
self.write_edge(
- ("directory", directory["id"]),
+ (ExtendedObjectType.DIRECTORY, directory["id"]),
(entry_type_mapping[entry["type"]], entry["target"]),
labels=[base64.b64encode(entry["name"]).decode(), str(entry["perms"])],
)
def process_content(self, content):
- self.write_node(("content", content["sha1_git"]))
+ self.write_node((ExtendedObjectType.CONTENT, content["sha1_git"]))
def sort_graph_nodes(export_path, config):
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Sun, Aug 24, 6:05 PM (1 w, 2 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3233766
Attached To
D6066: exporters/edges: Make swhid() format directly instead of instantiating ExtendedSWHID
Event Timeline
Log In to Comment