Changeset View
Changeset View
Standalone View
Standalone View
swh/dataset/exporters/edges.py
# Copyright (C) 2020 The Software Heritage developers | # Copyright (C) 2020 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
import base64 | import base64 | ||||
import os | import os | ||||
import os.path | import os.path | ||||
import shlex | import shlex | ||||
import subprocess | import subprocess | ||||
import tempfile | import tempfile | ||||
import uuid | import uuid | ||||
from swh.dataset.exporter import ExporterDispatch | from swh.dataset.exporter import ExporterDispatch | ||||
from swh.dataset.utils import ZSTFile, remove_pull_requests | from swh.dataset.utils import ZSTFile, remove_pull_requests | ||||
from swh.model.identifiers import origin_identifier, swhid | from swh.model.hashutil import hash_to_bytes | ||||
from swh.model.identifiers import ExtendedObjectType, ExtendedSWHID, origin_identifier | |||||
def swhid(object_type, object_id): | |||||
return str( | |||||
ExtendedSWHID( | |||||
object_type=ExtendedObjectType[object_type.upper()], object_id=object_id | |||||
) | |||||
) | |||||
class GraphEdgesExporter(ExporterDispatch): | class GraphEdgesExporter(ExporterDispatch): | ||||
""" | """ | ||||
Implementation of an exporter which writes all the graph edges | Implementation of an exporter which writes all the graph edges | ||||
of a specific type to a Zstandard-compressed CSV file. | of a specific type to a Zstandard-compressed CSV file. | ||||
Each row of the CSV is in the format: `<SRC SWHID> <DST SWHID> | Each row of the CSV is in the format: `<SRC SWHID> <DST SWHID> | ||||
Show All 36 Lines | def write_edge(self, src, dst, *, labels=None): | ||||
return | return | ||||
src_swhid = swhid(object_type=src_type, object_id=src_id) | src_swhid = swhid(object_type=src_type, object_id=src_id) | ||||
dst_swhid = swhid(object_type=dst_type, object_id=dst_id) | dst_swhid = swhid(object_type=dst_type, object_id=dst_id) | ||||
edge_line = " ".join([src_swhid, dst_swhid] + (labels if labels else [])) | edge_line = " ".join([src_swhid, dst_swhid] + (labels if labels else [])) | ||||
edge_writer = self.get_edge_writer_for(src_type) | edge_writer = self.get_edge_writer_for(src_type) | ||||
edge_writer.write("{}\n".format(edge_line)) | edge_writer.write("{}\n".format(edge_line)) | ||||
def process_origin(self, origin): | def process_origin(self, origin): | ||||
origin_id = origin_identifier({"url": origin["url"]}) | origin_id = hash_to_bytes(origin_identifier({"url": origin["url"]})) | ||||
self.write_node(("origin", origin_id)) | self.write_node(("origin", origin_id)) | ||||
def process_origin_visit_status(self, visit_status): | def process_origin_visit_status(self, visit_status): | ||||
origin_id = origin_identifier({"url": visit_status["origin"]}) | origin_id = hash_to_bytes(origin_identifier({"url": visit_status["origin"]})) | ||||
self.write_edge(("origin", origin_id), ("snapshot", visit_status["snapshot"])) | self.write_edge(("origin", origin_id), ("snapshot", visit_status["snapshot"])) | ||||
def process_snapshot(self, snapshot): | def process_snapshot(self, snapshot): | ||||
if self.config.get("remove_pull_requests"): | if self.config.get("remove_pull_requests"): | ||||
remove_pull_requests(snapshot) | remove_pull_requests(snapshot) | ||||
self.write_node(("snapshot", snapshot["id"])) | self.write_node(("snapshot", snapshot["id"])) | ||||
for branch_name, branch in snapshot["branches"].items(): | for branch_name, branch in snapshot["branches"].items(): | ||||
▲ Show 20 Lines • Show All 125 Lines • Show Last 20 Lines |