Changeset View
Changeset View
Standalone View
Standalone View
swh/dataset/exporters/orc.py
# Copyright (C) 2020 The Software Heritage developers | # Copyright (C) 2020 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
import datetime | import datetime | ||||
import uuid | import uuid | ||||
from pyorc import BigInt, Binary, Int, SmallInt, String, Struct, Timestamp, Writer | from pyorc import ( | ||||
BigInt, | |||||
Binary, | |||||
CompressionKind, | |||||
Int, | |||||
SmallInt, | |||||
String, | |||||
Struct, | |||||
Timestamp, | |||||
Writer, | |||||
) | |||||
from swh.dataset.exporter import ExporterDispatch | from swh.dataset.exporter import ExporterDispatch | ||||
from swh.dataset.relational import TABLES | from swh.dataset.relational import TABLES | ||||
from swh.dataset.utils import remove_pull_requests | from swh.dataset.utils import remove_pull_requests | ||||
from swh.model.hashutil import hash_to_hex | from swh.model.hashutil import hash_to_hex | ||||
ORC_TYPE_MAP = { | ORC_TYPE_MAP = { | ||||
"string": String, | "string": String, | ||||
▲ Show 20 Lines • Show All 50 Lines • ▼ Show 20 Lines | class ORCExporter(ExporterDispatch): | ||||
def get_writer_for(self, table_name: str): | def get_writer_for(self, table_name: str): | ||||
if table_name not in self.writers: | if table_name not in self.writers: | ||||
object_type_dir = self.export_path / table_name | object_type_dir = self.export_path / table_name | ||||
object_type_dir.mkdir(exist_ok=True) | object_type_dir.mkdir(exist_ok=True) | ||||
unique_id = str(uuid.uuid4()) | unique_id = str(uuid.uuid4()) | ||||
export_file = object_type_dir / ("graph-{}.orc".format(unique_id)) | export_file = object_type_dir / ("graph-{}.orc".format(unique_id)) | ||||
export_obj = self.exit_stack.enter_context(export_file.open("wb")) | export_obj = self.exit_stack.enter_context(export_file.open("wb")) | ||||
self.writers[table_name] = self.exit_stack.enter_context( | self.writers[table_name] = self.exit_stack.enter_context( | ||||
Writer(export_obj, EXPORT_SCHEMA[table_name]) | Writer( | ||||
export_obj, | |||||
EXPORT_SCHEMA[table_name], | |||||
compression=CompressionKind.ZSTD, | |||||
) | |||||
) | ) | ||||
return self.writers[table_name] | return self.writers[table_name] | ||||
def process_origin(self, origin): | def process_origin(self, origin): | ||||
origin_writer = self.get_writer_for("origin") | origin_writer = self.get_writer_for("origin") | ||||
origin_writer.write((origin["url"],)) | origin_writer.write((origin["url"],)) | ||||
def process_origin_visit(self, visit): | def process_origin_visit(self, visit): | ||||
▲ Show 20 Lines • Show All 119 Lines • Show Last 20 Lines |