Changeset View
Changeset View
Standalone View
Standalone View
swh/dataset/exporters/orc.py
# Copyright (C) 2020 The Software Heritage developers | # Copyright (C) 2020 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
import datetime | import datetime | ||||
import uuid | import uuid | ||||
from pyorc import BigInt, Binary, Int, SmallInt, String, Struct, Timestamp, Writer | from pyorc import BigInt, Binary, Int, SmallInt, String, Struct, Timestamp, Writer | ||||
from swh.dataset.exporter import ExporterDispatch | from swh.dataset.exporter import ExporterDispatch | ||||
from swh.dataset.relational import TABLES | |||||
from swh.dataset.utils import remove_pull_requests | from swh.dataset.utils import remove_pull_requests | ||||
from swh.model.hashutil import hash_to_hex | from swh.model.hashutil import hash_to_hex | ||||
# fmt: off | ORC_TYPE_MAP = { | ||||
"string": String, | |||||
"smallint": SmallInt, | |||||
"int": Int, | |||||
"bigint": BigInt, | |||||
"timestamp": Timestamp, | |||||
"binary": Binary, | |||||
} | |||||
EXPORT_SCHEMA = { | EXPORT_SCHEMA = { | ||||
'origin': Struct( | table_name: Struct( | ||||
url=String() | **{ | ||||
), | column_name: ORC_TYPE_MAP[column_type]() | ||||
'origin_visit': Struct( | for column_name, column_type in columns | ||||
origin=String(), | } | ||||
visit=BigInt(), | ) | ||||
date=Timestamp(), | for table_name, columns in TABLES.items() | ||||
type=String(), | |||||
), | |||||
'origin_visit_status': Struct( | |||||
origin=String(), | |||||
visit=BigInt(), | |||||
date=Timestamp(), | |||||
status=String(), | |||||
snapshot=String(), | |||||
), | |||||
'snapshot': Struct( | |||||
id=String(), | |||||
), | |||||
'snapshot_branch': Struct( | |||||
snapshot_id=String(), | |||||
name=Binary(), | |||||
target=String(), | |||||
target_type=String(), | |||||
), | |||||
'release': Struct( | |||||
id=String(), | |||||
name=Binary(), | |||||
message=Binary(), | |||||
target=String(), | |||||
target_type=String(), | |||||
author=Binary(), | |||||
date=Timestamp(), | |||||
date_offset=SmallInt(), | |||||
), | |||||
'revision': Struct( | |||||
id=String(), | |||||
message=Binary(), | |||||
author=Binary(), | |||||
date=Timestamp(), | |||||
date_offset=SmallInt(), | |||||
committer=Binary(), | |||||
committer_date=Timestamp(), | |||||
committer_offset=SmallInt(), | |||||
directory=String(), | |||||
), | |||||
'directory': Struct( | |||||
id=String(), | |||||
), | |||||
'directory_entry': Struct( | |||||
directory_id=String(), | |||||
name=Binary(), | |||||
type=String(), | |||||
target=String(), | |||||
perms=Int(), | |||||
), | |||||
'content': Struct( | |||||
sha1=String(), | |||||
sha1_git=String(), | |||||
sha256=String(), | |||||
blake2s256=String(), | |||||
length=BigInt(), | |||||
status=String(), | |||||
), | |||||
'skipped_content': Struct( | |||||
sha1=String(), | |||||
sha1_git=String(), | |||||
sha256=String(), | |||||
blake2s256=String(), | |||||
length=BigInt(), | |||||
status=String(), | |||||
reason=String(), | |||||
), | |||||
} | } | ||||
# fmt: on | |||||
def hash_to_hex_or_none(hash): | def hash_to_hex_or_none(hash): | ||||
return hash_to_hex(hash) if hash is not None else None | return hash_to_hex(hash) if hash is not None else None | ||||
def swh_date_to_datetime(obj): | def swh_date_to_datetime(obj): | ||||
if obj is None or obj["timestamp"] is None: | if obj is None or obj["timestamp"] is None: | ||||
▲ Show 20 Lines • Show All 152 Lines • Show Last 20 Lines |