diff --git a/swh/dataset/exporters/orc.py b/swh/dataset/exporters/orc.py --- a/swh/dataset/exporters/orc.py +++ b/swh/dataset/exporters/orc.py @@ -9,86 +9,28 @@ from pyorc import BigInt, Binary, Int, SmallInt, String, Struct, Timestamp, Writer from swh.dataset.exporter import ExporterDispatch +from swh.dataset.relational import TABLES from swh.dataset.utils import remove_pull_requests from swh.model.hashutil import hash_to_hex -# fmt: off +ORC_TYPE_MAP = { + "string": String, + "smallint": SmallInt, + "int": Int, + "bigint": BigInt, + "timestamp": Timestamp, + "binary": Binary, +} + EXPORT_SCHEMA = { - 'origin': Struct( - url=String() - ), - 'origin_visit': Struct( - origin=String(), - visit=BigInt(), - date=Timestamp(), - type=String(), - ), - 'origin_visit_status': Struct( - origin=String(), - visit=BigInt(), - date=Timestamp(), - status=String(), - snapshot=String(), - ), - 'snapshot': Struct( - id=String(), - ), - 'snapshot_branch': Struct( - snapshot_id=String(), - name=Binary(), - target=String(), - target_type=String(), - ), - 'release': Struct( - id=String(), - name=Binary(), - message=Binary(), - target=String(), - target_type=String(), - author=Binary(), - date=Timestamp(), - date_offset=SmallInt(), - ), - 'revision': Struct( - id=String(), - message=Binary(), - author=Binary(), - date=Timestamp(), - date_offset=SmallInt(), - committer=Binary(), - committer_date=Timestamp(), - committer_offset=SmallInt(), - directory=String(), - ), - 'directory': Struct( - id=String(), - ), - 'directory_entry': Struct( - directory_id=String(), - name=Binary(), - type=String(), - target=String(), - perms=Int(), - ), - 'content': Struct( - sha1=String(), - sha1_git=String(), - sha256=String(), - blake2s256=String(), - length=BigInt(), - status=String(), - ), - 'skipped_content': Struct( - sha1=String(), - sha1_git=String(), - sha256=String(), - blake2s256=String(), - length=BigInt(), - status=String(), - reason=String(), - ), + table_name: Struct( + **{ + column_name: ORC_TYPE_MAP[column_type]() + for column_name, column_type in columns + } + ) + for table_name, columns in TABLES.items() } -# fmt: on def hash_to_hex_or_none(hash): diff --git a/swh/dataset/relational.py b/swh/dataset/relational.py new file mode 100644 --- /dev/null +++ b/swh/dataset/relational.py @@ -0,0 +1,77 @@ +# fmt: off +TABLES = { + "origin": [ + ("url", "string"), + ], + "origin_visit": [ + ("origin", "string"), + ("visit", "bigint"), + ("date", "timestamp"), + ("type", "string"), + ], + "origin_visit_status": [ + ("origin", "string"), + ("visit", "bigint"), + ("date", "timestamp"), + ("status", "string"), + ("snapshot", "string"), + ], + "snapshot": [ + ("id", "string"), + ], + "snapshot_branch": [ + ("snapshot_id", "string"), + ("name", "binary"), + ("target", "string"), + ("target_type", "string"), + ], + "release": [ + ("id", "string"), + ("name", "binary"), + ("message", "binary"), + ("target", "string"), + ("target_type", "string"), + ("author", "binary"), + ("date", "timestamp"), + ("date_offset", "smallint"), + ], + "revision": [ + ("id", "string"), + ("message", "binary"), + ("author", "binary"), + ("date", "timestamp"), + ("date_offset", "smallint"), + ("committer", "binary"), + ("committer_date", "timestamp"), + ("committer_offset", "smallint"), + ("directory", "string"), + ], + "directory": [ + ("id", "string"), + ], + "directory_entry": [ + ("directory_id", "string"), + ("name", "binary"), + ("type", "string"), + ("target", "string"), + ("perms", "int"), + ], + "content": [ + ("sha1", "string"), + ("sha1_git", "string"), + ("sha256", "string"), + ("blake2s256", "string"), + ("length", "bigint"), + ("status", "string"), + ], + "skipped_content": [ + ("sha1", "string"), + ("sha1_git", "string"), + ("sha256", "string"), + ("blake2s256", "string"), + ("length", "bigint"), + ("status", "string"), + ("reason", "string"), + ], +} +# fmt: on