diff --git a/requirements-swh.txt b/requirements-swh.txt --- a/requirements-swh.txt +++ b/requirements-swh.txt @@ -1,5 +1,5 @@ # Add here internal Software Heritage dependencies, one per line. -swh.core[http] >= 0.3 +swh.core[http] >= 2 swh.journal -swh.model >= 1.0.0 +swh.model >= 4.3 swh.storage diff --git a/swh/dataset/exporters/orc.py b/swh/dataset/exporters/orc.py --- a/swh/dataset/exporters/orc.py +++ b/swh/dataset/exporters/orc.py @@ -3,8 +3,11 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -import datetime +from datetime import datetime +import math +from typing import Optional, Tuple, Type, cast import uuid +import zoneinfo from pyorc import ( BigInt, @@ -15,13 +18,16 @@ String, Struct, Timestamp, + TypeKind, Writer, ) +from pyorc.converters import ORCConverter from swh.dataset.exporter import ExporterDispatch from swh.dataset.relational import TABLES from swh.dataset.utils import remove_pull_requests from swh.model.hashutil import hash_to_hex +from swh.model.model import TimestampWithTimezone ORC_TYPE_MAP = { "string": String, @@ -47,21 +53,51 @@ return hash_to_hex(hash) if hash is not None else None -def swh_date_to_datetime(obj): +def swh_date_to_tuple(obj): if obj is None or obj["timestamp"] is None: - return None - return datetime.datetime( - 1970, 1, 1, tzinfo=datetime.timezone.utc - ) + datetime.timedelta( - seconds=obj["timestamp"]["seconds"], - microseconds=obj["timestamp"]["microseconds"], + return (None, None, None) + offset_bytes = obj.get("offset_bytes") + if offset_bytes is None: + offset = obj.get("offset", 0) + negative = offset < 0 or obj.get("negative_utc", False) + (hours, minutes) = divmod(abs(offset), 60) + offset_bytes = f"{'-' if negative else '+'}{hours:02}{minutes:02}".encode() + else: + offset = TimestampWithTimezone._parse_offset_bytes(offset_bytes) + return ( + (obj["timestamp"]["seconds"], obj["timestamp"]["microseconds"]), + offset, + offset_bytes, ) -def swh_date_to_offset(obj): +def datetime_to_tuple(obj: Optional[datetime]) -> Optional[Tuple[int, int]]: if obj is None: return None - return obj["offset"] + return (math.floor(obj.timestamp()), obj.microsecond) + + +class SWHTimestampConverter: + """This is an ORCConverter compatible class to convert timestamps from/to ORC files + + timestamps in python are given as a couple (seconds, microseconds) and are + serialized as a couple (seconds, nanoseconds) in the ORC file. + + """ + + @staticmethod + def from_orc( + seconds: int, nanoseconds: int, timezone: zoneinfo.ZoneInfo, + ) -> Tuple[int, int]: + return (seconds, nanoseconds // 1000) + + @staticmethod + def to_orc( + obj: Optional[Tuple[int, int]], timezone: zoneinfo.ZoneInfo + ) -> Optional[Tuple[int, int]]: + if obj is None: + return None + return (obj[0], obj[1] * 1000 if obj[1] is not None else None) class ORCExporter(ExporterDispatch): @@ -87,6 +123,11 @@ export_obj, EXPORT_SCHEMA[table_name], compression=CompressionKind.ZSTD, + converters={ + TypeKind.TIMESTAMP: cast( + Type[ORCConverter], SWHTimestampConverter + ) + }, ) ) return self.writers[table_name] @@ -98,7 +139,12 @@ def process_origin_visit(self, visit): origin_visit_writer = self.get_writer_for("origin_visit") origin_visit_writer.write( - (visit["origin"], visit["visit"], visit["date"], visit["type"],) + ( + visit["origin"], + visit["visit"], + datetime_to_tuple(visit["date"]), + visit["type"], + ) ) def process_origin_visit_status(self, visit_status): @@ -107,7 +153,7 @@ ( visit_status["origin"], visit_status["visit"], - visit_status["date"], + datetime_to_tuple(visit_status["date"]), visit_status["status"], hash_to_hex_or_none(visit_status["snapshot"]), ) @@ -142,8 +188,7 @@ hash_to_hex_or_none(release["target"]), release["target_type"], (release.get("author") or {}).get("fullname"), - swh_date_to_datetime(release["date"]), - swh_date_to_offset(release["date"]), + *swh_date_to_tuple(release["date"]), ) ) @@ -154,11 +199,9 @@ hash_to_hex_or_none(revision["id"]), revision["message"], revision["author"]["fullname"], - swh_date_to_datetime(revision["date"]), - swh_date_to_offset(revision["date"]), + *swh_date_to_tuple(revision["date"]), revision["committer"]["fullname"], - swh_date_to_datetime(revision["committer_date"]), - swh_date_to_offset(revision["committer_date"]), + *swh_date_to_tuple(revision["committer_date"]), hash_to_hex_or_none(revision["directory"]), ) ) diff --git a/swh/dataset/relational.py b/swh/dataset/relational.py --- a/swh/dataset/relational.py +++ b/swh/dataset/relational.py @@ -37,18 +37,21 @@ ("target", "string"), ("target_type", "string"), ("author", "binary"), - ("date", "timestamp"), - ("date_offset", "smallint"), + ("date_timestamp", "timestamp"), + ("date_offset", "bigint"), + ("date_raw_offset_bytes", "binary"), ], "revision": [ ("id", "string"), ("message", "binary"), ("author", "binary"), - ("date", "timestamp"), - ("date_offset", "smallint"), + ("date_timestamp", "timestamp"), + ("date_offset", "bigint"), + ("date_raw_offset_bytes", "binary"), ("committer", "binary"), - ("committer_date", "timestamp"), - ("committer_offset", "smallint"), + ("committer_date_timestamp", "timestamp"), + ("committer_date_offset", "bigint"), + ("committer_date_raw_offset_bytes", "binary"), ("directory", "string"), ], "revision_history": [ diff --git a/swh/dataset/test/test_orc.py b/swh/dataset/test/test_orc.py --- a/swh/dataset/test/test_orc.py +++ b/swh/dataset/test/test_orc.py @@ -7,9 +7,10 @@ from swh.dataset.exporters.orc import ( ORCExporter, + SWHTimestampConverter, + datetime_to_tuple, hash_to_hex_or_none, - swh_date_to_datetime, - swh_date_to_offset, + swh_date_to_tuple, ) from swh.model.tests.swh_model_data import TEST_OBJECTS @@ -29,7 +30,10 @@ for obj_type_dir in tmppath.iterdir(): for orc_file in obj_type_dir.iterdir(): with orc_file.open("rb") as orc_obj: - res[obj_type_dir.name] |= set(pyorc.Reader(orc_obj)) + res[obj_type_dir.name] |= set(pyorc.Reader( + orc_obj, + converters={pyorc.TypeKind.TIMESTAMP: SWHTimestampConverter} + )) return res return wrapped @@ -46,7 +50,9 @@ obj_type = "origin_visit" output = exporter({obj_type: TEST_OBJECTS[obj_type]}) for obj in TEST_OBJECTS[obj_type]: - assert (obj.origin, obj.visit, obj.date, obj.type) in output[obj_type] + assert (obj.origin, obj.visit, datetime_to_tuple(obj.date), obj.type) in output[ + obj_type + ] def test_export_origin_visit_status(exporter): @@ -56,7 +62,7 @@ assert ( obj.origin, obj.visit, - obj.date, + datetime_to_tuple(obj.date), obj.status, hash_to_hex_or_none(obj.snapshot), ) in output[obj_type] @@ -82,15 +88,14 @@ obj_type = "release" output = exporter({obj_type: TEST_OBJECTS[obj_type]}) for obj in TEST_OBJECTS[obj_type]: - assert ( + rel = ( hash_to_hex_or_none(obj.id), obj.name, obj.message, hash_to_hex_or_none(obj.target), obj.target_type.value, obj.author.fullname if obj.author else None, - swh_date_to_datetime(obj.date.to_dict()) if obj.date else None, - swh_date_to_offset(obj.date.to_dict()) if obj.date else None, + *swh_date_to_tuple(obj.date.to_dict() if obj.date is not None else None), ) in output[obj_type] @@ -102,11 +107,11 @@ hash_to_hex_or_none(obj.id), obj.message, obj.author.fullname, - swh_date_to_datetime(obj.date.to_dict()), - swh_date_to_offset(obj.date.to_dict()), + *swh_date_to_tuple(obj.date.to_dict() if obj.date else None), obj.committer.fullname, - swh_date_to_datetime(obj.committer_date.to_dict()), - swh_date_to_offset(obj.committer_date.to_dict()), + *swh_date_to_tuple( + obj.committer_date.to_dict() if obj.committer_date else None + ), hash_to_hex_or_none(obj.directory), ) in output["revision"] for i, parent in enumerate(obj.parents): @@ -159,3 +164,31 @@ obj.status, obj.reason, ) in output[obj_type] + + +def test_date_to_tuple(): + ts = {"seconds": 123456, "microseconds": 1515} + assert swh_date_to_tuple({"timestamp": ts, "offset_bytes": b"+0100"}) == ( + (123456, 1515), + 60, + b"+0100", + ) + + assert swh_date_to_tuple( + { + "timestamp": ts, + "offset": 120, + "negative_utc": False, + "offset_bytes": b"+0100", + } + ) == ((123456, 1515), 60, b"+0100") + + assert swh_date_to_tuple( + {"timestamp": ts, "offset": 120, "negative_utc": False,} + ) == ((123456, 1515), 120, b"+0200") + + assert swh_date_to_tuple({"timestamp": ts, "offset": 0, "negative_utc": True,}) == ( + (123456, 1515), + 0, + b"-0000", + )