diff --git a/requirements-swh.txt b/requirements-swh.txt --- a/requirements-swh.txt +++ b/requirements-swh.txt @@ -1,5 +1,5 @@ # Add here internal Software Heritage dependencies, one per line. -swh.core[http] >= 0.3 +swh.core[http] >= 2 swh.journal -swh.model >= 1.0.0 +swh.model >= 5 swh.storage diff --git a/swh/dataset/exporters/orc.py b/swh/dataset/exporters/orc.py --- a/swh/dataset/exporters/orc.py +++ b/swh/dataset/exporters/orc.py @@ -3,7 +3,6 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -import datetime import uuid from pyorc import ( @@ -22,6 +21,7 @@ from swh.dataset.relational import TABLES from swh.dataset.utils import remove_pull_requests from swh.model.hashutil import hash_to_hex +import swh.model.model as swhmodel ORC_TYPE_MAP = { "string": String, @@ -47,23 +47,17 @@ return hash_to_hex(hash) if hash is not None else None -def swh_date_to_datetime(obj): +def swh_date_to_tuple(obj): if obj is None or obj["timestamp"] is None: - return None - return datetime.datetime( - 1970, 1, 1, tzinfo=datetime.timezone.utc - ) + datetime.timedelta( - seconds=obj["timestamp"]["seconds"], - microseconds=obj["timestamp"]["microseconds"], + return (None, None, None) + ts = swhmodel.TimestampWithTimezone.from_dict(obj) + return ( + ts.timestamp.seconds, + ts.timestamp.microseconds, + ts.offset_bytes, ) -def swh_date_to_offset(obj): - if obj is None: - return None - return obj["offset"] - - class ORCExporter(ExporterDispatch): """ Implementation of an exporter which writes the entire graph dataset as @@ -142,8 +136,7 @@ hash_to_hex_or_none(release["target"]), release["target_type"], (release.get("author") or {}).get("fullname"), - swh_date_to_datetime(release["date"]), - swh_date_to_offset(release["date"]), + *swh_date_to_tuple(release["date"]), ) ) @@ -154,11 +147,9 @@ hash_to_hex_or_none(revision["id"]), revision["message"], revision["author"]["fullname"], - swh_date_to_datetime(revision["date"]), - swh_date_to_offset(revision["date"]), + *swh_date_to_tuple(revision["date"]), revision["committer"]["fullname"], - swh_date_to_datetime(revision["committer_date"]), - swh_date_to_offset(revision["committer_date"]), + *swh_date_to_tuple(revision["committer_date"]), hash_to_hex_or_none(revision["directory"]), ) ) diff --git a/swh/dataset/relational.py b/swh/dataset/relational.py --- a/swh/dataset/relational.py +++ b/swh/dataset/relational.py @@ -37,18 +37,21 @@ ("target", "string"), ("target_type", "string"), ("author", "binary"), - ("date", "timestamp"), - ("date_offset", "smallint"), + ("date_seconds", "bigint"), + ("date_microseconds", "int"), + ("date_offset_bytes", "binary"), ], "revision": [ ("id", "string"), ("message", "binary"), ("author", "binary"), - ("date", "timestamp"), - ("date_offset", "smallint"), + ("date_seconds", "bigint"), + ("date_microseconds", "int"), + ("date_offset_bytes", "binary"), ("committer", "binary"), - ("committer_date", "timestamp"), - ("committer_offset", "smallint"), + ("committer_date_seconds", "bigint"), + ("committer_date_microseconds", "int"), + ("committer_date_offset_bytes", "binary"), ("directory", "string"), ], "revision_history": [ diff --git a/swh/dataset/test/test_orc.py b/swh/dataset/test/test_orc.py --- a/swh/dataset/test/test_orc.py +++ b/swh/dataset/test/test_orc.py @@ -8,8 +8,7 @@ from swh.dataset.exporters.orc import ( ORCExporter, hash_to_hex_or_none, - swh_date_to_datetime, - swh_date_to_offset, + swh_date_to_tuple, ) from swh.model.tests.swh_model_data import TEST_OBJECTS @@ -89,8 +88,7 @@ hash_to_hex_or_none(obj.target), obj.target_type.value, obj.author.fullname if obj.author else None, - swh_date_to_datetime(obj.date.to_dict()) if obj.date else None, - swh_date_to_offset(obj.date.to_dict()) if obj.date else None, + *swh_date_to_tuple(obj.date.to_dict() if obj.date is not None else None), ) in output[obj_type] @@ -102,11 +100,11 @@ hash_to_hex_or_none(obj.id), obj.message, obj.author.fullname, - swh_date_to_datetime(obj.date.to_dict()), - swh_date_to_offset(obj.date.to_dict()), + *swh_date_to_tuple(obj.date.to_dict() if obj.date else None), obj.committer.fullname, - swh_date_to_datetime(obj.committer_date.to_dict()), - swh_date_to_offset(obj.committer_date.to_dict()), + *swh_date_to_tuple( + obj.committer_date.to_dict() + if obj.committer_date else None), hash_to_hex_or_none(obj.directory), ) in output["revision"] for i, parent in enumerate(obj.parents):