Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F9123580
D7379.id26772.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
10 KB
Subscribers
None
D7379.id26772.diff
View Options
diff --git a/requirements-swh.txt b/requirements-swh.txt
--- a/requirements-swh.txt
+++ b/requirements-swh.txt
@@ -1,5 +1,5 @@
# Add here internal Software Heritage dependencies, one per line.
-swh.core[http] >= 0.3
+swh.core[http] >= 2
swh.journal
-swh.model >= 1.0.0
+swh.model >= 4.3
swh.storage
diff --git a/swh/dataset/exporters/orc.py b/swh/dataset/exporters/orc.py
--- a/swh/dataset/exporters/orc.py
+++ b/swh/dataset/exporters/orc.py
@@ -3,8 +3,11 @@
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
-import datetime
+from datetime import datetime
+import math
+from typing import Optional, Tuple, Type, cast
import uuid
+import zoneinfo
from pyorc import (
BigInt,
@@ -15,13 +18,16 @@
String,
Struct,
Timestamp,
+ TypeKind,
Writer,
)
+from pyorc.converters import ORCConverter
from swh.dataset.exporter import ExporterDispatch
from swh.dataset.relational import TABLES
from swh.dataset.utils import remove_pull_requests
from swh.model.hashutil import hash_to_hex
+from swh.model.model import TimestampWithTimezone
ORC_TYPE_MAP = {
"string": String,
@@ -47,21 +53,51 @@
return hash_to_hex(hash) if hash is not None else None
-def swh_date_to_datetime(obj):
+def swh_date_to_tuple(obj):
if obj is None or obj["timestamp"] is None:
- return None
- return datetime.datetime(
- 1970, 1, 1, tzinfo=datetime.timezone.utc
- ) + datetime.timedelta(
- seconds=obj["timestamp"]["seconds"],
- microseconds=obj["timestamp"]["microseconds"],
+ return (None, None, None)
+ offset_bytes = obj.get("offset_bytes")
+ if offset_bytes is None:
+ offset = obj.get("offset", 0)
+ negative = offset < 0 or obj.get("negative_utc", False)
+ (hours, minutes) = divmod(abs(offset), 60)
+ offset_bytes = f"{'-' if negative else '+'}{hours:02}{minutes:02}".encode()
+ else:
+ offset = TimestampWithTimezone._parse_offset_bytes(offset_bytes)
+ return (
+ (obj["timestamp"]["seconds"], obj["timestamp"]["microseconds"]),
+ offset,
+ offset_bytes,
)
-def swh_date_to_offset(obj):
+def datetime_to_tuple(obj: Optional[datetime]) -> Optional[Tuple[int, int]]:
if obj is None:
return None
- return obj["offset"]
+ return (math.floor(obj.timestamp()), obj.microsecond)
+
+
+class SWHTimestampConverter:
+ """This is an ORCConverter compatible class to convert timestamps from/to ORC files
+
+ timestamps in python are given as a couple (seconds, microseconds) and are
+ serialized as a couple (seconds, nanoseconds) in the ORC file.
+
+ """
+
+ @staticmethod
+ def from_orc(
+ seconds: int, nanoseconds: int, timezone: zoneinfo.ZoneInfo,
+ ) -> Tuple[int, int]:
+ return (seconds, nanoseconds // 1000)
+
+ @staticmethod
+ def to_orc(
+ obj: Optional[Tuple[int, int]], timezone: zoneinfo.ZoneInfo
+ ) -> Optional[Tuple[int, int]]:
+ if obj is None:
+ return None
+ return (obj[0], obj[1] * 1000 if obj[1] is not None else None)
class ORCExporter(ExporterDispatch):
@@ -87,6 +123,11 @@
export_obj,
EXPORT_SCHEMA[table_name],
compression=CompressionKind.ZSTD,
+ converters={
+ TypeKind.TIMESTAMP: cast(
+ Type[ORCConverter], SWHTimestampConverter
+ )
+ },
)
)
return self.writers[table_name]
@@ -98,7 +139,12 @@
def process_origin_visit(self, visit):
origin_visit_writer = self.get_writer_for("origin_visit")
origin_visit_writer.write(
- (visit["origin"], visit["visit"], visit["date"], visit["type"],)
+ (
+ visit["origin"],
+ visit["visit"],
+ datetime_to_tuple(visit["date"]),
+ visit["type"],
+ )
)
def process_origin_visit_status(self, visit_status):
@@ -107,7 +153,7 @@
(
visit_status["origin"],
visit_status["visit"],
- visit_status["date"],
+ datetime_to_tuple(visit_status["date"]),
visit_status["status"],
hash_to_hex_or_none(visit_status["snapshot"]),
)
@@ -142,8 +188,7 @@
hash_to_hex_or_none(release["target"]),
release["target_type"],
(release.get("author") or {}).get("fullname"),
- swh_date_to_datetime(release["date"]),
- swh_date_to_offset(release["date"]),
+ *swh_date_to_tuple(release["date"]),
)
)
@@ -154,11 +199,9 @@
hash_to_hex_or_none(revision["id"]),
revision["message"],
revision["author"]["fullname"],
- swh_date_to_datetime(revision["date"]),
- swh_date_to_offset(revision["date"]),
+ *swh_date_to_tuple(revision["date"]),
revision["committer"]["fullname"],
- swh_date_to_datetime(revision["committer_date"]),
- swh_date_to_offset(revision["committer_date"]),
+ *swh_date_to_tuple(revision["committer_date"]),
hash_to_hex_or_none(revision["directory"]),
)
)
diff --git a/swh/dataset/relational.py b/swh/dataset/relational.py
--- a/swh/dataset/relational.py
+++ b/swh/dataset/relational.py
@@ -37,18 +37,21 @@
("target", "string"),
("target_type", "string"),
("author", "binary"),
- ("date", "timestamp"),
- ("date_offset", "smallint"),
+ ("date_timestamp", "timestamp"),
+ ("date_offset", "bigint"),
+ ("date_raw_offset_bytes", "binary"),
],
"revision": [
("id", "string"),
("message", "binary"),
("author", "binary"),
- ("date", "timestamp"),
- ("date_offset", "smallint"),
+ ("date_timestamp", "timestamp"),
+ ("date_offset", "bigint"),
+ ("date_raw_offset_bytes", "binary"),
("committer", "binary"),
- ("committer_date", "timestamp"),
- ("committer_offset", "smallint"),
+ ("committer_date_timestamp", "timestamp"),
+ ("committer_date_offset", "bigint"),
+ ("committer_date_raw_offset_bytes", "binary"),
("directory", "string"),
],
"revision_history": [
diff --git a/swh/dataset/test/test_orc.py b/swh/dataset/test/test_orc.py
--- a/swh/dataset/test/test_orc.py
+++ b/swh/dataset/test/test_orc.py
@@ -7,9 +7,10 @@
from swh.dataset.exporters.orc import (
ORCExporter,
+ SWHTimestampConverter,
+ datetime_to_tuple,
hash_to_hex_or_none,
- swh_date_to_datetime,
- swh_date_to_offset,
+ swh_date_to_tuple,
)
from swh.model.tests.swh_model_data import TEST_OBJECTS
@@ -29,7 +30,10 @@
for obj_type_dir in tmppath.iterdir():
for orc_file in obj_type_dir.iterdir():
with orc_file.open("rb") as orc_obj:
- res[obj_type_dir.name] |= set(pyorc.Reader(orc_obj))
+ res[obj_type_dir.name] |= set(pyorc.Reader(
+ orc_obj,
+ converters={pyorc.TypeKind.TIMESTAMP: SWHTimestampConverter}
+ ))
return res
return wrapped
@@ -46,7 +50,9 @@
obj_type = "origin_visit"
output = exporter({obj_type: TEST_OBJECTS[obj_type]})
for obj in TEST_OBJECTS[obj_type]:
- assert (obj.origin, obj.visit, obj.date, obj.type) in output[obj_type]
+ assert (obj.origin, obj.visit, datetime_to_tuple(obj.date), obj.type) in output[
+ obj_type
+ ]
def test_export_origin_visit_status(exporter):
@@ -56,7 +62,7 @@
assert (
obj.origin,
obj.visit,
- obj.date,
+ datetime_to_tuple(obj.date),
obj.status,
hash_to_hex_or_none(obj.snapshot),
) in output[obj_type]
@@ -82,15 +88,14 @@
obj_type = "release"
output = exporter({obj_type: TEST_OBJECTS[obj_type]})
for obj in TEST_OBJECTS[obj_type]:
- assert (
+ rel = (
hash_to_hex_or_none(obj.id),
obj.name,
obj.message,
hash_to_hex_or_none(obj.target),
obj.target_type.value,
obj.author.fullname if obj.author else None,
- swh_date_to_datetime(obj.date.to_dict()) if obj.date else None,
- swh_date_to_offset(obj.date.to_dict()) if obj.date else None,
+ *swh_date_to_tuple(obj.date.to_dict() if obj.date is not None else None),
) in output[obj_type]
@@ -102,11 +107,11 @@
hash_to_hex_or_none(obj.id),
obj.message,
obj.author.fullname,
- swh_date_to_datetime(obj.date.to_dict()),
- swh_date_to_offset(obj.date.to_dict()),
+ *swh_date_to_tuple(obj.date.to_dict() if obj.date else None),
obj.committer.fullname,
- swh_date_to_datetime(obj.committer_date.to_dict()),
- swh_date_to_offset(obj.committer_date.to_dict()),
+ *swh_date_to_tuple(
+ obj.committer_date.to_dict() if obj.committer_date else None
+ ),
hash_to_hex_or_none(obj.directory),
) in output["revision"]
for i, parent in enumerate(obj.parents):
@@ -159,3 +164,31 @@
obj.status,
obj.reason,
) in output[obj_type]
+
+
+def test_date_to_tuple():
+ ts = {"seconds": 123456, "microseconds": 1515}
+ assert swh_date_to_tuple({"timestamp": ts, "offset_bytes": b"+0100"}) == (
+ (123456, 1515),
+ 60,
+ b"+0100",
+ )
+
+ assert swh_date_to_tuple(
+ {
+ "timestamp": ts,
+ "offset": 120,
+ "negative_utc": False,
+ "offset_bytes": b"+0100",
+ }
+ ) == ((123456, 1515), 60, b"+0100")
+
+ assert swh_date_to_tuple(
+ {"timestamp": ts, "offset": 120, "negative_utc": False,}
+ ) == ((123456, 1515), 120, b"+0200")
+
+ assert swh_date_to_tuple({"timestamp": ts, "offset": 0, "negative_utc": True,}) == (
+ (123456, 1515),
+ 0,
+ b"-0000",
+ )
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Fri, Jun 20, 5:43 PM (2 w, 4 h ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3220607
Attached To
D7379: Encode TimestampWithTimezone as (sec, usec, offset) in ORC file
Event Timeline
Log In to Comment