Page MenuHomeSoftware Heritage

D7379.id26772.diff
No OneTemporary

D7379.id26772.diff

diff --git a/requirements-swh.txt b/requirements-swh.txt
--- a/requirements-swh.txt
+++ b/requirements-swh.txt
@@ -1,5 +1,5 @@
# Add here internal Software Heritage dependencies, one per line.
-swh.core[http] >= 0.3
+swh.core[http] >= 2
swh.journal
-swh.model >= 1.0.0
+swh.model >= 4.3
swh.storage
diff --git a/swh/dataset/exporters/orc.py b/swh/dataset/exporters/orc.py
--- a/swh/dataset/exporters/orc.py
+++ b/swh/dataset/exporters/orc.py
@@ -3,8 +3,11 @@
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
-import datetime
+from datetime import datetime
+import math
+from typing import Optional, Tuple, Type, cast
import uuid
+import zoneinfo
from pyorc import (
BigInt,
@@ -15,13 +18,16 @@
String,
Struct,
Timestamp,
+ TypeKind,
Writer,
)
+from pyorc.converters import ORCConverter
from swh.dataset.exporter import ExporterDispatch
from swh.dataset.relational import TABLES
from swh.dataset.utils import remove_pull_requests
from swh.model.hashutil import hash_to_hex
+from swh.model.model import TimestampWithTimezone
ORC_TYPE_MAP = {
"string": String,
@@ -47,21 +53,51 @@
return hash_to_hex(hash) if hash is not None else None
-def swh_date_to_datetime(obj):
+def swh_date_to_tuple(obj):
if obj is None or obj["timestamp"] is None:
- return None
- return datetime.datetime(
- 1970, 1, 1, tzinfo=datetime.timezone.utc
- ) + datetime.timedelta(
- seconds=obj["timestamp"]["seconds"],
- microseconds=obj["timestamp"]["microseconds"],
+ return (None, None, None)
+ offset_bytes = obj.get("offset_bytes")
+ if offset_bytes is None:
+ offset = obj.get("offset", 0)
+ negative = offset < 0 or obj.get("negative_utc", False)
+ (hours, minutes) = divmod(abs(offset), 60)
+ offset_bytes = f"{'-' if negative else '+'}{hours:02}{minutes:02}".encode()
+ else:
+ offset = TimestampWithTimezone._parse_offset_bytes(offset_bytes)
+ return (
+ (obj["timestamp"]["seconds"], obj["timestamp"]["microseconds"]),
+ offset,
+ offset_bytes,
)
-def swh_date_to_offset(obj):
+def datetime_to_tuple(obj: Optional[datetime]) -> Optional[Tuple[int, int]]:
if obj is None:
return None
- return obj["offset"]
+ return (math.floor(obj.timestamp()), obj.microsecond)
+
+
+class SWHTimestampConverter:
+ """This is an ORCConverter compatible class to convert timestamps from/to ORC files
+
+ timestamps in python are given as a couple (seconds, microseconds) and are
+ serialized as a couple (seconds, nanoseconds) in the ORC file.
+
+ """
+
+ @staticmethod
+ def from_orc(
+ seconds: int, nanoseconds: int, timezone: zoneinfo.ZoneInfo,
+ ) -> Tuple[int, int]:
+ return (seconds, nanoseconds // 1000)
+
+ @staticmethod
+ def to_orc(
+ obj: Optional[Tuple[int, int]], timezone: zoneinfo.ZoneInfo
+ ) -> Optional[Tuple[int, int]]:
+ if obj is None:
+ return None
+ return (obj[0], obj[1] * 1000 if obj[1] is not None else None)
class ORCExporter(ExporterDispatch):
@@ -87,6 +123,11 @@
export_obj,
EXPORT_SCHEMA[table_name],
compression=CompressionKind.ZSTD,
+ converters={
+ TypeKind.TIMESTAMP: cast(
+ Type[ORCConverter], SWHTimestampConverter
+ )
+ },
)
)
return self.writers[table_name]
@@ -98,7 +139,12 @@
def process_origin_visit(self, visit):
origin_visit_writer = self.get_writer_for("origin_visit")
origin_visit_writer.write(
- (visit["origin"], visit["visit"], visit["date"], visit["type"],)
+ (
+ visit["origin"],
+ visit["visit"],
+ datetime_to_tuple(visit["date"]),
+ visit["type"],
+ )
)
def process_origin_visit_status(self, visit_status):
@@ -107,7 +153,7 @@
(
visit_status["origin"],
visit_status["visit"],
- visit_status["date"],
+ datetime_to_tuple(visit_status["date"]),
visit_status["status"],
hash_to_hex_or_none(visit_status["snapshot"]),
)
@@ -142,8 +188,7 @@
hash_to_hex_or_none(release["target"]),
release["target_type"],
(release.get("author") or {}).get("fullname"),
- swh_date_to_datetime(release["date"]),
- swh_date_to_offset(release["date"]),
+ *swh_date_to_tuple(release["date"]),
)
)
@@ -154,11 +199,9 @@
hash_to_hex_or_none(revision["id"]),
revision["message"],
revision["author"]["fullname"],
- swh_date_to_datetime(revision["date"]),
- swh_date_to_offset(revision["date"]),
+ *swh_date_to_tuple(revision["date"]),
revision["committer"]["fullname"],
- swh_date_to_datetime(revision["committer_date"]),
- swh_date_to_offset(revision["committer_date"]),
+ *swh_date_to_tuple(revision["committer_date"]),
hash_to_hex_or_none(revision["directory"]),
)
)
diff --git a/swh/dataset/relational.py b/swh/dataset/relational.py
--- a/swh/dataset/relational.py
+++ b/swh/dataset/relational.py
@@ -37,18 +37,21 @@
("target", "string"),
("target_type", "string"),
("author", "binary"),
- ("date", "timestamp"),
- ("date_offset", "smallint"),
+ ("date_timestamp", "timestamp"),
+ ("date_offset", "bigint"),
+ ("date_raw_offset_bytes", "binary"),
],
"revision": [
("id", "string"),
("message", "binary"),
("author", "binary"),
- ("date", "timestamp"),
- ("date_offset", "smallint"),
+ ("date_timestamp", "timestamp"),
+ ("date_offset", "bigint"),
+ ("date_raw_offset_bytes", "binary"),
("committer", "binary"),
- ("committer_date", "timestamp"),
- ("committer_offset", "smallint"),
+ ("committer_date_timestamp", "timestamp"),
+ ("committer_date_offset", "bigint"),
+ ("committer_date_raw_offset_bytes", "binary"),
("directory", "string"),
],
"revision_history": [
diff --git a/swh/dataset/test/test_orc.py b/swh/dataset/test/test_orc.py
--- a/swh/dataset/test/test_orc.py
+++ b/swh/dataset/test/test_orc.py
@@ -7,9 +7,10 @@
from swh.dataset.exporters.orc import (
ORCExporter,
+ SWHTimestampConverter,
+ datetime_to_tuple,
hash_to_hex_or_none,
- swh_date_to_datetime,
- swh_date_to_offset,
+ swh_date_to_tuple,
)
from swh.model.tests.swh_model_data import TEST_OBJECTS
@@ -29,7 +30,10 @@
for obj_type_dir in tmppath.iterdir():
for orc_file in obj_type_dir.iterdir():
with orc_file.open("rb") as orc_obj:
- res[obj_type_dir.name] |= set(pyorc.Reader(orc_obj))
+ res[obj_type_dir.name] |= set(pyorc.Reader(
+ orc_obj,
+ converters={pyorc.TypeKind.TIMESTAMP: SWHTimestampConverter}
+ ))
return res
return wrapped
@@ -46,7 +50,9 @@
obj_type = "origin_visit"
output = exporter({obj_type: TEST_OBJECTS[obj_type]})
for obj in TEST_OBJECTS[obj_type]:
- assert (obj.origin, obj.visit, obj.date, obj.type) in output[obj_type]
+ assert (obj.origin, obj.visit, datetime_to_tuple(obj.date), obj.type) in output[
+ obj_type
+ ]
def test_export_origin_visit_status(exporter):
@@ -56,7 +62,7 @@
assert (
obj.origin,
obj.visit,
- obj.date,
+ datetime_to_tuple(obj.date),
obj.status,
hash_to_hex_or_none(obj.snapshot),
) in output[obj_type]
@@ -82,15 +88,14 @@
obj_type = "release"
output = exporter({obj_type: TEST_OBJECTS[obj_type]})
for obj in TEST_OBJECTS[obj_type]:
- assert (
+ rel = (
hash_to_hex_or_none(obj.id),
obj.name,
obj.message,
hash_to_hex_or_none(obj.target),
obj.target_type.value,
obj.author.fullname if obj.author else None,
- swh_date_to_datetime(obj.date.to_dict()) if obj.date else None,
- swh_date_to_offset(obj.date.to_dict()) if obj.date else None,
+ *swh_date_to_tuple(obj.date.to_dict() if obj.date is not None else None),
) in output[obj_type]
@@ -102,11 +107,11 @@
hash_to_hex_or_none(obj.id),
obj.message,
obj.author.fullname,
- swh_date_to_datetime(obj.date.to_dict()),
- swh_date_to_offset(obj.date.to_dict()),
+ *swh_date_to_tuple(obj.date.to_dict() if obj.date else None),
obj.committer.fullname,
- swh_date_to_datetime(obj.committer_date.to_dict()),
- swh_date_to_offset(obj.committer_date.to_dict()),
+ *swh_date_to_tuple(
+ obj.committer_date.to_dict() if obj.committer_date else None
+ ),
hash_to_hex_or_none(obj.directory),
) in output["revision"]
for i, parent in enumerate(obj.parents):
@@ -159,3 +164,31 @@
obj.status,
obj.reason,
) in output[obj_type]
+
+
+def test_date_to_tuple():
+ ts = {"seconds": 123456, "microseconds": 1515}
+ assert swh_date_to_tuple({"timestamp": ts, "offset_bytes": b"+0100"}) == (
+ (123456, 1515),
+ 60,
+ b"+0100",
+ )
+
+ assert swh_date_to_tuple(
+ {
+ "timestamp": ts,
+ "offset": 120,
+ "negative_utc": False,
+ "offset_bytes": b"+0100",
+ }
+ ) == ((123456, 1515), 60, b"+0100")
+
+ assert swh_date_to_tuple(
+ {"timestamp": ts, "offset": 120, "negative_utc": False,}
+ ) == ((123456, 1515), 120, b"+0200")
+
+ assert swh_date_to_tuple({"timestamp": ts, "offset": 0, "negative_utc": True,}) == (
+ (123456, 1515),
+ 0,
+ b"-0000",
+ )

File Metadata

Mime Type
text/plain
Expires
Fri, Jun 20, 5:43 PM (2 w, 4 h ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3220607

Event Timeline