diff --git a/swh/model/identifiers.py b/swh/model/identifiers.py --- a/swh/model/identifiers.py +++ b/swh/model/identifiers.py @@ -759,9 +759,9 @@ $ExtendedSwhid is a core SWHID, with extra types allowed ('ori' for origins and 'emd' for raw extrinsic metadata) - $Timestamp is a decimal representation of the integer number of seconds since - the UNIX epoch (1970-01-01 00:00:00 UTC), with no leading '0' - (unless the timestamp value is zero) and no timezone. + $Timestamp is a decimal representation of the rounded-down integer number of + seconds since the UNIX epoch (1970-01-01 00:00:00 UTC), + with no leading '0' (unless the timestamp value is zero) and no timezone. It may be negative by prefixing it with a '-', which must not be followed by a '0'. @@ -772,7 +772,19 @@ str: the intrinsic identifier for `metadata` """ - timestamp = metadata["discovery_date"].timestamp() + # equivalent to using math.floor(dt.timestamp()) to round down, + # as int(dt.timestamp()) rounds toward zero, + # which would map two seconds on the 0 timestamp. + # + # This should never be an issue in practice as Software Heritage didn't + # start collecting metadata before 2015. + timestamp = ( + metadata["discovery_date"] + .astimezone(datetime.timezone.utc) + .replace(microsecond=0) + .timestamp() + ) + assert timestamp.is_integer() headers = [ (b"target", str(metadata["target"]).encode()), diff --git a/swh/model/tests/test_identifiers.py b/swh/model/tests/test_identifiers.py --- a/swh/model/tests/test_identifiers.py +++ b/swh/model/tests/test_identifiers.py @@ -920,6 +920,26 @@ "5c13f20ba336e44549baf3d7b9305b027ec9f43d", ) + def test_noninteger_timezone(self): + """Checks the discovery_date is translated to UTC before truncating + microseconds""" + tz = datetime.timezone(datetime.timedelta(microseconds=-42)) + metadata = { + **self.minimal, + "discovery_date": datetime.datetime( + 2021, 1, 25, 11, 27, 50, 1_000_000 - 42, tzinfo=tz, + ), + } + + self.assertEqual( + identifiers.raw_extrinsic_metadata_identifier(self.minimal), + identifiers.raw_extrinsic_metadata_identifier(metadata), + ) + self.assertEqual( + identifiers.raw_extrinsic_metadata_identifier(metadata), + "5c13f20ba336e44549baf3d7b9305b027ec9f43d", + ) + def test_negative_timestamp(self): metadata = { **self.minimal, @@ -948,6 +968,62 @@ "895d0821a2991dd376ddc303424aceb7c68280f9", ) + def test_epoch(self): + metadata = { + **self.minimal, + "discovery_date": datetime.datetime( + 1970, 1, 1, 0, 0, 0, tzinfo=datetime.timezone.utc, + ), + } + + manifest = ( + b"raw_extrinsic_metadata 201\0" + b"target swh:1:cnt:568aaf43d83b2c3df8067f3bedbb97d83260be6d\n" + b"discovery_date 0\n" + b"authority forge https://forge.softwareheritage.org/\n" + b"fetcher swh-phabricator-metadata-fetcher 0.0.1\n" + b"format json\n" + b"\n" + b'{"foo": "bar"}' + ) + + self.assertEqual( + identifiers.raw_extrinsic_metadata_identifier(metadata), + hashlib.sha1(manifest).hexdigest(), + ) + self.assertEqual( + identifiers.raw_extrinsic_metadata_identifier(metadata), + "27a53df54ace35ebd910493cdc70b334d6b7cb88", + ) + + def test_negative_epoch(self): + metadata = { + **self.minimal, + "discovery_date": datetime.datetime( + 1969, 12, 31, 23, 59, 59, 1, tzinfo=datetime.timezone.utc, + ), + } + + manifest = ( + b"raw_extrinsic_metadata 202\0" + b"target swh:1:cnt:568aaf43d83b2c3df8067f3bedbb97d83260be6d\n" + b"discovery_date -1\n" + b"authority forge https://forge.softwareheritage.org/\n" + b"fetcher swh-phabricator-metadata-fetcher 0.0.1\n" + b"format json\n" + b"\n" + b'{"foo": "bar"}' + ) + + self.assertEqual( + identifiers.raw_extrinsic_metadata_identifier(metadata), + hashlib.sha1(manifest).hexdigest(), + ) + self.assertEqual( + identifiers.raw_extrinsic_metadata_identifier(metadata), + "be7154a8fd49d87f81547ea634d1e2152907d089", + ) + origin_example = { "url": "https://github.com/torvalds/linux",