diff --git a/swh/model/identifiers.py b/swh/model/identifiers.py --- a/swh/model/identifiers.py +++ b/swh/model/identifiers.py @@ -688,7 +688,7 @@ ``` target_type: $ValueOfMetadataTargetType target: $UrlOrSwhid - discovery_date: $ISO8601 + discovery_date: $Timestamp authority: $StrWithoutSpaces $IRI fetcher: $Str $Version format: $StrWithoutSpaces @@ -712,6 +712,12 @@ $Swhid are core SWHIDs, as defined in :ref:`persistent-identifiers`. + $Timestamp is a decimal representation of the integer number of seconds since + the UNIX epoch (1970-01-01 00:00:00 UTC), with no leading '0' + (unless the timestamp value is zero) and no timezone. + It may be negative by prefixing it with a '-', which must not be followed + by a '0'. + Newlines in $Bytes, $Str, and $Iri are escaped as with other git fields, ie. by adding a space after them. @@ -719,10 +725,12 @@ str: the intrinsic identifier for `metadata` """ + timestamp = metadata["discovery_date"].timestamp() + headers = [ (b"target_type", metadata["type"].encode("ascii")), (b"target", str(metadata["target"]).encode()), - (b"discovery_date", metadata["discovery_date"].isoformat().encode("ascii")), + (b"discovery_date", str(int(timestamp)).encode("ascii")), ( b"authority", f"{metadata['authority']['type']} {metadata['authority']['url']}".encode(), diff --git a/swh/model/tests/test_identifiers.py b/swh/model/tests/test_identifiers.py --- a/swh/model/tests/test_identifiers.py +++ b/swh/model/tests/test_identifiers.py @@ -803,10 +803,10 @@ def test_minimal(self): manifest = ( - b"raw_extrinsic_metadata 245\0" + b"raw_extrinsic_metadata 230\0" b"target_type content\n" b"target swh:1:cnt:568aaf43d83b2c3df8067f3bedbb97d83260be6d\n" - b"discovery_date 2021-01-25T11:27:51+00:00\n" + b"discovery_date 1611574071\n" b"authority forge https://forge.softwareheritage.org/\n" b"fetcher swh-phabricator-metadata-fetcher 0.0.1\n" b"format json\n" @@ -820,15 +820,15 @@ ) self.assertEqual( identifiers.raw_extrinsic_metadata_identifier(self.minimal), - "da734f1531f830b7282ee01c5e0c0dfe7ecc99e9", + "e35827936a4ae7c351a92eda0eeb36da07da315f", ) def test_maximal(self): manifest = ( - b"raw_extrinsic_metadata 568\0" + b"raw_extrinsic_metadata 553\0" b"target_type content\n" b"target swh:1:cnt:568aaf43d83b2c3df8067f3bedbb97d83260be6d\n" - b"discovery_date 2021-01-25T11:27:51+00:00\n" + b"discovery_date 1611574071\n" b"authority forge https://forge.softwareheritage.org/\n" b"fetcher swh-phabricator-metadata-fetcher 0.0.1\n" b"format json\n" @@ -849,7 +849,7 @@ ) self.assertEqual( identifiers.raw_extrinsic_metadata_identifier(self.maximal), - "0d9e3bb9a72850e32bfb575f612cfad1a7e6b66a", + "7523fa6cef72ced2935242a45def7c5a36d0f609", ) def test_nonascii_path(self): @@ -858,10 +858,10 @@ "path": b"/ab\nc/d\xf0\x9f\xa4\xb7e\x00f", } manifest = ( - b"raw_extrinsic_metadata 266\0" + b"raw_extrinsic_metadata 251\0" b"target_type content\n" b"target swh:1:cnt:568aaf43d83b2c3df8067f3bedbb97d83260be6d\n" - b"discovery_date 2021-01-25T11:27:51+00:00\n" + b"discovery_date 1611574071\n" b"authority forge https://forge.softwareheritage.org/\n" b"fetcher swh-phabricator-metadata-fetcher 0.0.1\n" b"format json\n" @@ -877,7 +877,75 @@ ) self.assertEqual( identifiers.raw_extrinsic_metadata_identifier(metadata), - "63f4cb28396e00926ab7ebfd96e96b60227fc11a", + "caacc32905ebf4659a633a2dc856ccd6412991dc", + ) + + def test_timezone_insensitive(self): + """Checks the timezone of the datetime.datetime does not affect the + hashed manifest.""" + utc_plus_one = datetime.timezone(datetime.timedelta(hours=1)) + metadata = { + **self.minimal, + "discovery_date": datetime.datetime( + 2021, 1, 25, 12, 27, 51, tzinfo=utc_plus_one, + ), + } + + self.assertEqual( + identifiers.raw_extrinsic_metadata_identifier(self.minimal), + identifiers.raw_extrinsic_metadata_identifier(metadata), + ) + self.assertEqual( + identifiers.raw_extrinsic_metadata_identifier(metadata), + "e35827936a4ae7c351a92eda0eeb36da07da315f", + ) + + def test_microsecond_insensitive(self): + """Checks the microseconds of the datetime.datetime does not affect the + hashed manifest.""" + metadata = { + **self.minimal, + "discovery_date": datetime.datetime( + 2021, 1, 25, 11, 27, 51, 123456, tzinfo=datetime.timezone.utc, + ), + } + + self.assertEqual( + identifiers.raw_extrinsic_metadata_identifier(self.minimal), + identifiers.raw_extrinsic_metadata_identifier(metadata), + ) + self.assertEqual( + identifiers.raw_extrinsic_metadata_identifier(metadata), + "e35827936a4ae7c351a92eda0eeb36da07da315f", + ) + + def test_negative_timestamp(self): + metadata = { + **self.minimal, + "discovery_date": datetime.datetime( + 1960, 1, 25, 11, 27, 51, tzinfo=datetime.timezone.utc, + ), + } + + manifest = ( + b"raw_extrinsic_metadata 230\0" + b"target_type content\n" + b"target swh:1:cnt:568aaf43d83b2c3df8067f3bedbb97d83260be6d\n" + b"discovery_date -313504329\n" + b"authority forge https://forge.softwareheritage.org/\n" + b"fetcher swh-phabricator-metadata-fetcher 0.0.1\n" + b"format json\n" + b"\n" + b'{"foo": "bar"}' + ) + + self.assertEqual( + identifiers.raw_extrinsic_metadata_identifier(metadata), + hashlib.sha1(manifest).hexdigest(), + ) + self.assertEqual( + identifiers.raw_extrinsic_metadata_identifier(metadata), + "02e58aa3e7d476f6fc174669f9f4b88d56f534fa", ) diff --git a/swh/model/tests/test_model.py b/swh/model/tests/test_model.py --- a/swh/model/tests/test_model.py +++ b/swh/model/tests/test_model.py @@ -827,7 +827,7 @@ assert m.to_dict() == { "type": "origin", "target": _origin_url, - "id": b"\x9b[\x94\xf7\xa7$Dwbc\xf4\xdf\xaf(7G0t-\xd8", + "id": b"\xdc\xf4\x89\xaf\xbf\xd9[1\x08\xcb\xf3?\xea\n\x1d7_os\x08", **common_fields, } assert RawExtrinsicMetadata.from_dict(m.to_dict()) == m @@ -840,7 +840,7 @@ assert m.to_dict() == { "type": "content", "target": "swh:1:cnt:94a9ed024d3859793618152ea559a168bbcbb5e2", - "id": b"\x00\xfd\xe4\x88\xfa\xb5\xac\x7f\x16'\x96\xa8\x10\x9a\xafI\xe9>w\xa7", + "id": b"B8~?\xa1h{\x88D\xbd\xc2C\x10\x89\x0c\x95R7\xb6\x03", **common_fields, } assert RawExtrinsicMetadata.from_dict(m.to_dict()) == m