diff --git a/swh/model/identifiers.py b/swh/model/identifiers.py --- a/swh/model/identifiers.py +++ b/swh/model/identifiers.py @@ -759,6 +759,12 @@ $ExtendedSwhid is a core SWHID, with extra types allowed ('ori' for origins and 'emd' for raw extrinsic metadata) + $Timestamp is a decimal representation of the integer number of seconds since + the UNIX epoch (1970-01-01 00:00:00 UTC), with no leading '0' + (unless the timestamp value is zero) and no timezone. + It may be negative by prefixing it with a '-', which must not be followed + by a '0'. + Newlines in $Bytes, $Str, and $Iri are escaped as with other git fields, ie. by adding a space after them. @@ -766,9 +772,11 @@ str: the intrinsic identifier for `metadata` """ + timestamp = metadata["discovery_date"].timestamp() + headers = [ (b"target", str(metadata["target"]).encode()), - (b"discovery_date", metadata["discovery_date"].isoformat().encode("ascii")), + (b"discovery_date", str(int(timestamp)).encode("ascii")), ( b"authority", f"{metadata['authority']['type']} {metadata['authority']['url']}".encode(), diff --git a/swh/model/tests/test_identifiers.py b/swh/model/tests/test_identifiers.py --- a/swh/model/tests/test_identifiers.py +++ b/swh/model/tests/test_identifiers.py @@ -807,9 +807,9 @@ def test_minimal(self): manifest = ( - b"raw_extrinsic_metadata 225\0" + b"raw_extrinsic_metadata 210\0" b"target swh:1:cnt:568aaf43d83b2c3df8067f3bedbb97d83260be6d\n" - b"discovery_date 2021-01-25T11:27:51+00:00\n" + b"discovery_date 1611574071\n" b"authority forge https://forge.softwareheritage.org/\n" b"fetcher swh-phabricator-metadata-fetcher 0.0.1\n" b"format json\n" @@ -823,14 +823,14 @@ ) self.assertEqual( identifiers.raw_extrinsic_metadata_identifier(self.minimal), - "df16b5ea35b12f530fb7ecd0eb10b87a8b1fc3d2", + "5c13f20ba336e44549baf3d7b9305b027ec9f43d", ) def test_maximal(self): manifest = ( - b"raw_extrinsic_metadata 548\0" + b"raw_extrinsic_metadata 533\0" b"target swh:1:cnt:568aaf43d83b2c3df8067f3bedbb97d83260be6d\n" - b"discovery_date 2021-01-25T11:27:51+00:00\n" + b"discovery_date 1611574071\n" b"authority forge https://forge.softwareheritage.org/\n" b"fetcher swh-phabricator-metadata-fetcher 0.0.1\n" b"format json\n" @@ -851,7 +851,7 @@ ) self.assertEqual( identifiers.raw_extrinsic_metadata_identifier(self.maximal), - "55563d91a3f9cb41aa36c60c2b518433bf318ae4", + "f96966e1093d15236a31fde07e47d5b1c9428049", ) def test_nonascii_path(self): @@ -860,9 +860,9 @@ "path": b"/ab\nc/d\xf0\x9f\xa4\xb7e\x00f", } manifest = ( - b"raw_extrinsic_metadata 246\0" + b"raw_extrinsic_metadata 231\0" b"target swh:1:cnt:568aaf43d83b2c3df8067f3bedbb97d83260be6d\n" - b"discovery_date 2021-01-25T11:27:51+00:00\n" + b"discovery_date 1611574071\n" b"authority forge https://forge.softwareheritage.org/\n" b"fetcher swh-phabricator-metadata-fetcher 0.0.1\n" b"format json\n" @@ -878,7 +878,74 @@ ) self.assertEqual( identifiers.raw_extrinsic_metadata_identifier(metadata), - "d8e5856601cdae96dfdfb5147235895949c9322d", + "7cc83fd1912176510c083f5df43f01b09af4b333", + ) + + def test_timezone_insensitive(self): + """Checks the timezone of the datetime.datetime does not affect the + hashed manifest.""" + utc_plus_one = datetime.timezone(datetime.timedelta(hours=1)) + metadata = { + **self.minimal, + "discovery_date": datetime.datetime( + 2021, 1, 25, 12, 27, 51, tzinfo=utc_plus_one, + ), + } + + self.assertEqual( + identifiers.raw_extrinsic_metadata_identifier(self.minimal), + identifiers.raw_extrinsic_metadata_identifier(metadata), + ) + self.assertEqual( + identifiers.raw_extrinsic_metadata_identifier(metadata), + "5c13f20ba336e44549baf3d7b9305b027ec9f43d", + ) + + def test_microsecond_insensitive(self): + """Checks the microseconds of the datetime.datetime does not affect the + hashed manifest.""" + metadata = { + **self.minimal, + "discovery_date": datetime.datetime( + 2021, 1, 25, 11, 27, 51, 123456, tzinfo=datetime.timezone.utc, + ), + } + + self.assertEqual( + identifiers.raw_extrinsic_metadata_identifier(self.minimal), + identifiers.raw_extrinsic_metadata_identifier(metadata), + ) + self.assertEqual( + identifiers.raw_extrinsic_metadata_identifier(metadata), + "5c13f20ba336e44549baf3d7b9305b027ec9f43d", + ) + + def test_negative_timestamp(self): + metadata = { + **self.minimal, + "discovery_date": datetime.datetime( + 1960, 1, 25, 11, 27, 51, tzinfo=datetime.timezone.utc, + ), + } + + manifest = ( + b"raw_extrinsic_metadata 210\0" + b"target swh:1:cnt:568aaf43d83b2c3df8067f3bedbb97d83260be6d\n" + b"discovery_date -313504329\n" + b"authority forge https://forge.softwareheritage.org/\n" + b"fetcher swh-phabricator-metadata-fetcher 0.0.1\n" + b"format json\n" + b"\n" + b'{"foo": "bar"}' + ) + + self.assertEqual( + identifiers.raw_extrinsic_metadata_identifier(metadata), + hashlib.sha1(manifest).hexdigest(), + ) + self.assertEqual( + identifiers.raw_extrinsic_metadata_identifier(metadata), + "895d0821a2991dd376ddc303424aceb7c68280f9", ) diff --git a/swh/model/tests/test_model.py b/swh/model/tests/test_model.py --- a/swh/model/tests/test_model.py +++ b/swh/model/tests/test_model.py @@ -855,7 +855,7 @@ m = RawExtrinsicMetadata(target=_origin_swhid, **_common_metadata_fields,) assert m.to_dict() == { "target": str(_origin_swhid), - "id": b"\xeck\x9cQ\xf1\x1f\xeb\xde\x85{\x7f\xf0\x83\x9c\x8a\xd5\xfb\x8e2\xef", + "id": b"@j\xc9\x01\xbc\x1e#p*\xf3q9\xa7u\x97\x00\x14\x02xa", **common_fields, } assert RawExtrinsicMetadata.from_dict(m.to_dict()) == m @@ -863,7 +863,7 @@ m = RawExtrinsicMetadata(target=_content_swhid, **_common_metadata_fields,) assert m.to_dict() == { "target": "swh:1:cnt:94a9ed024d3859793618152ea559a168bbcbb5e2", - "id": b"\x8e:_;eb\xe6\xf1Y\xd9\xa5aG[\rt\x89\xa1\x0b\xe4", + "id": b"\xbc\xa3U\xddf\x19U\xc5\xd2\xd7\xdfK\xd7c\x1f\xa8\xfeh\x992", **common_fields, } assert RawExtrinsicMetadata.from_dict(m.to_dict()) == m @@ -882,7 +882,7 @@ ) assert m.to_dict() == { "target": "swh:1:cnt:94a9ed024d3859793618152ea559a168bbcbb5e2", - "id": b"\x07\xf1aS\xbe\xda\xd2\xf2\xd7\xaf:\xc7\xb7\x91C\x87W\x85R\x19", + "id": b"\x14l\xb0\x1f\xb9\xc0{)\xc7\x0f\xbd\xc0*,YZ\xf5C\xab\xfc", **common_fields, "origin": "https://example.org/", "snapshot": f"swh:1:snp:{hash_hex}",