diff --git a/swh/model/hashutil.py b/swh/model/hashutil.py --- a/swh/model/hashutil.py +++ b/swh/model/hashutil.py @@ -289,7 +289,14 @@ ValueError if the git_type is unexpected. """ - git_object_types = {"blob", "tree", "commit", "tag", "snapshot"} + git_object_types = { + "blob", + "tree", + "commit", + "tag", + "snapshot", + "raw_extrinsic_metadata", + } if git_type not in git_object_types: raise ValueError( diff --git a/swh/model/identifiers.py b/swh/model/identifiers.py --- a/swh/model/identifiers.py +++ b/swh/model/identifiers.py @@ -678,6 +678,82 @@ return hashlib.sha1(origin["url"].encode("utf-8")).hexdigest() +def raw_extrinsic_metadata_identifier(metadata: Dict[str, Any]) -> str: + """Return the intrinsic identifier for a RawExtrinsicMetadata object. + + A snapshot identifier is a salted sha1 (using the git hashing algorithm + with the ``snapshot`` object type) of a manifest following the format: + + ``` + target_type: $ValueOfMetadataTargetType + target: $UrlOrSwhid + discovery_date: $ISO8601 + authority: $StrWithoutSpaces $IRI + fetcher: $Str $Version + format: $StrWithoutSpaces + origin: $IRI <- optional + visit: $IntInDecimal <- optional + snapshot: $Swhid <- optional + release: $Swhid <- optional + revision: $Swhid <- optional + path: $Bytes <- optional + directory: $Swhid <- optional + + $MetadataBytes + ``` + + $IRI must be RFC 3987 IRIs (so they may contain newlines, that are escaped as + described below) + + $StrWithoutSpaces and $Version are ASCII strings, and may not contain spaces. + + $Str is an UTF-8 string. + + Newlines in $Bytes, $Str, and $Iri are escaped as with other git fields, + ie. by adding a space after them. + + Returns: + str: the intrinsic identifier for `snapshot` + + """ + headers = [ + (b"target_type", metadata["target_type"].encode("ascii")), + (b"target", str(metadata["target"]).encode()), + (b"discovery_date", metadata["discovery_date"].isoformat().encode("ascii")), + ( + b"authority", + f"{metadata['authority']['type']} {metadata['authority']['url']}".encode(), + ), + ( + b"fetcher", + f"{metadata['fetcher']['name']} {metadata['fetcher']['version']}".encode(), + ), + (b"format", metadata["format"].encode()), + ] + + for key in ( + "origin", + "visit", + "snapshot", + "release", + "revision", + "path", + "directory", + ): + if metadata.get(key) is not None: + value: bytes + if key == "path": + value = metadata[key] + else: + value = str(metadata[key]).encode() + + headers.append((key.encode("ascii"), value)) + + return identifier_to_str( + hash_manifest("raw_extrinsic_metadata", headers, metadata["metadata"]) + ) + + _object_type_map = { ORIGIN: {"short_name": "ori", "key_id": "id"}, SNAPSHOT: {"short_name": "snp", "key_id": "id"}, diff --git a/swh/model/tests/test_identifiers.py b/swh/model/tests/test_identifiers.py --- a/swh/model/tests/test_identifiers.py +++ b/swh/model/tests/test_identifiers.py @@ -5,6 +5,7 @@ import binascii import datetime +import hashlib from typing import Dict import unittest @@ -21,6 +22,7 @@ SNAPSHOT, SWHID, normalize_timestamp, + parse_swhid, ) @@ -764,6 +766,121 @@ ) +class RawExtrinsicMetadataIdentifier(unittest.TestCase): + def setUp(self): + super().setUp() + + self.authority = { + "type": "forge", + "url": "https://forge.softwareheritage.org/", + } + self.fetcher = { + "name": "swh-phabricator-metadata-fetcher", + "version": "0.0.1", + } + + self.minimal = { + "target_type": "content", + "target": parse_swhid("swh:1:cnt:568aaf43d83b2c3df8067f3bedbb97d83260be6d"), + "discovery_date": datetime.datetime( + 2021, 1, 25, 11, 27, 51, tzinfo=datetime.timezone.utc + ), + "authority": self.authority, + "fetcher": self.fetcher, + "format": "json", + "metadata": b'{"foo": "bar"}', + } + self.maximal = { + **self.minimal, + "origin": "https://forge.softwareheritage.org/source/swh-model/", + "visit": 42, + "snapshot": parse_swhid("swh:1:snp:" + "00" * 20), + "release": parse_swhid("swh:1:rel:" + "01" * 20), + "revision": parse_swhid("swh:1:rev:" + "02" * 20), + "path": b"/abc/def", + "directory": parse_swhid("swh:1:dir:" + "03" * 20), + } + + def test_minimal(self): + manifest = ( + b"raw_extrinsic_metadata 245\0" + b"target_type content\n" + b"target swh:1:cnt:568aaf43d83b2c3df8067f3bedbb97d83260be6d\n" + b"discovery_date 2021-01-25T11:27:51+00:00\n" + b"authority forge https://forge.softwareheritage.org/\n" + b"fetcher swh-phabricator-metadata-fetcher 0.0.1\n" + b"format json\n" + b"\n" + b'{"foo": "bar"}' + ) + + self.assertEqual( + identifiers.raw_extrinsic_metadata_identifier(self.minimal), + hashlib.sha1(manifest).hexdigest(), + ) + self.assertEqual( + identifiers.raw_extrinsic_metadata_identifier(self.minimal), + "da734f1531f830b7282ee01c5e0c0dfe7ecc99e9", + ) + + def test_maximal(self): + manifest = ( + b"raw_extrinsic_metadata 568\0" + b"target_type content\n" + b"target swh:1:cnt:568aaf43d83b2c3df8067f3bedbb97d83260be6d\n" + b"discovery_date 2021-01-25T11:27:51+00:00\n" + b"authority forge https://forge.softwareheritage.org/\n" + b"fetcher swh-phabricator-metadata-fetcher 0.0.1\n" + b"format json\n" + b"origin https://forge.softwareheritage.org/source/swh-model/\n" + b"visit 42\n" + b"snapshot swh:1:snp:0000000000000000000000000000000000000000\n" + b"release swh:1:rel:0101010101010101010101010101010101010101\n" + b"revision swh:1:rev:0202020202020202020202020202020202020202\n" + b"path /abc/def\n" + b"directory swh:1:dir:0303030303030303030303030303030303030303\n" + b"\n" + b'{"foo": "bar"}' + ) + + self.assertEqual( + identifiers.raw_extrinsic_metadata_identifier(self.maximal), + hashlib.sha1(manifest).hexdigest(), + ) + self.assertEqual( + identifiers.raw_extrinsic_metadata_identifier(self.maximal), + "0d9e3bb9a72850e32bfb575f612cfad1a7e6b66a", + ) + + def test_nonascii_path(self): + metadata = { + **self.minimal, + "path": b"/ab\nc/d\xf0\x9f\xa4\xb7e\x00f", + } + manifest = ( + b"raw_extrinsic_metadata 266\0" + b"target_type content\n" + b"target swh:1:cnt:568aaf43d83b2c3df8067f3bedbb97d83260be6d\n" + b"discovery_date 2021-01-25T11:27:51+00:00\n" + b"authority forge https://forge.softwareheritage.org/\n" + b"fetcher swh-phabricator-metadata-fetcher 0.0.1\n" + b"format json\n" + b"path /ab\n" + b" c/d\xf0\x9f\xa4\xb7e\x00f\n" + b"\n" + b'{"foo": "bar"}' + ) + + self.assertEqual( + identifiers.raw_extrinsic_metadata_identifier(metadata), + hashlib.sha1(manifest).hexdigest(), + ) + self.assertEqual( + identifiers.raw_extrinsic_metadata_identifier(metadata), + "63f4cb28396e00926ab7ebfd96e96b60227fc11a", + ) + + class OriginIdentifier(unittest.TestCase): def setUp(self): self.origin = {