diff --git a/swh/model/hashutil.py b/swh/model/hashutil.py --- a/swh/model/hashutil.py +++ b/swh/model/hashutil.py @@ -289,7 +289,14 @@ ValueError if the git_type is unexpected. """ - git_object_types = {"blob", "tree", "commit", "tag", "snapshot"} + git_object_types = { + "blob", + "tree", + "commit", + "tag", + "snapshot", + "raw_extrinsic_metadata", + } if git_type not in git_object_types: raise ValueError( diff --git a/swh/model/identifiers.py b/swh/model/identifiers.py --- a/swh/model/identifiers.py +++ b/swh/model/identifiers.py @@ -724,6 +724,85 @@ return hashlib.sha1(origin["url"].encode("utf-8")).hexdigest() +def raw_extrinsic_metadata_identifier(metadata: Dict[str, Any]) -> str: + """Return the intrinsic identifier for a RawExtrinsicMetadata object. + + A raw_extrinsic_metadata identifier is a salted sha1 (using the git + hashing algorithm with the ``raw_extrinsic_metadata`` object type) of + a manifest following the format: + + ``` + target $ExtendedSwhid + discovery_date $ISO8601 + authority $StrWithoutSpaces $IRI + fetcher $Str $Version + format $StrWithoutSpaces + origin $IRI <- optional + visit $IntInDecimal <- optional + snapshot $CoreSwhid <- optional + release $CoreSwhid <- optional + revision $CoreSwhid <- optional + path $Bytes <- optional + directory $CoreSwhid <- optional + + $MetadataBytes + ``` + + $IRI must be RFC 3987 IRIs (so they may contain newlines, that are escaped as + described below) + + $StrWithoutSpaces and $Version are ASCII strings, and may not contain spaces. + + $Str is an UTF-8 string. + + $CoreSwhid are core SWHIDs, as defined in :ref:`persistent-identifiers`. + $ExtendedSwhid is a core SWHID, with extra types allowed ('ori' for + origins and 'emd' for raw extrinsic metadata) + + Newlines in $Bytes, $Str, and $Iri are escaped as with other git fields, + ie. by adding a space after them. + + Returns: + str: the intrinsic identifier for `metadata` + + """ + headers = [ + (b"target", str(metadata["target"]).encode()), + (b"discovery_date", metadata["discovery_date"].isoformat().encode("ascii")), + ( + b"authority", + f"{metadata['authority']['type']} {metadata['authority']['url']}".encode(), + ), + ( + b"fetcher", + f"{metadata['fetcher']['name']} {metadata['fetcher']['version']}".encode(), + ), + (b"format", metadata["format"].encode()), + ] + + for key in ( + "origin", + "visit", + "snapshot", + "release", + "revision", + "path", + "directory", + ): + if metadata.get(key) is not None: + value: bytes + if key == "path": + value = metadata[key] + else: + value = str(metadata[key]).encode() + + headers.append((key.encode("ascii"), value)) + + return identifier_to_str( + hash_manifest("raw_extrinsic_metadata", headers, metadata["metadata"]) + ) + + # type of the "object_type" attribute of the SWHID class; either # ObjectType or ExtendedObjectType _TObjectType = TypeVar("_TObjectType", ObjectType, ExtendedObjectType) diff --git a/swh/model/tests/test_identifiers.py b/swh/model/tests/test_identifiers.py --- a/swh/model/tests/test_identifiers.py +++ b/swh/model/tests/test_identifiers.py @@ -5,6 +5,7 @@ import binascii import datetime +import hashlib import itertools from typing import Dict import unittest @@ -767,6 +768,120 @@ ) +class RawExtrinsicMetadataIdentifier(unittest.TestCase): + def setUp(self): + super().setUp() + + self.authority = { + "type": "forge", + "url": "https://forge.softwareheritage.org/", + } + self.fetcher = { + "name": "swh-phabricator-metadata-fetcher", + "version": "0.0.1", + } + + self.minimal = { + "type": "content", + "target": ExtendedSWHID.from_string( + "swh:1:cnt:568aaf43d83b2c3df8067f3bedbb97d83260be6d" + ), + "discovery_date": datetime.datetime( + 2021, 1, 25, 11, 27, 51, tzinfo=datetime.timezone.utc + ), + "authority": self.authority, + "fetcher": self.fetcher, + "format": "json", + "metadata": b'{"foo": "bar"}', + } + self.maximal = { + **self.minimal, + "origin": "https://forge.softwareheritage.org/source/swh-model/", + "visit": 42, + "snapshot": CoreSWHID.from_string("swh:1:snp:" + "00" * 20), + "release": CoreSWHID.from_string("swh:1:rel:" + "01" * 20), + "revision": CoreSWHID.from_string("swh:1:rev:" + "02" * 20), + "path": b"/abc/def", + "directory": CoreSWHID.from_string("swh:1:dir:" + "03" * 20), + } + + def test_minimal(self): + manifest = ( + b"raw_extrinsic_metadata 225\0" + b"target swh:1:cnt:568aaf43d83b2c3df8067f3bedbb97d83260be6d\n" + b"discovery_date 2021-01-25T11:27:51+00:00\n" + b"authority forge https://forge.softwareheritage.org/\n" + b"fetcher swh-phabricator-metadata-fetcher 0.0.1\n" + b"format json\n" + b"\n" + b'{"foo": "bar"}' + ) + + self.assertEqual( + identifiers.raw_extrinsic_metadata_identifier(self.minimal), + hashlib.sha1(manifest).hexdigest(), + ) + self.assertEqual( + identifiers.raw_extrinsic_metadata_identifier(self.minimal), + "df16b5ea35b12f530fb7ecd0eb10b87a8b1fc3d2", + ) + + def test_maximal(self): + manifest = ( + b"raw_extrinsic_metadata 548\0" + b"target swh:1:cnt:568aaf43d83b2c3df8067f3bedbb97d83260be6d\n" + b"discovery_date 2021-01-25T11:27:51+00:00\n" + b"authority forge https://forge.softwareheritage.org/\n" + b"fetcher swh-phabricator-metadata-fetcher 0.0.1\n" + b"format json\n" + b"origin https://forge.softwareheritage.org/source/swh-model/\n" + b"visit 42\n" + b"snapshot swh:1:snp:0000000000000000000000000000000000000000\n" + b"release swh:1:rel:0101010101010101010101010101010101010101\n" + b"revision swh:1:rev:0202020202020202020202020202020202020202\n" + b"path /abc/def\n" + b"directory swh:1:dir:0303030303030303030303030303030303030303\n" + b"\n" + b'{"foo": "bar"}' + ) + + self.assertEqual( + identifiers.raw_extrinsic_metadata_identifier(self.maximal), + hashlib.sha1(manifest).hexdigest(), + ) + self.assertEqual( + identifiers.raw_extrinsic_metadata_identifier(self.maximal), + "55563d91a3f9cb41aa36c60c2b518433bf318ae4", + ) + + def test_nonascii_path(self): + metadata = { + **self.minimal, + "path": b"/ab\nc/d\xf0\x9f\xa4\xb7e\x00f", + } + manifest = ( + b"raw_extrinsic_metadata 246\0" + b"target swh:1:cnt:568aaf43d83b2c3df8067f3bedbb97d83260be6d\n" + b"discovery_date 2021-01-25T11:27:51+00:00\n" + b"authority forge https://forge.softwareheritage.org/\n" + b"fetcher swh-phabricator-metadata-fetcher 0.0.1\n" + b"format json\n" + b"path /ab\n" + b" c/d\xf0\x9f\xa4\xb7e\x00f\n" + b"\n" + b'{"foo": "bar"}' + ) + + self.assertEqual( + identifiers.raw_extrinsic_metadata_identifier(metadata), + hashlib.sha1(manifest).hexdigest(), + ) + self.assertEqual( + identifiers.raw_extrinsic_metadata_identifier(metadata), + "d8e5856601cdae96dfdfb5147235895949c9322d", + ) + + origin_example = { "url": "https://github.com/torvalds/linux", }