diff --git a/swh/model/hashutil.py b/swh/model/hashutil.py --- a/swh/model/hashutil.py +++ b/swh/model/hashutil.py @@ -198,7 +198,7 @@ ValueError if the git_type is unexpected. """ - git_object_types = {'blob', 'tree', 'commit', 'tag'} + git_object_types = {'blob', 'tree', 'commit', 'tag', 'snapshot'} if git_type not in git_object_types: raise ValueError('Unexpected git object type %s, expected one of %s' % diff --git a/swh/model/identifiers.py b/swh/model/identifiers.py --- a/swh/model/identifiers.py +++ b/swh/model/identifiers.py @@ -499,3 +499,88 @@ components.extend([b'\n', release['message']]) return identifier_to_str(hash_git_data(b''.join(components), 'tag')) + + +def snapshot_identifier(snapshot, *, ignore_unresolved=False): + """Return the intrinsic identifier for a snapshot. + + Snapshots are a set of named branches, which are pointers to objects at any + level of the Software Heritage DAG. + + As well as pointing to other objects in the Software Heritage DAG, branches + can also be *symbolic*, in which case their target is the name of another + branch in the same snapshot, or *dangling*, in which case the target is + unknown (and represented by the ``None`` value). + + A snapshot identifier is a salted sha1 (using the git hashing algorithm + with the ``snapshot`` object type) of a manifest following the algorithm: + + 1. Branches are sorted using the name as key, in bytes order. + + 2. For each branch, the following bytes are output: + + - the type of the branch target: + + - ``content``, ``directory``, ``revision``, ``release`` or ``snapshot`` + for the corresponding entries in the DAG; + - ``symbolic`` for branches referencing another branch; + - ``dangling`` for dangling branches + + - an ascii space (``\\x20``) + - the branch name (as raw bytes) + - a null byte (``\\x00``) + - the length of the target identifier, as an ascii-encoded decimal number + (``20`` for current intrinisic identifiers, ``0`` for dangling + branches, the length of the target branch name for symbolic branches) + - a colon (``:``) + - the identifier of the target object pointed at by the branch, + stored in the 'target' member: + + - for contents: their *sha1_git* + - for directories, revisions, releases or snapshots: their intrinsic + identifier + - for symbolic branches, the name of the target branch (as raw bytes) + - for dangling branches, the empty string + + Note that, akin to directory manifests, there is no separator between + entries. Because of symbolic branches, identifiers are of arbitrary + length but are length-encoded to avoid ambiguity. + + Args: + snapshot (dict): the snapshot of which to compute the identifier. A + single entry is needed, ``'branches'``, which is itself a :class:`dict` + mapping each branch to its target + ignore_unresolved (bool): if `True`, ignore unresolved symbolic branches. + + Returns: + str: the intrinsic identifier for `snapshot` + + """ + + unresolved = [] + lines = [] + + for name, target in sorted(snapshot['branches'].items()): + if not target: + target_type = b'dangling' + target_id = b'' + elif target['target_type'] == 'symbolic': + target_type = b'symbolic' + target_id = target['target'] + if target_id not in snapshot['branches'] or target_id == name: + unresolved.append((name, target_id)) + else: + target_type = target['target_type'].encode() + target_id = identifier_to_bytes(target['target']) + + lines.extend([ + target_type, b'\x20', name, b'\x00', + ('%d:' % len(target_id)).encode(), target_id, + ]) + + if unresolved and not ignore_unresolved: + raise ValueError('Symbolic references unresolved: %s' % + ', '.join('%s -> %s' % (name, target) + for name, target in unresolved)) + + return identifier_to_str(hash_git_data(b''.join(lines), 'snapshot')) diff --git a/swh/model/tests/test_identifiers.py b/swh/model/tests/test_identifiers.py --- a/swh/model/tests/test_identifiers.py +++ b/swh/model/tests/test_identifiers.py @@ -679,3 +679,92 @@ identifiers.release_identifier(self.release_newline_in_author), identifiers.identifier_to_str(self.release_newline_in_author['id']) ) + + +class SnapshotIdentifier(unittest.TestCase): + def setUp(self): + super().setUp() + + self.empty = { + 'id': '1a8893e6a86f444e8be8e7bda6cb34fb1735a00e', + 'branches': {}, + } + + self.dangling_branch = { + 'id': 'c84502e821eb21ed84e9fd3ec40973abc8b32353', + 'branches': { + b'HEAD': None, + }, + } + + self.unresolved = { + 'id': 'ca56baf90b4fb52c0041764fdd98b77d89ef580d', + 'branches': { + b'foo': { + 'target': b'bar', + 'target_type': 'symbolic', + }, + }, + } + + self.all_types = { + 'id': '175bab5e9f62248249e599af232d77f68e277965', + 'branches': { + b'directory': { + 'target': '1bd0e65f7d2ff14ae994de17a1e7fe65111dcad8', + 'target_type': 'directory', + }, + b'content': { + 'target': 'fe95a46679d128ff167b7c55df5d02356c5a1ae1', + 'target_type': 'content', + }, + b'symbolic': { + 'target': b'revision', + 'target_type': 'symbolic', + }, + b'revision': { + 'target': 'aafb16d69fd30ff58afdd69036a26047f3aebdc6', + 'target_type': 'revision', + }, + b'release': { + 'target': '7045404f3d1c54e6473c71bbb716529fbad4be24', + 'target_type': 'release', + }, + b'snapshot': { + 'target': '1a8893e6a86f444e8be8e7bda6cb34fb1735a00e', + 'target_type': 'snapshot', + }, + b'dangling': None, + } + } + + def test_empty_snapshot(self): + self.assertEqual( + identifiers.snapshot_identifier(self.empty), + identifiers.identifier_to_str(self.empty['id']), + ) + + def test_dangling_branch(self): + self.assertEqual( + identifiers.snapshot_identifier(self.dangling_branch), + identifiers.identifier_to_str(self.dangling_branch['id']), + ) + + def test_unresolved(self): + with self.assertRaisesRegex(ValueError, "b'foo' -> b'bar'"): + identifiers.snapshot_identifier(self.unresolved) + + def test_unresolved_force(self): + self.assertEqual( + identifiers.snapshot_identifier( + self.unresolved, + ignore_unresolved=True, + ), + identifiers.identifier_to_str(self.unresolved['id']), + ) + + def test_all_types(self): + self.assertEqual( + identifiers.snapshot_identifier(self.all_types), + identifiers.identifier_to_str(self.all_types['id']), + )