Page MenuHomeSoftware Heritage

D277.id933.diff
No OneTemporary

D277.id933.diff

diff --git a/swh/model/hashutil.py b/swh/model/hashutil.py
--- a/swh/model/hashutil.py
+++ b/swh/model/hashutil.py
@@ -198,7 +198,7 @@
ValueError if the git_type is unexpected.
"""
- git_object_types = {'blob', 'tree', 'commit', 'tag'}
+ git_object_types = {'blob', 'tree', 'commit', 'tag', 'snapshot'}
if git_type not in git_object_types:
raise ValueError('Unexpected git object type %s, expected one of %s' %
diff --git a/swh/model/identifiers.py b/swh/model/identifiers.py
--- a/swh/model/identifiers.py
+++ b/swh/model/identifiers.py
@@ -499,3 +499,88 @@
components.extend([b'\n', release['message']])
return identifier_to_str(hash_git_data(b''.join(components), 'tag'))
+
+
+def snapshot_identifier(snapshot, *, ignore_unresolved=False):
+ """Return the intrinsic identifier for a snapshot.
+
+ Snapshots are a set of named branches, which are pointers to objects at any
+ level of the Software Heritage DAG.
+
+ As well as pointing to other objects in the Software Heritage DAG, branches
+ can also be *symbolic*, in which case their target is the name of another
+ branch in the same snapshot, or *dangling*, in which case the target is
+ unknown (and represented by the ``None`` value).
+
+ A snapshot identifier is a salted sha1 (using the git hashing algorithm
+ with the ``snapshot`` object type) of a manifest following the algorithm:
+
+ 1. Branches are sorted using the name as key, in bytes order.
+
+ 2. For each branch, the following bytes are output:
+
+ - the type of the branch target:
+
+ - ``content``, ``directory``, ``revision``, ``release`` or ``snapshot``
+ for the corresponding entries in the DAG;
+ - ``symbolic`` for branches referencing another branch;
+ - ``dangling`` for dangling branches
+
+ - an ascii space (``\\x20``)
+ - the branch name (as raw bytes)
+ - a null byte (``\\x00``)
+ - the length of the target identifier, as an ascii-encoded decimal number
+ (``20`` for current intrinisic identifiers, ``0`` for dangling
+ branches, the length of the target branch name for symbolic branches)
+ - a colon (``:``)
+ - the identifier of the target object pointed at by the branch,
+ stored in the 'target' member:
+
+ - for contents: their *sha1_git*
+ - for directories, revisions, releases or snapshots: their intrinsic
+ identifier
+ - for symbolic branches, the name of the target branch (as raw bytes)
+ - for dangling branches, the empty string
+
+ Note that, akin to directory manifests, there is no separator between
+ entries. Because of symbolic branches, identifiers are of arbitrary
+ length but are length-encoded to avoid ambiguity.
+
+ Args:
+ snapshot (dict): the snapshot of which to compute the identifier. A
+ single entry is needed, ``'branches'``, which is itself a :class:`dict`
+ mapping each branch to its target
+ ignore_unresolved (bool): if `True`, ignore unresolved symbolic branches.
+
+ Returns:
+ str: the intrinsic identifier for `snapshot`
+
+ """
+
+ unresolved = []
+ lines = []
+
+ for name, target in sorted(snapshot['branches'].items()):
+ if not target:
+ target_type = b'dangling'
+ target_id = b''
+ elif target['target_type'] == 'symbolic':
+ target_type = b'symbolic'
+ target_id = target['target']
+ if target_id not in snapshot['branches'] or target_id == name:
+ unresolved.append((name, target_id))
+ else:
+ target_type = target['target_type'].encode()
+ target_id = identifier_to_bytes(target['target'])
+
+ lines.extend([
+ target_type, b'\x20', name, b'\x00',
+ ('%d:' % len(target_id)).encode(), target_id,
+ ])
+
+ if unresolved and not ignore_unresolved:
+ raise ValueError('Symbolic references unresolved: %s' %
+ ', '.join('%s -> %s' % (name, target)
+ for name, target in unresolved))
+
+ return identifier_to_str(hash_git_data(b''.join(lines), 'snapshot'))
diff --git a/swh/model/tests/test_identifiers.py b/swh/model/tests/test_identifiers.py
--- a/swh/model/tests/test_identifiers.py
+++ b/swh/model/tests/test_identifiers.py
@@ -679,3 +679,92 @@
identifiers.release_identifier(self.release_newline_in_author),
identifiers.identifier_to_str(self.release_newline_in_author['id'])
)
+
+
+class SnapshotIdentifier(unittest.TestCase):
+ def setUp(self):
+ super().setUp()
+
+ self.empty = {
+ 'id': '1a8893e6a86f444e8be8e7bda6cb34fb1735a00e',
+ 'branches': {},
+ }
+
+ self.dangling_branch = {
+ 'id': 'c84502e821eb21ed84e9fd3ec40973abc8b32353',
+ 'branches': {
+ b'HEAD': None,
+ },
+ }
+
+ self.unresolved = {
+ 'id': 'ca56baf90b4fb52c0041764fdd98b77d89ef580d',
+ 'branches': {
+ b'foo': {
+ 'target': b'bar',
+ 'target_type': 'symbolic',
+ },
+ },
+ }
+
+ self.all_types = {
+ 'id': '175bab5e9f62248249e599af232d77f68e277965',
+ 'branches': {
+ b'directory': {
+ 'target': '1bd0e65f7d2ff14ae994de17a1e7fe65111dcad8',
+ 'target_type': 'directory',
+ },
+ b'content': {
+ 'target': 'fe95a46679d128ff167b7c55df5d02356c5a1ae1',
+ 'target_type': 'content',
+ },
+ b'symbolic': {
+ 'target': b'revision',
+ 'target_type': 'symbolic',
+ },
+ b'revision': {
+ 'target': 'aafb16d69fd30ff58afdd69036a26047f3aebdc6',
+ 'target_type': 'revision',
+ },
+ b'release': {
+ 'target': '7045404f3d1c54e6473c71bbb716529fbad4be24',
+ 'target_type': 'release',
+ },
+ b'snapshot': {
+ 'target': '1a8893e6a86f444e8be8e7bda6cb34fb1735a00e',
+ 'target_type': 'snapshot',
+ },
+ b'dangling': None,
+ }
+ }
+
+ def test_empty_snapshot(self):
+ self.assertEqual(
+ identifiers.snapshot_identifier(self.empty),
+ identifiers.identifier_to_str(self.empty['id']),
+ )
+
+ def test_dangling_branch(self):
+ self.assertEqual(
+ identifiers.snapshot_identifier(self.dangling_branch),
+ identifiers.identifier_to_str(self.dangling_branch['id']),
+ )
+
+ def test_unresolved(self):
+ with self.assertRaisesRegex(ValueError, "b'foo' -> b'bar'"):
+ identifiers.snapshot_identifier(self.unresolved)
+
+ def test_unresolved_force(self):
+ self.assertEqual(
+ identifiers.snapshot_identifier(
+ self.unresolved,
+ ignore_unresolved=True,
+ ),
+ identifiers.identifier_to_str(self.unresolved['id']),
+ )
+
+ def test_all_types(self):
+ self.assertEqual(
+ identifiers.snapshot_identifier(self.all_types),
+ identifiers.identifier_to_str(self.all_types['id']),
+ )

File Metadata

Mime Type
text/plain
Expires
Nov 4 2024, 6:35 PM (19 w, 4 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3219179

Event Timeline