diff --git a/swh/model/identifiers.py b/swh/model/identifiers.py new file mode 100644 index 0000000..33e3afc --- /dev/null +++ b/swh/model/identifiers.py @@ -0,0 +1,123 @@ +# Copyright (C) 2015 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from functools import lru_cache + +from . import hashutil + + +@lru_cache() +def identifier_to_bytes(identifier): + """Convert a text identifier to bytes. + + Args: + identifier: an identifier, either a 40-char hexadecimal string or a + bytes object of length 20 + Returns: + The length 20 bytestring corresponding to the given identifier + + Raises: + ValueError if the identifier is of an unexpected type or length. + """ + + if isinstance(identifier, bytes): + if len(identifier) != 20: + raise ValueError( + 'Wrong length for bytes identifier %s, expected 20' % + len(identifier)) + return identifier + + if isinstance(identifier, str): + if len(identifier) != 40: + raise ValueError( + 'Wrong length for str identifier %s, expected 40' % + len(identifier)) + return bytes.fromhex(identifier) + + raise ValueError('Wrong type for identitfier %s, expected bytes or str' % + identifier.__class__.__name__) + + +def content_identifier(content): + """Return the intrinsic identifier for a content. + + A content's identifier is the sha1 checksum of its data. + + Args: + content: a content conforming to the Software Heritage schema + + Returns: + The intrinsic identifier of the content as a text string. + + Raises: + KeyError if the content doesn't have a data member. + """ + + hashes = hashutil.hash_data(content['data'], {'sha1'}) + + return hashes['sha1'] + + +def _sort_key(entry): + """The sorting key for tree entries""" + if entry['type'] == 'dir': + return entry['name'] + b'/' + else: + return entry['name'] + + +@lru_cache() +def _perms_to_bytes(perms): + """Convert the perms value to its bytes representation""" + oc = oct(perms)[2:] + return oc.encode('ascii') + + +def directory_identifier(directory): + """Return the intrinsic identifier for a directory. + + A directory's identifier is the tree sha1 à la git of a directory listing, + using the following algorithm, which is equivalent to the git algorithm for + trees: + + 1. Entries of the directory are sorted using the name (or the name with '/' + appended for directory entries) as key, in bytes order. + + 2. For each entry of the directory, the following bytes are output: + - the octal representation of the permissions for the entry + (stored in the 'perms' member), which is a representation of the + entry type: + b'100644' (int 33188)for files + b'100755' (int 33261)for executable files + b'120000' (int 40960)for symbolic links + b'40000' (int 16384) for directories + b'160000' (int 57344) for references to revisions + - an ascii space (b'\x20') + - the entry's name (as raw bytes), stored in the 'name' member + - a null byte (b'\x00') + - the 20 byte long identifier of the object pointed at by the entry, + stored in the 'target' member: + for files or executable files: their blob sha1_git + for symbolic links: the blob sha1_git of a file containing the + link destination + for directories: their intrinsic identifier + for revisions: their intrinsic identifier + + (Note that there is no separator between entries) + + """ + + components = [] + + for entry in sorted(directory['entries'], key=_sort_key): + components.extend([ + _perms_to_bytes(entry['perms']), + b'\x20', + entry['name'], + b'\x00', + identifier_to_bytes(entry['target']), + ]) + + return hashutil.hash_git_data(b''.join(components), 'tree') diff --git a/swh/model/tests/test_identifiers.py b/swh/model/tests/test_identifiers.py new file mode 100644 index 0000000..839a806 --- /dev/null +++ b/swh/model/tests/test_identifiers.py @@ -0,0 +1,139 @@ +# Copyright (C) 2015 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import datetime +import unittest + +from nose.tools import istest + +from swh.model import hashutil, identifiers + + +class Identifiers(unittest.TestCase): + def setUp(self): + self.content = { + 'status': 'visible', + 'length': 5, + 'data': b'1984\n', + 'ctime': datetime.datetime(2015, 11, 22, 16, 33, 56, + tzinfo=datetime.timezone.utc), + } + + self.content.update( + hashutil.hash_data(self.content['data'])) + + self.directory = { + 'id': 'c2e41aae41ac17bd4a650770d6ee77f62e52235b', + 'entries': [ + { + 'type': 'file', + 'perms': 33188, + 'name': b'README', + 'target': '37ec8ea2110c0b7a32fbb0e872f6e7debbf95e21' + }, + { + 'type': 'file', + 'perms': 33188, + 'name': b'Rakefile', + 'target': '3bb0e8592a41ae3185ee32266c860714980dbed7' + }, + { + 'type': 'dir', + 'perms': 16384, + 'name': b'app', + 'target': '61e6e867f5d7ba3b40540869bc050b0c4fed9e95' + }, + { + 'type': 'file', + 'perms': 33188, + 'name': b'1.megabyte', + 'target': '7c2b2fbdd57d6765cdc9d84c2d7d333f11be7fb3' + }, + { + 'type': 'dir', + 'perms': 16384, + 'name': b'config', + 'target': '591dfe784a2e9ccc63aaba1cb68a765734310d98' + }, + { + 'type': 'dir', + 'perms': 16384, + 'name': b'public', + 'target': '9588bf4522c2b4648bfd1c61d175d1f88c1ad4a5' + }, + { + 'type': 'file', + 'perms': 33188, + 'name': b'development.sqlite3', + 'target': 'e69de29bb2d1d6434b8b29ae775ad8c2e48c5391' + }, + { + 'type': 'dir', + 'perms': 16384, + 'name': b'doc', + 'target': '154705c6aa1c8ead8c99c7915373e3c44012057f' + }, + { + 'type': 'dir', + 'perms': 16384, + 'name': b'db', + 'target': '85f157bdc39356b7bc7de9d0099b4ced8b3b382c' + }, + { + 'type': 'dir', + 'perms': 16384, + 'name': b'log', + 'target': '5e3d3941c51cce73352dff89c805a304ba96fffe' + }, + { + 'type': 'dir', + 'perms': 16384, + 'name': b'script', + 'target': '1b278423caf176da3f3533592012502aa10f566c' + }, + { + 'type': 'dir', + 'perms': 16384, + 'name': b'test', + 'target': '035f0437c080bfd8711670b3e8677e686c69c763' + }, + { + 'type': 'dir', + 'perms': 16384, + 'name': b'vendor', + 'target': '7c0dc9ad978c1af3f9a4ce061e50f5918bd27138' + }, + { + 'type': 'rev', + 'perms': 57344, + 'name': b'will_paginate', + 'target': '3d531e169db92a16a9a8974f0ae6edf52e52659e' + } + ], + } + + self.empty_directory = { + 'id': '4b825dc642cb6eb9a060e54bf8d69288fbee4904', + 'entries': [], + } + + print(self.directory) + + @istest + def content_identifier(self): + self.assertEqual(identifiers.content_identifier(self.content), + self.content['sha1']) + + @istest + def dir_identifier(self): + self.assertEqual( + identifiers.directory_identifier(self.directory), + self.directory['id']) + + @istest + def dir_identifier_empty_directory(self): + self.assertEqual( + identifiers.directory_identifier(self.empty_directory), + self.empty_directory['id'])