diff --git a/swh/model/identifiers.py b/swh/model/identifiers.py index 33e3afc..70b8285 100644 --- a/swh/model/identifiers.py +++ b/swh/model/identifiers.py @@ -1,123 +1,236 @@ # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +import binascii +import datetime from functools import lru_cache from . import hashutil @lru_cache() def identifier_to_bytes(identifier): """Convert a text identifier to bytes. Args: identifier: an identifier, either a 40-char hexadecimal string or a bytes object of length 20 Returns: The length 20 bytestring corresponding to the given identifier Raises: ValueError if the identifier is of an unexpected type or length. """ if isinstance(identifier, bytes): if len(identifier) != 20: raise ValueError( 'Wrong length for bytes identifier %s, expected 20' % len(identifier)) return identifier if isinstance(identifier, str): if len(identifier) != 40: raise ValueError( 'Wrong length for str identifier %s, expected 40' % len(identifier)) return bytes.fromhex(identifier) raise ValueError('Wrong type for identitfier %s, expected bytes or str' % identifier.__class__.__name__) +@lru_cache() +def identifier_to_str(identifier): + """Convert an identifier to an hexadecimal string. + + Args: + identifier: an identifier, either a 40-char hexadecimal string or a + bytes object of length 20 + Returns: + The length 40 string corresponding to the given identifier, hex encoded + + Raises: + ValueError if the identifier is of an unexpected type or length. + """ + + if isinstance(identifier, str): + if len(identifier) != 40: + raise ValueError( + 'Wrong length for str identifier %s, expected 40' % + len(identifier)) + return identifier + + if isinstance(identifier, bytes): + if len(identifier) != 20: + raise ValueError( + 'Wrong length for bytes identifier %s, expected 20' % + len(identifier)) + return binascii.hexlify(identifier).decode() + + raise ValueError('Wrong type for identitfier %s, expected bytes or str' % + identifier.__class__.__name__) + + def content_identifier(content): """Return the intrinsic identifier for a content. A content's identifier is the sha1 checksum of its data. Args: content: a content conforming to the Software Heritage schema Returns: The intrinsic identifier of the content as a text string. Raises: KeyError if the content doesn't have a data member. """ hashes = hashutil.hash_data(content['data'], {'sha1'}) return hashes['sha1'] def _sort_key(entry): """The sorting key for tree entries""" if entry['type'] == 'dir': return entry['name'] + b'/' else: return entry['name'] @lru_cache() def _perms_to_bytes(perms): """Convert the perms value to its bytes representation""" oc = oct(perms)[2:] return oc.encode('ascii') def directory_identifier(directory): """Return the intrinsic identifier for a directory. A directory's identifier is the tree sha1 à la git of a directory listing, using the following algorithm, which is equivalent to the git algorithm for trees: 1. Entries of the directory are sorted using the name (or the name with '/' appended for directory entries) as key, in bytes order. 2. For each entry of the directory, the following bytes are output: - the octal representation of the permissions for the entry (stored in the 'perms' member), which is a representation of the entry type: b'100644' (int 33188)for files b'100755' (int 33261)for executable files b'120000' (int 40960)for symbolic links b'40000' (int 16384) for directories b'160000' (int 57344) for references to revisions - an ascii space (b'\x20') - the entry's name (as raw bytes), stored in the 'name' member - a null byte (b'\x00') - the 20 byte long identifier of the object pointed at by the entry, stored in the 'target' member: for files or executable files: their blob sha1_git for symbolic links: the blob sha1_git of a file containing the link destination for directories: their intrinsic identifier for revisions: their intrinsic identifier (Note that there is no separator between entries) """ components = [] for entry in sorted(directory['entries'], key=_sort_key): components.extend([ _perms_to_bytes(entry['perms']), b'\x20', entry['name'], b'\x00', identifier_to_bytes(entry['target']), ]) return hashutil.hash_git_data(b''.join(components), 'tree') + + +def format_date(date): + """Convert a date object into an UTC timestamp encoded as ascii bytes. + + Git stores timestamps as an integer number of seconds since the UNIX epoch. + + However, Software Heritage stores timestamps as an integer number of + microseconds (postgres type "datetime with timezone"). + + Therefore, we print timestamps with no microseconds as integers, and + timestamps with microseconds as floating point values. + + """ + if isinstance(date, datetime.datetime): + if date.microsecond == 0: + date = int(date.timestamp()) + else: + date = date.timestamp() + return str(date).encode() + else: + if date == int(date): + date = int(date) + return str(date).encode() + + +@lru_cache() +def format_offset(offset): + """Convert an integer number of minutes into an offset representation. + + The offset representation is [+-]hhmm where: + hh is the number of hours; + mm is the number of minutes. + + A null offset is represented as +0000. + """ + if offset >= 0: + sign = '+' + else: + sign = '-' + + hours = abs(offset) // 60 + minutes = abs(offset) % 60 + + t = '%s%02d%02d' % (sign, hours, minutes) + return t.encode() + + +def format_author(author): + components = [ + author['name'], b' <', author['email'], b'> ', + format_date(author['date']), b' ', + format_offset(author['date_offset']), + ] + + return b''.join(components) + + +def revision_identifier(revision): + """Return the intrinsic identifier for a revision. + """ + components = [ + b'tree ', identifier_to_str(revision['directory']).encode(), b'\n', + ] + for parent in revision['parents']: + if parent: + components.extend([ + b'parent ', identifier_to_str(parent).encode(), b'\n', + ]) + + components.extend([ + b'author ', format_author(revision['author']), b'\n', + b'committer ', format_author(revision['committer']), b'\n', + b'\n', + revision['message'], + ]) + + print(b''.join(components).decode('utf-8')) + return hashutil.hash_git_data(b''.join(components), 'commit') diff --git a/swh/model/tests/test_identifiers.py b/swh/model/tests/test_identifiers.py index 2dbd8ca..0ace24f 100644 --- a/swh/model/tests/test_identifiers.py +++ b/swh/model/tests/test_identifiers.py @@ -1,140 +1,213 @@ # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import datetime import unittest from nose.tools import istest from swh.model import hashutil, identifiers class ContentIdentifier(unittest.TestCase): def setUp(self): self.content = { 'status': 'visible', 'length': 5, 'data': b'1984\n', 'ctime': datetime.datetime(2015, 11, 22, 16, 33, 56, tzinfo=datetime.timezone.utc), } self.content.update( hashutil.hash_data(self.content['data'])) @istest def content_identifier(self): self.assertEqual(identifiers.content_identifier(self.content), self.content['sha1']) class DirectoryIdentifier(unittest.TestCase): def setUp(self): self.directory = { 'id': 'c2e41aae41ac17bd4a650770d6ee77f62e52235b', 'entries': [ { 'type': 'file', 'perms': 33188, 'name': b'README', 'target': '37ec8ea2110c0b7a32fbb0e872f6e7debbf95e21' }, { 'type': 'file', 'perms': 33188, 'name': b'Rakefile', 'target': '3bb0e8592a41ae3185ee32266c860714980dbed7' }, { 'type': 'dir', 'perms': 16384, 'name': b'app', 'target': '61e6e867f5d7ba3b40540869bc050b0c4fed9e95' }, { 'type': 'file', 'perms': 33188, 'name': b'1.megabyte', 'target': '7c2b2fbdd57d6765cdc9d84c2d7d333f11be7fb3' }, { 'type': 'dir', 'perms': 16384, 'name': b'config', 'target': '591dfe784a2e9ccc63aaba1cb68a765734310d98' }, { 'type': 'dir', 'perms': 16384, 'name': b'public', 'target': '9588bf4522c2b4648bfd1c61d175d1f88c1ad4a5' }, { 'type': 'file', 'perms': 33188, 'name': b'development.sqlite3', 'target': 'e69de29bb2d1d6434b8b29ae775ad8c2e48c5391' }, { 'type': 'dir', 'perms': 16384, 'name': b'doc', 'target': '154705c6aa1c8ead8c99c7915373e3c44012057f' }, { 'type': 'dir', 'perms': 16384, 'name': b'db', 'target': '85f157bdc39356b7bc7de9d0099b4ced8b3b382c' }, { 'type': 'dir', 'perms': 16384, 'name': b'log', 'target': '5e3d3941c51cce73352dff89c805a304ba96fffe' }, { 'type': 'dir', 'perms': 16384, 'name': b'script', 'target': '1b278423caf176da3f3533592012502aa10f566c' }, { 'type': 'dir', 'perms': 16384, 'name': b'test', 'target': '035f0437c080bfd8711670b3e8677e686c69c763' }, { 'type': 'dir', 'perms': 16384, 'name': b'vendor', 'target': '7c0dc9ad978c1af3f9a4ce061e50f5918bd27138' }, { 'type': 'rev', 'perms': 57344, 'name': b'will_paginate', 'target': '3d531e169db92a16a9a8974f0ae6edf52e52659e' } ], } self.empty_directory = { 'id': '4b825dc642cb6eb9a060e54bf8d69288fbee4904', 'entries': [], } @istest def dir_identifier(self): self.assertEqual( identifiers.directory_identifier(self.directory), self.directory['id']) @istest def dir_identifier_empty_directory(self): self.assertEqual( identifiers.directory_identifier(self.empty_directory), self.empty_directory['id']) + + +class RevisionIdentifier(unittest.TestCase): + def setUp(self): + self.revision = { + 'id': 'bc0195aad0daa2ad5b0d76cce22b167bc3435590', + 'directory': '85a74718d377195e1efd0843ba4f3260bad4fe07', + 'parents': ['01e2d0627a9a6edb24c37db45db5ecb31e9de808'], + 'author': { + 'name': b'Linus Torvalds', + 'email': b'torvalds@linux-foundation.org', + 'date': datetime.datetime(2015, 7, 12, 22, 10, 30, + tzinfo=datetime.timezone.utc), + 'date_offset': -420, + + }, + 'committer': { + 'name': b'Linus Torvalds', + 'email': b'torvalds@linux-foundation.org', + 'date': datetime.datetime(2015, 7, 12, 22, 10, 30, + tzinfo=datetime.timezone.utc), + 'date_offset': -420, + + }, + 'message': b'Linux 4.2-rc2\n', + } + + self.synthetic_revision = { + 'id': b'\xb2\xa7\xe1&\x04\x92\xe3D\xfa\xb3\xcb\xf9\x1b\xc1<\x91' + b'\xe0T&\xfd', + 'author': { + 'name': b'Software Heritage', + 'email': b'robot@softwareheritage.org', + 'date': datetime.datetime(2015, 7, 16, 11, 51, 35, + tzinfo=datetime.timezone.utc), + 'date_offset': 0, + }, + 'type': 'tar', + 'committer': { + 'name': b'Software Heritage', + 'date': datetime.datetime(2015, 7, 16, 11, 51, 35, + tzinfo=datetime.timezone.utc), + 'email': b'robot@softwareheritage.org', + 'date_offset': 0, + }, + 'synthetic': True, + 'parents': [None], + 'message': b'synthetic revision message\n', + 'directory': b'\xd1\x1f\x00\xa6\xa0\xfe\xa6\x05SA\xd2U\x84\xb5\xa9' + b'e\x16\xc0\xd2\xb8', + 'metadata': {'original_artifact': [ + {'archive_type': 'tar', + 'name': 'gcc-5.2.0.tar.bz2', + 'sha1_git': '39d281aff934d44b439730057e55b055e206a586', + 'sha1': 'fe3f5390949d47054b613edc36c557eb1d51c18e', + 'sha256': '5f835b04b5f7dd4f4d2dc96190ec1621b8d89f' + '2dc6f638f9f8bc1b1014ba8cad'}]}, + + } + + @istest + def revision_identifier(self): + self.assertEqual( + identifiers.revision_identifier(self.revision), + identifiers.identifier_to_str(self.revision['id']), + ) + + @istest + def revision_identifier_synthetic(self): + self.assertEqual( + identifiers.revision_identifier(self.synthetic_revision), + identifiers.identifier_to_str(self.synthetic_revision['id']), + )