diff --git a/swh/model/hashutil.py b/swh/model/hashutil.py index 2d5ff12..b2558a3 100644 --- a/swh/model/hashutil.py +++ b/swh/model/hashutil.py @@ -1,187 +1,189 @@ # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import binascii import functools import hashlib from io import BytesIO import os # supported hashing algorithms ALGORITHMS = set(['sha1', 'sha256', 'sha1_git']) # should be a multiple of 64 (sha1/sha256's block size) # FWIW coreutils' sha1sum uses 32768 HASH_BLOCK_SIZE = 32768 def _new_git_hash(base_algo, git_type, length): """Initialize a digest object (as returned by python's hashlib) for the requested algorithm, and feed it with the header for a git object of the given type and length. The header for hashing a git object consists of: - The type of the object (encoded in ASCII) - One ASCII space (\x20) - The length of the object (decimal encoded in ASCII) - One NUL byte Args: base_algo: a hashlib-supported algorithm git_type: the type of the git object (supposedly one of 'blob', 'commit', 'tag', 'tree') length: the length of the git object you're encoding Returns: a hashutil.hash object """ h = hashlib.new(base_algo) git_header = '%s %d\0' % (git_type, length) h.update(git_header.encode('ascii')) return h def _new_hash(algo, length=None): """Initialize a digest object (as returned by python's hashlib) for the requested algorithm. See the constant ALGORITHMS for the list of supported algorithms. If a git-specific hashing algorithm is requested (e.g., "sha1_git"), the hashing object will be pre-fed with the needed header; for this to work, length must be given. Args: algo: a hashing algorithm (one of ALGORITHMS) length: the length of the hashed payload (needed for git-specific algorithms) Returns: a hashutil.hash object Raises: ValueError if algo is unknown, or length is missing for a git-specific hash. """ if algo not in ALGORITHMS: raise ValueError('Unexpected hashing algorithm %s, ' 'expected one of %s' % (algo, ', '.join(sorted(ALGORITHMS)))) h = None if algo.endswith('_git'): if length is None: raise ValueError('Missing length for git hashing algorithm') base_algo = algo[:-4] h = _new_git_hash(base_algo, 'blob', length) else: h = hashlib.new(algo) return h def hash_file(fobj, length=None, algorithms=ALGORITHMS, chunk_cb=None): """Hash the contents of the given file object with the given algorithms. Args: fobj: a file-like object length: the length of the contents of the file-like object (for the git-specific algorithms) algorithms: the hashing algorithms used Returns: a dict mapping each algorithm to a bytes digest. Raises: ValueError if algorithms contains an unknown hash algorithm. """ hashes = {algo: _new_hash(algo, length) for algo in algorithms} while True: chunk = fobj.read(HASH_BLOCK_SIZE) if not chunk: break for hash in hashes.values(): hash.update(chunk) if chunk_cb: chunk_cb(chunk) return {algo: hash.digest() for algo, hash in hashes.items()} def hash_path(path, algorithms=ALGORITHMS, chunk_cb=None): """Hash the contents of the file at the given path with the given algorithms. Args: path: the path of the file to hash algorithms: the hashing algorithms used chunk_cb: a callback Returns: a dict mapping each algorithm to a bytes digest. Raises: ValueError if algorithms contains an unknown hash algorithm. OSError on file access error """ length = os.path.getsize(path) with open(path, 'rb') as fobj: - return hash_file(fobj, length, algorithms, chunk_cb) + hash = hash_file(fobj, length, algorithms, chunk_cb) + hash['length'] = length + return hash def hash_data(data, algorithms=ALGORITHMS): """Hash the given binary blob with the given algorithms. Args: data: a bytes object algorithms: the hashing algorithms used Returns: a dict mapping each algorithm to a bytes digest Raises: TypeError if data does not support the buffer interface. ValueError if algorithms contains an unknown hash algorithm. """ fobj = BytesIO(data) return hash_file(fobj, len(data), algorithms) def hash_git_data(data, git_type, base_algo='sha1'): """Hash the given data as a git object of type git_type. Args: data: a bytes object git_type: the git object type base_algo: the base hashing algorithm used (default: sha1) Returns: a dict mapping each algorithm to a bytes digest Raises: ValueError if the git_type is unexpected. """ git_object_types = {'blob', 'tree', 'commit', 'tag'} if git_type not in git_object_types: raise ValueError('Unexpected git object type %s, expected one of %s' % (git_type, ', '.join(sorted(git_object_types)))) h = _new_git_hash(base_algo, git_type, len(data)) h.update(data) return h.digest() @functools.lru_cache() def hash_to_hex(hash): """Converts a hash (in hex or bytes form) to its hexadecimal ascii form""" if isinstance(hash, str): return hash return binascii.hexlify(hash).decode('ascii') @functools.lru_cache() def hash_to_bytes(hash): """Converts a hash (in hex or bytes form) to its raw bytes form""" if isinstance(hash, bytes): return hash return bytes.fromhex(hash) diff --git a/swh/model/tests/test_git.py b/swh/model/tests/test_git.py index a630835..76ece28 100644 --- a/swh/model/tests/test_git.py +++ b/swh/model/tests/test_git.py @@ -1,498 +1,499 @@ # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import os import shutil import subprocess import tempfile import unittest from nose.tools import istest from swh.model import git class GitHashlib(unittest.TestCase): def setUp(self): self.tree_data = b''.join([b'40000 barfoo\0', bytes.fromhex('c3020f6bf135a38c6df' '3afeb5fb38232c5e07087'), b'100644 blah\0', bytes.fromhex('63756ef0df5e4f10b6efa' '33cfe5c758749615f20'), b'100644 hello\0', bytes.fromhex('907b308167f0880fb2a' '5c0e1614bb0c7620f9dc3')]) self.commit_data = """tree 1c61f7259dcb770f46b194d941df4f08ff0a3970 author Antoine R. Dumont (@ardumont) 1444054085 +0200 committer Antoine R. Dumont (@ardumont) 1444054085 +0200 initial """.encode('utf-8') # NOQA self.tag_data = """object 24d012aaec0bc5a4d2f62c56399053d6cc72a241 type commit tag 0.0.1 tagger Antoine R. Dumont (@ardumont) 1444225145 +0200 blah """.encode('utf-8') # NOQA self.checksums = { 'tree_sha1_git': bytes.fromhex('ac212302c45eada382b27bfda795db' '121dacdb1c'), 'commit_sha1_git': bytes.fromhex('e960570b2e6e2798fa4cfb9af2c399' 'd629189653'), 'tag_sha1_git': bytes.fromhex('bc2b99ba469987bcf1272c189ed534' 'e9e959f120'), } @istest def compute_directory_git_sha1(self): # given dirpath = 'some-dir-path' hashes = { dirpath: [{'perms': git.GitPerm.TREE, 'type': git.GitType.TREE, 'name': b'barfoo', 'sha1_git': bytes.fromhex('c3020f6bf135a38c6df' '3afeb5fb38232c5e07087')}, {'perms': git.GitPerm.BLOB, 'type': git.GitType.BLOB, 'name': b'hello', 'sha1_git': bytes.fromhex('907b308167f0880fb2a' '5c0e1614bb0c7620f9dc3')}, {'perms': git.GitPerm.BLOB, 'type': git.GitType.BLOB, 'name': b'blah', 'sha1_git': bytes.fromhex('63756ef0df5e4f10b6efa' '33cfe5c758749615f20')}] } # when checksum = git.compute_directory_git_sha1(dirpath, hashes) # then self.assertEqual(checksum, self.checksums['tree_sha1_git']) @istest def compute_revision_sha1_git(self): # given tree_hash = bytes.fromhex('1c61f7259dcb770f46b194d941df4f08ff0a3970') revision = { 'author': { 'name': b'Antoine R. Dumont (@ardumont)', 'email': b'antoine.romain.dumont@gmail.com', }, 'date': { 'timestamp': 1444054085, 'offset': 120, }, 'committer': { 'name': b'Antoine R. Dumont (@ardumont)', 'email': b'antoine.romain.dumont@gmail.com', }, 'committer_date': { 'timestamp': 1444054085, 'offset': 120, }, 'message': b'initial\n', 'type': 'tar', 'directory': tree_hash, 'parents': [], } # when checksum = git.compute_revision_sha1_git(revision) # then self.assertEqual(checksum, self.checksums['commit_sha1_git']) @istest def compute_release_sha1_git(self): # given revision_hash = bytes.fromhex('24d012aaec0bc5a4d2f62c56399053' 'd6cc72a241') release = { 'name': b'0.0.1', 'author': { 'name': b'Antoine R. Dumont (@ardumont)', 'email': b'antoine.romain.dumont@gmail.com', }, 'date': { 'timestamp': 1444225145, 'offset': 120, }, 'message': b'blah\n', 'target_type': 'revision', 'target': revision_hash, } # when checksum = git.compute_release_sha1_git(release) # then self.assertEqual(checksum, self.checksums['tag_sha1_git']) class GitHashWalkArborescenceTree(unittest.TestCase): """Root class to ease walk and git hash testing without side-effecty problems. """ def setUp(self): self.tmp_root_path = tempfile.mkdtemp().encode('utf-8') start_path = os.path.dirname(__file__).encode('utf-8') sample_folder_archive = os.path.join(start_path, b'../../../..', b'swh-storage-testdata', b'dir-folders', b'sample-folder.tgz') self.root_path = os.path.join(self.tmp_root_path, b'sample-folder') # uncompress the sample folder subprocess.check_output( ['tar', 'xvf', sample_folder_archive, '-C', self.tmp_root_path]) def tearDown(self): if os.path.exists(self.tmp_root_path): shutil.rmtree(self.tmp_root_path) class GitHashFromScratch(GitHashWalkArborescenceTree): """Test the main `walk_and_compute_sha1_from_directory` algorithm that scans and compute the disk for checksums. """ @istest def walk_and_compute_sha1_from_directory(self): # make a temporary arborescence tree to hash without ignoring anything # same as previous behavior walk0 = git.walk_and_compute_sha1_from_directory(self.tmp_root_path) keys0 = list(walk0.keys()) path_excluded = os.path.join(self.tmp_root_path, b'sample-folder', b'foo') self.assertTrue(path_excluded in keys0) # it is not excluded here # make the same temporary arborescence tree to hash with ignoring one # folder foo walk1 = git.walk_and_compute_sha1_from_directory( self.tmp_root_path, dir_ok_fn=lambda dirpath: b'sample-folder/foo' not in dirpath) keys1 = list(walk1.keys()) self.assertTrue(path_excluded not in keys1) # remove the keys that can't be the same (due to hash definition) # Those are the top level folders keys_diff = [self.tmp_root_path, os.path.join(self.tmp_root_path, b'sample-folder'), git.ROOT_TREE_KEY] for k in keys_diff: self.assertNotEquals(walk0[k], walk1[k]) # The remaining keys (bottom path) should have exactly the same hashes # as before keys = set(keys1) - set(keys_diff) actual_walk1 = {} for k in keys: self.assertEquals(walk0[k], walk1[k]) actual_walk1[k] = walk1[k] expected_checksums = { os.path.join(self.tmp_root_path, b'sample-folder/empty-folder'): [], # noqa os.path.join(self.tmp_root_path, b'sample-folder/bar/barfoo'): [{ # noqa 'type': git.GitType.BLOB, # noqa + 'length': 72, 'sha256': b'=\xb5\xae\x16\x80U\xbc\xd9:M\x08(]\xc9\x9f\xfe\xe2\x883\x03\xb2?\xac^\xab\x85\x02s\xa8\xeaUF', # noqa 'name': b'another-quote.org', # noqa 'path': os.path.join(self.tmp_root_path, b'sample-folder/bar/barfoo/another-quote.org'), # noqa 'perms': git.GitPerm.BLOB, # noqa 'sha1': b'\x90\xa6\x13\x8b\xa5\x99\x15&\x1e\x17\x99H8j\xa1\xcc*\xa9"\n', # noqa 'sha1_git': b'\x136\x93\xb1%\xba\xd2\xb4\xac1\x855\xb8I\x01\xeb\xb1\xf6\xb68'}], # noqa os.path.join(self.tmp_root_path, b'sample-folder/bar'): [{ # noqa 'type': git.GitType.TREE, # noqa 'perms': git.GitPerm.TREE, # noqa 'name': b'barfoo', # noqa 'path': os.path.join(self.tmp_root_path, b'sample-folder/bar/barfoo'), # noqa 'sha1_git': b'\xc3\x02\x0fk\xf15\xa3\x8cm\xf3\xaf\xeb_\xb3\x822\xc5\xe0p\x87'}]} # noqa self.assertEquals(actual_walk1, expected_checksums) @istest def walk_and_compute_sha1_from_directory_without_root_tree(self): # compute the full checksums expected_hashes = git.walk_and_compute_sha1_from_directory( self.tmp_root_path) # except for the key on that round actual_hashes = git.walk_and_compute_sha1_from_directory( self.tmp_root_path, with_root_tree=False) # then, removing the root tree hash from the first round del expected_hashes[git.ROOT_TREE_KEY] # should give us the same checksums as the second round self.assertEquals(actual_hashes, expected_hashes) class GitHashUpdate(GitHashWalkArborescenceTree): """Test `walk and git hash only on modified fs` functions. """ @istest def update_checksums_from_add_new_file(self): # make a temporary arborescence tree to hash without ignoring anything # update the disk in some way (add a new file) # update the actual git checksums from the deeper tree modified # when objects = git.walk_and_compute_sha1_from_directory( self.tmp_root_path) # update the existing file changed_path = os.path.join(self.tmp_root_path, b'sample-folder/bar/barfoo/new') with open(changed_path, 'wb') as f: f.write(b'new line') # walk1 (this will be our expectation) expected_dict = git.walk_and_compute_sha1_from_directory( self.tmp_root_path) # then actual_dict = git.update_checksums_from( [{'path': changed_path, 'action': 'A'}], objects) self.assertEquals(expected_dict, actual_dict) @istest def update_checksums_from_modify_existing_file(self): # make a temporary arborescence tree to hash without ignoring anything # update the disk in some way () # update the actual git checksums where only the modification is needed # when objects = git.walk_and_compute_sha1_from_directory( self.tmp_root_path) # update existing file changed_path = os.path.join( self.tmp_root_path, b'sample-folder/bar/barfoo/another-quote.org') with open(changed_path, 'wb+') as f: f.write(b'I have a dream') # walk1 (this will be our expectation) expected_dict = git.walk_and_compute_sha1_from_directory( self.tmp_root_path) # then actual_dict = git.update_checksums_from( [{'path': changed_path, 'action': 'M'}], objects) self.assertEquals(expected_dict, actual_dict) @istest def update_checksums_no_change(self): # when expected_dict = git.walk_and_compute_sha1_from_directory( self.tmp_root_path) # nothing changes on disk # then actual_dict = git.update_checksums_from([], expected_dict) self.assertEquals(actual_dict, expected_dict) @istest def update_checksums_delete_existing_file(self): # make a temporary arborescence tree to hash without ignoring anything # update the disk in some way (delete a file) # update the actual git checksums from the deeper tree modified # when objects = git.walk_and_compute_sha1_from_directory( self.tmp_root_path) # Remove folder changed_path = os.path.join(self.tmp_root_path, b'sample-folder/bar/barfoo') shutil.rmtree(changed_path) # Actually walking the fs will be the resulting expectation expected_dict = git.walk_and_compute_sha1_from_directory( self.tmp_root_path) # then actual_dict = git.update_checksums_from( [{'path': changed_path, 'action': 'D'}], objects) self.assertEquals(actual_dict, expected_dict) @istest def update_checksums_from_multiple_fs_modifications(self): # make a temporary arborescence tree to hash without ignoring anything # update the disk in some way (modify a file, add a new, delete one) # update the actual git checksums from the deeper tree modified # when objects = git.walk_and_compute_sha1_from_directory( self.tmp_root_path) # Actions on disk (imagine a checkout of some form) # 1. Create a new file changed_path = os.path.join(self.tmp_root_path, b'sample-folder/bar/barfoo/new') with open(changed_path, 'wb') as f: f.write(b'new line') # 2. update the existing file changed_path1 = os.path.join( self.tmp_root_path, b'sample-folder/bar/barfoo/another-quote.org') with open(changed_path1, 'wb') as f: f.write(b'new line') # 3. Remove some folder changed_path2 = os.path.join(self.tmp_root_path, b'sample-folder/foo') shutil.rmtree(changed_path2) # Actually walking the fs will be the resulting expectation expected_dict = git.walk_and_compute_sha1_from_directory( self.tmp_root_path) # then actual_dict = git.update_checksums_from( [{'path': changed_path, 'action': 'A'}, {'path': changed_path1, 'action': 'M'}, {'path': changed_path2, 'action': 'D'}], objects) self.assertEquals(expected_dict, actual_dict) @istest def update_checksums_from_common_ancestor(self): # when # Add some new arborescence below a folder destined to be removed # want to check that old keys does not remain future_folder_to_remove = os.path.join(self.tmp_root_path, b'sample-folder/bar/barfoo') # add .../barfoo/hello/world under (.../barfoo which will be destroyed) new_folder = os.path.join(future_folder_to_remove, b'hello') os.makedirs(new_folder, exist_ok=True) with open(os.path.join(future_folder_to_remove, b'world'), 'wb') as f: f.write(b"i'm sad 'cause i'm destined to be removed...") # now we scan the disk objects = git.walk_and_compute_sha1_from_directory( self.tmp_root_path) assert objects[future_folder_to_remove] # Actions on disk (to simulate a checkout of some sort) # 1. Create a new file changed_path = os.path.join(self.tmp_root_path, b'sample-folder/bar/barfoo/new') with open(changed_path, 'wb') as f: f.write(b'new line') # 2. update the existing file changed_path1 = os.path.join( self.tmp_root_path, b'sample-folder/bar/barfoo/another-quote.org') with open(changed_path1, 'wb') as f: f.write(b'new line') # 3. Remove folder shutil.rmtree(future_folder_to_remove) # Actually walking the fs will be the resulting expectation expected_dict = git.walk_and_compute_sha1_from_directory( self.tmp_root_path) # then actual_dict = git.update_checksums_from( [{'path': changed_path, 'action': 'A'}, {'path': changed_path1, 'action': 'M'}, {'path': future_folder_to_remove, 'action': 'D'}], objects) self.assertEquals(expected_dict, actual_dict) @istest def update_checksums_detects_recomputation_from_all_is_needed(self): # when objects = git.walk_and_compute_sha1_from_directory( self.tmp_root_path) # Actions on disk (imagine a checkout of some form) # 1. Create a new file changed_path = os.path.join(self.tmp_root_path, b'new-file-at-root') with open(changed_path, 'wb') as f: f.write(b'new line') # 2. update the existing file changed_path1 = os.path.join( self.tmp_root_path, b'sample-folder/bar/barfoo/another-quote.org') with open(changed_path1, 'wb') as f: f.write(b'new line') # 3. Remove some folder changed_path2 = os.path.join(self.tmp_root_path, b'sample-folder/foo') # 3. Remove some folder changed_path2 = os.path.join(self.tmp_root_path, b'sample-folder/bar/barfoo') shutil.rmtree(changed_path2) # Actually walking the fs will be the resulting expectation expected_dict = git.walk_and_compute_sha1_from_directory( self.tmp_root_path) # then actual_dict = git.update_checksums_from( [{'path': changed_path, 'action': 'A'}, {'path': changed_path1, 'action': 'M'}, {'path': changed_path2, 'action': 'D'}], objects) self.assertEquals(expected_dict, actual_dict) @istest def commonpath(self): paths = ['r/0/h', 'r/1/d', 'r/1/i/a', 'r/1/i/b', 'r/1/i/c', 'r/2/e', 'r/2/f', 'r/2/g'] self.assertEquals(git.commonpath(paths), 'r') paths = ['r/1/d', 'r/1/i/a', 'r/1/i/b', 'r/1/i/c'] self.assertEquals(git.commonpath(paths), 'r/1') paths = ['/a/r/2/g', '/a/r/1/i/c', '/a/r/0/h'] self.assertEquals(git.commonpath(paths), '/a/r') paths = [b'/a/r/2/g', b'/b/r/1/i/c', b'/c/r/0/h'] self.assertEquals(git.commonpath(paths), b'/') paths = ['a/z', 'a/z', 'a/z'] self.assertEquals(git.commonpath(paths), 'a/z') paths = ['0'] self.assertEquals(git.commonpath(paths), '0') diff --git a/swh/model/tests/test_hashutil.py b/swh/model/tests/test_hashutil.py index 79cdc9e..f795e87 100644 --- a/swh/model/tests/test_hashutil.py +++ b/swh/model/tests/test_hashutil.py @@ -1,111 +1,112 @@ # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import io import tempfile import unittest from nose.tools import istest from swh.model import hashutil class Hashutil(unittest.TestCase): def setUp(self): self.data = b'1984\n' self.hex_checksums = { 'sha1': '62be35bf00ff0c624f4a621e2ea5595a049e0731', 'sha1_git': '568aaf43d83b2c3df8067f3bedbb97d83260be6d', 'sha256': '26602113b4b9afd9d55466b08580d3c2' '4a9b50ee5b5866c0d91fab0e65907311', } self.checksums = { type: bytes.fromhex(cksum) for type, cksum in self.hex_checksums.items() } self.git_hex_checksums = { 'blob': self.hex_checksums['sha1_git'], 'tree': '5b2e883aa33d2efab98442693ea4dd5f1b8871b0', 'commit': '79e4093542e72f0fcb7cbd75cb7d270f9254aa8f', 'tag': 'd6bf62466f287b4d986c545890716ce058bddf67', } self.git_checksums = { type: bytes.fromhex(cksum) for type, cksum in self.git_hex_checksums.items() } @istest def hash_data(self): checksums = hashutil.hash_data(self.data) self.assertEqual(checksums, self.checksums) @istest def hash_data_unknown_hash(self): with self.assertRaises(ValueError) as cm: hashutil.hash_data(self.data, ['unknown-hash']) self.assertIn('Unexpected hashing algorithm', cm.exception.args[0]) self.assertIn('unknown-hash', cm.exception.args[0]) @istest def hash_git_data(self): checksums = { git_type: hashutil.hash_git_data(self.data, git_type) for git_type in self.git_checksums } self.assertEqual(checksums, self.git_checksums) @istest def hash_git_data_unknown_git_type(self): with self.assertRaises(ValueError) as cm: hashutil.hash_git_data(self.data, 'unknown-git-type') self.assertIn('Unexpected git object type', cm.exception.args[0]) self.assertIn('unknown-git-type', cm.exception.args[0]) @istest def hash_file(self): fobj = io.BytesIO(self.data) checksums = hashutil.hash_file(fobj, length=len(self.data)) self.assertEqual(checksums, self.checksums) @istest def hash_file_missing_length(self): fobj = io.BytesIO(self.data) with self.assertRaises(ValueError) as cm: hashutil.hash_file(fobj, algorithms=['sha1_git']) self.assertIn('Missing length', cm.exception.args[0]) @istest def hash_path(self): with tempfile.NamedTemporaryFile(delete=False) as f: f.write(self.data) f.close() hashes = hashutil.hash_path(f.name) + self.checksums['length'] = len(self.data) self.assertEquals(self.checksums, hashes) @istest def hash_to_hex(self): for type in self.checksums: hex = self.hex_checksums[type] hash = self.checksums[type] self.assertEquals(hashutil.hash_to_hex(hex), hex) self.assertEquals(hashutil.hash_to_hex(hash), hex) @istest def hash_to_bytes(self): for type in self.checksums: hex = self.hex_checksums[type] hash = self.checksums[type] self.assertEquals(hashutil.hash_to_bytes(hex), hash) self.assertEquals(hashutil.hash_to_bytes(hash), hash)