diff --git a/swh/model/git.py b/swh/model/git.py index 155ba98..79852f8 100644 --- a/swh/model/git.py +++ b/swh/model/git.py @@ -1,249 +1,266 @@ # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import os from enum import Enum from swh.model import hashutil, identifiers ROOT_TREE_KEY = b'' class GitType(Enum): BLOB = b'blob' TREE = b'tree' EXEC = b'exec' LINK = b'link' COMM = b'commit' RELE = b'release' REFS = b'ref' class GitPerm(Enum): BLOB = b'100644' TREE = b'40000' EXEC = b'100755' LINK = b'120000' def compute_directory_git_sha1(dirpath, hashes): """Compute a directory git sha1 for a dirpath. Args: dirpath: the directory's absolute path hashes: list of tree entries with keys: - sha1_git: the tree entry's sha1 - name: file or subdir's name - perms: the tree entry's sha1 permissions Returns: the binary sha1 of the dictionary's identifier Assumes: Every path exists in hashes. """ directory = { 'entries': [ { 'name': entry['name'], 'perms': int(entry['perms'].value, 8), 'target': entry['sha1_git'], 'type': 'dir' if entry['perms'] == GitPerm.TREE else 'file', } for entry in hashes[dirpath] ] } return hashutil.hash_to_bytes(identifiers.directory_identifier(directory)) def compute_revision_sha1_git(revision): """Compute a revision sha1 git from its dict representation. Args: revision: Additional dictionary information needed to compute a synthetic revision. Following keys are expected: - author - date - committer - committer_date - message - type - directory: binary form of the tree hash Returns: revision sha1 in bytes # FIXME: beware, bytes output from storage api """ return hashutil.hash_to_bytes(identifiers.revision_identifier(revision)) def compute_release_sha1_git(release): """Compute a release sha1 git from its dict representation. Args: release: Additional dictionary information needed to compute a synthetic release. Following keys are expected: - name - message - date - author - revision: binary form of the sha1_git revision targeted by this Returns: release sha1 in bytes """ return hashutil.hash_to_bytes(identifiers.release_identifier(release)) def compute_link_metadata(linkpath): """Given a linkpath, compute the git metadata. Args: linkpath: absolute pathname of the link Returns: Dictionary of values: - name: basename of the link - perms: git permission for link - type: git type for link """ data = os.readlink(linkpath) link_metadata = hashutil.hash_data(data) link_metadata.update({ 'data': data, 'length': len(data), 'name': os.path.basename(linkpath), 'perms': GitPerm.LINK, 'type': GitType.BLOB, 'path': linkpath }) return link_metadata def compute_blob_metadata(filepath): """Given a filepath, compute the git metadata. Args: filepath: absolute pathname of the file. Returns: Dictionary of values: - name: basename of the file - perms: git permission for file - type: git type for file """ blob_metadata = hashutil.hash_path(filepath) perms = GitPerm.EXEC if os.access(filepath, os.X_OK) else GitPerm.BLOB blob_metadata.update({ 'name': os.path.basename(filepath), 'perms': perms, 'type': GitType.BLOB, 'path': filepath }) return blob_metadata def compute_tree_metadata(dirname, ls_hashes): """Given a dirname, compute the git metadata. Args: dirname: absolute pathname of the directory. Returns: Dictionary of values: - name: basename of the directory - perms: git permission for directory - type: git type for directory """ return { 'sha1_git': compute_directory_git_sha1(dirname, ls_hashes), 'name': os.path.basename(dirname), 'perms': GitPerm.TREE, 'type': GitType.TREE, 'path': dirname } -def walk_and_compute_sha1_from_directory(rootdir): +def walk_and_compute_sha1_from_directory(rootdir, + dir_ok_fn=lambda dirpath: True): """Compute git sha1 from directory rootdir. + Args: + - rootdir: Root directory from which beginning the git hash computation + + - dir_ok_fn: Filter function to filter directory according to rules + defined in the function. By default, all folders are ok. + Example override: dir_ok_fn = lambda dirpath: b'svn' not in dirpath + Returns: Dictionary of entries with keys and as values a list of directory entries. Those are list of dictionary with keys: - 'perms' - 'type' - 'name' - 'sha1_git' - and specifically content: 'sha1', 'sha256', ... Note: One special key is ROOT_TREE_KEY to indicate the upper root of the directory (this is the revision's directory). Raises: Nothing If something is raised, this is a programmatic error. """ ls_hashes = {} all_links = set() - for dirpath, dirnames, filenames in os.walk(rootdir, topdown=False): + def filtfn(dirpath, dirnames): + return list(filter(lambda dirname: dir_ok_fn(os.path.join(dirpath, + dirname)), + dirnames)) + + gen_dir = ((dp, filtfn(dp, dns), fns) for (dp, dns, fns) + in os.walk(rootdir, topdown=False) + if dir_ok_fn(dp)) + + for dirpath, dirnames, filenames in gen_dir: hashes = [] - links = [os.path.join(dirpath, file) + links = (os.path.join(dirpath, file) for file in (filenames+dirnames) - if os.path.islink(os.path.join(dirpath, file))] + if os.path.islink(os.path.join(dirpath, file))) for linkpath in links: all_links.add(linkpath) m_hashes = compute_link_metadata(linkpath) hashes.append(m_hashes) - only_files = [os.path.join(dirpath, file) + only_files = (os.path.join(dirpath, file) for file in filenames - if os.path.join(dirpath, file) not in all_links] + if os.path.join(dirpath, file) not in all_links) for filepath in only_files: m_hashes = compute_blob_metadata(filepath) hashes.append(m_hashes) ls_hashes[dirpath] = hashes dir_hashes = [] - subdirs = [os.path.join(dirpath, dir) + subdirs = (os.path.join(dirpath, dir) for dir in dirnames if os.path.join(dirpath, dir) - not in all_links] + not in all_links) for fulldirname in subdirs: tree_hash = compute_tree_metadata(fulldirname, ls_hashes) dir_hashes.append(tree_hash) ls_hashes[dirpath].extend(dir_hashes) # compute the current directory hashes root_hash = { 'sha1_git': compute_directory_git_sha1(rootdir, ls_hashes), 'path': rootdir, 'name': os.path.basename(rootdir), 'perms': GitPerm.TREE, 'type': GitType.TREE } ls_hashes[ROOT_TREE_KEY] = [root_hash] return ls_hashes diff --git a/swh/model/tests/test_git.py b/swh/model/tests/test_git.py index ed3617a..4351019 100644 --- a/swh/model/tests/test_git.py +++ b/swh/model/tests/test_git.py @@ -1,133 +1,212 @@ # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +import os +import tempfile import unittest +import subprocess from nose.tools import istest from swh.model import git class GitHashlib(unittest.TestCase): def setUp(self): self.tree_data = b''.join([b'40000 barfoo\0', bytes.fromhex('c3020f6bf135a38c6df' '3afeb5fb38232c5e07087'), b'100644 blah\0', bytes.fromhex('63756ef0df5e4f10b6efa' '33cfe5c758749615f20'), b'100644 hello\0', bytes.fromhex('907b308167f0880fb2a' '5c0e1614bb0c7620f9dc3')]) self.commit_data = """tree 1c61f7259dcb770f46b194d941df4f08ff0a3970 author Antoine R. Dumont (@ardumont) 1444054085 +0200 committer Antoine R. Dumont (@ardumont) 1444054085 +0200 initial """.encode('utf-8') # NOQA self.tag_data = """object 24d012aaec0bc5a4d2f62c56399053d6cc72a241 type commit tag 0.0.1 tagger Antoine R. Dumont (@ardumont) 1444225145 +0200 blah """.encode('utf-8') # NOQA self.checksums = { 'tree_sha1_git': bytes.fromhex('ac212302c45eada382b27bfda795db' '121dacdb1c'), 'commit_sha1_git': bytes.fromhex('e960570b2e6e2798fa4cfb9af2c399' 'd629189653'), 'tag_sha1_git': bytes.fromhex('bc2b99ba469987bcf1272c189ed534' 'e9e959f120'), } @istest def compute_directory_git_sha1(self): # given dirpath = 'some-dir-path' hashes = { dirpath: [{'perms': git.GitPerm.TREE, 'type': git.GitType.TREE, 'name': b'barfoo', 'sha1_git': bytes.fromhex('c3020f6bf135a38c6df' '3afeb5fb38232c5e07087')}, {'perms': git.GitPerm.BLOB, 'type': git.GitType.BLOB, 'name': b'hello', 'sha1_git': bytes.fromhex('907b308167f0880fb2a' '5c0e1614bb0c7620f9dc3')}, {'perms': git.GitPerm.BLOB, 'type': git.GitType.BLOB, 'name': b'blah', 'sha1_git': bytes.fromhex('63756ef0df5e4f10b6efa' '33cfe5c758749615f20')}] } # when checksum = git.compute_directory_git_sha1(dirpath, hashes) # then self.assertEqual(checksum, self.checksums['tree_sha1_git']) @istest def compute_revision_sha1_git(self): # given tree_hash = bytes.fromhex('1c61f7259dcb770f46b194d941df4f08ff0a3970') revision = { 'author': { 'name': b'Antoine R. Dumont (@ardumont)', 'email': b'antoine.romain.dumont@gmail.com', }, 'date': { 'timestamp': 1444054085, 'offset': 120, }, 'committer': { 'name': b'Antoine R. Dumont (@ardumont)', 'email': b'antoine.romain.dumont@gmail.com', }, 'committer_date': { 'timestamp': 1444054085, 'offset': 120, }, 'message': b'initial\n', 'type': 'tar', 'directory': tree_hash, 'parents': [], } # when checksum = git.compute_revision_sha1_git(revision) # then self.assertEqual(checksum, self.checksums['commit_sha1_git']) @istest def compute_release_sha1_git(self): # given revision_hash = bytes.fromhex('24d012aaec0bc5a4d2f62c56399053' 'd6cc72a241') release = { 'name': b'0.0.1', 'author': { 'name': b'Antoine R. Dumont (@ardumont)', 'email': b'antoine.romain.dumont@gmail.com', }, 'date': { 'timestamp': 1444225145, 'offset': 120, }, 'message': b'blah\n', 'target_type': 'revision', 'target': revision_hash, } # when checksum = git.compute_release_sha1_git(release) # then self.assertEqual(checksum, self.checksums['tag_sha1_git']) + + +class GitHashArborescenceTree(unittest.TestCase): + @classmethod + def setUpClass(cls): + super().setUpClass() + + cls.tmp_root_path = tempfile.mkdtemp().encode('utf-8') + + start_path = os.path.dirname(__file__).encode('utf-8') + sample_folder_archive = os.path.join(start_path, + b'../../../..', + b'swh-storage-testdata', + b'dir-folders', + b'sample-folder.tgz') + + cls.root_path = os.path.join(cls.tmp_root_path, b'sample-folder') + + # uncompress the sample folder + subprocess.check_output( + ['tar', 'xvf', sample_folder_archive, '-C', cls.tmp_root_path]) + + @istest + def walk_and_compute_sha1_from_directory(self): + # make a temporary arborescence tree to hash without ignoring anything + # same as previous behavior + walk0 = git.walk_and_compute_sha1_from_directory(self.tmp_root_path) + + keys0 = list(walk0.keys()) + path_excluded = os.path.join(self.tmp_root_path, + b'sample-folder', + b'foo') + self.assertTrue(path_excluded in keys0) # it is not excluded here + + # make the same temporary arborescence tree to hash with ignoring one + # folder foo + walk1 = git.walk_and_compute_sha1_from_directory( + self.tmp_root_path, + dir_ok_fn=lambda dirpath: b'sample-folder/foo' not in dirpath) + keys1 = list(walk1.keys()) + self.assertTrue(path_excluded not in keys1) + + # remove the keys that can't be the same (due to hash definition) + # Those are the top level folders + keys_diff = [self.tmp_root_path, + os.path.join(self.tmp_root_path, b'sample-folder'), + git.ROOT_TREE_KEY] + for k in keys_diff: + self.assertNotEquals(walk0[k], walk1[k]) + + # The remaining keys (bottom path) should have exactly the same hashes + # as before + keys = set(keys1) - set(keys_diff) + actual_walk1 = {} + for k in keys: + self.assertEquals(walk0[k], walk1[k]) + actual_walk1[k] = walk1[k] + + expected_checksums = { + os.path.join(self.tmp_root_path, b'sample-folder/empty-folder'): [], # noqa + os.path.join(self.tmp_root_path, b'sample-folder/bar/barfoo'): [{ # noqa + 'type': git.GitType.BLOB, # noqa + 'sha256': b'=\xb5\xae\x16\x80U\xbc\xd9:M\x08(]\xc9\x9f\xfe\xe2\x883\x03\xb2?\xac^\xab\x85\x02s\xa8\xeaUF', # noqa + 'name': b'another-quote.org', # noqa + 'path': os.path.join(self.tmp_root_path, b'sample-folder/bar/barfoo/another-quote.org'), # noqa + 'perms': git.GitPerm.BLOB, # noqa + 'sha1': b'\x90\xa6\x13\x8b\xa5\x99\x15&\x1e\x17\x99H8j\xa1\xcc*\xa9"\n', # noqa + 'sha1_git': b'\x136\x93\xb1%\xba\xd2\xb4\xac1\x855\xb8I\x01\xeb\xb1\xf6\xb68'}], # noqa + os.path.join(self.tmp_root_path, b'sample-folder/bar'): [{ # noqa + 'type': git.GitType.TREE, # noqa + 'perms': git.GitPerm.TREE, # noqa + 'name': b'barfoo', # noqa + 'path': os.path.join(self.tmp_root_path, b'sample-folder/bar/barfoo'), # noqa + 'sha1_git': b'\xc3\x02\x0fk\xf15\xa3\x8cm\xf3\xaf\xeb_\xb3\x822\xc5\xe0p\x87'}]} # noqa + + self.assertEquals(actual_walk1, expected_checksums)