diff --git a/scratch/walking.py b/scratch/walking.py index 852fc66..0e4acf9 100755 --- a/scratch/walking.py +++ b/scratch/walking.py @@ -1,230 +1,71 @@ #!/usr/bin/env python3 import os import shutil import tempfile -from swh.core import hashutil -from swh.loader.dir import git -from swh.loader.dir.git import GitPerm, GitType - - -def compute_content_hashes(dirpath, filename): - """Given a dirpath and a filename, compute the hashes for that particular - file. - - Args: - dirpath: the absolute path of the filename - filename: the file's name - - Returns: - The computed hashes for that dirpath/filename. - - Assumes: - The full computed path of the file exists. - - """ - fullname = os.path.join(dirpath, filename) - return hashutil.hashfile(fullname) - - -# Note: echo | git mktree --missing -def compute_directory_hash(dirpath, hashes): - """Compute a directory git sha1 for a dirpath. - - Args: - dirpath: the directory's absolute path - hashes: list of tree entries with keys: - - sha1_git: its sha1 - - name: file or subdir's name - - perms: the tree entry's sha1 permissions - - type: not used - - Returns: - sha1 git of the directory - - Assumes: - Every path exists. - - """ - def sort_by_entry_name(hashes): - return sorted(hashes, key=lambda entry: entry['name']) - - def row_entry_tree_format(hashes): - return map(lambda entry: - b''.join([entry['perms'].value, - b' ', - entry['name'], - b'\0', - entry['sha1_git']]), - hashes) - - rows = row_entry_tree_format(sort_by_entry_name(hashes[dirpath])) - return git.hashdata(b''.join(rows), 'tree') - - -def walk_and_compute_sha1_from_directory(rootdir): - """Compute git sha1 from directory rootdir. - - Empty directories are skipped. - - Returns: - Dictionary of entries with keys and as values a list of - directory entries. - Those are list of dictionary with keys: - - 'perms' - - 'type' - - 'name' - - 'sha1_git' - - and specifically content: 'sha1', 'sha256', ... (may be extended...) - - Note: - One special key is '' to indicate the upper root of the directory. - (This is typically the entry point of the revision). - - Raises: - Nothing - If something is raised, this is a programmatic error. - - """ - ls_hashes = {} - empty_dir = set() - - for dirpath, dirnames, filenames in os.walk(rootdir, topdown=False): - hashes = [] - - if dirnames == [] and filenames == []: - empty_dir.add(dirpath) - continue - - # compute content hashes - for filename in filenames: - m_hashes = compute_content_hashes(dirpath, filename) - m_hashes.update({ - 'name': bytes(filename, 'utf-8'), - 'perms': GitPerm.file, # FIXME symlink, exec file, gitlink... - 'type': GitType.file, - }) - hashes.append(m_hashes) - - ls_hashes.update({ - dirpath: hashes - }) - - dir_hashes = [] - # compute directory hashes and skip empty ones - for dirname in [dir for dir in dirnames if os.path.join(dirpath, dir) not in empty_dir]: - fullname = os.path.join(dirpath, dirname) - tree_hash = compute_directory_hash(fullname, ls_hashes) - tree_hash.update({ - 'name': bytes(dirname, 'utf-8'), - 'perms': GitPerm.dir, - 'type': GitType.dir - }) - dir_hashes.append(tree_hash) - - - ls_hashes.update({ - dirpath: ls_hashes.get(dirpath, []) + dir_hashes - }) - - # compute the current directory hashes - root_hash = compute_directory_hash(rootdir, ls_hashes) - root_hash.update({ - 'name': bytes(rootdir, 'utf-8'), - 'perms': GitPerm.dir, - 'type': GitType.dir - }) - ls_hashes.update({ - '': [root_hash] - }) - - return ls_hashes +from swh.loader.dir.git import git, utils def write_file(file, content): """Write some content in a file. """ with open(file, 'w') as f: f.write(content) # prepare some arborescence with dirs and files to walk it # scratch_folder_root = tempfile.mktemp(prefix='swh.loader.dir', suffix='.tmp', dir='/tmp') scratch_folder_root = os.path.join(os.environ['HOME'], 'tmp') # scratch_folder_foo = os.path.join(scratch_folder_root, 'foo') # os.makedirs(scratch_folder_foo, exist_ok=True) # scratch_folder_bar = os.path.join(scratch_folder_root, 'bar/barfoo') # os.makedirs(scratch_folder_bar, exist_ok=True) # scratch_file = os.path.join(scratch_folder_foo, 'quotes.md') # write_file(scratch_file, # 'Shoot for the moon. Even if you miss, you\'ll land among the stars.') # scratch_file2 = os.path.join(scratch_folder_bar, 'another-quote.org') # write_file(scratch_file2, # 'A Victory without danger is a triumph without glory.\n-- Pierre Corneille') def git_ls_tree_rec(hashes): """Display the computed result for debug purposes. """ for entry in hashes.keys(): entry_properties = hashes[entry] print("entry name: %s" % entry) for file in entry_properties: - sha1 = hashutil.hash_to_hex(file['sha1_git']) + sha1 = utils.hash_to_hex(file['sha1_git']) print("%s %s %s\t%s" % (file['perms'].value.decode('utf-8'), file['type'].value.decode('utf-8'), sha1, file['name'].decode('utf-8'))) print() -def compute_revision_hash(hashes, info): - """Compute a revision's hash. - - Use the entry's sha1_git as tree representation. - - """ - tree_hash = hashutil.hash_to_hex(hashes[''][0]['sha1_git']) - - revision_content = ("""tree %s -author %s <%s> %s %s -committer %s <%s> %s %s - -%s -""" % (tree_hash, - info['revision_author_name'], - info['revision_author_email'], - info['revision_author_date'], - info['revision_author_offset'], - info['revision_committer_name'], - info['revision_committer_email'], - info['revision_committer_date'], - info['revision_committer_offset'], - info['revision_message'])).encode('utf-8') - return git.hashdata(revision_content, 'commit') - -hashes = walk_and_compute_sha1_from_directory(scratch_folder_root) +hashes = git.walk_and_compute_sha1_from_directory(scratch_folder_root) git_ls_tree_rec(hashes) ADDITIONAL_INFO = { 'revision_author_name': 'swh author', 'revision_author_email': 'swh@inria.fr', 'revision_author_date': '1444054085', 'revision_author_offset': '+0200', 'revision_committer_name': 'swh committer', 'revision_committer_email': 'swh@inria.fr', 'revision_committer_date': '1444054085', 'revision_committer_offset': '+0200', 'revision_type': 'dir', 'revision_message': 'synthetic revision message' } -print('revision directory: %s' % compute_revision_hash(hashes, ADDITIONAL_INFO)) +revision_hash = git.compute_revision_hash(hashes, ADDITIONAL_INFO) +print('revision directory: %s' % revision_hash) # clean up # shutil.rmtree(scratch_folder_root, ignore_errors = True) diff --git a/scratch/walking.py b/swh/loader/dir/git/git.py old mode 100755 new mode 100644 similarity index 62% copy from scratch/walking.py copy to swh/loader/dir/git/git.py index 852fc66..f2fcde8 --- a/scratch/walking.py +++ b/swh/loader/dir/git/git.py @@ -1,230 +1,185 @@ -#!/usr/bin/env python3 +# Copyright (C) 2015 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + import os -import shutil -import tempfile -from swh.core import hashutil -from swh.loader.dir import git -from swh.loader.dir.git import GitPerm, GitType +from enum import Enum + +from . import utils + + +class GitType(Enum): + file = b'blob' + dir = b'tree' + exec = b'exec' + link = b'link' + gitlink = b'gitlink' + + +class GitPerm(Enum): + file = b'100644' + dir = b'40000' + exec = b'100755' + link = b'120000' + gitlink = b'160000' def compute_content_hashes(dirpath, filename): """Given a dirpath and a filename, compute the hashes for that particular file. Args: dirpath: the absolute path of the filename filename: the file's name Returns: The computed hashes for that dirpath/filename. Assumes: The full computed path of the file exists. """ fullname = os.path.join(dirpath, filename) - return hashutil.hashfile(fullname) + return utils.hashfile(fullname) -# Note: echo | git mktree --missing def compute_directory_hash(dirpath, hashes): """Compute a directory git sha1 for a dirpath. Args: dirpath: the directory's absolute path hashes: list of tree entries with keys: - sha1_git: its sha1 - name: file or subdir's name - perms: the tree entry's sha1 permissions - type: not used Returns: sha1 git of the directory Assumes: Every path exists. """ def sort_by_entry_name(hashes): return sorted(hashes, key=lambda entry: entry['name']) def row_entry_tree_format(hashes): return map(lambda entry: b''.join([entry['perms'].value, b' ', entry['name'], b'\0', entry['sha1_git']]), hashes) rows = row_entry_tree_format(sort_by_entry_name(hashes[dirpath])) - return git.hashdata(b''.join(rows), 'tree') + return utils.hashdata(b''.join(rows), 'tree') def walk_and_compute_sha1_from_directory(rootdir): """Compute git sha1 from directory rootdir. Empty directories are skipped. Returns: Dictionary of entries with keys and as values a list of directory entries. Those are list of dictionary with keys: - 'perms' - 'type' - 'name' - 'sha1_git' - and specifically content: 'sha1', 'sha256', ... (may be extended...) Note: One special key is '' to indicate the upper root of the directory. (This is typically the entry point of the revision). Raises: Nothing If something is raised, this is a programmatic error. """ ls_hashes = {} empty_dir = set() for dirpath, dirnames, filenames in os.walk(rootdir, topdown=False): hashes = [] if dirnames == [] and filenames == []: empty_dir.add(dirpath) continue # compute content hashes for filename in filenames: m_hashes = compute_content_hashes(dirpath, filename) m_hashes.update({ 'name': bytes(filename, 'utf-8'), 'perms': GitPerm.file, # FIXME symlink, exec file, gitlink... 'type': GitType.file, }) hashes.append(m_hashes) ls_hashes.update({ dirpath: hashes }) dir_hashes = [] # compute directory hashes and skip empty ones for dirname in [dir for dir in dirnames if os.path.join(dirpath, dir) not in empty_dir]: fullname = os.path.join(dirpath, dirname) tree_hash = compute_directory_hash(fullname, ls_hashes) tree_hash.update({ 'name': bytes(dirname, 'utf-8'), 'perms': GitPerm.dir, 'type': GitType.dir }) dir_hashes.append(tree_hash) ls_hashes.update({ dirpath: ls_hashes.get(dirpath, []) + dir_hashes }) # compute the current directory hashes root_hash = compute_directory_hash(rootdir, ls_hashes) root_hash.update({ 'name': bytes(rootdir, 'utf-8'), 'perms': GitPerm.dir, 'type': GitType.dir }) ls_hashes.update({ '': [root_hash] }) return ls_hashes -def write_file(file, content): - """Write some content in a file. - - """ - with open(file, 'w') as f: - f.write(content) - - -# prepare some arborescence with dirs and files to walk it -# scratch_folder_root = tempfile.mktemp(prefix='swh.loader.dir', suffix='.tmp', dir='/tmp') -scratch_folder_root = os.path.join(os.environ['HOME'], 'tmp') - -# scratch_folder_foo = os.path.join(scratch_folder_root, 'foo') -# os.makedirs(scratch_folder_foo, exist_ok=True) -# scratch_folder_bar = os.path.join(scratch_folder_root, 'bar/barfoo') -# os.makedirs(scratch_folder_bar, exist_ok=True) - -# scratch_file = os.path.join(scratch_folder_foo, 'quotes.md') -# write_file(scratch_file, -# 'Shoot for the moon. Even if you miss, you\'ll land among the stars.') - -# scratch_file2 = os.path.join(scratch_folder_bar, 'another-quote.org') -# write_file(scratch_file2, -# 'A Victory without danger is a triumph without glory.\n-- Pierre Corneille') - -def git_ls_tree_rec(hashes): - """Display the computed result for debug purposes. - - """ - for entry in hashes.keys(): - entry_properties = hashes[entry] - print("entry name: %s" % entry) - for file in entry_properties: - sha1 = hashutil.hash_to_hex(file['sha1_git']) - print("%s %s %s\t%s" % (file['perms'].value.decode('utf-8'), - file['type'].value.decode('utf-8'), - sha1, - file['name'].decode('utf-8'))) - print() - - def compute_revision_hash(hashes, info): """Compute a revision's hash. Use the entry's sha1_git as tree representation. """ - tree_hash = hashutil.hash_to_hex(hashes[''][0]['sha1_git']) + tree_hash = utils.hash_to_hex(hashes[''][0]['sha1_git']) revision_content = ("""tree %s author %s <%s> %s %s committer %s <%s> %s %s %s """ % (tree_hash, info['revision_author_name'], info['revision_author_email'], info['revision_author_date'], info['revision_author_offset'], info['revision_committer_name'], info['revision_committer_email'], info['revision_committer_date'], info['revision_committer_offset'], info['revision_message'])).encode('utf-8') - return git.hashdata(revision_content, 'commit') - -hashes = walk_and_compute_sha1_from_directory(scratch_folder_root) -git_ls_tree_rec(hashes) - -ADDITIONAL_INFO = { - 'revision_author_name': 'swh author', - 'revision_author_email': 'swh@inria.fr', - 'revision_author_date': '1444054085', - 'revision_author_offset': '+0200', - 'revision_committer_name': 'swh committer', - 'revision_committer_email': 'swh@inria.fr', - 'revision_committer_date': '1444054085', - 'revision_committer_offset': '+0200', - 'revision_type': 'dir', - 'revision_message': 'synthetic revision message' -} - -print('revision directory: %s' % compute_revision_hash(hashes, ADDITIONAL_INFO)) - -# clean up -# shutil.rmtree(scratch_folder_root, ignore_errors = True) + return utils.hashdata(revision_content, 'commit') diff --git a/swh/loader/dir/git.py b/swh/loader/dir/git/utils.py similarity index 88% rename from swh/loader/dir/git.py rename to swh/loader/dir/git/utils.py index 4af915e..3aaa2ea 100644 --- a/swh/loader/dir/git.py +++ b/swh/loader/dir/git/utils.py @@ -1,90 +1,78 @@ # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import hashlib from io import BytesIO -from enum import Enum from swh.core import hashutil -class GitType(Enum): - file = b'blob' - dir = b'tree' - exec = b'exec' - link = b'link' - gitlink = b'gitlink' - - -class GitPerm(Enum): - file = b'100644' - dir = b'40000' - exec = b'100755' - link = b'120000' - gitlink = b'160000' +hashfile = hashutil.hashfile +hash_to_hex = hashutil.hash_to_hex +hex_to_hash = hashutil.hex_to_hash def _new_hash(header_type, length): """Initialize a digest object (as returned by python's hashlib) for the git sha1 algorithm. This is in charge of pre-computing the needed header for git. Args: header_type: a git sha1 type ('blob', 'tree', 'commit', 'tag') length: Length of content to hash. Could be None if when hashing with sha1 and sha256 Returns: A digest object Raises: ValueError if header_type is not one of 'blob', 'commit', 'tree', 'tag' """ h = hashlib.new('sha1') if header_type not in ('blob', 'commit', 'tree', 'tag'): raise ValueError( 'Only supported types are blob, commit, tree, tag') h.update(('%s %d\0' % (header_type, length)).encode('ascii')) return h def _hash_file_obj(f, header_type, length): """hash (git sha1) the content of a file-like object f with header_type and length. Returns: A dictionary with 'sha1_git' as key and value the computed sha1_git. Raises: ValueError if header_type is not one of 'blob', 'commit', 'tree', 'tag' """ h = _new_hash(header_type, length) while True: chunk = f.read(hashutil.HASH_BLOCK_SIZE) if not chunk: break h.update(chunk) return {'sha1_git': h.digest()} def hashdata(data, header_type): """Hash data as git sha1 with header_type. Returns: A dictionary with 'sha1_git' as key and value the computed sha1_git. Raises: ValueError if header_type is not one of 'blob', 'commit', 'tree', 'tag' """ buf = BytesIO(data) return _hash_file_obj(buf, header_type, len(data)) diff --git a/swh/loader/dir/tests/test_git.py b/swh/loader/dir/tests/test_git_utils.py similarity index 88% rename from swh/loader/dir/tests/test_git.py rename to swh/loader/dir/tests/test_git_utils.py index 0918e21..0606891 100644 --- a/swh/loader/dir/tests/test_git.py +++ b/swh/loader/dir/tests/test_git_utils.py @@ -1,70 +1,70 @@ # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import unittest from nose.tools import istest -from swh.loader.dir import git +from swh.loader.dir.git import utils class GitHashlib(unittest.TestCase): def setUp(self): self.blob_data = b'42\n' self.tree_data = b''.join([b'40000 barfoo\0', bytes.fromhex('c3020f6bf135a38c6df3afeb5fb38232c5e07087'), b'100644 blah\0', bytes.fromhex('63756ef0df5e4f10b6efa33cfe5c758749615f20'), b'100644 hello\0', bytes.fromhex('907b308167f0880fb2a5c0e1614bb0c7620f9dc3')]) self.commit_data = """tree 1c61f7259dcb770f46b194d941df4f08ff0a3970 author Antoine R. Dumont (@ardumont) 1444054085 +0200 committer Antoine R. Dumont (@ardumont) 1444054085 +0200 initial """.encode('utf-8') self.checksums = { 'blob_sha1_git': bytes.fromhex('d81cc0710eb6cf9efd5b920a8453e1' 'e07157b6cd'), 'tree_sha1_git': bytes.fromhex('ac212302c45eada382b27bfda795db' '121dacdb1c'), 'commit_sha1_git': bytes.fromhex('e960570b2e6e2798fa4cfb9af2c399' 'd629189653'), } @istest def unknown_header_type(self): with self.assertRaises(ValueError) as cm: - git.hashdata(b'any-data', 'some-unknown-type') + utils.hashdata(b'any-data', 'some-unknown-type') self.assertIn('Only supported types', cm.exception.args[0]) @istest def hashdata_content(self): # when - checksums = git.hashdata(self.blob_data, 'blob') + checksums = utils.hashdata(self.blob_data, 'blob') # then self.assertEqual(checksums['sha1_git'], self.checksums['blob_sha1_git']) @istest def hashdata_tree(self): # when - checksums = git.hashdata(self.tree_data, 'tree') + checksums = utils.hashdata(self.tree_data, 'tree') # then self.assertEqual(checksums['sha1_git'], self.checksums['tree_sha1_git']) @istest def hashdata_revision(self): # when - checksums = git.hashdata(self.commit_data, 'commit') + checksums = utils.hashdata(self.commit_data, 'commit') # then self.assertEqual(checksums['sha1_git'], self.checksums['commit_sha1_git'])