diff --git a/scratch/walking.py b/scratch/walking.py index 2a96cd2..4781391 100755 --- a/scratch/walking.py +++ b/scratch/walking.py @@ -1,175 +1,176 @@ #!/usr/bin/env python3 import os import shutil import tempfile from swh.core import hashutil from swh.loader.dir import git +from swh.loader.dir.git import GitPerm, GitType def compute_hashes(dirpath, filename): """Given a fullname, compute its hashes. Args: dirpath: the absolute path of the filename filename: the file's name Returns: The computed hash for that filename. Assumes: The full computed path exists. """ fullname = os.path.join(dirpath, filename) return hashutil.hashfile(fullname) # Note: echo | git mktree --missing def compute_directory_hash(dirpath, hashes): """Compute a directory git sha1 for a dirpath. Args: dirpath: the absolute path of the directory hashes: list of dictionaries with keys: - sha1_git: - path: directory parent of the filename - name: file's name Returns: sha1 git of the directory Assumes: Every path exists. """ def sort_by_entry_name(hashes): if not hashes: return hashes sorted_res = sorted(hashes, key=lambda entry: entry['name']) return sorted_res def row_entry_tree_format(hashes): return map(lambda entry: - b''.join([entry['perms'], + b''.join([entry['perms'].value, b' ', entry['name'], b'\0', entry['sha1_git']]), hashes) rows = row_entry_tree_format(sort_by_entry_name(hashes[dirpath])) tree_content = b''.join(rows) h = git.hashdata(tree_content, 'tree') return h # 100644: regular file # 100755: executable # 40000: tree # 120000: symlink # 160000: gitlink def walk_and_compute_sha1_from_directory(dir): """Compute git sha1 from directory dir. """ ls_hashes = {} for dirpath, dirnames, filenames in os.walk(dir, topdown=False): hashes = [] # FIXME: deal with empty directories which must be skipped! # -> pb with bottom up approach # compute content hashes for filename in filenames: m_hashes = compute_hashes(dirpath, filename) m_hashes.update({ 'name': bytes(filename, 'utf-8'), - 'perms': b'100644', # FIXME symlink, exec file, gitlink... - 'type': b'blob', + 'perms': GitPerm.file, # FIXME symlink, exec file, gitlink... + 'type': GitType.file, }) hashes.append(m_hashes) ls_hashes.update({ dirpath: hashes }) dir_hashes = [] # compute directory hashes for dirname in dirnames: fullname = os.path.join(dirpath, dirname) tree_hash = compute_directory_hash(fullname, ls_hashes) tree_hash.update({ 'name': bytes(dirname, 'utf-8'), - 'perms': b'40000', - 'type': b'tree' + 'perms': GitPerm.dir, + 'type': GitType.dir }) dir_hashes.append(tree_hash) ls_hashes.update({ dirpath: ls_hashes.get(dirpath, []) + dir_hashes }) # compute the current directory hashes root_hash = compute_directory_hash(dir, ls_hashes) root_hash.update({ 'name': b'root', - 'perms': b'40000', - 'type': b'tree' + 'perms': GitPerm.dir, + 'type': GitType.dir }) ls_hashes.update({ 'root': [root_hash] }) return ls_hashes def write_file(file, content): """Write some content in a file. """ with open(file, 'w') as f: f.write(content) # prepare some arborescence with dirs and files to walk it # scratch_folder_root = tempfile.mktemp(prefix='swh.loader.dir', suffix='.tmp', dir='/tmp') scratch_folder_root = '/home/tony/tmp' # scratch_folder_foo = os.path.join(scratch_folder_root, 'foo') # os.makedirs(scratch_folder_foo, exist_ok=True) # scratch_folder_bar = os.path.join(scratch_folder_root, 'bar/barfoo') # os.makedirs(scratch_folder_bar, exist_ok=True) # scratch_file = os.path.join(scratch_folder_foo, 'quotes.md') # write_file(scratch_file, # 'Shoot for the moon. Even if you miss, you\'ll land among the stars.') # scratch_file2 = os.path.join(scratch_folder_bar, 'another-quote.org') # write_file(scratch_file2, # 'A Victory without danger is a triumph without glory.\n-- Pierre Corneille') def git_ls_tree_rec(hashes): """Display the computed result for debug purposes. """ for entry in hashes.keys(): entry_properties = hashes[entry] print("entry name: %s" % entry) for file in entry_properties: sha1 = hashutil.hash_to_hex(file['sha1_git']) - print("%s %s %s\t%s" % (file['perms'].decode('utf-8'), - file['type'].decode('utf-8'), + print("%s %s %s\t%s" % (file['perms'].value.decode('utf-8'), + file['type'].value.decode('utf-8'), sha1, file['name'].decode('utf-8'))) print() hashes = walk_and_compute_sha1_from_directory(scratch_folder_root) git_ls_tree_rec(hashes) # clean up # shutil.rmtree(scratch_folder_root, ignore_errors = True) diff --git a/swh/loader/dir/git.py b/swh/loader/dir/git.py index 8421c4a..4af915e 100644 --- a/swh/loader/dir/git.py +++ b/swh/loader/dir/git.py @@ -1,72 +1,90 @@ # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import hashlib from io import BytesIO +from enum import Enum from swh.core import hashutil + +class GitType(Enum): + file = b'blob' + dir = b'tree' + exec = b'exec' + link = b'link' + gitlink = b'gitlink' + + +class GitPerm(Enum): + file = b'100644' + dir = b'40000' + exec = b'100755' + link = b'120000' + gitlink = b'160000' + + def _new_hash(header_type, length): """Initialize a digest object (as returned by python's hashlib) for the git sha1 algorithm. This is in charge of pre-computing the needed header for git. Args: header_type: a git sha1 type ('blob', 'tree', 'commit', 'tag') length: Length of content to hash. Could be None if when hashing with sha1 and sha256 Returns: A digest object Raises: ValueError if header_type is not one of 'blob', 'commit', 'tree', 'tag' """ h = hashlib.new('sha1') if header_type not in ('blob', 'commit', 'tree', 'tag'): raise ValueError( 'Only supported types are blob, commit, tree, tag') h.update(('%s %d\0' % (header_type, length)).encode('ascii')) return h def _hash_file_obj(f, header_type, length): """hash (git sha1) the content of a file-like object f with header_type and length. Returns: A dictionary with 'sha1_git' as key and value the computed sha1_git. Raises: ValueError if header_type is not one of 'blob', 'commit', 'tree', 'tag' """ h = _new_hash(header_type, length) while True: chunk = f.read(hashutil.HASH_BLOCK_SIZE) if not chunk: break h.update(chunk) return {'sha1_git': h.digest()} def hashdata(data, header_type): """Hash data as git sha1 with header_type. Returns: A dictionary with 'sha1_git' as key and value the computed sha1_git. Raises: ValueError if header_type is not one of 'blob', 'commit', 'tree', 'tag' """ buf = BytesIO(data) return _hash_file_obj(buf, header_type, len(data))