diff --git a/scratch/walking.py b/scratch/walking.py index 30eb9eb..e8f36a2 100755 --- a/scratch/walking.py +++ b/scratch/walking.py @@ -1,92 +1,91 @@ #!/usr/bin/env python3 import os import shutil import tempfile from swh.loader.dir.git import git, utils def write_file(root, file, content): """Write some content in a file. """ filename = os.path.join(root, file) with open(filename, 'w') as f: f.write(content) def mkdir(root, name): """Create a directory path on disk. """ full_foldername = os.path.join(root, name) os.makedirs(full_foldername, exist_ok=True) return full_foldername def git_ls_tree_rec(hashes, info): """Display the computed result for debug purposes. """ for entry in hashes.keys(): entry_properties = hashes[entry] print("entry name: %s" % entry) for file in entry_properties: sha1 = utils.hash_to_hex(file['sha1_git']) print("%s %s %s\t%s" % (file['perms'].value.decode('utf-8'), file['type'].value.decode('utf-8'), sha1, file['name'].decode('utf-8'))) print() revision = git.compute_revision_git_sha1(hashes, info) print('revision %s -> directory %s' % ( utils.hash_to_hex(revision['sha1_git']), utils.hash_to_hex(hashes[''][0]['sha1_git']) )) ### setup - prepare some arborescence with dirs and files to walk it tempfilename = tempfile.mktemp(prefix='swh.loader.dir', suffix='.tmp', dir='/tmp') # want the same name for idempotency scratch_folder_root = mkdir(tempfilename, 'tmp') mkdir(scratch_folder_root, 'empty-folder') scratch_folder_foo = mkdir(scratch_folder_root, 'foo') scratch_folder_bar = mkdir(scratch_folder_root, 'bar/barfoo') write_file(scratch_folder_foo, 'quotes.md', 'Shoot for the moon. Even if you miss, you\'ll land among ' 'the stars.') write_file(scratch_folder_bar, 'another-quote.org', 'A Victory without danger is a triumph without glory.\n' '-- Pierre Corneille') ADDITIONAL_INFO = { 'revision_author_name': 'swh author', 'revision_author_email': 'swh@inria.fr', 'revision_author_date': '1444054085', 'revision_author_offset': '+0200', 'revision_committer_name': 'swh committer', 'revision_committer_email': 'swh@inria.fr', 'revision_committer_date': '1444054085', 'revision_committer_offset': '+0200', 'revision_type': 'dir', 'revision_message': 'synthetic revision message' } # when hashes = git.walk_and_compute_sha1_from_directory(scratch_folder_root) # then git_ls_tree_rec(hashes, ADDITIONAL_INFO) - ### teardown shutil.rmtree(tempfilename, ignore_errors = True) diff --git a/swh/loader/dir/git/git.py b/swh/loader/dir/git/git.py index dcbfdef..f47ec87 100644 --- a/swh/loader/dir/git/git.py +++ b/swh/loader/dir/git/git.py @@ -1,264 +1,263 @@ # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import os from enum import Enum from swh.loader.dir.git import utils class GitType(Enum): file = b'blob' dir = b'tree' exec = b'exec' link = b'link' gitlink = b'gitlink' class GitPerm(Enum): file = b'100644' dir = b'40000' exec = b'100755' link = b'120000' gitlink = b'160000' def compute_symlink_git_sha1(linkpath): """Compute git sha1 for a link. Args: linkpath: the absolute path name to a symbolic link. Returns: dictionary with sha1_git as key and the actual binary sha1 as value. """ dest_path = os.readlink(linkpath) return utils.hashdata(dest_path.encode('utf-8'), 'blob') def compute_directory_git_sha1(dirpath, hashes): """Compute a directory git sha1 for a dirpath. Args: dirpath: the directory's absolute path hashes: list of tree entries with keys: - sha1_git: the tree entry's sha1 - name: file or subdir's name - perms: the tree entry's sha1 permissions Returns: dictionary with sha1_git as key and the actual binary sha1 as value. Assumes: Every path exists in hashes. """ def sort_by_entry_name(hashes): return sorted(hashes, key=lambda entry: entry['name']) def row_entry_tree_format(hashes): return map(lambda entry: b''.join([entry['perms'].value, b' ', entry['name'], b'\0', entry['sha1_git']]), hashes) rows = row_entry_tree_format(sort_by_entry_name(hashes[dirpath])) return utils.hashdata(b''.join(rows), 'tree') def compute_revision_git_sha1(hashes, info): """Compute a revision's hash. Use the entry's sha1_git as tree representation. Args: hashes: info: Additional dictionary information needed to compute a synthetic revision. Following keys are expected: - revision_author_name - revision_author_email - revision_author_date - revision_author_offset - revision_committer_name - revision_committer_email - revision_committer_date - revision_committer_offset - revision_message """ tree_hash = utils.hash_to_hex(hashes[''][0]['sha1_git']) revision_content = ("""tree %s author %s <%s> %s %s committer %s <%s> %s %s %s """ % (tree_hash, info['revision_author_name'], info['revision_author_email'], info['revision_author_date'], info['revision_author_offset'], info['revision_committer_name'], info['revision_committer_email'], info['revision_committer_date'], info['revision_committer_offset'], info['revision_message'])).encode('utf-8') return utils.hashdata(revision_content, 'commit') def compute_link_metadata(linkpath): """Given a linkpath, compute the git metadata. Args: linkpath: absolute pathname of the link Returns: Dictionary of values: - name: basename of the link - perms: git permission for link - type: git type for link """ m_hashes = compute_symlink_git_sha1(linkpath) m_hashes.update({ 'name': bytes(os.path.basename(linkpath), 'utf-8'), 'perms': GitPerm.link, 'type': GitType.file, }) return m_hashes def compute_blob_metadata(filepath): """Given a filepath, compute the git metadata. Args: filepath: absolute pathname of the file. Returns: Dictionary of values: - name: basename of the file - perms: git permission for file - type: git type for file """ m_hashes = utils.hashfile(filepath) m_hashes.update({ 'name': bytes(os.path.basename(filepath), 'utf-8'), 'perms': GitPerm.exec if os.access(filepath, os.X_OK) else GitPerm.file, 'type': GitType.file, }) return m_hashes def compute_tree_metadata(dirname, ls_hashes): """Given a dirname, compute the git metadata. Args: dirname: absolute pathname of the directory. Returns: Dictionary of values: - name: basename of the directory - perms: git permission for directory - type: git type for directory """ tree_hash = compute_directory_git_sha1(dirname, ls_hashes) tree_hash.update({ 'name': bytes(os.path.basename(dirname), 'utf-8'), 'perms': GitPerm.dir, 'type': GitType.dir }) return tree_hash - def walk_and_compute_sha1_from_directory(rootdir): """Compute git sha1 from directory rootdir. Empty directories are skipped. Returns: Dictionary of entries with keys and as values a list of directory entries. Those are list of dictionary with keys: - 'perms' - 'type' - 'name' - 'sha1_git' - and specifically content: 'sha1', 'sha256', ... (may be extended...) Note: One special key is '' to indicate the upper root of the directory. (This is typically the entry point of the revision). Raises: Nothing If something is raised, this is a programmatic error. """ ls_hashes = {} empty_dirs = set() link_dirs = set() for dirpath, dirnames, filenames in os.walk(rootdir, topdown=False): hashes = [] if not(dirnames) and not(filenames): empty_dirs.add(dirpath) continue links = [ file for file in filenames if os.path.islink(os.path.join(dirpath, file)) ] + \ [ dir for dir in dirnames if os.path.islink(os.path.join(dirpath, dir))] for link in links: linkpath = os.path.join(dirpath, link) link_dirs.add(linkpath) m_hashes = compute_link_metadata(linkpath) hashes.append(m_hashes) for filename in [ file for file in filenames if os.path.join(dirpath, file) not in link_dirs ]: filepath = os.path.join(dirpath, filename) m_hashes = compute_blob_metadata(filepath) hashes.append(m_hashes) ls_hashes.update({ dirpath: hashes }) dir_hashes = [] subdirs = [ dir for dir in dirnames if os.path.join(dirpath, dir) not in (empty_dirs | link_dirs) ] for dirname in subdirs: fulldirname = os.path.join(dirpath, dirname) tree_hash = compute_tree_metadata(fulldirname, ls_hashes) dir_hashes.append(tree_hash) ls_hashes.update({ dirpath: ls_hashes.get(dirpath, []) + dir_hashes }) # compute the current directory hashes root_hash = compute_directory_git_sha1(rootdir, ls_hashes) root_hash.update({ 'name': bytes(rootdir, 'utf-8'), 'perms': GitPerm.dir, 'type': GitType.dir }) ls_hashes.update({ '': [root_hash] }) return ls_hashes