diff --git a/bin/swh-loader-tar-retrieve-tarball b/bin/swh-loader-tar-retrieve-tarball index ce86756..40a5d43 100755 --- a/bin/swh-loader-tar-retrieve-tarball +++ b/bin/swh-loader-tar-retrieve-tarball @@ -1,171 +1,178 @@ #!/usr/bin/env python3 # NOT FOR PRODUCTION (does not use the stable swh storage api yet) # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import argparse import itertools import os import shutil import sys import tempfile from swh.core import hashutil, config from swh.loader.tar import tarball def escape_hash(sha1): """Escape an hexa sha1 to a ready queryable sha1.""" if isinstance(sha1, bytes): sha1 = hashutil.hash_to_hex(sha1) return '\\x%s' % sha1 def get_revision(revision_id): """Return the directory sha1 the revision with id revision_id points to. """ for revision in storage.revision_get([revision_id]): if 'metadata' in revision: meta = revision['metadata'] artifact = meta['original_artifact'][0] revision['archive_name'] = artifact['name'] revision['archive_type'] = artifact['archive_type'] return revision -def directory_ls_with_content(directory_id, recursive=True): - """List directories with their data when content targeted is a file. - - """ - all_contents = storage.directory_get(directory_id, recursive=recursive) - ls_dirs, ls_contents = itertools.tee(all_contents) - +def _directory_ls_dir(ls_dirs): for entry in (e for e in ls_dirs if e['type'] == 'dir'): entry['name'] = entry['name'].decode('utf-8') yield entry + +def _directory_ls_contents(ls_contents): for entry in (e for e in ls_contents if e['type'] != 'dir'): entry['name'] = entry['name'].decode('utf-8') sha1 = entry['sha1'] status = entry['status'] # HACK: heavy, 1 post per content, need to batch contents_data = list(storage.content_get( [{'sha1': sha1, 'status': status}])) entry['data'] = contents_data[0]['data'] yield entry +def directory_ls_with_content(directory_id, recursive=True): + """List directories with their data when content targeted is a file. + + """ + all_contents = storage.directory_get(directory_id, recursive=recursive) + ls_dirs, ls_contents = itertools.tee(all_contents) + + yield from itertools.chain(_directory_ls_dir(ls_dirs), + _directory_ls_contents(ls_contents)) + + def build_archive_from_revision(revision_id, archive_type=None, directory_dest='.'): def mkdir(path): os.makedirs(path, exist_ok=True) os.chmod(path, 0o755) revision = get_revision(revision_id) directory_id = revision['directory'] tarpath = os.path.join(directory_dest, revision['archive_name']) archive_type = archive_type or revision['archive_type'] print("Compressing archive as '%s' with type %s" % (tarpath, archive_type)) # build fs structure tmpdir = tempfile.mkdtemp(suffix='create-tarball', prefix='swh.loader.tar', dir='/tmp') for entry in directory_ls_with_content(directory_id, recursive=True): name = entry['name'] perms = entry['perms'] path = os.path.join(tmpdir, name) if perms == 40000: # dir mkdir(path) else: dirpath = os.path.dirname(path) mkdir(dirpath) if perms == 100644: # file file_content = entry['data'] with open(path, 'wb') as f: f.write(file_content) os.chmod(path, 0o644) else: # symlink linkdest = entry['data'] os.symlink(path, linkdest) # build archive from the tree tarball.compress(tarpath, tmpdir, archive_type) # clean up tmp directory shutil.rmtree(tmpdir) def parse_args(): """Parse the configuration from the cli. """ cli = argparse.ArgumentParser( description='Tarball creation from swh-storage.') cli.add_argument('--config-file', '-c', help='configuration file') cli.add_argument('--type-archive', '-t', help='archive type (zip or tar)') cli.add_argument('--directory', '-d', help='configuration file path') cli.add_argument('--revision', '-r', help='revision checksum') args = cli.parse_args() return args def check_args(args): """Check cli args and returns the error msg. Returns: List of error messages as string if some. """ errorMsgs = [] if not args.config_file: errorMsgs.append('\n- Configuration file option.') if not args.revision: errorMsgs.append('\n- Revision checksum') return errorMsgs if __name__ == '__main__': args = parse_args() errorMsgs = check_args(args) if errorMsgs: print('Some mandatory options are missing: %s' % ''.join(errorMsgs)) sys.exit(1) conf = config.read(args.config_file) type_archive = args.type_archive or None directory_dest = args.directory or '.' revision_hex = args.revision if conf['storage_class'] == 'remote_storage': from swh.storage.api.client import RemoteStorage as Storage else: from swh.storage import Storage storage = Storage(conf['storage_args']) revision_id = hashutil.hex_to_hash(revision_hex) build_archive_from_revision(revision_id, type_archive, directory_dest)