diff --git a/bin/swh-loader-tar-retrieve-tarball b/bin/swh-loader-tar-retrieve-tarball index ed54e3f..9778729 100755 --- a/bin/swh-loader-tar-retrieve-tarball +++ b/bin/swh-loader-tar-retrieve-tarball @@ -1,188 +1,190 @@ #!/usr/bin/env python3 -# NOT FOR PRODUCTION (does not use the stable swh storage api yet) +# NOT FOR PRODUCTION +# - use swh storage api +# - does not deal with missing contents yet so the tarball could be uncomplete # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import argparse import collections import itertools import os import shutil import sys import tempfile from swh.core import hashutil, config from swh.loader.tar import tarball, utils def escape_hash(sha1): """Escape an hexa sha1 to a ready queryable sha1.""" if isinstance(sha1, bytes): sha1 = hashutil.hash_to_hex(sha1) return '\\x%s' % sha1 def get_revision(revision_id): """Return the directory sha1 the revision with id revision_id points to. """ for revision in storage.revision_get([revision_id]): if 'metadata' in revision: meta = revision['metadata'] artifact = meta['original_artifact'][0] revision['archive_name'] = artifact['name'] revision['archive_type'] = artifact['archive_type'] return revision CONTENTS_BLOCK_SIZE = 10000 def _directory_ls_contents(ls_contents): # Split in iterable blocks of size CONTENTS_BLOCK_SIZE # filled with empty values if need for the last block blocks_contents = utils.grouper(ls_contents, CONTENTS_BLOCK_SIZE, fillvalue=None) for block_contents in blocks_contents: full_contents = [] content_by_sha = collections.defaultdict(list) # iter over contents (beware the last block can contain empty ones) for content in block_contents: if not content or content['status'] != 'visible': continue full_contents.append(content['sha1']) content_by_sha[content['sha1']].append(content) for c in storage.content_get(full_contents): for content in content_by_sha[c['sha1']]: content['data'] = c['data'] yield content def directory_ls_with_content(directory_id, recursive=True): """List directories with their data when content targeted is a file. """ ls_dirs, ls_contents = itertools.tee( storage.directory_get(directory_id, recursive=recursive)) yield from itertools.chain( (e for e in ls_dirs if e['type'] == 'dir'), _directory_ls_contents((e for e in ls_contents if e['type'] != 'dir'))) def build_archive_from_revision(revision_id, archive_type=None, directory_dest='.'): def mkdir(path): os.makedirs(path, exist_ok=True) os.chmod(path, 0o755) def build_tree(): # build fs structure tmpdir = tempfile.mkdtemp(suffix='create-tarball', prefix='swh.loader.tar-', dir='/tmp') for entry in directory_ls_with_content(directory_id, recursive=True): name = entry['name'].decode('utf-8') perms = entry['perms'] path = os.path.join(tmpdir, name) if perms == 40000: # dir mkdir(path) else: dirpath = os.path.dirname(path) mkdir(dirpath) if perms == 100644: # file file_content = entry['data'] with open(path, 'wb') as f: f.write(file_content) os.chmod(path, 0o644) else: # symlink linkdest = entry['data'] os.symlink(path, linkdest) yield path, name # clean up tmp directory shutil.rmtree(tmpdir) revision = get_revision(revision_id) directory_id = revision['directory'] tarpath = os.path.join(directory_dest, revision['archive_name']) archive_type = archive_type or revision['archive_type'] files = build_tree() # build archive from the tree tarball.compress(tarpath, archive_type, files) def parse_args(): """Parse the configuration from the cli. """ cli = argparse.ArgumentParser( description='Tarball creation from swh-storage.') cli.add_argument('--config-file', '-c', help='configuration file') cli.add_argument('--type-archive', '-t', help='archive type (zip or tar)') cli.add_argument('--directory', '-d', help='configuration file path') cli.add_argument('--revision', '-r', help='revision checksum') args = cli.parse_args() return args def check_args(args): """Check cli args and returns the error msg. Returns: List of error messages as string if some. """ errorMsgs = [] if not args.config_file: errorMsgs.append('\n- Configuration file option.') if not args.revision: errorMsgs.append('\n- Revision checksum') return errorMsgs if __name__ == '__main__': args = parse_args() errorMsgs = check_args(args) if errorMsgs: print('Some mandatory options are missing: %s' % ''.join(errorMsgs)) sys.exit(1) conf = config.read(args.config_file) type_archive = args.type_archive or None directory_dest = args.directory or '.' revision_hex = args.revision if conf['storage_class'] == 'remote_storage': from swh.storage.api.client import RemoteStorage as Storage else: from swh.storage import Storage storage = Storage(conf['storage_args']) revision_id = hashutil.hex_to_hash(revision_hex) build_archive_from_revision(revision_id, type_archive, directory_dest)