diff --git a/bin/swh-loader-tar-retrieve-tarball b/bin/swh-loader-tar-retrieve-tarball deleted file mode 100755 index 0ef0a12..0000000 --- a/bin/swh-loader-tar-retrieve-tarball +++ /dev/null @@ -1,190 +0,0 @@ -#!/usr/bin/env python3 - -# NOT FOR PRODUCTION -# - use swh storage api -# - does not deal with missing contents yet so the tarball could be uncomplete - -# Copyright (C) 2015 The Software Heritage developers -# See the AUTHORS file at the top-level directory of this distribution -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - -import argparse -import collections -import itertools -import os -import shutil -import sys -import tempfile - -from swh.core import hashutil, config -from swh.loader.tar import tarball, utils - - -def escape_hash(sha1): - """Escape an hexa sha1 to a ready queryable sha1.""" - if isinstance(sha1, bytes): - sha1 = hashutil.hash_to_hex(sha1) - return '\\x%s' % sha1 - - -def get_revision(revision_id): - """Return the directory sha1 the revision with id revision_id points to. - - """ - for revision in storage.revision_get([revision_id]): - if 'metadata' in revision: - meta = revision['metadata'] - artifact = meta['original_artifact'][0] - revision['archive_name'] = artifact['name'] - revision['archive_type'] = artifact['archive_type'] - - return revision - - -CONTENTS_BLOCK_SIZE = 10000 - - -def _directory_ls_contents(ls_contents): - # Split in iterable blocks of size CONTENTS_BLOCK_SIZE - # filled with empty values if need for the last block - blocks_contents = utils.grouper(ls_contents, CONTENTS_BLOCK_SIZE, - fillvalue=None) - - for block_contents in blocks_contents: - full_contents = [] - content_by_sha = collections.defaultdict(list) - - # iter over contents (beware the last block can contain empty ones) - for content in block_contents: - if not content or content['status'] != 'visible': - continue - - full_contents.append(content['sha1']) - content_by_sha[content['sha1']].append(content) - - for c in storage.content_get(full_contents): - for content in content_by_sha[c['sha1']]: - content['data'] = c['data'] - yield content - - -def directory_ls_with_content(directory_id, recursive=True): - """List directories with their data when content targeted is a file. - - """ - ls_dirs, ls_contents = itertools.tee( - storage.directory_get(directory_id, recursive=recursive)) - - yield from itertools.chain( - (e for e in ls_dirs if e['type'] == 'dir'), - _directory_ls_contents((e for e in ls_contents if e['type'] != 'dir'))) - - -def build_archive_from_revision(revision_id, - archive_type=None, - directory_dest='.'): - def mkdir(path): - os.makedirs(path, exist_ok=True) - os.chmod(path, 0o755) - - def build_tree(): - # build fs structure - tmpdir = tempfile.mkdtemp(suffix='create-tarball', - prefix='swh.loader.tar-', - dir='/tmp') - - for entry in directory_ls_with_content(directory_id, recursive=True): - name = entry['name'].decode('utf-8') - perms = entry['perms'] - - path = os.path.join(tmpdir, name) - if perms == 40000: # dir - mkdir(path) - else: - dirpath = os.path.dirname(path) - mkdir(dirpath) - - if perms == 100644: # file - file_content = entry['data'] - with open(path, 'wb') as f: - f.write(file_content) - - os.chmod(path, 0o644) - else: # symlink - linkdest = entry['data'] - os.symlink(path, linkdest) - - yield path, name - - # clean up tmp directory - shutil.rmtree(tmpdir) - - revision = get_revision(revision_id) - directory_id = revision['directory'] - tarpath = os.path.join(directory_dest, revision['archive_name']) - archive_type = archive_type or revision['archive_type'] - - files = build_tree() - # build archive from the tree - tarball.compress(tarpath, archive_type, files) - - -def parse_args(): - """Parse the configuration from the cli. - - """ - cli = argparse.ArgumentParser( - description='Tarball creation from swh-storage.') - cli.add_argument('--config-file', '-c', help='configuration file') - cli.add_argument('--type-archive', '-t', - help='archive type (zip or tar)') - cli.add_argument('--directory', '-d', - help='configuration file path') - cli.add_argument('--revision', '-r', - help='revision checksum') - - args = cli.parse_args() - - return args - - -def check_args(args): - """Check cli args and returns the error msg. - - Returns: - List of error messages as string if some. - - """ - errors = [] - if not args.config_file: - errors.append('\n- Configuration file option.') - - if not args.revision: - errors.append('\n- Revision checksum') - - return errors - - -if __name__ == '__main__': - args = parse_args() - - errorMsgs = check_args(args) - if errorMsgs: - print('Some mandatory options are missing: %s' % ''.join(errorMsgs)) - sys.exit(1) - - conf = config.read(args.config_file) - type_archive = args.type_archive or None - directory_dest = args.directory or '.' - revision_hex = args.revision - - if conf['storage_class'] == 'remote_storage': - from swh.storage.api.client import RemoteStorage as Storage - else: - from swh.storage import Storage - - storage = Storage(conf['storage_args']) - - revision_id = hashutil.hex_to_hash(revision_hex) - build_archive_from_revision(revision_id, type_archive, directory_dest)