diff --git a/bin/swh-loader-tar-retrieve-tarball b/bin/swh-loader-tar-retrieve-tarball index 40a5d43..34de7a5 100755 --- a/bin/swh-loader-tar-retrieve-tarball +++ b/bin/swh-loader-tar-retrieve-tarball @@ -1,178 +1,186 @@ #!/usr/bin/env python3 # NOT FOR PRODUCTION (does not use the stable swh storage api yet) # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import argparse +import collections import itertools import os import shutil import sys import tempfile from swh.core import hashutil, config -from swh.loader.tar import tarball +from swh.loader.tar import tarball, utils def escape_hash(sha1): """Escape an hexa sha1 to a ready queryable sha1.""" if isinstance(sha1, bytes): sha1 = hashutil.hash_to_hex(sha1) return '\\x%s' % sha1 def get_revision(revision_id): """Return the directory sha1 the revision with id revision_id points to. """ for revision in storage.revision_get([revision_id]): if 'metadata' in revision: meta = revision['metadata'] artifact = meta['original_artifact'][0] revision['archive_name'] = artifact['name'] revision['archive_type'] = artifact['archive_type'] return revision -def _directory_ls_dir(ls_dirs): - for entry in (e for e in ls_dirs if e['type'] == 'dir'): - entry['name'] = entry['name'].decode('utf-8') - yield entry +CONTENTS_BLOCK_SIZE = 10000 def _directory_ls_contents(ls_contents): - for entry in (e for e in ls_contents if e['type'] != 'dir'): - entry['name'] = entry['name'].decode('utf-8') - sha1 = entry['sha1'] - status = entry['status'] + # Split in iterable blocks of size CONTENTS_BLOCK_SIZE + # filled with empty values if need for the last block + blocks_contents = utils.grouper(ls_contents, CONTENTS_BLOCK_SIZE, + fillvalue=None) - # HACK: heavy, 1 post per content, need to batch - contents_data = list(storage.content_get( - [{'sha1': sha1, 'status': status}])) - entry['data'] = contents_data[0]['data'] + for block_contents in blocks_contents: + full_contents = [] + content_by_sha = collections.defaultdict(list) - yield entry + # iter over contents (beware the last block can contain empty ones) + for content in (c for c in block_contents if c): + full_contents.append({'sha1': content['sha1'], + 'status': content['status']}) + content_by_sha[content['sha1']].append(content) + + for c in storage.content_get(full_contents): + for content in content_by_sha[c['sha1']]: + content['data'] = c['data'] + yield content def directory_ls_with_content(directory_id, recursive=True): """List directories with their data when content targeted is a file. """ - all_contents = storage.directory_get(directory_id, recursive=recursive) - ls_dirs, ls_contents = itertools.tee(all_contents) + ls_dirs, ls_contents = itertools.tee( + storage.directory_get(directory_id, recursive=recursive)) - yield from itertools.chain(_directory_ls_dir(ls_dirs), - _directory_ls_contents(ls_contents)) + yield from itertools.chain( + (e for e in ls_dirs if e['type'] == 'dir'), + _directory_ls_contents((e for e in ls_contents if e['type'] != 'dir'))) def build_archive_from_revision(revision_id, archive_type=None, directory_dest='.'): def mkdir(path): os.makedirs(path, exist_ok=True) os.chmod(path, 0o755) - revision = get_revision(revision_id) - directory_id = revision['directory'] - tarpath = os.path.join(directory_dest, revision['archive_name']) - archive_type = archive_type or revision['archive_type'] + def build_tree(): + # build fs structure + tmpdir = tempfile.mkdtemp(suffix='create-tarball', + prefix='swh.loader.tar-', + dir='/tmp') - print("Compressing archive as '%s' with type %s" % (tarpath, archive_type)) + for entry in directory_ls_with_content(directory_id, recursive=True): + name = entry['name'].decode('utf-8') + perms = entry['perms'] - # build fs structure - tmpdir = tempfile.mkdtemp(suffix='create-tarball', - prefix='swh.loader.tar', - dir='/tmp') + path = os.path.join(tmpdir, name) + if perms == 40000: # dir + mkdir(path) + else: + dirpath = os.path.dirname(path) + mkdir(dirpath) - for entry in directory_ls_with_content(directory_id, recursive=True): - name = entry['name'] - perms = entry['perms'] + if perms == 100644: # file + file_content = entry['data'] + with open(path, 'wb') as f: + f.write(file_content) - path = os.path.join(tmpdir, name) + os.chmod(path, 0o644) + else: # symlink + linkdest = entry['data'] + os.symlink(path, linkdest) - if perms == 40000: # dir - mkdir(path) - else: - dirpath = os.path.dirname(path) - mkdir(dirpath) + yield path, name - if perms == 100644: # file - file_content = entry['data'] - with open(path, 'wb') as f: - f.write(file_content) + # clean up tmp directory + shutil.rmtree(tmpdir) - os.chmod(path, 0o644) - else: # symlink - linkdest = entry['data'] - os.symlink(path, linkdest) + revision = get_revision(revision_id) + directory_id = revision['directory'] + tarpath = os.path.join(directory_dest, revision['archive_name']) + archive_type = archive_type or revision['archive_type'] + files = build_tree() # build archive from the tree - tarball.compress(tarpath, tmpdir, archive_type) - - # clean up tmp directory - shutil.rmtree(tmpdir) + tarball.compress(tarpath, archive_type, files) def parse_args(): """Parse the configuration from the cli. """ cli = argparse.ArgumentParser( description='Tarball creation from swh-storage.') cli.add_argument('--config-file', '-c', help='configuration file') cli.add_argument('--type-archive', '-t', help='archive type (zip or tar)') cli.add_argument('--directory', '-d', help='configuration file path') cli.add_argument('--revision', '-r', help='revision checksum') args = cli.parse_args() return args def check_args(args): """Check cli args and returns the error msg. Returns: List of error messages as string if some. """ errorMsgs = [] if not args.config_file: errorMsgs.append('\n- Configuration file option.') if not args.revision: errorMsgs.append('\n- Revision checksum') return errorMsgs if __name__ == '__main__': args = parse_args() errorMsgs = check_args(args) if errorMsgs: print('Some mandatory options are missing: %s' % ''.join(errorMsgs)) sys.exit(1) conf = config.read(args.config_file) type_archive = args.type_archive or None directory_dest = args.directory or '.' revision_hex = args.revision if conf['storage_class'] == 'remote_storage': from swh.storage.api.client import RemoteStorage as Storage else: from swh.storage import Storage storage = Storage(conf['storage_args']) revision_id = hashutil.hex_to_hash(revision_hex) build_archive_from_revision(revision_id, type_archive, directory_dest) diff --git a/swh/loader/tar/tarball.py b/swh/loader/tar/tarball.py index becd654..a7275fc 100644 --- a/swh/loader/tar/tarball.py +++ b/swh/loader/tar/tarball.py @@ -1,215 +1,224 @@ # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import os import tarfile import zipfile from os.path import abspath, realpath, join, dirname from swh.loader.tar import utils def canonical_abspath(path): """Resolve all paths to an absolute and real one. Args: path: to resolve Returns: canonical absolute path to path """ return realpath(abspath(path)) def badpath(path, basepath): """Determine if a path is outside basepath. Args: path: a relative or absolute path of a file or directory basepath: the basepath path must be in Returns: True if path is outside basepath, false otherwise. """ return not canonical_abspath(join(basepath, path)).startswith(basepath) def badlink(info, basepath): """Determine if the tarinfo member is outside basepath. Args: info: TarInfo member representing a symlink or hardlink of tar archive basepath: the basepath the info member must be in Returns: True if info is outside basepath, false otherwise. """ tippath = canonical_abspath(join(basepath, dirname(info.name))) return badpath(info.linkname, basepath=tippath) def is_tarball(filepath): """Given a filepath, determine if it represents an archive. Args: filepath: file to test for tarball property Returns: Bool, True if it's a tarball, False otherwise """ return tarfile.is_tarfile(filepath) or zipfile.is_zipfile(filepath) def _uncompress_zip(tarpath, dirpath): """Uncompress zip archive safely. As per zipfile is concerned (cf. note on https://docs.python.org/3.5/library/zipfile.html#zipfile.ZipFile.extract) # noqa Args: tarpath: path to the archive dirpath: directory to uncompress the archive to """ with zipfile.ZipFile(tarpath) as z: z.extractall(path=dirpath) def _uncompress_tar(tarpath, dirpath): """Uncompress tarpath if the tarpath is safe. Safe means, no file will be uncompressed outside of dirpath. Args: tarpath: path to the archive dirpath: directory to uncompress the archive to Raises: ValueError when a member would be extracted outside dirpath. """ def safemembers(tarpath, members, basepath): """Given a list of archive members, yield the members (directory, file, hard-link) that stays in bounds with basepath. Note that symbolic link are authorized to point outside the basepath though. Args: tarpath: Name of the tarball members: Archive members for such tarball basepath: the basepath sandbox Yields: Safe TarInfo member Raises: ValueError when a member would be extracted outside basepath """ errormsg = 'Archive {} blocked. Illegal path to %s %s'.format(tarpath) for finfo in members: if finfo.isdir() and badpath(finfo.name, basepath): raise ValueError(errormsg % ('directory', finfo.name)) elif finfo.isfile() and badpath(finfo.name, basepath): raise ValueError(errormsg % ('file', finfo.name)) elif finfo.islnk() and badlink(finfo, basepath): raise ValueError(errormsg % ('hard-link', finfo.linkname)) # Authorize symlinks to point outside basepath # elif finfo.issym() and badlink(finfo, basepath): # raise ValueError(errormsg % ('symlink', finfo.linkname)) else: yield finfo with tarfile.open(tarpath) as t: members = t.getmembers() t.extractall(path=dirpath, members=safemembers(tarpath, members, dirpath)) def uncompress(tarpath, dest): """Uncompress tarpath to dest folder if tarball is supported and safe. Safe means, no file will be uncompressed outside of dirpath. Note that this fixes permissions after successfully uncompressing the archive. Args: tarpath: path to tarball to uncompress dest: the destination folder where to uncompress the tarball Returns: The nature of the tarball, zip or tar. Raises: ValueError when: - an archive member would be extracted outside basepath - the archive is not supported """ if tarfile.is_tarfile(tarpath): _uncompress_tar(tarpath, dest) nature = 'tar' elif zipfile.is_zipfile(tarpath): _uncompress_zip(tarpath, dest) nature = 'zip' else: raise ValueError('File %s is not a supported archive.' % tarpath) # Fix permissions for dirpath, _, fnames in os.walk(dest): os.chmod(dirpath, 0o755) for fname in fnames: fpath = os.path.join(dirpath, fname) if not os.path.islink(fpath): os.chmod(fpath, 0o644) return nature def ls(rootdir): """Generator of filepath, filename from rootdir. """ for dirpath, dirnames, fnames in os.walk(rootdir): for fname in (dirnames+fnames): fpath = os.path.join(dirpath, fname) fname = utils.commonname(rootdir, fpath) yield fpath, fname -def _compress_zip(tarpath, dirpath): +def _compress_zip(tarpath, files): """Compress dirpath's content as tarpath. """ with zipfile.ZipFile(tarpath, 'w') as z: - for fpath, fname in ls(dirpath): + for fpath, fname in files: z.write(fpath, arcname=fname) -def _compress_tar(tarpath, dirpath): +def _compress_tar(tarpath, files): """Compress dirpath's content as tarpath. """ with tarfile.open(tarpath, 'w:bz2') as t: - for fpath, fname in ls(dirpath): + for fpath, fname in files: t.add(fpath, arcname=fname, recursive=False) -def compress(tarpath, dirpath, nature): - """Compress the directory dirpath's content to a tarball. +def compress(tarpath, nature, dirpath_or_files): + """Create a tarball tarpath with nature nature. + The content of the tarball is either dirpath's content (if representing + a directory path) or dirpath's iterable contents. + + Compress the directory dirpath's content to a tarball. The tarball being dumped at tarpath. The nature of the tarball is determined by the nature argument. """ + if isinstance(dirpath_or_files, str): + files = ls(dirpath_or_files) + else: # iterable of 'filepath, filename' + files = dirpath_or_files + if nature == 'zip': - _compress_zip(tarpath, dirpath) + _compress_zip(tarpath, files) else: - _compress_tar(tarpath, dirpath) + _compress_tar(tarpath, files) return tarpath