diff --git a/swh/loader/tar/loader.py b/swh/loader/tar/loader.py index 518c2d4..208d34d 100644 --- a/swh/loader/tar/loader.py +++ b/swh/loader/tar/loader.py @@ -1,117 +1,118 @@ # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import logging import os import tempfile import shutil import sys import traceback from swh.core import hashutil from swh.loader.dir import loader from swh.loader.tar import tarball, utils class TarLoader(loader.DirLoader): """A tarball loader. """ def __init__(self, config): super().__init__(config) self.log = logging.getLogger('swh.loader.tar.TarLoader') def process(self, tarpath, origin, revision, release, occurrences): """Load a tarball in backend. This will: - persist the origin if it does not exist. - write an entry in fetch_history to mark the loading tarball start - uncompress locally the tarballs in a temporary location - process the content of the tarballs to persist on swh storage - clean up the temporary location - write an entry in fetch_history to mark the loading tarball end Args: - tarpath: path to the tarball to uncompress - origin: Dictionary origin - url: url origin we fetched - type: type of the origin - revision: Dictionary of information needed, keys are: - author_name: revision's author name - author_email: revision's author email - author_date: timestamp (e.g. 1444054085) - author_offset: date offset e.g. -0220, +0100 - committer_name: revision's committer name - committer_email: revision's committer email - committer_date: timestamp - committer_offset: date offset e.g. -0220, +0100 - type: type of revision dir, tar - message: synthetic message for the revision - release: Dictionary of information needed, keys are: - name: release name - date: release timestamp (e.g. 1444054085) - offset: release date offset e.g. -0220, +0100 - author_name: release author's name - author_email: release author's email - comment: release's comment message - occurrences: List of occurrence dictionary. Information needed, keys are: - branch: occurrence's branch name - authority_id: authority id (e.g. 1 for swh) - validity: validity date (e.g. 2015-01-01 00:00:00+00) """ if 'type' not in origin: # let the type flow if present origin['type'] = 'tar' origin['id'] = self.storage.origin_add_one(origin) # Mark the start of the loading fetch_history_id = self.open_fetch_history(origin['id']) # Prepare the extraction path extraction_dir = self.config['extraction_dir'] os.makedirs(extraction_dir, 0o755, exist_ok=True) dir_path = tempfile.mkdtemp(prefix='swh.loader.tar-', dir=extraction_dir) # add checksums in revision artifact = utils.convert_to_hex(hashutil.hashfile(tarpath)) artifact['name'] = os.path.basename(tarpath) - revision['metadata'] = { - 'original-artifact': [artifact] - } - # for edge cases (NotImplemented...) result = {'status': False, 'stderr': ''} try: self.log.info('Uncompress %s to %s' % (tarpath, dir_path)) - tarball.uncompress(tarpath, dir_path) + nature = tarball.uncompress(tarpath, dir_path) + + revision['metadata'] = { + 'original-artifact': [artifact], + 'archive-type': nature, + } result = super().process(dir_path, origin, revision, release, occurrences) except: e_info = sys.exc_info() if not result['status']: # Enrich the error message with the tarball result['stderr'] = 'reason:%s\ntrace:%s\n%s' % ( e_info[1], ''.join(traceback.format_tb(e_info[2])), result.get('stderr', '')) raise finally: shutil.rmtree(dir_path) if not result['status']: result['stderr'] = 'archive:%s\nreason:%s' % ( tarpath, result.get('stderr', '')) # mark the end of the loading self.close_fetch_history(fetch_history_id, result) diff --git a/swh/loader/tar/tarball.py b/swh/loader/tar/tarball.py index 3972a1b..3c345ba 100644 --- a/swh/loader/tar/tarball.py +++ b/swh/loader/tar/tarball.py @@ -1,164 +1,171 @@ # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import os import tarfile import zipfile from os.path import abspath, realpath, join, dirname def canonical_abspath(path): """Resolve all paths to an absolute and real one. Args: path: to resolve Returns: canonical absolute path to path """ return realpath(abspath(path)) def badpath(path, basepath): """Determine if a path is outside basepath. Args: path: a relative or absolute path of a file or directory basepath: the basepath path must be in Returns: True if path is outside basepath, false otherwise. """ return not canonical_abspath(join(basepath, path)).startswith(basepath) def badlink(info, basepath): """Determine if the tarinfo member is outside basepath. Args: info: TarInfo member representing a symlink or hardlink of tar archive basepath: the basepath the info member must be in Returns: True if info is outside basepath, false otherwise. """ tippath = canonical_abspath(join(basepath, dirname(info.name))) return badpath(info.linkname, basepath=tippath) def is_tarball(filepath): """Given a filepath, determine if it represents an archive. Args: filepath: file to test for tarball property Returns: Bool, True if it's a tarball, False otherwise """ return tarfile.is_tarfile(filepath) or zipfile.is_zipfile(filepath) def _uncompress_zip(tarpath, dirpath): """Uncompress zip archive safely. As per zipfile is concerned (cf. note on https://docs.python.org/3.5/library/zipfile.html#zipfile.ZipFile.extract) # noqa Args: tarpath: path to the archive dirpath: directory to uncompress the archive to """ with zipfile.ZipFile(tarpath) as z: z.extractall(path=dirpath) def _uncompress_tar(tarpath, dirpath): """Uncompress tarpath if the tarpath is safe. Safe means, no file will be uncompressed outside of dirpath. Args: tarpath: path to the archive dirpath: directory to uncompress the archive to Raises: ValueError when a member would be extracted outside dirpath. """ def safemembers(tarpath, members, basepath): """Given a list of archive members, yield the members (directory, file, hard-link) that stays in bounds with basepath. Note that symbolic link are authorized to point outside the basepath though. Args: tarpath: Name of the tarball members: Archive members for such tarball basepath: the basepath sandbox Yields: Safe TarInfo member Raises: ValueError when a member would be extracted outside basepath """ errormsg = 'Archive {} blocked. Illegal path to %s %s'.format(tarpath) for finfo in members: if finfo.isdir() and badpath(finfo.name, basepath): raise ValueError(errormsg % ('directory', finfo.name)) elif finfo.isfile() and badpath(finfo.name, basepath): raise ValueError(errormsg % ('file', finfo.name)) elif finfo.islnk() and badlink(finfo, basepath): raise ValueError(errormsg % ('hard-link', finfo.linkname)) # Authorize symlinks to point outside basepath # elif finfo.issym() and badlink(finfo, basepath): # raise ValueError(errormsg % ('symlink', finfo.linkname)) else: yield finfo with tarfile.open(tarpath) as t: members = t.getmembers() t.extractall(path=dirpath, members=safemembers(tarpath, members, dirpath)) def uncompress(tarpath, dest): """Uncompress tarpath to dest folder if tarball is supported and safe. Safe means, no file will be uncompressed outside of dirpath. Note that this fixes permissions after successfully uncompressing the archive. Args: tarpath: path to tarball to uncompress dest: the destination folder where to uncompress the tarball + Returns: + The nature of the tarball, zip or tar. + Raises: ValueError when: - an archive member would be extracted outside basepath - the archive is not supported """ if tarfile.is_tarfile(tarpath): _uncompress_tar(tarpath, dest) + nature = 'tar' elif zipfile.is_zipfile(tarpath): _uncompress_zip(tarpath, dest) + nature = 'zip' else: raise ValueError('File %s is not a supported archive.' % tarpath) # Fix permissions for dirpath, _, fnames in os.walk(dest): os.chmod(dirpath, 0o755) for fname in fnames: fpath = os.path.join(dirpath, fname) if not os.path.islink(fpath): os.chmod(fpath, 0o644) + + return nature