diff --git a/swh/loader/tar/tarball.py b/swh/loader/tar/tarball.py index a7275fc..d341180 100644 --- a/swh/loader/tar/tarball.py +++ b/swh/loader/tar/tarball.py @@ -1,224 +1,227 @@ -# Copyright (C) 2015 The Software Heritage developers +# Copyright (C) 2015-2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import os +import stat import tarfile import zipfile from os.path import abspath, realpath, join, dirname from swh.loader.tar import utils def canonical_abspath(path): """Resolve all paths to an absolute and real one. Args: path: to resolve Returns: canonical absolute path to path """ return realpath(abspath(path)) def badpath(path, basepath): """Determine if a path is outside basepath. Args: path: a relative or absolute path of a file or directory basepath: the basepath path must be in Returns: True if path is outside basepath, false otherwise. """ return not canonical_abspath(join(basepath, path)).startswith(basepath) def badlink(info, basepath): """Determine if the tarinfo member is outside basepath. Args: info: TarInfo member representing a symlink or hardlink of tar archive basepath: the basepath the info member must be in Returns: True if info is outside basepath, false otherwise. """ tippath = canonical_abspath(join(basepath, dirname(info.name))) return badpath(info.linkname, basepath=tippath) def is_tarball(filepath): """Given a filepath, determine if it represents an archive. Args: filepath: file to test for tarball property Returns: Bool, True if it's a tarball, False otherwise """ return tarfile.is_tarfile(filepath) or zipfile.is_zipfile(filepath) def _uncompress_zip(tarpath, dirpath): """Uncompress zip archive safely. As per zipfile is concerned (cf. note on https://docs.python.org/3.5/library/zipfile.html#zipfile.ZipFile.extract) # noqa Args: tarpath: path to the archive dirpath: directory to uncompress the archive to """ with zipfile.ZipFile(tarpath) as z: z.extractall(path=dirpath) def _uncompress_tar(tarpath, dirpath): """Uncompress tarpath if the tarpath is safe. Safe means, no file will be uncompressed outside of dirpath. Args: tarpath: path to the archive dirpath: directory to uncompress the archive to Raises: ValueError when a member would be extracted outside dirpath. """ def safemembers(tarpath, members, basepath): """Given a list of archive members, yield the members (directory, file, hard-link) that stays in bounds with basepath. Note that symbolic link are authorized to point outside the basepath though. Args: tarpath: Name of the tarball members: Archive members for such tarball basepath: the basepath sandbox Yields: Safe TarInfo member Raises: ValueError when a member would be extracted outside basepath """ errormsg = 'Archive {} blocked. Illegal path to %s %s'.format(tarpath) for finfo in members: if finfo.isdir() and badpath(finfo.name, basepath): raise ValueError(errormsg % ('directory', finfo.name)) elif finfo.isfile() and badpath(finfo.name, basepath): raise ValueError(errormsg % ('file', finfo.name)) elif finfo.islnk() and badlink(finfo, basepath): raise ValueError(errormsg % ('hard-link', finfo.linkname)) # Authorize symlinks to point outside basepath # elif finfo.issym() and badlink(finfo, basepath): # raise ValueError(errormsg % ('symlink', finfo.linkname)) else: yield finfo with tarfile.open(tarpath) as t: members = t.getmembers() t.extractall(path=dirpath, members=safemembers(tarpath, members, dirpath)) def uncompress(tarpath, dest): """Uncompress tarpath to dest folder if tarball is supported and safe. Safe means, no file will be uncompressed outside of dirpath. Note that this fixes permissions after successfully uncompressing the archive. Args: tarpath: path to tarball to uncompress dest: the destination folder where to uncompress the tarball Returns: The nature of the tarball, zip or tar. Raises: ValueError when: - an archive member would be extracted outside basepath - the archive is not supported """ if tarfile.is_tarfile(tarpath): _uncompress_tar(tarpath, dest) nature = 'tar' elif zipfile.is_zipfile(tarpath): _uncompress_zip(tarpath, dest) nature = 'zip' else: raise ValueError('File %s is not a supported archive.' % tarpath) # Fix permissions for dirpath, _, fnames in os.walk(dest): os.chmod(dirpath, 0o755) for fname in fnames: fpath = os.path.join(dirpath, fname) if not os.path.islink(fpath): - os.chmod(fpath, 0o644) + fpath_exec = os.stat(fpath).st_mode & stat.S_IXUSR + if not fpath_exec: + os.chmod(fpath, 0o644) return nature def ls(rootdir): """Generator of filepath, filename from rootdir. """ for dirpath, dirnames, fnames in os.walk(rootdir): for fname in (dirnames+fnames): fpath = os.path.join(dirpath, fname) fname = utils.commonname(rootdir, fpath) yield fpath, fname def _compress_zip(tarpath, files): """Compress dirpath's content as tarpath. """ with zipfile.ZipFile(tarpath, 'w') as z: for fpath, fname in files: z.write(fpath, arcname=fname) def _compress_tar(tarpath, files): """Compress dirpath's content as tarpath. """ with tarfile.open(tarpath, 'w:bz2') as t: for fpath, fname in files: t.add(fpath, arcname=fname, recursive=False) def compress(tarpath, nature, dirpath_or_files): """Create a tarball tarpath with nature nature. The content of the tarball is either dirpath's content (if representing a directory path) or dirpath's iterable contents. Compress the directory dirpath's content to a tarball. The tarball being dumped at tarpath. The nature of the tarball is determined by the nature argument. """ if isinstance(dirpath_or_files, str): files = ls(dirpath_or_files) else: # iterable of 'filepath, filename' files = dirpath_or_files if nature == 'zip': _compress_zip(tarpath, files) else: _compress_tar(tarpath, files) return tarpath diff --git a/swh/loader/tar/tests/test_loader.py b/swh/loader/tar/tests/test_loader.py index ddf5bc4..60272d1 100644 --- a/swh/loader/tar/tests/test_loader.py +++ b/swh/loader/tar/tests/test_loader.py @@ -1,209 +1,211 @@ # Copyright (C) 2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import os from nose.tools import istest from unittest import TestCase from swh.loader.tar.loader import TarLoader class LoaderNoStorageForTest: """Mixin class to inhibit the persistence and keep in memory the data sent for storage. cf. SWHTarLoaderNoStorage """ def __init__(self): super().__init__() # Init the state self.all_contents = [] self.all_directories = [] self.all_revisions = [] self.all_releases = [] self.all_occurrences = [] def send_origin(self, origin): self.origin = origin def send_origin_visit(self, origin_id, ts): self.origin_visit = { 'origin': origin_id, 'ts': ts, 'visit': 1, } return self.origin_visit def update_origin_visit(self, origin_id, visit, status): self.status = status self.origin_visit = visit def maybe_load_contents(self, all_contents): self.all_contents.extend(all_contents) def maybe_load_directories(self, all_directories): self.all_directories.extend(all_directories) def maybe_load_revisions(self, all_revisions): self.all_revisions.extend(all_revisions) def maybe_load_releases(self, releases): self.all_releases.extend(releases) def maybe_load_occurrences(self, all_occurrences): self.all_occurrences.extend(all_occurrences) def open_fetch_history(self): return 1 def close_fetch_history_success(self, fetch_history_id): pass def close_fetch_history_failure(self, fetch_history_id): pass TEST_CONFIG = { 'extraction_dir': '/tmp/tests/loader-tar/', # where to extract the tarball 'storage': { # we instantiate it but we don't use it in test context 'cls': 'remote', 'args': { 'url': 'http://127.0.0.1:9999', # somewhere that does not exist } }, 'send_contents': False, 'send_directories': False, 'send_revisions': False, 'send_releases': False, 'send_occurrences': False, 'content_packet_size': 100, 'content_packet_block_size_bytes': 104857600, 'content_packet_size_bytes': 1073741824, 'directory_packet_size': 250, 'revision_packet_size': 100, 'release_packet_size': 100, 'occurrence_packet_size': 100, } def parse_config_file(base_filename=None, config_filename=None, additional_configs=None, global_config=True): return TEST_CONFIG # Inhibit side-effect loading configuration from disk TarLoader.parse_config_file = parse_config_file class SWHTarLoaderNoStorage(LoaderNoStorageForTest, TarLoader): """A TarLoader with no persistence. Context: Load a tarball with a persistent-less tarball loader """ pass PATH_TO_DATA = '../../../../..' class SWHTarLoaderITTest(TestCase): def setUp(self): super().setUp() self.loader = SWHTarLoaderNoStorage() @istest def load(self): """Process a new tarball should be ok """ # given start_path = os.path.dirname(__file__) tarpath = os.path.join( start_path, PATH_TO_DATA, 'swh-storage-testdata/dir-folders/sample-folder.tgz') origin = { 'url': 'file:///tmp/sample-folder', 'type': 'dir' } visit_date = 'Tue, 3 May 2016 17:16:32 +0200' import datetime commit_time = int(datetime.datetime.now( tz=datetime.timezone.utc).timestamp() ) swh_person = { 'name': 'Software Heritage', 'fullname': 'Software Heritage', 'email': 'robot@softwareheritage.org' } revision_message = 'swh-loader-tar: synthetic revision message' revision_type = 'tar' revision = { 'date': { 'timestamp': commit_time, 'offset': 0, }, 'committer_date': { 'timestamp': commit_time, 'offset': 0, }, 'author': swh_person, 'committer': swh_person, 'type': revision_type, 'message': revision_message, 'metadata': {}, 'synthetic': True, } occurrence = { 'branch': os.path.basename(tarpath), } # when self.loader.load(tar_path=tarpath, origin=origin, visit_date=visit_date, revision=revision, occurrences=[occurrence]) # then - self.assertEquals(len(self.loader.all_contents), 8) - self.assertEquals(len(self.loader.all_directories), 6) - self.assertEquals(len(self.loader.all_revisions), 1) + self.assertEquals(len(self.loader.all_contents), 8, + "8 contents: 3 files + 5 links") + self.assertEquals(len(self.loader.all_directories), 6, + "6 directories: 4 subdirs + 1 empty + 1 main dir") + self.assertEquals(len(self.loader.all_revisions), 1, + "synthetic revision") actual_revision = self.loader.all_revisions[0] - self.assertEquals(actual_revision['synthetic'], - True) + self.assertTrue(actual_revision['synthetic']) self.assertEquals(actual_revision['parents'], []) self.assertEquals(actual_revision['type'], 'tar') self.assertEquals(actual_revision['message'], b'swh-loader-tar: synthetic revision message') self.assertEquals(actual_revision['directory'], - b'\x18U\xe5?K\x98,\xdb&9\x0f\xd3/h\xf5{\xfb,\xc3\xd5') # noqa + b'\xa7A\xfcM\x96\x8c{\x8e<\x94\xff\x86\xe7\x04\x80\xc5\xc7\xe5r\xa9') # noqa self.assertEquals( actual_revision['metadata']['original_artifact'][0], { 'sha1_git': 'cc848944a0d3e71d287027347e25467e61b07428', 'archive_type': 'tar', 'blake2s256': '5d70923443ad36377cd58e993aff0e3c1b9ef14f796c69569105d3a99c64f075', # noqa 'name': 'sample-folder.tgz', 'sha1': '3ca0d0a5c6833113bd532dc5c99d9648d618f65a', 'length': 555, 'sha256': '307ebda0071ca5975f618e192c8417161e19b6c8bf581a26061b76dc8e85321d' # noqa }) self.assertEquals(len(self.loader.all_releases), 0) self.assertEquals(len(self.loader.all_occurrences), 1)