diff --git a/debian/control b/debian/control index 21ce26e..f639d03 100644 --- a/debian/control +++ b/debian/control @@ -1,25 +1,25 @@ Source: swh-loader-tar Maintainer: Software Heritage developers Section: python Priority: optional Build-Depends: debhelper (>= 9), dh-python (>= 2), python3-all, python3-nose, python3-setuptools, - python3-swh.core (>= 0.0.14~), + python3-swh.core (>= 0.0.36~), python3-swh.model (>= 0.0.15~), python3-swh.scheduler (>= 0.0.14~), python3-swh.storage (>= 0.0.83~), python3-swh.loader.dir (>= 0.0.30~), python3-vcversioner Standards-Version: 3.9.6 Homepage: https://forge.softwareheritage.org/diffusion/DLDTAR/ Package: python3-swh.loader.tar Architecture: all -Depends: python3-swh.core (>= 0.0.14~), python3-swh.storage (>= 0.0.83~), +Depends: python3-swh.core (>= 0.0.36~), python3-swh.storage (>= 0.0.83~), python3-swh.loader.dir (>= 0.0.30~), python3-swh.scheduler (>= 0.0.14~), ${misc:Depends}, ${python3:Depends} Description: Software Heritage Tarball Loader diff --git a/requirements-swh.txt b/requirements-swh.txt index 85dfb89..649e66d 100644 --- a/requirements-swh.txt +++ b/requirements-swh.txt @@ -1,5 +1,5 @@ -swh.core >= 0.0.14 +swh.core >= 0.0.36 swh.model >= 0.0.15 swh.scheduler >= 0.0.14 swh.storage >= 0.0.83 swh.loader.dir >= 0.0.30 diff --git a/swh/loader/tar/build.py b/swh/loader/tar/build.py index c5ff02d..47e5415 100755 --- a/swh/loader/tar/build.py +++ b/swh/loader/tar/build.py @@ -1,116 +1,116 @@ # Copyright (C) 2015-2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import os -from swh.loader.tar import utils +from swh.core import utils # Static setup EPOCH = 0 UTC_OFFSET = 0 SWH_PERSON = { 'name': 'Software Heritage', 'fullname': 'Software Heritage', 'email': 'robot@softwareheritage.org' } REVISION_MESSAGE = 'synthetic revision message' REVISION_TYPE = 'tar' def compute_origin(url_scheme, url_type, root_dirpath, tarpath): """Compute the origin. Args: - url_scheme: scheme to build the origin's url - url_type: origin's type - root_dirpath: the top level root directory path - tarpath: file's absolute path Returns: Dictionary origin with keys: - url: origin's url - type: origin's type """ relative_path = utils.commonname(root_dirpath, tarpath) return { 'url': ''.join([url_scheme, os.path.dirname(relative_path)]), 'type': url_type, } def compute_occurrence(tarpath): """Compute the occurrence using the tarpath's ctime. Args: tarpath: file's path Returns: Occurrence dictionary. """ return { 'branch': os.path.basename(tarpath), } def _time_from_path(tarpath): """Compute the modification time from the tarpath. Args: tarpath (str|bytes): Full path to the archive to extract the date from. Returns: dict representing a timestamp with keys seconds and microseconds keys. """ mtime = os.lstat(tarpath).st_mtime if isinstance(mtime, float): normalized_time = list(map(int, str(mtime).split('.'))) else: # assuming int normalized_time = [mtime, 0] return { 'seconds': normalized_time[0], 'microseconds': normalized_time[1] } def compute_revision(tarpath): """Compute a revision. Args: tarpath: absolute path to the tarball Returns: Revision as dict: - date (dict): the modification timestamp as returned by _time_from_path function - committer_date: the modification timestamp as returned by _time_from_path function - author: cf. SWH_PERSON - committer: cf. SWH_PERSON - type: cf. REVISION_TYPE - message: cf. REVISION_MESSAGE """ ts = _time_from_path(tarpath) return { 'date': { 'timestamp': ts, 'offset': UTC_OFFSET, }, 'committer_date': { 'timestamp': ts, 'offset': UTC_OFFSET, }, 'author': SWH_PERSON, 'committer': SWH_PERSON, 'type': REVISION_TYPE, 'message': REVISION_MESSAGE, } diff --git a/swh/loader/tar/file.py b/swh/loader/tar/file.py index af7fa91..57fd6b5 100644 --- a/swh/loader/tar/file.py +++ b/swh/loader/tar/file.py @@ -1,89 +1,90 @@ -# Copyright (C) 2015 The Software Heritage developers +# Copyright (C) 2015-2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import itertools import os -from swh.loader.tar import tarball, utils +from swh.core import tarball +from swh.loader.tar import utils def archives_from_dir(path): """Given a path to a directory, walk such directory and yield tuple of tarpath, fname. Args: path: top level directory Returns: Generator of tuple tarpath, filename with tarpath a tarball. """ for dirpath, dirnames, filenames in os.walk(path): for fname in filenames: tarpath = os.path.join(dirpath, fname) if not os.path.exists(tarpath): continue if tarball.is_tarball(tarpath): yield tarpath, fname def archives_from_file(mirror_file): """Given a path to a file containing one tarball per line, yield a tuple of tarpath, fname. Args: mirror_file: path to the file containing list of tarpath. Returns: Generator of tuple tarpath, filename with tarpath a tarball. """ with open(mirror_file, 'r') as f: for tarpath in f.readlines(): tarpath = tarpath.strip() if not os.path.exists(tarpath): print('WARN: %s does not exist. Skipped.' % tarpath) continue if tarball.is_tarball(tarpath): yield tarpath, os.path.basename(tarpath) def archives_from(path): """From path, list tuple of tarpath, fname. Args: path: top directory to list archives from or custom file format. Returns: Generator of tuple tarpath, filename with tarpath a tarball. """ if os.path.isfile(path): yield from archives_from_file(path) elif os.path.isdir(path): yield from archives_from_dir(path) else: raise ValueError( 'Input incorrect, %s must be a file or a directory.' % path) def random_archives_from(path, block, limit=None): """Randomize by size block the archives. Returns: Generator of randomized tuple tarpath, filename with tarpath a tarball. """ random_archives = utils.random_blocks(archives_from(path), block, fillvalue=(None, None)) if limit: random_archives = itertools.islice(random_archives, limit) for tarpath, fname in ((t, f) for t, f in random_archives if t and f): yield tarpath, fname diff --git a/swh/loader/tar/loader.py b/swh/loader/tar/loader.py index 9a01379..fb205e8 100644 --- a/swh/loader/tar/loader.py +++ b/swh/loader/tar/loader.py @@ -1,106 +1,107 @@ # Copyright (C) 2015-2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import os import tempfile import shutil +from swh.core import tarball from swh.loader.core.loader import SWHLoader from swh.loader.dir import loader -from swh.loader.tar import tarball, utils +from swh.loader.tar import utils from swh.model import hashutil class TarLoader(loader.DirLoader): """Tarball loader implementation. This is a subclass of the :class:DirLoader as the main goal of this class is to first uncompress a tarball, then provide the uncompressed directory/tree to be loaded by the DirLoader. This will: - creates an origin (if it does not exist) - creates a fetch_history entry - creates an origin_visit - uncompress locally the tarball in a temporary location - process the content of the tarballs to persist on swh storage - clean up the temporary location - write an entry in fetch_history to mark the loading tarball end (success or failure) """ CONFIG_BASE_FILENAME = 'loader/tar' ADDITIONAL_CONFIG = { 'extraction_dir': ('string', '/tmp') } def __init__(self, logging_class='swh.loader.tar.TarLoader', config=None): super().__init__(logging_class=logging_class, config=config) def load(self, *, tar_path, origin, visit_date, revision, occurrences): """Load a tarball in `tarpath` in the Software Heritage Archive. Args: tar_path: tarball to import origin (dict): an origin dictionary as returned by :func:`swh.storage.storage.Storage.origin_get_one` visit_date (str): the date the origin was visited (as an isoformatted string) revision (dict): a revision as passed to :func:`swh.storage.storage.Storage.revision_add`, excluding the `id` and `directory` keys (computed from the directory) occurrences (list of dicts): the occurrences to create in the generated origin visit. Each dict contains a 'branch' key with the branch name as value. """ # Shortcut super() as we use different arguments than the DirLoader. SWHLoader.load(self, tar_path=tar_path, origin=origin, visit_date=visit_date, revision=revision, occurrences=occurrences) def prepare(self, *, tar_path, origin, visit_date, revision, occurrences): """1. Uncompress the tarball in a temporary directory. 2. Compute some metadata to update the revision. """ if 'type' not in origin: # let the type flow if present origin['type'] = 'tar' # Prepare the extraction path extraction_dir = self.config['extraction_dir'] os.makedirs(extraction_dir, 0o755, exist_ok=True) dir_path = tempfile.mkdtemp(prefix='swh.loader.tar-', dir=extraction_dir) # add checksums in revision self.log.info('Uncompress %s to %s' % (tar_path, dir_path)) nature = tarball.uncompress(tar_path, dir_path) if 'metadata' not in revision: artifact = utils.convert_to_hex(hashutil.hash_path(tar_path)) artifact['name'] = os.path.basename(tar_path) artifact['archive_type'] = nature artifact['length'] = os.path.getsize(tar_path) revision['metadata'] = { 'original_artifact': [artifact], } super().prepare(dir_path=dir_path, origin=origin, visit_date=visit_date, revision=revision, release=None, occurrences=occurrences) def cleanup(self): """Clean up temporary directory where we uncompress the tarball. """ dir_path = self.dir_path if dir_path and os.path.exists(dir_path): shutil.rmtree(dir_path) diff --git a/swh/loader/tar/tarball.py b/swh/loader/tar/tarball.py deleted file mode 100644 index 6e4f4bf..0000000 --- a/swh/loader/tar/tarball.py +++ /dev/null @@ -1,227 +0,0 @@ -# Copyright (C) 2015-2017 The Software Heritage developers -# See the AUTHORS file at the top-level directory of this distribution -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - -import os -import stat -import tarfile -import zipfile - -from os.path import abspath, realpath, join, dirname -from swh.loader.tar import utils - - -def _canonical_abspath(path): - """Resolve all paths to an absolute and real one. - - Args: - path: to resolve - - Returns: - canonical absolute path to path - - """ - return realpath(abspath(path)) - - -def _badpath(path, basepath): - """Determine if a path is outside basepath. - - Args: - path: a relative or absolute path of a file or directory - basepath: the basepath path must be in - - Returns: - True if path is outside basepath, false otherwise. - - """ - return not _canonical_abspath(join(basepath, path)).startswith(basepath) - - -def _badlink(info, basepath): - """Determine if the tarinfo member is outside basepath. - - Args: - info: TarInfo member representing a symlink or hardlink of tar archive - basepath: the basepath the info member must be in - - Returns: - True if info is outside basepath, false otherwise. - - """ - tippath = _canonical_abspath(join(basepath, dirname(info.name))) - return _badpath(info.linkname, basepath=tippath) - - -def is_tarball(filepath): - """Given a filepath, determine if it represents an archive. - - Args: - filepath: file to test for tarball property - - Returns: - Bool, True if it's a tarball, False otherwise - - """ - return tarfile.is_tarfile(filepath) or zipfile.is_zipfile(filepath) - - -def _uncompress_zip(tarpath, dirpath): - """Uncompress zip archive safely. - - As per zipfile is concerned - (cf. note on https://docs.python.org/3.5/library/zipfile.html#zipfile.ZipFile.extract) # noqa - - Args: - tarpath: path to the archive - dirpath: directory to uncompress the archive to - - """ - with zipfile.ZipFile(tarpath) as z: - z.extractall(path=dirpath) - - -def _uncompress_tar(tarpath, dirpath): - """Uncompress tarpath if the tarpath is safe. - Safe means, no file will be uncompressed outside of dirpath. - - Args: - tarpath: path to the archive - dirpath: directory to uncompress the archive to - - Raises: - ValueError when a member would be extracted outside dirpath. - - """ - def safemembers(tarpath, members, basepath): - """Given a list of archive members, yield the members (directory, - file, hard-link) that stays in bounds with basepath. Note - that symbolic link are authorized to point outside the - basepath though. - - Args: - tarpath: Name of the tarball - members: Archive members for such tarball - basepath: the basepath sandbox - - Yields: - Safe TarInfo member - - Raises: - ValueError when a member would be extracted outside basepath - - """ - errormsg = 'Archive {} blocked. Illegal path to %s %s'.format(tarpath) - - for finfo in members: - if finfo.isdir() and _badpath(finfo.name, basepath): - raise ValueError(errormsg % ('directory', finfo.name)) - elif finfo.isfile() and _badpath(finfo.name, basepath): - raise ValueError(errormsg % ('file', finfo.name)) - elif finfo.islnk() and _badlink(finfo, basepath): - raise ValueError(errormsg % ('hard-link', finfo.linkname)) - # Authorize symlinks to point outside basepath - # elif finfo.issym() and _badlink(finfo, basepath): - # raise ValueError(errormsg % ('symlink', finfo.linkname)) - else: - yield finfo - - with tarfile.open(tarpath) as t: - members = t.getmembers() - t.extractall(path=dirpath, - members=safemembers(tarpath, members, dirpath)) - - -def uncompress(tarpath, dest): - """Uncompress tarpath to dest folder if tarball is supported and safe. - Safe means, no file will be uncompressed outside of dirpath. - - Note that this fixes permissions after successfully - uncompressing the archive. - - Args: - tarpath: path to tarball to uncompress - dest: the destination folder where to uncompress the tarball - - Returns: - The nature of the tarball, zip or tar. - - Raises: - ValueError when: - - an archive member would be extracted outside basepath - - the archive is not supported - - """ - if tarfile.is_tarfile(tarpath): - _uncompress_tar(tarpath, dest) - nature = 'tar' - elif zipfile.is_zipfile(tarpath): - _uncompress_zip(tarpath, dest) - nature = 'zip' - else: - raise ValueError('File %s is not a supported archive.' % tarpath) - - # Fix permissions - for dirpath, _, fnames in os.walk(dest): - os.chmod(dirpath, 0o755) - for fname in fnames: - fpath = os.path.join(dirpath, fname) - if not os.path.islink(fpath): - fpath_exec = os.stat(fpath).st_mode & stat.S_IXUSR - if not fpath_exec: - os.chmod(fpath, 0o644) - - return nature - - -def _ls(rootdir): - """Generator of filepath, filename from rootdir. - - """ - for dirpath, dirnames, fnames in os.walk(rootdir): - for fname in (dirnames+fnames): - fpath = os.path.join(dirpath, fname) - fname = utils.commonname(rootdir, fpath) - yield fpath, fname - - -def _compress_zip(tarpath, files): - """Compress dirpath's content as tarpath. - - """ - with zipfile.ZipFile(tarpath, 'w') as z: - for fpath, fname in files: - z.write(fpath, arcname=fname) - - -def _compress_tar(tarpath, files): - """Compress dirpath's content as tarpath. - - """ - with tarfile.open(tarpath, 'w:bz2') as t: - for fpath, fname in files: - t.add(fpath, arcname=fname, recursive=False) - - -def compress(tarpath, nature, dirpath_or_files): - """Create a tarball tarpath with nature nature. - The content of the tarball is either dirpath's content (if representing - a directory path) or dirpath's iterable contents. - - Compress the directory dirpath's content to a tarball. - The tarball being dumped at tarpath. - The nature of the tarball is determined by the nature argument. - - """ - if isinstance(dirpath_or_files, str): - files = _ls(dirpath_or_files) - else: # iterable of 'filepath, filename' - files = dirpath_or_files - - if nature == 'zip': - _compress_zip(tarpath, files) - else: - _compress_tar(tarpath, files) - - return tarpath diff --git a/swh/loader/tar/tests/test_utils.py b/swh/loader/tar/tests/test_utils.py index 38f222f..05b43fc 100644 --- a/swh/loader/tar/tests/test_utils.py +++ b/swh/loader/tar/tests/test_utils.py @@ -1,59 +1,45 @@ -# Copyright (C) 2015 The Software Heritage developers +# Copyright (C) 2015-2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import unittest from nose.tools import istest from swh.loader.tar import utils class TestUtils(unittest.TestCase): - @istest - def commonname(self): - # when - actual_commonname = utils.commonname('/some/where/to/', - '/some/where/to/go/to') - # then - self.assertEquals('go/to', actual_commonname) - - # when - actual_commonname2 = utils.commonname(b'/some/where/to/', - b'/some/where/to/go/to') - # then - self.assertEquals(b'go/to', actual_commonname2) - @istest def convert_to_hex(self): # given input_dict = { 'sha1_git': b'\xf6\xb7 \x8b+\xcd \x9fq5E\xe6\x03\xffg\x87\xd7\xb9D\xa1', # noqa 'sha1': b'\xf4O\xf0\xd4\xc0\xb0\xae\xca\xe4C\xab%\x10\xf7\x12h\x1e\x9f\xac\xeb', # noqa 'sha256': b'\xa8\xf9=\xf3\xfek\xa2$\xee\xc7\x1b\xc2\x83\xca\x96\xae8\xaf&\xab\x08\xfa\xb1\x13\xec(.s]\xf6Yb', # noqa 'length': 10, } # noqa expected_dict = {'sha1_git': 'f6b7208b2bcd209f713545e603ff6' '787d7b944a1', 'sha1': 'f44ff0d4c0b0aecae443ab2510f712681e' '9faceb', 'sha256': 'a8f93df3fe6ba224eec71bc283ca96ae3' '8af26ab08fab113ec282e735df65962', 'length': 10} # when actual_dict = utils.convert_to_hex(input_dict) # then self.assertDictEqual(actual_dict, expected_dict) @istest def convert_to_hex_edge_cases(self): # when actual_dict = utils.convert_to_hex({}) # then self.assertDictEqual(actual_dict, {}) self.assertIsNone(utils.convert_to_hex(None)) diff --git a/swh/loader/tar/utils.py b/swh/loader/tar/utils.py index e0c9230..c5e9bac 100644 --- a/swh/loader/tar/utils.py +++ b/swh/loader/tar/utils.py @@ -1,81 +1,74 @@ # Copyright (C) 2015-2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import itertools import random from swh.model import hashutil -def commonname(path0, path1, as_str=False): - """Compute the commonname between the path0 and path1. - - """ - return path1.split(path0)[1] - - def convert_to_hex(d): """Convert a flat dictionary with bytes in values to the same dictionary with hex as values. Args: dict: flat dictionary with sha bytes in their values. Returns: Mirror dictionary with values as string hex. """ if not d: return d checksums = {} for key, h in d.items(): if isinstance(h, bytes): checksums[key] = hashutil.hash_to_hex(h) else: checksums[key] = h return checksums def grouper(iterable, n, fillvalue=None): """Collect data into fixed-length chunks or blocks. Args: iterable: an iterable n: size of block fillvalue: value to use for the last block Returns: fixed-length chunks of blocks as iterables """ args = [iter(iterable)] * n return itertools.zip_longest(*args, fillvalue=fillvalue) def random_blocks(iterable, block=100, fillvalue=None): """Given an iterable: - slice the iterable in data set of block-sized elements - randomized the data set - yield each element Args: iterable: iterable of data block: number of elements per block fillvalue: a fillvalue for the last block if not enough values in last block Returns: An iterable of randomized per block-size elements. """ count = 0 for iterable in grouper(iterable, block, fillvalue=fillvalue): count += 1 l = list(iterable) random.shuffle(l) for e in l: yield e