diff --git a/debian/control b/debian/control index 6078985..7a11bdf 100644 --- a/debian/control +++ b/debian/control @@ -1,27 +1,28 @@ Source: swh-loader-tar Maintainer: Software Heritage developers Section: python Priority: optional Build-Depends: debhelper (>= 9), dh-python (>= 2), python3-all, python3-nose, python3-setuptools, python3-swh.core (>= 0.0.36~), python3-swh.loader.dir (>= 0.0.32~), - python3-swh.model (>= 0.0.15~), + python3-swh.model (>= 0.0.27~), python3-swh.scheduler (>= 0.0.14~), python3-swh.storage (>= 0.0.83~), python3-vcversioner Standards-Version: 3.9.6 Homepage: https://forge.softwareheritage.org/diffusion/DLDTAR/ Package: python3-swh.loader.tar Architecture: all Depends: python3-swh.core (>= 0.0.36~), python3-swh.loader.dir (>= 0.0.32~), + python3-swh.model (>= 0.0.27~), python3-swh.scheduler (>= 0.0.14~), python3-swh.storage (>= 0.0.83~), ${misc:Depends}, ${python3:Depends} Description: Software Heritage Tarball Loader diff --git a/requirements-swh.txt b/requirements-swh.txt index 26ad0c3..ec55d8a 100644 --- a/requirements-swh.txt +++ b/requirements-swh.txt @@ -1,5 +1,5 @@ swh.core >= 0.0.36 -swh.model >= 0.0.15 +swh.model >= 0.0.27 swh.scheduler >= 0.0.14 swh.storage >= 0.0.83 swh.loader.dir >= 0.0.32 diff --git a/swh/loader/tar/loader.py b/swh/loader/tar/loader.py index 7d4981f..b069343 100644 --- a/swh/loader/tar/loader.py +++ b/swh/loader/tar/loader.py @@ -1,154 +1,153 @@ # Copyright (C) 2015-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import os import tempfile import shutil from swh.core import tarball from swh.loader.core.loader import SWHLoader from swh.loader.dir import loader -from swh.loader.tar import utils -from swh.model import hashutil +from swh.model.hashutil import MultiHash class TarLoader(loader.DirLoader): """Tarball loader implementation. This is a subclass of the :class:DirLoader as the main goal of this class is to first uncompress a tarball, then provide the uncompressed directory/tree to be loaded by the DirLoader. This will: - creates an origin (if it does not exist) - creates a fetch_history entry - creates an origin_visit - uncompress locally the tarball in a temporary location - process the content of the tarballs to persist on swh storage - clean up the temporary location - write an entry in fetch_history to mark the loading tarball end (success or failure) """ CONFIG_BASE_FILENAME = 'loader/tar' ADDITIONAL_CONFIG = { 'extraction_dir': ('string', '/tmp') } def __init__(self, logging_class='swh.loader.tar.TarLoader', config=None): super().__init__(logging_class=logging_class, config=config) self.dir_path = None def load(self, *, tar_path, origin, visit_date, revision, branch_name=None): """Load a tarball in `tarpath` in the Software Heritage Archive. Args: tar_path: tarball to import origin (dict): an origin dictionary as returned by :func:`swh.storage.storage.Storage.origin_get_one` visit_date (str): the date the origin was visited (as an isoformatted string) revision (dict): a revision as passed to :func:`swh.storage.storage.Storage.revision_add`, excluding the `id` and `directory` keys (computed from the directory) branch_name (str): the optional branch_name to use for snapshot """ # Shortcut super() as we use different arguments than the DirLoader. return SWHLoader.load(self, tar_path=tar_path, origin=origin, visit_date=visit_date, revision=revision, branch_name=branch_name) def prepare_origin_visit(self, *, origin, visit_date=None, **kwargs): self.origin = origin if 'type' not in self.origin: # let the type flow if present self.origin['type'] = 'tar' self.visit_date = visit_date def prepare(self, *, tar_path, origin, revision, visit_date=None, branch_name=None): """1. Uncompress the tarball in a temporary directory. 2. Compute some metadata to update the revision. """ # Prepare the extraction path extraction_dir = self.config['extraction_dir'] os.makedirs(extraction_dir, 0o755, exist_ok=True) self.dir_path = tempfile.mkdtemp(prefix='swh.loader.tar-', dir=extraction_dir) # add checksums in revision self.log.info('Uncompress %s to %s' % (tar_path, self.dir_path)) nature = tarball.uncompress(tar_path, self.dir_path) if 'metadata' not in revision: - artifact = utils.convert_to_hex(hashutil.hash_path(tar_path)) + artifact = MultiHash.from_path(tar_path).hexdigest() artifact['name'] = os.path.basename(tar_path) artifact['archive_type'] = nature artifact['length'] = os.path.getsize(tar_path) revision['metadata'] = { 'original_artifact': [artifact], } branch = branch_name if branch_name else os.path.basename(tar_path) super().prepare(dir_path=self.dir_path, origin=origin, visit_date=visit_date, revision=revision, release=None, branch_name=branch) def cleanup(self): """Clean up temporary directory where we uncompress the tarball. """ if self.dir_path and os.path.exists(self.dir_path): shutil.rmtree(self.dir_path) if __name__ == '__main__': import click import logging logging.basicConfig( level=logging.DEBUG, format='%(asctime)s %(process)d %(message)s' ) @click.command() @click.option('--archive-path', required=1, help='Archive path to load') @click.option('--origin-url', required=1, help='Origin url to associate') @click.option('--visit-date', default=None, help='Visit date time override') def main(archive_path, origin_url, visit_date): """Loading archive tryout.""" import datetime origin = {'url': origin_url, 'type': 'tar'} commit_time = int(datetime.datetime.now( tz=datetime.timezone.utc).timestamp()) swh_person = { 'name': 'Software Heritage', 'fullname': 'Software Heritage', 'email': 'robot@softwareheritage.org' } revision = { 'date': {'timestamp': commit_time, 'offset': 0}, 'committer_date': {'timestamp': commit_time, 'offset': 0}, 'author': swh_person, 'committer': swh_person, 'type': 'tar', 'message': 'swh-loader-tar: synthetic revision message', 'metadata': {}, 'synthetic': True, } TarLoader().load(tar_path=archive_path, origin=origin, visit_date=visit_date, revision=revision, branch_name='master') main() diff --git a/swh/loader/tar/tests/test_utils.py b/swh/loader/tar/tests/test_utils.py deleted file mode 100644 index 05b43fc..0000000 --- a/swh/loader/tar/tests/test_utils.py +++ /dev/null @@ -1,45 +0,0 @@ -# Copyright (C) 2015-2017 The Software Heritage developers -# See the AUTHORS file at the top-level directory of this distribution -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - -import unittest - -from nose.tools import istest - -from swh.loader.tar import utils - - -class TestUtils(unittest.TestCase): - @istest - def convert_to_hex(self): - # given - input_dict = { - 'sha1_git': b'\xf6\xb7 \x8b+\xcd \x9fq5E\xe6\x03\xffg\x87\xd7\xb9D\xa1', # noqa - 'sha1': b'\xf4O\xf0\xd4\xc0\xb0\xae\xca\xe4C\xab%\x10\xf7\x12h\x1e\x9f\xac\xeb', # noqa - 'sha256': b'\xa8\xf9=\xf3\xfek\xa2$\xee\xc7\x1b\xc2\x83\xca\x96\xae8\xaf&\xab\x08\xfa\xb1\x13\xec(.s]\xf6Yb', # noqa - 'length': 10, - } # noqa - - expected_dict = {'sha1_git': 'f6b7208b2bcd209f713545e603ff6' - '787d7b944a1', - 'sha1': 'f44ff0d4c0b0aecae443ab2510f712681e' - '9faceb', - 'sha256': 'a8f93df3fe6ba224eec71bc283ca96ae3' - '8af26ab08fab113ec282e735df65962', - 'length': 10} - - # when - actual_dict = utils.convert_to_hex(input_dict) - - # then - self.assertDictEqual(actual_dict, expected_dict) - - @istest - def convert_to_hex_edge_cases(self): - # when - actual_dict = utils.convert_to_hex({}) - # then - self.assertDictEqual(actual_dict, {}) - - self.assertIsNone(utils.convert_to_hex(None)) diff --git a/swh/loader/tar/utils.py b/swh/loader/tar/utils.py index b728b0a..af25e3d 100644 --- a/swh/loader/tar/utils.py +++ b/swh/loader/tar/utils.py @@ -1,74 +1,48 @@ # Copyright (C) 2015-2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import itertools import random -from swh.model import hashutil - - -def convert_to_hex(d): - """Convert a flat dictionary with bytes in values to the same dictionary - with hex as values. - - Args: - dict: flat dictionary with sha bytes in their values. - - Returns: - Mirror dictionary with values as string hex. - - """ - if not d: - return d - - checksums = {} - for key, h in d.items(): - if isinstance(h, bytes): - checksums[key] = hashutil.hash_to_hex(h) - else: - checksums[key] = h - - return checksums - def grouper(iterable, n, fillvalue=None): """Collect data into fixed-length chunks or blocks. Args: iterable: an iterable n: size of block fillvalue: value to use for the last block Returns: fixed-length chunks of blocks as iterables """ args = [iter(iterable)] * n return itertools.zip_longest(*args, fillvalue=fillvalue) def random_blocks(iterable, block=100, fillvalue=None): """Given an iterable: - slice the iterable in data set of block-sized elements - randomized the data set - yield each element Args: iterable: iterable of data block: number of elements per block fillvalue: a fillvalue for the last block if not enough values in last block Returns: An iterable of randomized per block-size elements. """ count = 0 for iterable in grouper(iterable, block, fillvalue=fillvalue): count += 1 lst = list(iterable) random.shuffle(lst) for e in lst: yield e