diff --git a/PKG-INFO b/PKG-INFO index 69d55c8..81eb713 100644 --- a/PKG-INFO +++ b/PKG-INFO @@ -1,10 +1,10 @@ Metadata-Version: 1.0 Name: swh.loader.tar -Version: 0.0.32 +Version: 0.0.33 Summary: Software Heritage Tarball Loader Home-page: https://forge.softwareheritage.org/diffusion/DLDTAR Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Description: UNKNOWN Platform: UNKNOWN diff --git a/debian/control b/debian/control index f639d03..8993181 100644 --- a/debian/control +++ b/debian/control @@ -1,25 +1,27 @@ Source: swh-loader-tar Maintainer: Software Heritage developers Section: python Priority: optional Build-Depends: debhelper (>= 9), dh-python (>= 2), python3-all, python3-nose, python3-setuptools, python3-swh.core (>= 0.0.36~), + python3-swh.loader.dir (>= 0.0.31~), python3-swh.model (>= 0.0.15~), python3-swh.scheduler (>= 0.0.14~), python3-swh.storage (>= 0.0.83~), - python3-swh.loader.dir (>= 0.0.30~), python3-vcversioner Standards-Version: 3.9.6 Homepage: https://forge.softwareheritage.org/diffusion/DLDTAR/ Package: python3-swh.loader.tar Architecture: all -Depends: python3-swh.core (>= 0.0.36~), python3-swh.storage (>= 0.0.83~), - python3-swh.loader.dir (>= 0.0.30~), python3-swh.scheduler (>= 0.0.14~), +Depends: python3-swh.core (>= 0.0.36~), + python3-swh.loader.dir (>= 0.0.31~), + python3-swh.scheduler (>= 0.0.14~), + python3-swh.storage (>= 0.0.83~), ${misc:Depends}, ${python3:Depends} Description: Software Heritage Tarball Loader diff --git a/requirements-swh.txt b/requirements-swh.txt index 649e66d..baacbb0 100644 --- a/requirements-swh.txt +++ b/requirements-swh.txt @@ -1,5 +1,5 @@ swh.core >= 0.0.36 swh.model >= 0.0.15 swh.scheduler >= 0.0.14 swh.storage >= 0.0.83 -swh.loader.dir >= 0.0.30 +swh.loader.dir >= 0.0.31 diff --git a/swh.loader.tar.egg-info/PKG-INFO b/swh.loader.tar.egg-info/PKG-INFO index 69d55c8..81eb713 100644 --- a/swh.loader.tar.egg-info/PKG-INFO +++ b/swh.loader.tar.egg-info/PKG-INFO @@ -1,10 +1,10 @@ Metadata-Version: 1.0 Name: swh.loader.tar -Version: 0.0.32 +Version: 0.0.33 Summary: Software Heritage Tarball Loader Home-page: https://forge.softwareheritage.org/diffusion/DLDTAR Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Description: UNKNOWN Platform: UNKNOWN diff --git a/swh.loader.tar.egg-info/requires.txt b/swh.loader.tar.egg-info/requires.txt index e0ec4d4..04e02aa 100644 --- a/swh.loader.tar.egg-info/requires.txt +++ b/swh.loader.tar.egg-info/requires.txt @@ -1,9 +1,9 @@ click python-dateutil retrying swh.core>=0.0.36 -swh.loader.dir>=0.0.30 +swh.loader.dir>=0.0.31 swh.model>=0.0.15 swh.scheduler>=0.0.14 swh.storage>=0.0.83 vcversioner diff --git a/swh/loader/tar/loader.py b/swh/loader/tar/loader.py index fb205e8..d173d3c 100644 --- a/swh/loader/tar/loader.py +++ b/swh/loader/tar/loader.py @@ -1,107 +1,107 @@ -# Copyright (C) 2015-2017 The Software Heritage developers +# Copyright (C) 2015-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import os import tempfile import shutil from swh.core import tarball from swh.loader.core.loader import SWHLoader from swh.loader.dir import loader from swh.loader.tar import utils from swh.model import hashutil class TarLoader(loader.DirLoader): """Tarball loader implementation. This is a subclass of the :class:DirLoader as the main goal of this class is to first uncompress a tarball, then provide the uncompressed directory/tree to be loaded by the DirLoader. This will: - creates an origin (if it does not exist) - creates a fetch_history entry - creates an origin_visit - uncompress locally the tarball in a temporary location - process the content of the tarballs to persist on swh storage - clean up the temporary location - write an entry in fetch_history to mark the loading tarball end (success or failure) """ CONFIG_BASE_FILENAME = 'loader/tar' ADDITIONAL_CONFIG = { 'extraction_dir': ('string', '/tmp') } def __init__(self, logging_class='swh.loader.tar.TarLoader', config=None): super().__init__(logging_class=logging_class, config=config) def load(self, *, tar_path, origin, visit_date, revision, occurrences): """Load a tarball in `tarpath` in the Software Heritage Archive. Args: tar_path: tarball to import origin (dict): an origin dictionary as returned by :func:`swh.storage.storage.Storage.origin_get_one` visit_date (str): the date the origin was visited (as an isoformatted string) revision (dict): a revision as passed to :func:`swh.storage.storage.Storage.revision_add`, excluding the `id` and `directory` keys (computed from the directory) occurrences (list of dicts): the occurrences to create in the generated origin visit. Each dict contains a 'branch' key with the branch name as value. """ # Shortcut super() as we use different arguments than the DirLoader. - SWHLoader.load(self, tar_path=tar_path, origin=origin, - visit_date=visit_date, revision=revision, - occurrences=occurrences) + return SWHLoader.load(self, tar_path=tar_path, origin=origin, + visit_date=visit_date, revision=revision, + occurrences=occurrences) def prepare(self, *, tar_path, origin, visit_date, revision, occurrences): """1. Uncompress the tarball in a temporary directory. 2. Compute some metadata to update the revision. """ if 'type' not in origin: # let the type flow if present origin['type'] = 'tar' # Prepare the extraction path extraction_dir = self.config['extraction_dir'] os.makedirs(extraction_dir, 0o755, exist_ok=True) dir_path = tempfile.mkdtemp(prefix='swh.loader.tar-', dir=extraction_dir) # add checksums in revision self.log.info('Uncompress %s to %s' % (tar_path, dir_path)) nature = tarball.uncompress(tar_path, dir_path) if 'metadata' not in revision: artifact = utils.convert_to_hex(hashutil.hash_path(tar_path)) artifact['name'] = os.path.basename(tar_path) artifact['archive_type'] = nature artifact['length'] = os.path.getsize(tar_path) revision['metadata'] = { 'original_artifact': [artifact], } super().prepare(dir_path=dir_path, origin=origin, visit_date=visit_date, revision=revision, release=None, occurrences=occurrences) def cleanup(self): """Clean up temporary directory where we uncompress the tarball. """ dir_path = self.dir_path if dir_path and os.path.exists(dir_path): shutil.rmtree(dir_path) diff --git a/swh/loader/tar/tasks.py b/swh/loader/tar/tasks.py index eb45604..5429fcb 100644 --- a/swh/loader/tar/tasks.py +++ b/swh/loader/tar/tasks.py @@ -1,25 +1,26 @@ -# Copyright (C) 2015-2017 The Software Heritage developers +# Copyright (C) 2015-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from swh.scheduler.task import Task from swh.loader.tar.loader import TarLoader class LoadTarRepository(Task): """Import a directory to Software Heritage """ task_queue = 'swh_loader_tar' def run_task(self, *, tar_path, origin, visit_date, revision, occurrences): """Import a tarball into swh. Args: see :func:`TarLoader.load`. """ loader = TarLoader() loader.log = self.log - loader.load(tar_path=tar_path, origin=origin, visit_date=visit_date, - revision=revision, occurrences=occurrences) + return loader.load(tar_path=tar_path, origin=origin, + visit_date=visit_date, revision=revision, + occurrences=occurrences) diff --git a/swh/loader/tar/utils.py b/swh/loader/tar/utils.py index c5e9bac..b728b0a 100644 --- a/swh/loader/tar/utils.py +++ b/swh/loader/tar/utils.py @@ -1,74 +1,74 @@ # Copyright (C) 2015-2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import itertools import random from swh.model import hashutil def convert_to_hex(d): """Convert a flat dictionary with bytes in values to the same dictionary with hex as values. Args: dict: flat dictionary with sha bytes in their values. Returns: Mirror dictionary with values as string hex. """ if not d: return d checksums = {} for key, h in d.items(): if isinstance(h, bytes): checksums[key] = hashutil.hash_to_hex(h) else: checksums[key] = h return checksums def grouper(iterable, n, fillvalue=None): """Collect data into fixed-length chunks or blocks. Args: iterable: an iterable n: size of block fillvalue: value to use for the last block Returns: fixed-length chunks of blocks as iterables """ args = [iter(iterable)] * n return itertools.zip_longest(*args, fillvalue=fillvalue) def random_blocks(iterable, block=100, fillvalue=None): """Given an iterable: - slice the iterable in data set of block-sized elements - randomized the data set - yield each element Args: iterable: iterable of data block: number of elements per block fillvalue: a fillvalue for the last block if not enough values in last block Returns: An iterable of randomized per block-size elements. """ count = 0 for iterable in grouper(iterable, block, fillvalue=fillvalue): count += 1 - l = list(iterable) - random.shuffle(l) - for e in l: + lst = list(iterable) + random.shuffle(lst) + for e in lst: yield e diff --git a/version.txt b/version.txt index e86f201..560cb5e 100644 --- a/version.txt +++ b/version.txt @@ -1 +1 @@ -v0.0.32-0-gc957097 \ No newline at end of file +v0.0.33-0-g9bcc7f1 \ No newline at end of file