diff --git a/PKG-INFO b/PKG-INFO index ebc0730..11c148e 100644 --- a/PKG-INFO +++ b/PKG-INFO @@ -1,10 +1,10 @@ Metadata-Version: 1.0 Name: swh.loader.tar -Version: 0.0.23 +Version: 0.0.24 Summary: Software Heritage Tarball Loader Home-page: https://forge.softwareheritage.org/diffusion/DLDTAR Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Description: UNKNOWN Platform: UNKNOWN diff --git a/debian/control b/debian/control index 0bec5e7..8851d19 100644 --- a/debian/control +++ b/debian/control @@ -1,24 +1,25 @@ Source: swh-loader-tar Maintainer: Software Heritage developers Section: python Priority: optional Build-Depends: debhelper (>= 9), dh-python, python3-all, python3-nose, python3-setuptools, python3-swh.core (>= 0.0.14~), + python3-swh.model (>= 0.0.15~), python3-swh.scheduler, - python3-swh.storage (>= 0.0.76~), - python3-swh.loader.dir (>= 0.0.24~), + python3-swh.storage (>= 0.0.83~), + python3-swh.loader.dir (>= 0.0.25~), python3-vcversioner Standards-Version: 3.9.6 Homepage: https://forge.softwareheritage.org/diffusion/DLDTAR/ Package: python3-swh.loader.tar Architecture: all -Depends: python3-swh.core (>= 0.0.14~), python3-swh.storage (>= 0.0.76~), - python3-swh.loader.dir (>= 0.0.24~), python3-swh.scheduler, +Depends: python3-swh.core (>= 0.0.14~), python3-swh.storage (>= 0.0.83~), + python3-swh.loader.dir (>= 0.0.25~), python3-swh.scheduler, ${misc:Depends}, ${python3:Depends} Description: Software Heritage Tarball Loader diff --git a/requirements-swh.txt b/requirements-swh.txt index cd41b69..15e51e3 100644 --- a/requirements-swh.txt +++ b/requirements-swh.txt @@ -1,4 +1,5 @@ swh.core >= 0.0.14 +swh.model >= 0.0.15 swh.scheduler -swh.storage >= 0.0.76 -swh.loader.dir >= 0.0.24 +swh.storage >= 0.0.83 +swh.loader.dir >= 0.0.25 diff --git a/resources/create-tarball.ini b/resources/create-tarball.ini deleted file mode 100644 index d4f6d6c..0000000 --- a/resources/create-tarball.ini +++ /dev/null @@ -1,4 +0,0 @@ -[main] -storage_class = remote_storage -storage_args = http://localhost:5002/ - diff --git a/swh.loader.tar.egg-info/PKG-INFO b/swh.loader.tar.egg-info/PKG-INFO index ebc0730..11c148e 100644 --- a/swh.loader.tar.egg-info/PKG-INFO +++ b/swh.loader.tar.egg-info/PKG-INFO @@ -1,10 +1,10 @@ Metadata-Version: 1.0 Name: swh.loader.tar -Version: 0.0.23 +Version: 0.0.24 Summary: Software Heritage Tarball Loader Home-page: https://forge.softwareheritage.org/diffusion/DLDTAR Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Description: UNKNOWN Platform: UNKNOWN diff --git a/swh.loader.tar.egg-info/SOURCES.txt b/swh.loader.tar.egg-info/SOURCES.txt index d59bf44..bb77cb1 100644 --- a/swh.loader.tar.egg-info/SOURCES.txt +++ b/swh.loader.tar.egg-info/SOURCES.txt @@ -1,36 +1,35 @@ .gitignore AUTHORS LICENSE MANIFEST.in Makefile README requirements-swh.txt requirements.txt setup.py version.txt debian/changelog debian/compat debian/control debian/copyright debian/rules debian/source/format -resources/create-tarball.ini resources/loader/tar.ini resources/producer/tar-gnu.ini resources/producer/tar-old-gnu.ini swh.loader.tar.egg-info/PKG-INFO swh.loader.tar.egg-info/SOURCES.txt swh.loader.tar.egg-info/dependency_links.txt swh.loader.tar.egg-info/requires.txt swh.loader.tar.egg-info/top_level.txt swh/loader/tar/__init__.py swh/loader/tar/build.py swh/loader/tar/db.py swh/loader/tar/file.py swh/loader/tar/loader.py swh/loader/tar/producer.py swh/loader/tar/tarball.py swh/loader/tar/tasks.py swh/loader/tar/utils.py swh/loader/tar/tests/test_build.py swh/loader/tar/tests/test_utils.py \ No newline at end of file diff --git a/swh.loader.tar.egg-info/requires.txt b/swh.loader.tar.egg-info/requires.txt index f64313a..a58e0a5 100644 --- a/swh.loader.tar.egg-info/requires.txt +++ b/swh.loader.tar.egg-info/requires.txt @@ -1,8 +1,9 @@ click python-dateutil retrying swh.core>=0.0.14 -swh.loader.dir>=0.0.24 +swh.loader.dir>=0.0.25 +swh.model>=0.0.15 swh.scheduler -swh.storage>=0.0.76 +swh.storage>=0.0.83 vcversioner diff --git a/swh/loader/tar/loader.py b/swh/loader/tar/loader.py index 7d320f4..9d2ac5d 100644 --- a/swh/loader/tar/loader.py +++ b/swh/loader/tar/loader.py @@ -1,100 +1,100 @@ # Copyright (C) 2015-2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import os import tempfile import shutil -from swh.core import hashutil from swh.loader.dir import loader from swh.loader.tar import tarball, utils +from swh.model import hashutil class TarLoader(loader.DirLoader): """A tarball loader: - creates an origin if it does not exist - creates a fetch_history entry - creates an origin_visit - uncompress locally the tarballs in a temporary location - process the content of the tarballs to persist on swh storage - clean up the temporary location - write an entry in fetch_history to mark the loading tarball end (success or failure) Args: - tarpath: path to the tarball to uncompress - origin: Dictionary origin - url: url origin we fetched - type: type of the origin - visit_date (str): To override the visit date - revision: Dictionary of information needed, keys are: - author_name: revision's author name - author_email: revision's author email - author_date: timestamp (e.g. 1444054085) - author_offset: date offset e.g. -0220, +0100 - committer_name: revision's committer name - committer_email: revision's committer email - committer_date: timestamp - committer_offset: date offset e.g. -0220, +0100 - type: type of revision dir, tar - message: synthetic message for the revision - occurrences: List of occurrence dictionary. Information needed, keys are: - branch: occurrence's branch name - authority_id: authority id (e.g. 1 for swh) - validity: validity date (e.g. 2015-01-01 00:00:00+00) """ CONFIG_BASE_FILENAME = 'loader/tar' ADDITIONAL_CONFIG = { 'extraction_dir': ('string', '/tmp') } def __init__(self): super().__init__(logging_class='swh.loader.tar.TarLoader') def prepare(self, *args, **kwargs): """1. Uncompress the tarball in a temporary directory. 2. Compute some metadata to update the revision. """ tarpath, origin, visit_date, revision, occs = args if 'type' not in origin: # let the type flow if present origin['type'] = 'tar' # Prepare the extraction path extraction_dir = self.config['extraction_dir'] os.makedirs(extraction_dir, 0o755, exist_ok=True) dir_path = tempfile.mkdtemp(prefix='swh.loader.tar-', dir=extraction_dir) # add checksums in revision - artifact = utils.convert_to_hex(hashutil.hashfile(tarpath)) + artifact = utils.convert_to_hex(hashutil.hash_path(tarpath)) artifact['name'] = os.path.basename(tarpath) self.log.info('Uncompress %s to %s' % (tarpath, dir_path)) nature = tarball.uncompress(tarpath, dir_path) artifact['archive_type'] = nature artifact['length'] = os.path.getsize(tarpath) revision['metadata'] = { 'original_artifact': [artifact], } self.dir_path = dir_path super().prepare(dir_path, origin, visit_date, revision, None, occs) def cleanup(self): """Clean up temporary directory where we uncompress the tarball. """ dir_path = self.dir_path if dir_path and os.path.exists(dir_path): shutil.rmtree(dir_path) diff --git a/swh/loader/tar/producer.py b/swh/loader/tar/producer.py index e692b7f..a13765c 100755 --- a/swh/loader/tar/producer.py +++ b/swh/loader/tar/producer.py @@ -1,101 +1,101 @@ # Copyright (C) 2015-2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import click import dateutil.parser from swh.scheduler.utils import get_task from swh.core import config from swh.loader.tar import build, file TASK_QUEUE = 'swh.loader.tar.tasks.LoadTarRepository' def produce_archive_messages_from( conf, root_dir, visit_date, mirror_file=None, dry_run=False): """From root_dir, produce archive tarball messages to celery. Will print error message when some computation arise on archive and continue. Args: conf: dictionary holding static metadata root_dir: top directory to list archives from. visit_date: override origin's visit date of information mirror_file: a filtering file of tarballs to load dry_run: will compute but not send messages Returns: Number of messages generated """ - limit = conf['limit'] + limit = conf.get('limit') block = int(conf['block_messages']) count = 0 path_source_tarballs = mirror_file if mirror_file else root_dir visit_date = dateutil.parser.parse(visit_date) if not dry_run: task = get_task(TASK_QUEUE) for tarpath, _ in file.random_archives_from( path_source_tarballs, block, limit): try: origin = build.compute_origin( conf['url_scheme'], conf['type'], root_dir, tarpath) revision = build.compute_revision(tarpath) occurrence = build.compute_occurrence(tarpath) if not dry_run: task.delay(tarpath, origin, visit_date, revision, [occurrence]) count += 1 except ValueError: print('Problem with the following archive: %s' % tarpath) return count @click.command() @click.option('--config-file', required=1, help='Configuration file path') @click.option('--dry-run/--no-dry-run', default=False, help='Dry run (print repo only)') @click.option('--limit', default=None, help='Number of origins limit to send') def main(config_file, dry_run, limit): """Tarball producer of local fs tarballs. """ conf = config.read(config_file) url_scheme = conf['url_scheme'] mirror_dir = conf['mirror_root_directory'] # remove trailing / in configuration (to ease ulterior computation) if url_scheme[-1] == '/': conf['url_scheme'] = url_scheme[0:-1] if mirror_dir[-1] == '/': conf['mirror_root_directory'] = mirror_dir[0:-1] if limit: conf['limit'] = int(limit) nb_tarballs = produce_archive_messages_from( conf=conf, root_dir=conf['mirror_root_directory'], visit_date=conf['date'], mirror_file=conf.get('mirror_subset_archives'), dry_run=dry_run) print('%s tarball(s) sent to worker.' % nb_tarballs) if __name__ == '__main__': main() diff --git a/swh/loader/tar/tests/test_utils.py b/swh/loader/tar/tests/test_utils.py index 80d4da7..38f222f 100644 --- a/swh/loader/tar/tests/test_utils.py +++ b/swh/loader/tar/tests/test_utils.py @@ -1,56 +1,59 @@ # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import unittest from nose.tools import istest from swh.loader.tar import utils class TestUtils(unittest.TestCase): @istest def commonname(self): # when actual_commonname = utils.commonname('/some/where/to/', '/some/where/to/go/to') # then self.assertEquals('go/to', actual_commonname) # when actual_commonname2 = utils.commonname(b'/some/where/to/', b'/some/where/to/go/to') # then self.assertEquals(b'go/to', actual_commonname2) @istest def convert_to_hex(self): # given input_dict = { 'sha1_git': b'\xf6\xb7 \x8b+\xcd \x9fq5E\xe6\x03\xffg\x87\xd7\xb9D\xa1', # noqa 'sha1': b'\xf4O\xf0\xd4\xc0\xb0\xae\xca\xe4C\xab%\x10\xf7\x12h\x1e\x9f\xac\xeb', # noqa - 'sha256': b'\xa8\xf9=\xf3\xfek\xa2$\xee\xc7\x1b\xc2\x83\xca\x96\xae8\xaf&\xab\x08\xfa\xb1\x13\xec(.s]\xf6Yb'} # noqa + 'sha256': b'\xa8\xf9=\xf3\xfek\xa2$\xee\xc7\x1b\xc2\x83\xca\x96\xae8\xaf&\xab\x08\xfa\xb1\x13\xec(.s]\xf6Yb', # noqa + 'length': 10, + } # noqa expected_dict = {'sha1_git': 'f6b7208b2bcd209f713545e603ff6' '787d7b944a1', 'sha1': 'f44ff0d4c0b0aecae443ab2510f712681e' '9faceb', 'sha256': 'a8f93df3fe6ba224eec71bc283ca96ae3' - '8af26ab08fab113ec282e735df65962'} + '8af26ab08fab113ec282e735df65962', + 'length': 10} # when actual_dict = utils.convert_to_hex(input_dict) # then self.assertDictEqual(actual_dict, expected_dict) @istest def convert_to_hex_edge_cases(self): # when actual_dict = utils.convert_to_hex({}) # then self.assertDictEqual(actual_dict, {}) self.assertIsNone(utils.convert_to_hex(None)) diff --git a/swh/loader/tar/utils.py b/swh/loader/tar/utils.py index 67706d6..e0c9230 100644 --- a/swh/loader/tar/utils.py +++ b/swh/loader/tar/utils.py @@ -1,78 +1,81 @@ # Copyright (C) 2015-2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import itertools import random -from swh.core import hashutil +from swh.model import hashutil def commonname(path0, path1, as_str=False): """Compute the commonname between the path0 and path1. """ return path1.split(path0)[1] def convert_to_hex(d): """Convert a flat dictionary with bytes in values to the same dictionary with hex as values. Args: dict: flat dictionary with sha bytes in their values. Returns: Mirror dictionary with values as string hex. """ if not d: return d checksums = {} for key, h in d.items(): - checksums[key] = hashutil.hash_to_hex(h) + if isinstance(h, bytes): + checksums[key] = hashutil.hash_to_hex(h) + else: + checksums[key] = h return checksums def grouper(iterable, n, fillvalue=None): """Collect data into fixed-length chunks or blocks. Args: iterable: an iterable n: size of block fillvalue: value to use for the last block Returns: fixed-length chunks of blocks as iterables """ args = [iter(iterable)] * n return itertools.zip_longest(*args, fillvalue=fillvalue) def random_blocks(iterable, block=100, fillvalue=None): """Given an iterable: - slice the iterable in data set of block-sized elements - randomized the data set - yield each element Args: iterable: iterable of data block: number of elements per block fillvalue: a fillvalue for the last block if not enough values in last block Returns: An iterable of randomized per block-size elements. """ count = 0 for iterable in grouper(iterable, block, fillvalue=fillvalue): count += 1 l = list(iterable) random.shuffle(l) for e in l: yield e diff --git a/version.txt b/version.txt index 1683b16..0755384 100644 --- a/version.txt +++ b/version.txt @@ -1 +1 @@ -v0.0.23-0-gd4fe87e \ No newline at end of file +v0.0.24-0-g86e4e26 \ No newline at end of file