diff --git a/bin/test-archive.sh b/bin/test-archive.sh deleted file mode 100755 index cb3c03f..0000000 --- a/bin/test-archive.sh +++ /dev/null @@ -1,16 +0,0 @@ -#!/usr/bin/env bash - -generated_ar=./emacs-19.34.6-src.tar.gz -original_ar=/home/storage/space/mirrors/gnu.org/old-gnu/emacs/windows/19.34/src/emacs-19.34.6-src.tar.gz - -tmpdir=$(mktemp -d) -tmpdirgnu=$(mktemp -d) - -tar xvf $generated_ar -C $tmpdir -tar xvf $original_ar -C $tmpdirgnu - -echo "diff -r ar:$tmpdir gnu:$tmpdirgnu" -diff -r $tmpdir $tmpdirgnu - - -rm -rf $tmpdir $tmpdirgnu diff --git a/swh/loader/tar/build.py b/swh/loader/tar/build.py index 83f9372..9869990 100755 --- a/swh/loader/tar/build.py +++ b/swh/loader/tar/build.py @@ -1,136 +1,101 @@ -# Copyright (C) 2015 The Software Heritage developers +# Copyright (C) 2015-2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import os from swh.loader.tar import utils # Static setup EPOCH = 0 UTC_OFFSET = 0 SWH_PERSON = { 'name': 'Software Heritage', 'fullname': 'Software Heritage', 'email': 'robot@softwareheritage.org' } REVISION_MESSAGE = 'synthetic revision message' -RELEASE_MESSAGE = 'synthetic release message' REVISION_TYPE = 'tar' def compute_origin(url_scheme, url_type, root_dirpath, tarpath): """Compute the origin. Args: - url_scheme: scheme to build the origin's url - url_type: origin's type - root_dirpath: the top level root directory path - tarpath: file's absolute path Returns: Dictionary origin with keys: - url: origin's url - type: origin's type """ relative_path = utils.commonname(root_dirpath, tarpath) return { 'url': ''.join([url_scheme, os.path.dirname(relative_path)]), 'type': url_type, } def occurrence_with_date(date, tarpath): """Compute the occurrence using the tarpath's ctime. Args: authority: the authority's uuid tarpath: file's path Returns: Occurrence dictionary (cf. _build_occurrence) """ return { 'branch': os.path.basename(tarpath), 'date': date } def _time_from_path(tarpath): """Compute the modification time from the tarpath. """ return os.lstat(tarpath).st_mtime def compute_revision(tarpath): """Compute a revision. Args: tarpath: absolute path to the tarball Returns: Revision as dict: - date: the modification timestamp as returned by a fstat call - committer_date: the modification timestamp as returned by a fstat call - author: cf. SWH_PERSON - committer: cf. SWH_PERSON - type: cf. REVISION_TYPE - message: cf. REVISION_MESSAGE """ ts = _time_from_path(tarpath) return { 'date': { 'timestamp': ts, 'offset': UTC_OFFSET, }, 'committer_date': { 'timestamp': ts, 'offset': UTC_OFFSET, }, 'author': SWH_PERSON, 'committer': SWH_PERSON, 'type': REVISION_TYPE, 'message': REVISION_MESSAGE, } - - -def compute_release(filename, tarpath): - """Compute a release from a given tarpath, filename. - If the tarpath does not contain a recognizable release number, the release - can be skipped. - - Args: - filename: file's name without path - tarpath: file's absolute path - - Returns: - None if the release number cannot be extracted from the filename. - Otherwise a synthetic release is computed with the following keys: - - name: the release computed from the filename - - date: the modification timestamp as returned by a fstat call - - offset: 0 - - author_name: '' - - author_email: '' - - comment: '' - - """ - release_number = utils.release_number(filename) - if release_number: - return { - 'name': release_number, - 'date': { - 'timestamp': _time_from_path(tarpath), - 'offset': UTC_OFFSET, - }, - 'author': SWH_PERSON, - 'message': RELEASE_MESSAGE, - } - return None diff --git a/swh/loader/tar/loader.py b/swh/loader/tar/loader.py index 88a9f9b..97b6f2f 100644 --- a/swh/loader/tar/loader.py +++ b/swh/loader/tar/loader.py @@ -1,130 +1,131 @@ # Copyright (C) 2015-2016 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import datetime import os import tempfile import shutil from swh.core import hashutil from swh.loader.dir import loader from swh.loader.tar import tarball, utils class TarLoader(loader.DirLoader): """A tarball loader. """ CONFIG_BASE_FILENAME = 'loader/tar' ADDITIONAL_CONFIG = { 'extraction_dir': ('string', '/tmp') } def __init__(self): super().__init__(logging_class='swh.loader.tar.TarLoader') - def load(self, tarpath, origin, visit, revision, release, occurrences): + def load(self, tarpath, origin, visit, revision, occurrences): """ Load a tarball in backend. This will: - uncompress locally the tarballs in a temporary location - process the content of the tarballs to persist on swh storage - clean up the temporary location - write an entry in fetch_history to mark the loading tarball end Args: - tarpath: path to the tarball to uncompress - origin: Dictionary origin - url: url origin we fetched - type: type of the origin - visit: Numbered visit - revision: Dictionary of information needed, keys are: - author_name: revision's author name - author_email: revision's author email - author_date: timestamp (e.g. 1444054085) - author_offset: date offset e.g. -0220, +0100 - committer_name: revision's committer name - committer_email: revision's committer email - committer_date: timestamp - committer_offset: date offset e.g. -0220, +0100 - type: type of revision dir, tar - message: synthetic message for the revision - release: Dictionary of information needed, keys are: - name: release name - date: release timestamp (e.g. 1444054085) - offset: release date offset e.g. -0220, +0100 - author_name: release author's name - author_email: release author's email - comment: release's comment message - occurrences: List of occurrence dictionary. Information needed, keys are: - branch: occurrence's branch name - authority_id: authority id (e.g. 1 for swh) - validity: validity date (e.g. 2015-01-01 00:00:00+00) """ # Prepare the extraction path extraction_dir = self.config['extraction_dir'] os.makedirs(extraction_dir, 0o755, exist_ok=True) dir_path = tempfile.mkdtemp(prefix='swh.loader.tar-', dir=extraction_dir) # add checksums in revision artifact = utils.convert_to_hex(hashutil.hashfile(tarpath)) artifact['name'] = os.path.basename(tarpath) try: self.log.info('Uncompress %s to %s' % (tarpath, dir_path)) nature = tarball.uncompress(tarpath, dir_path) artifact['archive_type'] = nature artifact['length'] = os.path.getsize(tarpath) revision['metadata'] = { 'original_artifact': [artifact], } return super().load( - dir_path, origin, visit, revision, release, occurrences) + dir_path, origin, visit, revision, + release={}, occurrences=occurrences) finally: shutil.rmtree(dir_path) def prepare_and_load(self, - tarpath, origin, revision, release, occurrences): + tarpath, origin, revision, occurrences): """ Prepare origin, fetch_origin, origin_visit Then load a tarball 'tarpath'. Then close origin_visit, fetch_history First: - creates an origin if it does not exist - creates a fetch_history entry - creates an origin_visit - Then loads the tarball """ if 'type' not in origin: # let the type flow if present origin['type'] = 'tar' self.origin_id = self.storage.origin_add_one(origin) origin['id'] = self.origin_id date_visit = datetime.datetime.now(tz=datetime.timezone.utc) origin_visit = self.storage.origin_visit_add(origin['id'], date_visit) visit = origin_visit['visit'] fetch_history_id = self.open_fetch_history() try: - self.load(tarpath, origin, visit, revision, release, occurrences) + self.load(tarpath, origin, visit, revision, occurrences) self.close_fetch_history_success(fetch_history_id) self.storage.origin_visit_update( self.origin_id, origin_visit['visit'], status='full') except: self.close_fetch_history_failure(fetch_history_id) self.storage.origin_visit_update( self.origin_id, origin_visit['visit'], status='partial') raise diff --git a/swh/loader/tar/producer.py b/swh/loader/tar/producer.py index de21c40..7dc0835 100755 --- a/swh/loader/tar/producer.py +++ b/swh/loader/tar/producer.py @@ -1,179 +1,174 @@ #!/usr/bin/env python3 -# Copyright (C) 2015 The Software Heritage developers +# Copyright (C) 2015-2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import argparse import sys from swh.core import config from swh.loader.tar import build, file task_queue = 'swh.loader.tar.tasks.LoadTarRepository' def compute_message_from(app, conf, root_dir, tarpath, filename, retrieval_date, dry_run=False): """Compute and post the message to worker for the archive tarpath. Args: app: instance of the celery app conf: dictionary holding static metadata root_dir: root directory tarball: the archive's representation retrieval_date: retrieval date of information dry_run: will compute but not send messages Returns: None - Raises: - ValueError when release number computation error arise. - """ origin = build.compute_origin(conf['url_scheme'], conf['type'], root_dir, tarpath) revision = build.compute_revision(tarpath) occurrence = build.occurrence_with_date(retrieval_date, tarpath) - release = build.compute_release(filename, tarpath) if not dry_run: app.tasks[task_queue].delay(tarpath, origin, revision, - release, [occurrence]) def produce_archive_messages_from(app, conf, path, retrieval_date, mirror_file=None, dry_run=False): """From path, produce archive tarball messages to celery. Will print error message when some computation arise on archive and continue. Args: app: instance of the celery app conf: dictionary holding static metadata path: top directory to list archives from. retrieval_date: retrieval date of information mirror_file: a filtering file of tarballs to load dry_run: will compute but not send messages Returns: None Raises: None """ limit = conf['limit'] block = int(conf['block_messages']) count = 0 path_source_tarballs = mirror_file if mirror_file else path for tarpath, fname in file.random_archives_from(path_source_tarballs, block, limit): try: compute_message_from(app, conf, path, tarpath, fname, retrieval_date, dry_run) count += 1 except ValueError: print('Problem with the following archive: %s' % tarpath) return count def load_config(conf_file): """Load the configuration from file. Args: conf_file: path to a configuration file with the following content: [main] # mirror's root directory holding tarballs to load into swh mirror_root_directory = /home/storage/space/mirrors/gnu.org/gnu/ # origin setup's possible scheme url url_scheme = rsync://ftp.gnu.org/gnu/ # origin type used for those tarballs type = ftp # For tryouts purposes (no limit if not specified) limit = 1 Returns: dictionary of data present in the configuration file. """ conf = config.read(conf_file, default_conf={'limit': ('int', None)}) url_scheme = conf['url_scheme'] mirror_dir = conf['mirror_root_directory'] # remove trailing / in configuration (to ease ulterior computation) if url_scheme[-1] == '/': conf.update({ 'url_scheme': url_scheme[0:-1] }) if mirror_dir[-1] == '/': conf.update({ 'mirror_root_directory': mirror_dir[0:-1] }) return conf def parse_args(): """Parse the configuration from the cli. """ cli = argparse.ArgumentParser( description='Tarball producer of local fs tarballs.') cli.add_argument('--dry-run', '-n', action='store_true', help='Dry run (print repo only)') cli.add_argument('--config', '-c', help='configuration file path') args = cli.parse_args() return args if __name__ == '__main__': args = parse_args() config_file = args.config if not config_file: print('Missing configuration file option.') sys.exit(1) # instantiate celery app with its configuration from swh.scheduler.celery_backend.config import app from swh.loader.tar import tasks # noqa conf = load_config(config_file) retrieval_date = conf['date'] nb_tarballs = produce_archive_messages_from( app, conf, conf['mirror_root_directory'], retrieval_date, conf.get('mirror_subset_archives'), args.dry_run) print('%s tarball(s) sent to worker.' % nb_tarballs) diff --git a/swh/loader/tar/tasks.py b/swh/loader/tar/tasks.py index c6ea825..34a5558 100644 --- a/swh/loader/tar/tasks.py +++ b/swh/loader/tar/tasks.py @@ -1,27 +1,28 @@ -# Copyright (C) 2015-2016 The Software Heritage developers +# Copyright (C) 2015-2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from swh.scheduler.task import Task from swh.loader.tar.loader import TarLoader class LoadTarRepository(Task): """Import a directory to Software Heritage """ task_queue = 'swh_loader_tar' - def run(self, tarpath, origin, revision, release, occurrences): + def run(self, tarpath, origin, revision, occurrences): """Import a tarball into swh. Args: - tarpath: path to a tarball file - origin, revision, release, occurrences: cf. swh.loader.dir.loader.run docstring """ - TarLoader().prepare_and_load( - tarpath, origin, revision, release, occurrences) + loader = TarLoader() + loader.log = self.log + loader.prepare_and_load(tarpath, origin, revision, occurrences) diff --git a/swh/loader/tar/tests/test_build.py b/swh/loader/tar/tests/test_build.py index e4a7763..b06bed3 100644 --- a/swh/loader/tar/tests/test_build.py +++ b/swh/loader/tar/tests/test_build.py @@ -1,106 +1,71 @@ -# Copyright (C) 2015 The Software Heritage developers +# Copyright (C) 2015-2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import unittest from nose.tools import istest from unittest.mock import patch from swh.loader.tar import build class TestBuildUtils(unittest.TestCase): @istest def compute_origin(self): # given expected_origin = { 'url': 'rsync://some/url/package-foo', 'type': 'rsync', } # when actual_origin = build.compute_origin( 'rsync://some/url/', 'rsync', '/some/root/path/', '/some/root/path/package-foo/package-foo-1.2.3.tgz') # then self.assertEquals(actual_origin, expected_origin) @istest def occurrence_with_date(self): # given expected_occurrence = { 'branch': b'package-bar.tgz', 'date': '2015-10-22 08:44:47.422384+00' } # when actual_occurrence = build.occurrence_with_date( '2015-10-22 08:44:47.422384+00', b'/path/to/package-bar.tgz',) # then self.assertEquals(actual_occurrence, expected_occurrence) - @istest - def compute_release__no_release(self): - # given - - # when - actual_release = build.compute_release( - 'pack-without-version.tgz', - '/some/path/to/pack-without-version.tgz') - - # then - self.assertIsNone(actual_release) - - @istest - def compute_release(self): - # given - expected_release = { - 'name': '1.2.3rc1', - 'date': { - 'timestamp': 'some-time', - 'offset': build.UTC_OFFSET, - }, - 'author': build.SWH_PERSON, - 'message': build.RELEASE_MESSAGE, - } - - # when - with patch('swh.loader.tar.build._time_from_path', - return_value='some-time'): - actual_release = build.compute_release( - 'foobar-1.2.3rc1.tgz', - '/some/path/to/path-without-version.tgz') - - # then - self.assertEquals(expected_release, actual_release) - @istest def compute_revision(self): # when with patch('swh.loader.tar.build._time_from_path', return_value='some-other-time'): actual_revision = build.compute_revision('/some/path') expected_revision = { 'date': { 'timestamp': 'some-other-time', 'offset': build.UTC_OFFSET, }, 'committer_date': { 'timestamp': 'some-other-time', 'offset': build.UTC_OFFSET, }, 'author': build.SWH_PERSON, 'committer': build.SWH_PERSON, 'type': build.REVISION_TYPE, 'message': build.REVISION_MESSAGE, } # then self.assertEquals(actual_revision, expected_revision)