diff --git a/bin/swh-loader-tar-producer b/bin/swh-loader-tar-producer index 5f944c0..dc9bf16 100755 --- a/bin/swh-loader-tar-producer +++ b/bin/swh-loader-tar-producer @@ -1,174 +1,176 @@ #!/usr/bin/env python3 # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import argparse import sys from swh.core import config from swh.loader.tar import build, file def compute_message_from(app, conf, root_dir, tarpath, filename, - dry_run=False): + date, dry_run=False): """Compute and post the message to worker for the archive tarpath. Args: app: instance of the celery app conf: dictionary holding static metadata root_dir: root directory tarball: the archive's representation + retrieval_date: retrieval date of information dry_run: will compute but not send messages Returns: None Raises: ValueError when release number computation error arise. """ origin = build.compute_origin(conf['url_scheme'], conf['type'], root_dir, tarpath) revision = build.compute_revision(tarpath) - occurrences = [build.occurrence_with_mtime(GNU_AUTHORITY, tarpath), - build.occurrence_with_ctime(SWH_AUTHORITY, tarpath)] + occurrence = build.occurrence_with_date(retrieval_date, tarpath) release = build.compute_release(filename, tarpath) if not dry_run: app.tasks['swh.loader.tar.tasks.LoadTarRepository'].delay(tarpath, origin, revision, release, - occurrences) + [occurrence]) def produce_archive_messages_from(app, conf, path, + retrieval_date, mirror_file=None, dry_run=False): """From path, produce archive tarball messages to celery. Will print error message when some computation arise on archive and continue. Args: app: instance of the celery app conf: dictionary holding static metadata path: top directory to list archives from. + retrieval_date: retrieval date of information mirror_file: a filtering file of tarballs to load dry_run: will compute but not send messages Returns: None Raises: None """ limit = conf['limit'] block = int(conf['block_messages']) count = 0 path_source_tarballs = mirror_file if mirror_file else path for tarpath, fname in file.random_archives_from(path_source_tarballs, block, limit): try: - compute_message_from(app, conf, path, tarpath, fname, dry_run) + compute_message_from(app, conf, path, tarpath, fname, + retrieval_date, dry_run) count += 1 except ValueError: print('Problem with the following archive: %s' % tarpath) return count def load_config(conf_file): """Load the configuration from file. Args: conf_file: path to a configuration file with the following content: [main] # mirror's root directory holding tarballs to load into swh mirror_root_directory = /home/storage/space/mirrors/gnu.org/gnu/ # origin setup's possible scheme url url_scheme = rsync://ftp.gnu.org/gnu/ # origin type used for those tarballs type = ftp # For tryouts purposes (no limit if not specified) limit = 1 Returns: dictionary of data present in the configuration file. """ conf = config.read(conf_file, default_conf={'limit': ('int', None)}) url_scheme = conf['url_scheme'] mirror_dir = conf['mirror_root_directory'] # remove trailing / in configuration (to ease ulterior computation) if url_scheme[-1] == '/': conf.update({ 'url_scheme': url_scheme[0:-1] }) if mirror_dir[-1] == '/': conf.update({ 'mirror_root_directory': mirror_dir[0:-1] }) return conf def parse_args(): """Parse the configuration from the cli. """ cli = argparse.ArgumentParser( description='Tarball producer of local fs tarballs.') cli.add_argument('--dry-run', '-n', action='store_true', help='Dry run (print repo only)') cli.add_argument('--config', '-c', help='configuration file path') args = cli.parse_args() return args if __name__ == '__main__': args = parse_args() config_file = args.config if not config_file: print('Missing configuration file option.') sys.exit(1) # instantiate celery app with its configuration - from swh.core.worker import app + from swh.scheduler.celery_backend.config import app from swh.loader.tar import tasks # noqa conf = load_config(config_file) - # state... - SWH_AUTHORITY = conf['swh_authority'] - GNU_AUTHORITY = conf['gnu_authority'] + retrieval_date = conf['date'] nb_tarballs = produce_archive_messages_from( app, conf, conf['mirror_root_directory'], + retrieval_date, conf.get('mirror_subset_archives'), args.dry_run) print('%s tarball(s) sent to worker.' % nb_tarballs) diff --git a/resources/producer/tar-gnu.ini b/resources/producer/tar-gnu.ini index a13f954..98e01c6 100644 --- a/resources/producer/tar-gnu.ini +++ b/resources/producer/tar-gnu.ini @@ -1,31 +1,30 @@ [main] # Mirror's root directory holding tarballs to load into swh mirror_root_directory = /home/storage/space/mirrors/gnu.org/gnu/ # Origin setup's possible scheme url url_scheme = rsync://ftp.gnu.org/gnu/ # Origin type used for tarballs type = ftp # File containing a subset list tarballs from mirror_root_directory to load. # The file's format is one absolute path name to a tarball per line. # NOTE: # - This file must contain data consistent with the mirror_root_directory # - if this option is not provided, the mirror_root_directory is scanned # completely as usual # mirror_subset_archives = /home/storage/missing-archives -# Authorities -gnu_authority = 4706c92a-8173-45d9-93d7-06523f249398 -swh_authority = 5f4d4c51-498a-4e28-88b3-b3e4e8396cba +# Retrieval date information (rsync, etc...) +date = Fri, 28 Aug 2015 13:13:26 +0200 # Randomize blocks of messages and send for consumption block_messages = 250 # DEV options # Tryouts purposes (no limit if not specified) # limit = 10 diff --git a/resources/producer/tar-old-gnu.ini b/resources/producer/tar-old-gnu.ini index 4906232..db5cec1 100644 --- a/resources/producer/tar-old-gnu.ini +++ b/resources/producer/tar-old-gnu.ini @@ -1,31 +1,30 @@ [main] # Mirror's root directory holding tarballs to load into swh mirror_root_directory = /home/storage/space/mirrors/gnu.org/old-gnu/ # Origin setup's possible scheme url url_scheme = rsync://ftp.gnu.org/old-gnu/ # Origin type used for tarballs type = ftp # File containing a subset list tarballs from mirror_root_directory to load. # The file's format is one absolute path name to a tarball per line. # NOTE: # - This file must contain data consistent with the mirror_root_directory # - if this option is not provided, the mirror_root_directory is scanned # completely as usual -# mirror_subset_archives = /home/storage/missing-archives +# mirror_subset_archives = /home/tony/work/inria/repo/swh-environment/swh-loader-tar/old-gnu-missing -# Authorities -gnu_authority = 4706c92a-8173-45d9-93d7-06523f249398 -swh_authority = 5f4d4c51-498a-4e28-88b3-b3e4e8396cba +# Retrieval date information (rsync, etc...) +date = Fri, 28 Aug 2015 13:13:26 +0200 # Randomize blocks of messages and send for consumption -block_messages = 250 +block_messages = 100 # DEV options # Tryouts purposes (no limit if not specified) -# limit = 10 +#limit = 10 diff --git a/swh/loader/tar/build.py b/swh/loader/tar/build.py index 84d2a8c..1ee03b7 100755 --- a/swh/loader/tar/build.py +++ b/swh/loader/tar/build.py @@ -1,172 +1,134 @@ # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information - import os -import datetime from swh.loader.tar import utils # Static setup EPOCH = 0 UTC_OFFSET = '+0000' SWH_PERSON = 'Software Heritage' SWH_MAIL = 'robot@softwareheritage.org' REVISION_MESSAGE = 'synthetic revision message' RELEASE_MESSAGE = 'synthetic release message' REVISION_TYPE = 'tar' def compute_origin(url_scheme, url_type, root_dirpath, tarpath): """Compute the origin. Args: - url_scheme: scheme to build the origin's url - url_type: origin's type - root_dirpath: the top level root directory path - tarpath: file's absolute path Returns: Dictionary origin with keys: - url: origin's url - type: origin's type """ relative_path = utils.commonname(root_dirpath, tarpath) return { 'url': ''.join([url_scheme, os.path.dirname(relative_path)]), 'type': url_type, } -def _build_occurrence(tarpath, authority_id, validity_ts): - """Build an occurrence from branch_name, authority_id and validity_ts. - - Args: - - tarpath: file's path - - authority_id: swh authority id (as per swh's storage values in - organization table) - - validity_ts: validity timestamp - - Returns: - Occurrence dictionary - - tarpath: file's path - - authority: swh authority - - validity: validity date (e.g. 2015-01-01 00:00:00+00) - """ - validity = '%s+00' % datetime.datetime.utcfromtimestamp(validity_ts) - return { - 'branch': os.path.basename(tarpath), - 'authority': authority_id, - 'validity': validity - } - - -def occurrence_with_ctime(authority, tarpath): +def occurrence_with_date(date, tarpath): """Compute the occurrence using the tarpath's ctime. Args: authority: the authority's uuid tarpath: file's path Returns: Occurrence dictionary (cf. _build_occurrence) """ - validity_ts = os.lstat(tarpath).st_ctime - return _build_occurrence(tarpath, authority, validity_ts) + return { + 'branch': os.path.basename(tarpath), + 'date': date + } def _time_from_path(tarpath): """Compute the modification time from the tarpath. """ return os.lstat(tarpath).st_mtime -def occurrence_with_mtime(authority, tarpath): - """Compute the occurrence from the tarpath using the tarpath's mtime. - - Args: - authority: the authority's uuid - tarpath: file's path - - Return: - Occurrence dictionary (cf. _build_occurrence) - - """ - validity_ts = _time_from_path(tarpath) - return _build_occurrence(tarpath, authority, validity_ts) - - def compute_revision(tarpath): """Compute a revision. Args: tarpath: absolute path to the tarball Returns: Revision as dict: - author_date: the modification timestamp as returned by a fstat call - author_offset: +0000 - committer_date: the modification timestamp as returned by a fstat call - committer_offset: +0000 - author_name: cf. SWH_PERSON - author_email: cf. SWH_MAIL - committer_name: cf. SWH_MAIL - committer_email: cf. SWH_MAIL - type: cf. REVISION_TYPE - message: cf. REVISION_MESSAGE """ ts = _time_from_path(tarpath) return { 'author_date': ts, 'author_offset': UTC_OFFSET, 'committer_date': ts, 'committer_offset': UTC_OFFSET, 'author_name': SWH_PERSON, 'author_email': SWH_MAIL, 'committer_name': SWH_PERSON, 'committer_email': SWH_MAIL, 'type': REVISION_TYPE, 'message': REVISION_MESSAGE, } def compute_release(filename, tarpath): """Compute a release from a given tarpath, filename. If the tarpath does not contain a recognizable release number, the release can be skipped. Args: filename: file's name without path tarpath: file's absolute path Returns: None if the release number cannot be extracted from the filename. Otherwise a synthetic release is computed with the following keys: - name: the release computed from the filename - date: the modification timestamp as returned by a fstat call - offset: +0000 - author_name: '' - author_email: '' - comment: '' """ release_number = utils.release_number(filename) if release_number: return { 'name': release_number, 'date': _time_from_path(tarpath), 'offset': UTC_OFFSET, 'author_name': SWH_PERSON, 'author_email': SWH_MAIL, 'comment': RELEASE_MESSAGE, } return None diff --git a/swh/loader/tar/tests/test_build.py b/swh/loader/tar/tests/test_build.py index e677ac8..d51afbc 100644 --- a/swh/loader/tar/tests/test_build.py +++ b/swh/loader/tar/tests/test_build.py @@ -1,106 +1,103 @@ # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import unittest from nose.tools import istest from unittest.mock import patch from swh.loader.tar import build class TestBuildUtils(unittest.TestCase): @istest def compute_origin(self): # given expected_origin = { 'url': 'rsync://some/url/package-foo', 'type': 'rsync', } # when actual_origin = build.compute_origin( 'rsync://some/url/', 'rsync', '/some/root/path/', '/some/root/path/package-foo/package-foo-1.2.3.tgz') # then self.assertEquals(actual_origin, expected_origin) @istest - def _build_occurrence(self): + def occurrence_with_date(self): # given expected_occurrence = { 'branch': b'package-bar.tgz', - 'authority': 3, - 'validity': '2015-10-22 08:44:47.422384+00' + 'date': '2015-10-22 08:44:47.422384+00' } # when - actual_occurrence = build._build_occurrence( - b'/path/to/package-bar.tgz', - authority_id=3, - validity_ts=1445503487.4223838) + actual_occurrence = build.occurrence_with_date( + '2015-10-22 08:44:47.422384+00', b'/path/to/package-bar.tgz',) # then self.assertEquals(actual_occurrence, expected_occurrence) @istest def compute_release__no_release(self): # given # when actual_release = build.compute_release( 'pack-without-version.tgz', '/some/path/to/pack-without-version.tgz') # then self.assertIsNone(actual_release) @istest def compute_release(self): # given expected_release = { 'name': '1.2.3rc1', 'date': 'some-time', 'offset': build.UTC_OFFSET, 'author_name': build.SWH_PERSON, 'author_email': build.SWH_MAIL, 'comment': build.RELEASE_MESSAGE, } # when with patch('swh.loader.tar.build._time_from_path', return_value='some-time'): actual_release = build.compute_release( 'foobar-1.2.3rc1.tgz', '/some/path/to/path-without-version.tgz') # then self.assertEquals(expected_release, actual_release) @istest def compute_revision(self): # when with patch('swh.loader.tar.build._time_from_path', return_value='some-other-time'): actual_revision = build.compute_revision('/some/path') expected_revision = { 'author_date': 'some-other-time', 'author_offset': build.UTC_OFFSET, 'committer_date': 'some-other-time', 'committer_offset': build.UTC_OFFSET, 'author_name': build.SWH_PERSON, 'author_email': build.SWH_MAIL, 'committer_name': build.SWH_PERSON, 'committer_email': build.SWH_MAIL, 'type': build.REVISION_TYPE, 'message': build.REVISION_MESSAGE, } # then self.assertEquals(actual_revision, expected_revision)