diff --git a/swh/loader/tar/build.py b/swh/loader/tar/build.py index a94bc26..c5ff02d 100755 --- a/swh/loader/tar/build.py +++ b/swh/loader/tar/build.py @@ -1,118 +1,116 @@ # Copyright (C) 2015-2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import os from swh.loader.tar import utils # Static setup EPOCH = 0 UTC_OFFSET = 0 SWH_PERSON = { 'name': 'Software Heritage', 'fullname': 'Software Heritage', 'email': 'robot@softwareheritage.org' } REVISION_MESSAGE = 'synthetic revision message' REVISION_TYPE = 'tar' def compute_origin(url_scheme, url_type, root_dirpath, tarpath): """Compute the origin. Args: - url_scheme: scheme to build the origin's url - url_type: origin's type - root_dirpath: the top level root directory path - tarpath: file's absolute path Returns: Dictionary origin with keys: - url: origin's url - type: origin's type """ relative_path = utils.commonname(root_dirpath, tarpath) return { 'url': ''.join([url_scheme, os.path.dirname(relative_path)]), 'type': url_type, } -def occurrence_with_date(date, tarpath): +def compute_occurrence(tarpath): """Compute the occurrence using the tarpath's ctime. Args: - authority: the authority's uuid tarpath: file's path Returns: - Occurrence dictionary (cf. _build_occurrence) + Occurrence dictionary. """ return { 'branch': os.path.basename(tarpath), - 'date': date } def _time_from_path(tarpath): """Compute the modification time from the tarpath. Args: tarpath (str|bytes): Full path to the archive to extract the date from. Returns: dict representing a timestamp with keys seconds and microseconds keys. """ mtime = os.lstat(tarpath).st_mtime if isinstance(mtime, float): normalized_time = list(map(int, str(mtime).split('.'))) else: # assuming int normalized_time = [mtime, 0] return { 'seconds': normalized_time[0], 'microseconds': normalized_time[1] } def compute_revision(tarpath): """Compute a revision. Args: tarpath: absolute path to the tarball Returns: Revision as dict: - date (dict): the modification timestamp as returned by _time_from_path function - committer_date: the modification timestamp as returned by _time_from_path function - author: cf. SWH_PERSON - committer: cf. SWH_PERSON - type: cf. REVISION_TYPE - message: cf. REVISION_MESSAGE """ ts = _time_from_path(tarpath) return { 'date': { 'timestamp': ts, 'offset': UTC_OFFSET, }, 'committer_date': { 'timestamp': ts, 'offset': UTC_OFFSET, }, 'author': SWH_PERSON, 'committer': SWH_PERSON, 'type': REVISION_TYPE, 'message': REVISION_MESSAGE, } diff --git a/swh/loader/tar/producer.py b/swh/loader/tar/producer.py index f491f6f..e692b7f 100755 --- a/swh/loader/tar/producer.py +++ b/swh/loader/tar/producer.py @@ -1,102 +1,101 @@ # Copyright (C) 2015-2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import click import dateutil.parser from swh.scheduler.utils import get_task from swh.core import config from swh.loader.tar import build, file TASK_QUEUE = 'swh.loader.tar.tasks.LoadTarRepository' def produce_archive_messages_from( conf, root_dir, visit_date, mirror_file=None, dry_run=False): """From root_dir, produce archive tarball messages to celery. Will print error message when some computation arise on archive and continue. Args: conf: dictionary holding static metadata root_dir: top directory to list archives from. visit_date: override origin's visit date of information mirror_file: a filtering file of tarballs to load dry_run: will compute but not send messages Returns: Number of messages generated """ limit = conf['limit'] block = int(conf['block_messages']) count = 0 path_source_tarballs = mirror_file if mirror_file else root_dir - parsed_visit_date = dateutil.parser.parse(visit_date) + visit_date = dateutil.parser.parse(visit_date) if not dry_run: task = get_task(TASK_QUEUE) for tarpath, _ in file.random_archives_from( path_source_tarballs, block, limit): try: origin = build.compute_origin( conf['url_scheme'], conf['type'], root_dir, tarpath) revision = build.compute_revision(tarpath) - occurrence = build.occurrence_with_date(visit_date, tarpath) + occurrence = build.compute_occurrence(tarpath) if not dry_run: - task.delay(tarpath, origin, parsed_visit_date, revision, - [occurrence]) + task.delay(tarpath, origin, visit_date, revision, [occurrence]) count += 1 except ValueError: print('Problem with the following archive: %s' % tarpath) return count @click.command() @click.option('--config-file', required=1, help='Configuration file path') @click.option('--dry-run/--no-dry-run', default=False, help='Dry run (print repo only)') @click.option('--limit', default=None, help='Number of origins limit to send') def main(config_file, dry_run, limit): """Tarball producer of local fs tarballs. """ conf = config.read(config_file) url_scheme = conf['url_scheme'] mirror_dir = conf['mirror_root_directory'] # remove trailing / in configuration (to ease ulterior computation) if url_scheme[-1] == '/': conf['url_scheme'] = url_scheme[0:-1] if mirror_dir[-1] == '/': conf['mirror_root_directory'] = mirror_dir[0:-1] if limit: conf['limit'] = int(limit) nb_tarballs = produce_archive_messages_from( conf=conf, root_dir=conf['mirror_root_directory'], visit_date=conf['date'], mirror_file=conf.get('mirror_subset_archives'), dry_run=dry_run) print('%s tarball(s) sent to worker.' % nb_tarballs) if __name__ == '__main__': main() diff --git a/swh/loader/tar/tests/test_build.py b/swh/loader/tar/tests/test_build.py index 0054997..94e2235 100644 --- a/swh/loader/tar/tests/test_build.py +++ b/swh/loader/tar/tests/test_build.py @@ -1,107 +1,106 @@ # Copyright (C) 2015-2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import unittest from nose.tools import istest from unittest.mock import patch from swh.loader.tar import build class TestBuildUtils(unittest.TestCase): @istest def compute_origin(self): # given expected_origin = { 'url': 'rsync://some/url/package-foo', 'type': 'rsync', } # when actual_origin = build.compute_origin( 'rsync://some/url/', 'rsync', '/some/root/path/', '/some/root/path/package-foo/package-foo-1.2.3.tgz') # then self.assertEquals(actual_origin, expected_origin) @istest - def occurrence_with_date(self): + def compute_occurrence(self): # given expected_occurrence = { 'branch': b'package-bar.tgz', - 'date': '2015-10-22 08:44:47.422384+00' } # when - actual_occurrence = build.occurrence_with_date( - '2015-10-22 08:44:47.422384+00', b'/path/to/package-bar.tgz',) + actual_occurrence = build.compute_occurrence( + b'/path/to/package-bar.tgz') # then self.assertEquals(actual_occurrence, expected_occurrence) @patch('swh.loader.tar.build._time_from_path') @istest def compute_revision(self, mock_time_from_path): mock_time_from_path.return_value = 'some-other-time' # when actual_revision = build.compute_revision('/some/path') expected_revision = { 'date': { 'timestamp': 'some-other-time', 'offset': build.UTC_OFFSET, }, 'committer_date': { 'timestamp': 'some-other-time', 'offset': build.UTC_OFFSET, }, 'author': build.SWH_PERSON, 'committer': build.SWH_PERSON, 'type': build.REVISION_TYPE, 'message': build.REVISION_MESSAGE, } # then self.assertEquals(actual_revision, expected_revision) mock_time_from_path.assert_called_once_with('/some/path') @patch('swh.loader.tar.build.os') @istest def time_from_path_with_float(self, mock_os): class MockStat: st_mtime = 1445348286.8308342 mock_os.lstat.return_value = MockStat() actual_time = build._time_from_path('some/path') self.assertEquals(actual_time, { 'seconds': 1445348286, 'microseconds': 8308342 }) mock_os.lstat.assert_called_once_with('some/path') @patch('swh.loader.tar.build.os') @istest def time_from_path_with_int(self, mock_os): class MockStat: st_mtime = 1445348286 mock_os.lstat.return_value = MockStat() actual_time = build._time_from_path('some/path') self.assertEquals(actual_time, { 'seconds': 1445348286, 'microseconds': 0 }) mock_os.lstat.assert_called_once_with('some/path')