diff --git a/swh/loader/svn/tasks.py b/swh/loader/svn/tasks.py index 7834224..545480b 100644 --- a/swh/loader/svn/tasks.py +++ b/swh/loader/svn/tasks.py @@ -1,59 +1,61 @@ # Copyright (C) 2015-2016 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import shutil from os.path import basename from swh.scheduler.task import Task from .loader import SWHSvnLoader from . import utils class LoadSWHSvnRepositoryTsk(Task): """Import one svn repository to Software Heritage. """ task_queue = 'swh_loader_svn' def run(self, *args, **kwargs): """Import a svn repository with swh policy. Args: args: ordered arguments (expected None) kwargs: Dictionary with the following expected keys: - svn_url: (mandatory) svn's repository url - destination_path: (mandatory) root directory to locally retrieve svn's data - swh_revision: (optional) extra SWH revision hex to start from. cf. swh.loader.svn.SvnLoader.process docstring """ SWHSvnLoader().load(*args, **kwargs) class MountAndLoadSvnRepositoryTsk(Task): task_queue = 'swh_mount_and_load_loader_svn' def run(self, archive_path): """1. Mount an svn dump from archive as a local svn repository. 2. Load it through the svn loader. 3. Clean up mounted svn repository archive. """ - self.log.info('Archive to mount and load %s' % archive_path) - temp_dir, repo_path = utils.init_svn_repo_from_archive_dump( - archive_path) - self.log.debug('Mounted svn repository to %s' % repo_path) + temp_dir = None try: + self.log.info('Archive to mount and load %s' % archive_path) + temp_dir, repo_path = utils.init_svn_repo_from_archive_dump( + archive_path) + self.log.debug('Mounted svn repository to %s' % repo_path) SWHSvnLoader().load(svn_url='file://%s' % repo_path, destination_path=None) except Exception as e: raise e finally: - self.log.debug('Clean up temp directory %s for project %s' % ( - temp_dir, basename(repo_path))) - shutil.rmtree(temp_dir) + if temp_dir: + self.log.debug('Clean up temp directory %s for project %s' % ( + temp_dir, basename(repo_path))) + shutil.rmtree(temp_dir) diff --git a/swh/loader/svn/utils.py b/swh/loader/svn/utils.py index 73161e3..80709ba 100644 --- a/swh/loader/svn/utils.py +++ b/swh/loader/svn/utils.py @@ -1,167 +1,176 @@ # Copyright (C) 2016 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import os import tempfile import shutil from dateutil import parser from subprocess import PIPE, Popen, check_call from swh.model import git def strdate_to_timestamp(strdate): """Convert a string date to an int timestamp. Args: strdate: A string representing a date with format like 'YYYY-mm-DDTHH:MM:SS.800722Z' Returns: A timestamp in float """ if strdate: dt = parser.parse(strdate) ts_float = dt.timestamp() else: # epoch ts_float = 0 return ts_float def convert_hashes_with_relative_path(hashes, rootpath): """A function to ease the transformation of absolute path to relative ones. This is an implementation detail: - swh.loader.svn.ra compute hashes and store keys with relative paths - swh.model.git compute hashes and store keys with full paths """ if rootpath.endswith(b'/'): rootpath = rootpath[:-1] root_value = hashes.pop(rootpath) if not rootpath.endswith(b'/'): rootpath = rootpath + b'/' def _replace_slash(s, rootpath=rootpath): return s.replace(rootpath, b'') def _update_children(children): return set((_replace_slash(c) for c in children)) h = { b'': { 'checksums': root_value['checksums'], 'children': _update_children(root_value['children']) } } for path, v in hashes.items(): p = _replace_slash(path) if 'children' in v: v['children'] = _update_children(v['children']) h[p] = v return h def hashtree(path, ignore_empty_folder=False, ignore=None): """Given a path and options, compute the hash's upper tree. This is not for production use. It's merely a helper function used mainly in bin/swh-hashtree Args: - path: The path to hash - ignore_empty_folder: An option to ignore empty folder - ignore: An option to ignore patterns in directory names. Returns: The path's checksums respecting the options passed as parameters. """ if os.path.exists(path): if not os.path.isdir(path): raise ValueError('%s should be a directory!' % path) else: raise ValueError('%s should exist!' % path) if isinstance(path, str): path = path.encode('utf-8') if ignore: patterns = [] for exc in ignore: patterns.append(exc.encode('utf-8')) def dir_ok_fn_basic(dirpath, patterns=patterns): dname = os.path.basename(dirpath) for pattern_to_ignore in patterns: if pattern_to_ignore == dname: return False if (pattern_to_ignore + b'/') in dirpath: return False return True if ignore_empty_folder: def dir_ok_fn(dirpath, patterns=patterns): if not dir_ok_fn_basic(dirpath): return False return os.listdir(dirpath) != [] else: dir_ok_fn = dir_ok_fn_basic else: if ignore_empty_folder: def dir_ok_fn(dirpath): return os.listdir(dirpath) != [] else: dir_ok_fn = git.default_validation_dir objects = git.compute_hashes_from_directory( path, dir_ok_fn=dir_ok_fn) h = objects[path]['checksums'] return h def init_svn_repo_from_archive_dump(archive_path, root_temp_dir='/tmp'): """Given a path to an archive containing an svn dump. Initialize an svn repository with the content of said dump. Returns: A tuple: - temporary folder: containing the mounted repository - repo_path, path to the mounted repository inside the temporary folder Raises: ValueError in case of failure to run the command to uncompress and load the dump. """ project_name = os.path.basename(os.path.dirname(archive_path)) temp_dir = tempfile.mkdtemp(suffix='.swh.loader.svn', prefix='tmp.', dir=root_temp_dir) - repo_path = os.path.join(temp_dir, project_name) - # create the repository that will be loaded with the dump - cmd = ['svnadmin', 'create', repo_path] - check_call(cmd) - - with Popen(['pigz', '-dc', archive_path], stdout=PIPE) as dump: - cmd = ['svnadmin', 'load', '-q', repo_path] - r = check_call(cmd, stdin=dump.stdout) - if r == 0: + try: + repo_path = os.path.join(temp_dir, project_name) + + # create the repository that will be loaded with the dump + cmd = ['svnadmin', 'create', repo_path] + r = check_call(cmd) + if r != 0: + raise ValueError( + 'Failed to initialize an empty svn repository for project %s' % + project_name) + + with Popen(['pigz', '-dc', archive_path], stdout=PIPE) as dump: + cmd = ['svnadmin', 'load', '-q', repo_path] + r = check_call(cmd, stdin=dump.stdout) + if r != 0: + raise ValueError( + 'Failed to mount the svn dump for project %s' % + project_name) return temp_dir, repo_path + except Exception as e: # failure, so we clean up shutil.rmtree(temp_dir) - raise ValueError('Failed to mount the svn dump for project %s' % - project_name) + raise e