diff --git a/debian/control b/debian/control index 486e9d2..e4e9fe5 100644 --- a/debian/control +++ b/debian/control @@ -1,29 +1,29 @@ Source: swh-loader-svn Maintainer: Software Heritage developers Section: python Priority: optional Build-Depends: debhelper (>= 9), dh-python, python3-all, python3-nose, python3-setuptools, python3-swh.core (>= 0.0.19~), python3-swh.storage (>= 0.0.50~), python3-swh.model (>= 0.0.11~), python3-swh.scheduler (>= 0.0.7~), python3-swh.loader.core (>= 0.0.11~), python3-subvertpy (>= 0.9.4~), python3-dateutil, python3-retrying, python3-click, python3-vcversioner Standards-Version: 3.9.6 Homepage: https://forge.softwareheritage.org/diffusion/DLDSVN/ Package: python3-swh.loader.svn Architecture: all Depends: python3-swh.core (>= 0.0.19~), python3-swh.storage (>= 0.0.50~), python3-swh.model (>= 0.0.11~), python3-swh.scheduler (>= 0.0.7~), python3-swh.loader.core (>= 0.0.11~), - ${misc:Depends}, ${python3:Depends} + subversion, pigz, ${misc:Depends}, ${python3:Depends} Description: Software Heritage Loader Svn Module in charge of loading svn repositories into swh storage. diff --git a/swh/loader/svn/producer.py b/swh/loader/svn/producer.py index 70554ea..eb82fbb 100644 --- a/swh/loader/svn/producer.py +++ b/swh/loader/svn/producer.py @@ -1,62 +1,99 @@ # Copyright (C) 2015-2016 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import click import sys -task_name = 'swh.loader.svn.tasks.LoadSWHSvnRepositoryTsk' +def get_task(task_name): + """Retrieve task object in the application by its fully qualified name. - -def libproduce(svn_url, original_svn_url, original_svn_uuid, - destination_path=None, synchroneous=False): + """ from swh.scheduler.celery_backend.config import app for module in app.conf.CELERY_IMPORTS: __import__(module) - task = app.tasks[task_name] + return app.tasks[task_name] + + +def _produce_svn_to_load( + svn_url, original_svn_url, original_svn_uuid, + destination_path=None, synchroneous=False, + task_name='swh.loader.svn.tasks.LoadSWHSvnRepositoryTsk'): + """Produce svn urls on the message queue. + + Those urls can either be read from stdin or directly passed as argument. + + """ + task = get_task(task_name) if not synchroneous and svn_url: task.delay(svn_url=svn_url, original_svn_url=original_svn_url, original_svn_uuid=original_svn_uuid, destination_path=destination_path) elif synchroneous and svn_url: # for debug purpose task(svn_url=svn_url, original_svn_url=original_svn_url, original_svn_uuid=original_svn_uuid, destination_path=destination_path) else: # synchroneous flag is ignored in that case for svn_url in sys.stdin: svn_url = svn_url.rstrip() if svn_url: print(svn_url) task.delay(svn_url=svn_url, original_svn_url=original_svn_url, original_svn_uuid=original_svn_uuid, destination_path=destination_path) -@click.command() -@click.option('--svn-url', +def _produce_archive_to_mount_and_load( + archive_path, + task_name='swh.loader.svn.tasks.MountAndLoadSvnRepositoryTsk'): + task = get_task(task_name) + if archive_path: + task.delay(archive_path) + else: + for archive_path in sys.stdin: + archive_path = archive_path.rstrip() + if archive_path: + print(archive_path) + task.delay(archive_path) + + +@click.group() +def cli(): + pass + + +@cli.command('svn', help='Default svn urls producer') +@click.option('--url', help="svn repository's mirror url.") -@click.option('--original-svn-url', default=None, +@click.option('--original-url', default=None, help='svn repository\'s original remote url ' '(if different than --svn-url).') -@click.option('--original-svn-uuid', default=None, +@click.option('--original-uuid', default=None, help='svn repository\'s original uuid ' - '(to provide when using --original-svn-url)') + '(to provide when using --original-url)') @click.option('--destination-path', help="(optional) svn checkout destination.") @click.option('--synchroneous', is_flag=True, help="To execute directly the svn loading.") -def produce(svn_url, original_svn_url, original_svn_uuid, destination_path, - synchroneous): - libproduce(svn_url, original_svn_url, original_svn_uuid, destination_path, - synchroneous) +def produce_svn_to_load(url, original_url, original_uuid, destination_path, + synchroneous): + _produce_svn_to_load(url, original_url, original_uuid, + destination_path=destination_path, + synchroneous=synchroneous) + + +@cli.command('svn-archive', help='Default svndump archive producer') +@click.option('--path', help="Archive's Path to load and mount") +def produce_archive_to_mount_and_load(path): + _produce_archive_to_mount_and_load(path) if __name__ == '__main__': - produce() + cli() diff --git a/swh/loader/svn/tasks.py b/swh/loader/svn/tasks.py index 6f38346..7834224 100644 --- a/swh/loader/svn/tasks.py +++ b/swh/loader/svn/tasks.py @@ -1,31 +1,59 @@ # Copyright (C) 2015-2016 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +import shutil + +from os.path import basename + from swh.scheduler.task import Task from .loader import SWHSvnLoader +from . import utils class LoadSWHSvnRepositoryTsk(Task): """Import one svn repository to Software Heritage. """ task_queue = 'swh_loader_svn' def run(self, *args, **kwargs): """Import a svn repository with swh policy. Args: args: ordered arguments (expected None) kwargs: Dictionary with the following expected keys: - svn_url: (mandatory) svn's repository url - destination_path: (mandatory) root directory to locally retrieve svn's data - swh_revision: (optional) extra SWH revision hex to start from. cf. swh.loader.svn.SvnLoader.process docstring """ SWHSvnLoader().load(*args, **kwargs) + + +class MountAndLoadSvnRepositoryTsk(Task): + task_queue = 'swh_mount_and_load_loader_svn' + + def run(self, archive_path): + """1. Mount an svn dump from archive as a local svn repository. + 2. Load it through the svn loader. + 3. Clean up mounted svn repository archive. + """ + self.log.info('Archive to mount and load %s' % archive_path) + temp_dir, repo_path = utils.init_svn_repo_from_archive_dump( + archive_path) + self.log.debug('Mounted svn repository to %s' % repo_path) + try: + SWHSvnLoader().load(svn_url='file://%s' % repo_path, + destination_path=None) + except Exception as e: + raise e + finally: + self.log.debug('Clean up temp directory %s for project %s' % ( + temp_dir, basename(repo_path))) + shutil.rmtree(temp_dir) diff --git a/swh/loader/svn/utils.py b/swh/loader/svn/utils.py index 22842fa..73161e3 100644 --- a/swh/loader/svn/utils.py +++ b/swh/loader/svn/utils.py @@ -1,128 +1,167 @@ # Copyright (C) 2016 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import os +import tempfile +import shutil from dateutil import parser +from subprocess import PIPE, Popen, check_call + from swh.model import git def strdate_to_timestamp(strdate): """Convert a string date to an int timestamp. Args: strdate: A string representing a date with format like 'YYYY-mm-DDTHH:MM:SS.800722Z' Returns: A timestamp in float """ if strdate: dt = parser.parse(strdate) ts_float = dt.timestamp() else: # epoch ts_float = 0 return ts_float def convert_hashes_with_relative_path(hashes, rootpath): """A function to ease the transformation of absolute path to relative ones. This is an implementation detail: - swh.loader.svn.ra compute hashes and store keys with relative paths - swh.model.git compute hashes and store keys with full paths """ if rootpath.endswith(b'/'): rootpath = rootpath[:-1] root_value = hashes.pop(rootpath) if not rootpath.endswith(b'/'): rootpath = rootpath + b'/' def _replace_slash(s, rootpath=rootpath): return s.replace(rootpath, b'') def _update_children(children): return set((_replace_slash(c) for c in children)) h = { b'': { 'checksums': root_value['checksums'], 'children': _update_children(root_value['children']) } } for path, v in hashes.items(): p = _replace_slash(path) if 'children' in v: v['children'] = _update_children(v['children']) h[p] = v return h def hashtree(path, ignore_empty_folder=False, ignore=None): """Given a path and options, compute the hash's upper tree. This is not for production use. It's merely a helper function used mainly in bin/swh-hashtree Args: - path: The path to hash - ignore_empty_folder: An option to ignore empty folder - ignore: An option to ignore patterns in directory names. Returns: The path's checksums respecting the options passed as parameters. """ if os.path.exists(path): if not os.path.isdir(path): raise ValueError('%s should be a directory!' % path) else: raise ValueError('%s should exist!' % path) if isinstance(path, str): path = path.encode('utf-8') if ignore: patterns = [] for exc in ignore: patterns.append(exc.encode('utf-8')) def dir_ok_fn_basic(dirpath, patterns=patterns): dname = os.path.basename(dirpath) for pattern_to_ignore in patterns: if pattern_to_ignore == dname: return False if (pattern_to_ignore + b'/') in dirpath: return False return True if ignore_empty_folder: def dir_ok_fn(dirpath, patterns=patterns): if not dir_ok_fn_basic(dirpath): return False return os.listdir(dirpath) != [] else: dir_ok_fn = dir_ok_fn_basic else: if ignore_empty_folder: def dir_ok_fn(dirpath): return os.listdir(dirpath) != [] else: dir_ok_fn = git.default_validation_dir objects = git.compute_hashes_from_directory( path, dir_ok_fn=dir_ok_fn) h = objects[path]['checksums'] return h + + +def init_svn_repo_from_archive_dump(archive_path, root_temp_dir='/tmp'): + """Given a path to an archive containing an svn dump. + Initialize an svn repository with the content of said dump. + + Returns: + A tuple: + - temporary folder: containing the mounted repository + - repo_path, path to the mounted repository inside the temporary folder + + Raises: + ValueError in case of failure to run the command to uncompress + and load the dump. + + """ + project_name = os.path.basename(os.path.dirname(archive_path)) + temp_dir = tempfile.mkdtemp(suffix='.swh.loader.svn', + prefix='tmp.', + dir=root_temp_dir) + repo_path = os.path.join(temp_dir, project_name) + + # create the repository that will be loaded with the dump + cmd = ['svnadmin', 'create', repo_path] + check_call(cmd) + + with Popen(['pigz', '-dc', archive_path], stdout=PIPE) as dump: + cmd = ['svnadmin', 'load', '-q', repo_path] + r = check_call(cmd, stdin=dump.stdout) + if r == 0: + return temp_dir, repo_path + # failure, so we clean up + shutil.rmtree(temp_dir) + raise ValueError('Failed to mount the svn dump for project %s' % + project_name)