diff --git a/resources/svn.ini b/resources/svn.ini index 7cfeda0..f834380 100644 --- a/resources/svn.ini +++ b/resources/svn.ini @@ -1,28 +1,24 @@ [main] storage_class = remote_storage storage_args = http://localhost:5000/ send_contents = True send_directories = True send_revisions = True send_releases = True send_occurrences = True # nb of max contents to send for storage (if size threshold not reached before) content_packet_size = 10000 # 100 Mib of content data (size threshold of data before sending for storage) content_packet_block_size_bytes = 104857600 # limit for swh content storage for one blob (beyond that limit, the # content's data is not sent for storage) content_packet_size_bytes = 1073741824 # packet of directories to send for storage directory_packet_size = 25000 # packet of revisions to send for storage revision_packet_size = 10000 # packet of releases to send for storage release_packet_size = 100000 # packet of occurrences to send for storage occurrence_packet_size = 100000 -### for git-svn comparison - -# policy possible: swh, gitsvn -with_policy = swh diff --git a/swh/loader/svn/producer.py b/swh/loader/svn/producer.py index aef1bba..70554ea 100644 --- a/swh/loader/svn/producer.py +++ b/swh/loader/svn/producer.py @@ -1,62 +1,62 @@ # Copyright (C) 2015-2016 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import click import sys -task_name = 'swh.loader.svn.tasks.LoadSvnRepositoryTsk' +task_name = 'swh.loader.svn.tasks.LoadSWHSvnRepositoryTsk' def libproduce(svn_url, original_svn_url, original_svn_uuid, destination_path=None, synchroneous=False): from swh.scheduler.celery_backend.config import app for module in app.conf.CELERY_IMPORTS: __import__(module) task = app.tasks[task_name] if not synchroneous and svn_url: task.delay(svn_url=svn_url, original_svn_url=original_svn_url, original_svn_uuid=original_svn_uuid, destination_path=destination_path) elif synchroneous and svn_url: # for debug purpose task(svn_url=svn_url, original_svn_url=original_svn_url, original_svn_uuid=original_svn_uuid, destination_path=destination_path) else: # synchroneous flag is ignored in that case for svn_url in sys.stdin: svn_url = svn_url.rstrip() if svn_url: print(svn_url) task.delay(svn_url=svn_url, original_svn_url=original_svn_url, original_svn_uuid=original_svn_uuid, destination_path=destination_path) @click.command() @click.option('--svn-url', help="svn repository's mirror url.") @click.option('--original-svn-url', default=None, help='svn repository\'s original remote url ' '(if different than --svn-url).') @click.option('--original-svn-uuid', default=None, help='svn repository\'s original uuid ' '(to provide when using --original-svn-url)') @click.option('--destination-path', help="(optional) svn checkout destination.") @click.option('--synchroneous', is_flag=True, help="To execute directly the svn loading.") def produce(svn_url, original_svn_url, original_svn_uuid, destination_path, synchroneous): libproduce(svn_url, original_svn_url, original_svn_uuid, destination_path, synchroneous) if __name__ == '__main__': produce() diff --git a/swh/loader/svn/tasks.py b/swh/loader/svn/tasks.py index f4a473d..355a1a6 100644 --- a/swh/loader/svn/tasks.py +++ b/swh/loader/svn/tasks.py @@ -1,107 +1,194 @@ # Copyright (C) 2015-2016 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import datetime from swh.core import hashutil from swh.loader.core import tasks from .loader import GitSvnSvnLoader, SWHSvnLoader -class LoadSvnRepositoryTsk(tasks.LoaderCoreTask): - """Import one svn repository to Software Heritage. +class LoadGitSvnRepositoryTsk(tasks.LoaderCoreTask): + """ + Import one svn repository to Software Heritage with git-svn policy. + + Note: NOT FOR PRODUCTION """ CONFIG_BASE_FILENAME = 'loader/svn.ini' ADDITIONAL_CONFIG = { 'storage_class': ('str', 'remote_storage'), 'storage_args': ('list[str]', ['http://localhost:5000/']), - 'with_policy': ('string', 'swh'), # Default, other possible - # value is 'gitsvn' } - task_queue = 'swh_loader_svn' + task_queue = 'swh_loader_gitsvn' def run(self, *args, **kwargs): """Import a svn repository. Args: - - svn_url: svn's repository url - - destination_path: root directory to locally retrieve svn's data - - swh_revision: Optional extra swh revision to start from. - cf. swh.loader.svn.SvnLoader.process docstring + args: ordered arguments (expected None) + kwargs: Dictionary with the following expected keys: + - svn_url: (mandatory) svn's repository url + - destination_path: (mandatory) root directory to + locally retrieve svn's data + - original_svn_url: (optional) original svn url (if + svn_url is a local mirror for example) + - original_svn_uuid: (optional) original svn uuid (same + use case than previous line) """ destination_path = kwargs['destination_path'] # local svn url svn_url = kwargs['svn_url'] # if original_svn_url is mentioned, this means we load a local mirror original_svn_url = kwargs.get('original_svn_url') # potential uuid overwrite original_svn_uuid = kwargs.get('original_svn_uuid') # Make sure we have all that's needed if original_svn_url and not original_svn_uuid: msg = "When loading a local mirror, you must specify the original repository's uuid." # noqa self.log.error('%s. Skipping mirror %s' % (msg, svn_url)) return # Determine the origin url origin_url = original_svn_url if original_svn_url else svn_url if 'origin' not in kwargs: # first time, we'll create the origin origin = { 'url': origin_url, 'type': 'svn', } origin['id'] = self.storage.origin_add_one(origin) else: origin = { 'id': kwargs['origin'], 'url': origin_url, 'type': 'svn' } date_visit = datetime.datetime.now(tz=datetime.timezone.utc) origin_visit = self.storage.origin_visit_add(origin['id'], date_visit) origin_visit.update({ 'date': date_visit }) - # Determine which loader to trigger - if self.config['with_policy'] == 'gitsvn': - # this one compute hashes but do not store anywhere - loader = GitSvnSvnLoader(svn_url, destination_path, origin, - svn_uuid=original_svn_uuid) - elif self.config['with_policy'] == 'swh': - # the real production use case with storage and all - loader = SWHSvnLoader(svn_url, destination_path, origin, - svn_uuid=original_svn_uuid) + # this one compute hashes but do not store anywhere + loader = GitSvnSvnLoader(svn_url, destination_path, origin, + svn_uuid=original_svn_uuid) + + result = loader.load(origin_visit) + + # Check for partial completion to complete state data + if 'completion' in result and result['completion'] == 'partial': + state = result['state'] + state.update({ + 'destination_path': destination_path, + 'svn_url': svn_url, + 'original_svn_url': origin_url, + 'original_svn_uuid': original_svn_uuid, + }) + result['state'] = state + + return result + + +class LoadSWHSvnRepositoryTsk(tasks.LoaderCoreTask): + """Import one svn repository to Software Heritage. + + """ + CONFIG_BASE_FILENAME = 'loader/svn.ini' + + ADDITIONAL_CONFIG = { + 'storage_class': ('str', 'remote_storage'), + 'storage_args': ('list[str]', ['http://localhost:5000/']), + } + + task_queue = 'swh_loader_svn' + + def run(self, *args, **kwargs): + """Import a svn repository with swh policy. + + Args: + args: ordered arguments (expected None) + kwargs: Dictionary with the following expected keys: + - svn_url: (mandatory) svn's repository url + - destination_path: (mandatory) root directory to + locally retrieve svn's data + - original_svn_url: (optional) original svn url (if + svn_url is a local mirror for example) + - original_svn_uuid: (optional) original svn uuid (same + use case than previous line) + - swh_revision: (optional) extra SWH revision hex to + start from. cf. swh.loader.svn.SvnLoader.process + docstring + + """ + destination_path = kwargs['destination_path'] + # local svn url + svn_url = kwargs['svn_url'] + # if original_svn_url is mentioned, this means we load a local mirror + original_svn_url = kwargs.get('original_svn_url') + # potential uuid overwrite + original_svn_uuid = kwargs.get('original_svn_uuid') + + # Make sure we have all that's needed + if original_svn_url and not original_svn_uuid: + msg = "When loading a local mirror, you must specify the original repository's uuid." # noqa + self.log.error('%s. Skipping mirror %s' % (msg, svn_url)) + return + + # Determine the origin url + origin_url = original_svn_url if original_svn_url else svn_url + + if 'origin' not in kwargs: # first time, we'll create the origin + origin = { + 'url': origin_url, + 'type': 'svn', + } + origin['id'] = self.storage.origin_add_one(origin) else: - raise ValueError('Only gitsvn or swh policies are supported in' - '\'with_policy\' entry. ' - 'Please adapt your svn.ini file accordingly') + origin = { + 'id': kwargs['origin'], + 'url': origin_url, + 'type': 'svn' + } + + date_visit = datetime.datetime.now(tz=datetime.timezone.utc) + origin_visit = self.storage.origin_visit_add(origin['id'], + date_visit) + + origin_visit.update({ + 'date': date_visit + }) + + # the real production use case with storage and all + loader = SWHSvnLoader(svn_url, destination_path, origin, + svn_uuid=original_svn_uuid) + if 'swh_revision' in kwargs: swh_revision = hashutil.hex_to_hash(kwargs['swh_revision']) else: swh_revision = None result = loader.load(origin_visit, swh_revision) # Check for partial completion to complete state data if 'completion' in result and result['completion'] == 'partial': state = result['state'] state.update({ + 'destination_path': destination_path, 'svn_url': svn_url, 'original_svn_url': origin_url, 'original_svn_uuid': original_svn_uuid, }) result['state'] = state return result