diff --git a/swh/loader/svn/loader.py b/swh/loader/svn/loader.py index abd9433..bb0ffc0 100644 --- a/swh/loader/svn/loader.py +++ b/swh/loader/svn/loader.py @@ -1,180 +1,173 @@ # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import datetime -from swh.core import hashutil, utils +from swh.core import utils from swh.model import git from swh.model.git import GitType from swh.loader.svn import libloader, svn, converters class SvnLoader(libloader.SWHLoader): """Svn loader to load one svn repository. """ def __init__(self, config): super().__init__(config, revision_type='svn', logging_class='swh.loader.svn.SvnLoader') def check_history_not_altered(self, svnrepo, revision_start, swh_rev): """Given a svn repository, check if the history was not tampered with. """ revision_id = swh_rev['id'] parents = swh_rev['parents'] hash_data_per_revs = svnrepo.swh_hash_data_per_revision(revision_start, revision_start) rev, _, commit, objects_per_path = list(hash_data_per_revs)[0] dir_id = objects_per_path[git.ROOT_TREE_KEY][0]['sha1_git'] swh_revision = converters.build_swh_revision(svnrepo.uuid, commit, rev, dir_id, parents) swh_revision_id = git.compute_revision_sha1_git(swh_revision) return swh_revision_id == revision_id def process_revisions(self, svnrepo, revision_start, revision_end, revision_parents): """Process revisions from revision_start to revision_end and send to swh for storage. At each svn revision, checkout the repository, compute the tree hash and blobs and send for swh storage to store. Then computes and yields the swh revision. Yields: swh revision """ for rev, nextrev, commit, objects_per_path in svnrepo.swh_hash_data_per_revision( # noqa revision_start, revision_end): - self.log.debug('rev: %s, nextrev: %s\ncommit: %s' % ( - rev, nextrev, commit)) - - self.log.debug('objects_per_path_keys: %s\nobjects_per_path: %s' % - (objects_per_path.keys(), objects_per_path)) objects_per_type = { GitType.BLOB: [], GitType.TREE: [], GitType.COMM: [], GitType.RELE: [], GitType.REFS: [], } # compute the fs tree's checksums dir_id = objects_per_path[git.ROOT_TREE_KEY][0]['sha1_git'] - self.log.debug('tree: %s' % hashutil.hash_to_hex(dir_id)) swh_revision = converters.build_swh_revision(svnrepo.uuid, commit, rev, dir_id, revision_parents[rev]) swh_revision['id'] = git.compute_revision_sha1_git(swh_revision) + # self.log.debug('svnrev: %s, swhrev: %s, nextsvnrev: %s' % ( + # rev, swh_revision['id'], nextrev)) + if nextrev: revision_parents[nextrev] = [swh_revision['id']] - self.log.info('svnrev: %s, swhrev: %s' % - (rev, hashutil.hash_to_hex(swh_revision['id']))) - # send blobs for tree_path in objects_per_path: objs = objects_per_path[tree_path] for obj in objs: objects_per_type[obj['type']].append(obj) self.load(objects_per_type, objects_per_path, svnrepo.origin_id) yield swh_revision def process(self, svn_url, origin, destination_path): """Load a svn repository in swh. Checkout the svn repository locally in destination_path. Args: - svn_url: svn repository url to import - origin: Dictionary origin - id: origin's id - url: url origin we fetched - type: type of the origin Returns: Dictionary with the following keys: - status: mandatory, the status result as a boolean - stderr: optional when status is True, mandatory otherwise """ svnrepo = svn.SvnRepo(svn_url, origin['id'], self.storage, destination_path) try: swh_rev = svnrepo.swh_previous_revision() if swh_rev: extra_headers = dict(swh_rev['metadata']['extra_headers']) revision_start = extra_headers['svn_revision'] revision_parents = { revision_start: swh_rev['parents'] } else: revision_start = 1 revision_parents = { revision_start: [] } svnrepo.fork(revision_start) self.log.debug('svn co %s@%s' % (svn_url, revision_start)) if swh_rev and not self.check_history_not_altered(svnrepo, revision_start, swh_rev): msg = 'History of svn %s@%s history modified. Skipping...' % ( svn_url, revision_start) self.log.warn(msg) return {'status': False, 'stderr': msg} revision_end = svnrepo.head_revision() self.log.debug('[revision_start-revision_end]: [%s-%s]' % ( revision_start, revision_end)) if revision_start == revision_end and revision_start is not 1: self.log.info('%s@%s already injected.' % (svn_url, revision_end)) return {'status': True} self.log.info('Repo %s ready to be processed.' % svnrepo) # process and store revision to swh (sent by by blocks of # 'revision_packet_size') for revisions in utils.grouper( self.process_revisions(svnrepo, revision_start, revision_end, revision_parents), self.config['revision_packet_size']): revs = list(revisions) - self.log.info('%s revisions sent to swh' % len(revs)) self.maybe_load_revisions(revs) # create occurrence pointing to the latest revision (the last one) swh_revision = revs[-1] occ = converters.build_swh_occurrence(swh_revision['id'], origin['id'], datetime.datetime.utcnow()) self.log.debug('occ: %s' % occ) self.maybe_load_occurrences([occ]) finally: svnrepo.cleanup() return {'status': True} diff --git a/swh/loader/svn/tasks.py b/swh/loader/svn/tasks.py index c42f2bc..caea022 100644 --- a/swh/loader/svn/tasks.py +++ b/swh/loader/svn/tasks.py @@ -1,91 +1,97 @@ # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +import logging + from swh.core.config import load_named_config from swh.scheduler.task import Task from swh.storage import get_storage from swh.model.git import GitType from swh.loader.svn.loader import SvnLoader DEFAULT_CONFIG = { 'storage_class': ('str', 'remote_storage'), 'storage_args': ('list[str]', ['http://localhost:5000/']), 'send_contents': ('bool', True), 'send_directories': ('bool', True), 'send_revisions': ('bool', True), 'send_releases': ('bool', True), 'send_occurrences': ('bool', True), 'content_packet_size': ('int', 10000), 'content_packet_size_bytes': ('int', 1073741824), 'directory_packet_size': ('int', 25000), 'revision_packet_size': ('int', 100), 'release_packet_size': ('int', 100000), 'occurrence_packet_size': ('int', 100000), } class LoadSvnRepositoryTsk(Task): """Import a svn repository to Software Heritage """ task_queue = 'swh_loader_svn' @property def config(self): if not hasattr(self, '__config'): self.__config = load_named_config( 'loader/svn.ini', DEFAULT_CONFIG) + + l = logging.getLogger('requests.packages.urllib3.connectionpool') + l.setLevel(logging.WARN) + return self.__config def open_fetch_history(self, storage, origin_id): return storage.fetch_history_start(origin_id) def close_fetch_history(self, storage, fetch_history_id, res): result = None if 'objects' in res: result = { 'contents': len(res['objects'].get(GitType.BLOB, [])), 'directories': len(res['objects'].get(GitType.TREE, [])), 'revisions': len(res['objects'].get(GitType.COMM, [])), 'releases': len(res['objects'].get(GitType.RELE, [])), 'occurrences': len(res['objects'].get(GitType.REFS, [])), } data = { 'status': res['status'], 'result': result, 'stderr': res.get('stderr') } return storage.fetch_history_end(fetch_history_id, data) def run(self, svn_url, local_path): """Import a svn repository. Args: cf. swh.loader.svn.SvnLoader.process docstring """ config = self.config storage = get_storage(config['storage_class'], config['storage_args']) origin = {'type': 'svn', 'url': svn_url} origin['id'] = storage.origin_add_one(origin) fetch_history_id = self.open_fetch_history(storage, origin['id']) # try: result = SvnLoader(config).process(svn_url, origin, local_path) # except: # e_info = sys.exc_info() # self.log.error('Problem during svn load for repo %s - %s' % ( # svn_url, e_info[1])) # result = {'status': False, 'stderr': 'reason:%s\ntrace:%s' % ( # e_info[1], # ''.join(traceback.format_tb(e_info[2])))} self.close_fetch_history(storage, fetch_history_id, result)