diff --git a/swh/loader/svn/loader.py b/swh/loader/svn/loader.py index 341e932..e033014 100644 --- a/swh/loader/svn/loader.py +++ b/swh/loader/svn/loader.py @@ -1,211 +1,211 @@ -# Copyright (C) 2015 The Software Heritage developers +# Copyright (C) 2015-2016 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import datetime from swh.core import utils from swh.model import git, hashutil from swh.model.git import GitType from swh.loader.vcs import loader from swh.loader.svn import svn, converters def objects_per_type(objects_per_path): """Given an object dictionary returned by `swh.model.git.walk_and_compute_sha1_from_directory`, return a map grouped by type. Returns: Dictionary with keys: - GitType.BLOB: list of blobs - GitType.TREE: list of directories """ objects = { GitType.BLOB: [], GitType.TREE: [], } for tree_path in objects_per_path: objs = objects_per_path[tree_path] for obj in objs: objects[obj['type']].append(obj) return objects class SvnLoader(loader.SWHLoader): """Svn loader to load one svn repository. """ def __init__(self, config, origin_id): super().__init__(config, origin_id, logging_class='swh.loader.svn.SvnLoader') def check_history_not_altered(self, svnrepo, revision_start, swh_rev): """Given a svn repository, check if the history was not tampered with. """ revision_id = swh_rev['id'] parents = swh_rev['parents'] hash_data_per_revs = svnrepo.swh_hash_data_per_revision(revision_start, revision_start) rev, _, commit, objects_per_path = list(hash_data_per_revs)[0] dir_id = objects_per_path[git.ROOT_TREE_KEY][0]['sha1_git'] swh_revision = converters.build_swh_revision(svnrepo.uuid, commit, rev, dir_id, parents) swh_revision_id = git.compute_revision_sha1_git(swh_revision) return swh_revision_id == revision_id def process_svn_revisions(self, svnrepo, revision_start, revision_end, revision_parents): """Process revisions from revision_start to revision_end and send to swh for storage. At each svn revision, checkout the repository, compute the tree hash and blobs and send for swh storage to store. Then computes and yields the swh revision. Yields: swh revision """ gen_revs = svnrepo.swh_hash_data_per_revision(revision_start, revision_end) for rev, nextrev, commit, objects_per_path in gen_revs: # compute the fs tree's checksums dir_id = objects_per_path[git.ROOT_TREE_KEY][0]['sha1_git'] swh_revision = converters.build_swh_revision(svnrepo.uuid, commit, rev, dir_id, revision_parents[rev]) swh_revision['id'] = git.compute_revision_sha1_git(swh_revision) self.log.debug('rev: %s, swhrev: %s' % ( rev, hashutil.hash_to_hex(swh_revision['id']))) if nextrev: revision_parents[nextrev] = [swh_revision['id']] objects = objects_per_type(objects_per_path) self.maybe_load_contents(objects[GitType.BLOB]) self.maybe_load_directories(objects[GitType.TREE], objects_per_path) yield swh_revision def process_swh_revisions(self, svnrepo, revision_start, revision_end, revision_parents): """Process and store revision to swh (sent by by blocks of 'revision_packet_size') Returns: The latest revision stored. """ for revisions in utils.grouper( self.process_svn_revisions(svnrepo, revision_start, revision_end, revision_parents), self.config['revision_packet_size']): revs = list(revisions) self.maybe_load_revisions(revs) return revs[-1] def process_swh_occurrence(self, revision, origin): """Process and load the occurrence pointing to the latest revision. """ occ = converters.build_swh_occurrence(revision['id'], origin['id'], datetime.datetime.utcnow()) self.log.debug('occ: %s' % occ) self.maybe_load_occurrences([occ]) def process(self, svn_url, origin, destination_path): """Load a svn repository in swh. Checkout the svn repository locally in destination_path. Args: - svn_url: svn repository url to import - origin: Dictionary origin - id: origin's id - url: url origin we fetched - type: type of the origin Returns: Dictionary with the following keys: - status: mandatory, the status result as a boolean - stderr: optional when status is True, mandatory otherwise """ svnrepo = svn.SvnRepo(svn_url, origin['id'], self.storage, destination_path) try: swh_rev = svnrepo.swh_previous_revision() if swh_rev: extra_headers = dict(swh_rev['metadata']['extra_headers']) revision_start = extra_headers['svn_revision'] revision_parents = { revision_start: swh_rev['parents'] } else: revision_start = 1 revision_parents = { revision_start: [] } svnrepo.fork(revision_start) self.log.debug('svn co %s@%s' % (svn_url, revision_start)) if swh_rev and not self.check_history_not_altered(svnrepo, revision_start, swh_rev): msg = 'History of svn %s@%s history modified. Skipping...' % ( svn_url, revision_start) self.log.warn(msg) return {'status': False, 'stderr': msg} revision_end = svnrepo.head_revision() self.log.info('[revision_start-revision_end]: [%s-%s]' % ( revision_start, revision_end)) if revision_start == revision_end and revision_start is not 1: self.log.info('%s@%s already injected.' % (svn_url, revision_end)) return {'status': True} self.log.info('Repo %s ready to be processed.' % svnrepo) # process and store revision to swh (sent by by blocks of # 'revision_packet_size') latest_rev = self.process_swh_revisions(svnrepo, revision_start, revision_end, revision_parents) self.process_swh_occurrence(latest_rev, origin) # flush eventual remaining data self.flush() finally: svnrepo.clean_fs() return {'status': True} diff --git a/swh/loader/svn/svn.py b/swh/loader/svn/svn.py index 8230f01..2331f03 100644 --- a/swh/loader/svn/svn.py +++ b/swh/loader/svn/svn.py @@ -1,301 +1,279 @@ -# Copyright (C) 2015 The Software Heritage developers +# Copyright (C) 2015-2016 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import os import pysvn import tempfile -import subprocess import shutil -from contextlib import contextmanager from pysvn import Revision, opt_revision_kind from retrying import retry from swh.model import git -@contextmanager -def cwd(path): - """Contextually change the working directory to do thy bidding. - Then gets back to the original location. - - """ - prev_cwd = os.getcwd() - os.chdir(path) - try: - yield - finally: - os.chdir(prev_cwd) - - # When log message contains empty data DEFAULT_AUTHOR_NAME = b'' DEFAULT_AUTHOR_DATE = b'' DEFAULT_AUTHOR_MESSAGE = b'' class SvnRepoException(ValueError): def __init__(self, svnrepo, e): super().__init__(e) self.svnrepo = svnrepo def retry_with_cleanup(exception): """Clean the repository from locks before retrying. """ exception.svnrepo.cleanup() return True class SvnRepo(): """Swh representation of a svn repository. """ def __init__(self, remote_url, origin_id, storage, destination_path=None): self.remote_url = remote_url self.storage = storage self.origin_id = origin_id if destination_path: os.makedirs(destination_path, exist_ok=True) root_dir = destination_path else: root_dir = '/tmp' local_dirname = tempfile.mkdtemp(suffix='.swh.loader', prefix='tmp.', dir=root_dir) name = os.path.basename(remote_url) local_repo_url = os.path.join(local_dirname, name) self.client = pysvn.Client() self.local_url = local_repo_url - self.uuid = None + self.uuid = None # Cannot know it yet since we need a working copy def __str__(self): return str({'remote_url': self.remote_url, 'local_url': self.local_url, 'uuid': self.uuid, 'swh-origin': self.origin_id}) - def read_uuid(self): - with cwd(self.local_url): - cmd = 'svn info | grep UUID | cut -f2 -d:' - uuid = subprocess.check_output(cmd, shell=True) - return uuid.strip().decode('utf-8') - def cleanup(self): """Clean up any locks in the working copy at path. """ self.client.cleanup(self.local_url) @retry(retry_on_exception=retry_with_cleanup, stop_max_attempt_number=3) def checkout(self, revision): """Checkout repository repo at revision. Args: revision: the revision number to checkout the repo to. """ try: self.client.checkout( self.remote_url, self.local_url, revision=Revision(opt_revision_kind.number, revision)) except Exception as e: raise SvnRepoException(self, e) def fork(self, svn_revision=None): """Checkout remote repository to a local working copy (at revision 1 if the svn revision is not specified). This will also update the repository's uuid. """ self.checkout(1 if not svn_revision else svn_revision) - self.uuid = self.read_uuid() + self.uuid = self.client.info(self.local_url).uuid def head_revision(self): """Retrieve current revision of the repository's working copy. """ head_rev = Revision(opt_revision_kind.head) info = self.client.info2(self.local_url, revision=head_rev, recurse=False) return info[0][1]['rev'].number def initial_revision(self): """Retrieve the initial revision from which the remote url appeared. Note: This should always be 1 since we won't be dealing with in-depth url. """ return self.client.log(self.remote_url)[-1].data.get( 'revision').number def _to_change_paths(self, log_entry): """Convert changed paths to dict if any. """ try: changed_paths = log_entry.changed_paths except AttributeError: changed_paths = [] for paths in changed_paths: path = os.path.join(self.local_url, paths.path.lstrip('/')) yield { 'path': path.encode('utf-8'), 'action': paths.action # A(dd), M(odified), D(eleted) } def _to_entry(self, log_entry): try: author_date = log_entry.date or DEFAULT_AUTHOR_DATE except AttributeError: author_date = DEFAULT_AUTHOR_DATE try: author = log_entry.author.encode('utf-8') or DEFAULT_AUTHOR_NAME except AttributeError: author = DEFAULT_AUTHOR_NAME try: message = log_entry.message.encode('utf-8') \ or DEFAULT_AUTHOR_MESSAGE except AttributeError: message = DEFAULT_AUTHOR_MESSAGE return { 'rev': log_entry.revision.number, 'author_date': author_date, 'author_name': author, 'message': message, 'changed_paths': self._to_change_paths(log_entry), } @retry(stop_max_attempt_number=3) def _logs(self, revision_start, revision_end): rev_start = Revision(opt_revision_kind.number, revision_start) rev_end = Revision(opt_revision_kind.number, revision_end) return self.client.log(url_or_path=self.local_url, revision_start=rev_start, revision_end=rev_end, discover_changed_paths=True) def logs(self, revision_start, revision_end, block_size=100): """Stream svn logs between revision_start and revision_end by chunks of block_size logs. Yields revision and associated revision information between the revision start and revision_end. Args: revision_start: the svn revision starting bound revision_end: the svn revision ending bound block_size: block size of revisions to fetch Yields: tuple of revisions and logs. revisions: list of revisions in order logs: Dictionary with key revision number and value the log entry. The log entry is a dictionary with the following keys: - author_date: date of the commit - author_name: name of the author - message: commit message """ r1 = revision_start r2 = r1 + block_size - 1 done = False if r2 >= revision_end: r2 = revision_end done = True for log_entry in self._logs(r1, r2): # determine the full diff between (rev - 1) and rev # diff = self.client.diff(url_or_path=self.local_url, # tmp_path='/tmp', # url_or_path2=self.local_url, # revision1=Revision( # opt_revision_kind.number, rev-1), # revision2=Revision( # opt_revision_kind.number, rev), # ignore_content_type=True) yield self._to_entry(log_entry) if not done: yield from self.logs(r2 + 1, revision_end, block_size) def swh_previous_revision(self): """Look for possible existing revision in swh. Returns: The previous swh revision if found, None otherwise. """ storage = self.storage occ = storage.occurrence_get(self.origin_id) if occ: revision_id = occ[0]['target'] revisions = storage.revision_get([revision_id]) if revisions: return revisions[0] def swh_hash_data_per_revision(self, start_revision, end_revision): """Compute swh hash data per each revision between start_revision and end_revision. Args: start_revision: starting revision end_revision: ending revision Yields: tuple (rev, nextrev, commit, objects_per_path) - rev: current revision - nextrev: next revision - commit: commit data (author, date, message) for such revision - objects_per_path: dictionary of path, swh hash data with type """ def ignore_svn_folder(dirpath): return b'.svn' not in dirpath local_url = self.local_url.encode('utf-8') for commit in self.logs(start_revision, end_revision): rev = commit['rev'] # checkout to the revision rev self.checkout(revision=rev) if rev == start_revision: # first time we walk the complete tree objects_per_path = git.walk_and_compute_sha1_from_directory( local_url, dir_ok_fn=ignore_svn_folder) else: # then we update only what needs to be objects_per_path = git.update_checksums_from( commit['changed_paths'], objects_per_path, dir_ok_fn=ignore_svn_folder) if rev == end_revision: nextrev = None else: nextrev = rev + 1 yield rev, nextrev, commit, objects_per_path def clean_fs(self): """Clean up the local url checkout. """ shutil.rmtree(os.path.dirname(self.local_url))