diff --git a/swh/loader/svn/converters.py b/swh/loader/svn/converters.py index f632d45..323edb1 100644 --- a/swh/loader/svn/converters.py +++ b/swh/loader/svn/converters.py @@ -1,81 +1,66 @@ # Copyright (C) 2015-2016 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -import email.utils - - -def uid_to_person(uid, encode=True): - """Convert an uid to a person suitable for insertion. +def svn_author_to_person(author): + """Convert an svn author to a person suitable for insertion. Args: - uid: an uid of the form "Name " - encode: whether to convert the output to bytes or not + author: the svn author (in bytes) + Returns: a dictionary with keys: - name: the name associated to the uid - email: the mail associated to the uid - """ + fullname: the name associate to the author + name: the name associated to the author + email: None (no email in svn) - ret = { - 'name': '', - 'email': '', + """ + return { + 'fullname': author, + 'name': author, + 'email': None, } - name, mail = email.utils.parseaddr(uid) - - if name and email: - ret['name'] = name - ret['email'] = mail - else: - ret['name'] = uid - - if encode: - for key in ('name', 'email'): - ret[key] = ret[key].encode('utf-8') - - return ret - def build_swh_revision(repo_uuid, commit, rev, dir_id, parents): """Given a svn revision, build a swh revision. """ - author = uid_to_person(commit['author_name']) + author = svn_author_to_person(commit['author_name']) - msg = commit['message'].encode('utf-8') + msg = commit['message'] date = { 'timestamp': commit['author_date'], 'offset': 0, } return { 'date': date, 'committer_date': date, 'type': 'svn', 'directory': dir_id, 'message': msg, 'author': author, 'committer': author, 'synthetic': True, 'metadata': { 'extra_headers': [ ['svn_repo_uuid', repo_uuid], ['svn_revision', rev] ] }, 'parents': parents, } def build_swh_occurrence(revision_id, origin_id, date): """Build a swh occurrence from the revision id, origin id, and date. """ return {'branch': 'master', 'target': revision_id, 'target_type': 'revision', 'origin': origin_id, 'date': date} diff --git a/swh/loader/svn/svn.py b/swh/loader/svn/svn.py index a469485..873debc 100644 --- a/swh/loader/svn/svn.py +++ b/swh/loader/svn/svn.py @@ -1,323 +1,324 @@ # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import os import pysvn import tempfile import subprocess import shutil from contextlib import contextmanager from pysvn import Revision, opt_revision_kind from retrying import retry from swh.model import git @contextmanager def cwd(path): """Contextually change the working directory to do thy bidding. Then gets back to the original location. """ prev_cwd = os.getcwd() os.chdir(path) try: yield finally: os.chdir(prev_cwd) def init_repo(remote_repo_url, destination_path=None): """Initialize a repository without any svn action on disk. There may be temporary folder creation on disk as side effect (if destination_path is not provided) Args: remote_repo_url: The remote svn url destination_path: The optional local parent folder to checkout the repository to. Returns: Dictionary with the following keys: - client: client instance to manipulate the repository - remote_url: remote url (same as input) - local_url: local url which has been computed """ name = os.path.basename(remote_repo_url) if destination_path: os.makedirs(destination_path, exist_ok=True) local_dirname = destination_path else: local_dirname = tempfile.mkdtemp(suffix='.swh.loader', prefix='tmp.', dir='/tmp') local_repo_url = os.path.join(local_dirname, name) client = pysvn.Client() return {'client': client, 'remote_url': remote_repo_url, 'local_url': local_repo_url} # When log message contains empty data -DEFAULT_AUTHOR_NAME = '' -DEFAULT_AUTHOR_DATE = '' -DEFAULT_AUTHOR_MESSAGE = '' +DEFAULT_AUTHOR_NAME = b'' +DEFAULT_AUTHOR_DATE = b'' +DEFAULT_AUTHOR_MESSAGE = b'' class SvnRepoException(ValueError): def __init__(self, svnrepo, e): super().__init__(e) self.svnrepo = svnrepo def retry_with_cleanup(exception): """Clean the repository from locks before retrying. """ exception.svnrepo.cleanup() return True class SvnRepo(): """Swh representation of a svn repository. """ def __init__(self, remote_url, origin_id, storage, local_url=None): self.remote_url = remote_url self.storage = storage self.origin_id = origin_id r = init_repo(remote_url, local_url) self.client = r['client'] self.local_url = r['local_url'] self.uuid = None def __str__(self): return str({'remote_url': self.remote_url, 'local_url': self.local_url, 'uuid': self.uuid, 'swh-origin': self.origin_id}) def read_uuid(self): with cwd(self.local_url): cmd = 'svn info | grep UUID | cut -f2 -d:' uuid = subprocess.check_output(cmd, shell=True) return uuid.strip().decode('utf-8') def cleanup(self): """Clean up any locks in the working copy at path. """ self.client.cleanup(self.local_url) @retry(retry_on_exception=retry_with_cleanup, stop_max_attempt_number=3) def checkout(self, revision): """Checkout repository repo at revision. Args: revision: the revision number to checkout the repo to. """ try: self.client.checkout( self.remote_url, self.local_url, revision=Revision(opt_revision_kind.number, revision)) except Exception as e: raise SvnRepoException(self, e) def fork(self, svn_revision=None): """Checkout remote repository to a local working copy (at revision 1 if the svn revision is not specified). This will also update the repository's uuid. """ self.checkout(1 if not svn_revision else svn_revision) self.uuid = self.read_uuid() def head_revision(self): """Retrieve current revision of the repository's working copy. """ head_rev = Revision(opt_revision_kind.head) info = self.client.info2(self.local_url, revision=head_rev, recurse=False) return info[0][1]['rev'].number def initial_revision(self): """Retrieve the initial revision from which the remote url appeared. Note: This should always be 1 since we won't be dealing with in-depth url. """ return self.client.log(self.remote_url)[-1].data.get( 'revision').number def _to_change_paths(self, log_entry): """Convert changed paths to dict if any. """ try: changed_paths = log_entry.changed_paths except AttributeError: changed_paths = [] for paths in changed_paths: path = os.path.join(self.local_url, paths.path.lstrip('/')) yield { 'path': path.encode('utf-8'), 'action': paths.action # A(dd), M(odified), D(eleted) } def _to_entry(self, log_entry): try: author_date = log_entry.date or DEFAULT_AUTHOR_DATE except AttributeError: author_date = DEFAULT_AUTHOR_DATE try: - author = log_entry.author or DEFAULT_AUTHOR_NAME + author = log_entry.author.encode('utf-8') or DEFAULT_AUTHOR_NAME except AttributeError: author = DEFAULT_AUTHOR_NAME try: - message = log_entry.message or DEFAULT_AUTHOR_MESSAGE + message = log_entry.message.encode('utf-8') \ + or DEFAULT_AUTHOR_MESSAGE except AttributeError: message = DEFAULT_AUTHOR_MESSAGE return { 'rev': log_entry.revision.number, 'author_date': author_date, 'author_name': author, 'message': message, 'changed_paths': self._to_change_paths(log_entry), } @retry(stop_max_attempt_number=3) def _logs(self, revision_start, revision_end): rev_start = Revision(opt_revision_kind.number, revision_start) rev_end = Revision(opt_revision_kind.number, revision_end) return self.client.log(url_or_path=self.local_url, revision_start=rev_start, revision_end=rev_end, discover_changed_paths=True) def logs(self, revision_start, revision_end, block_size=100): """Stream svn logs between revision_start and revision_end by chunks of block_size logs. Yields revision and associated revision information between the revision start and revision_end. Args: revision_start: the svn revision starting bound revision_end: the svn revision ending bound block_size: block size of revisions to fetch Yields: tuple of revisions and logs. revisions: list of revisions in order logs: Dictionary with key revision number and value the log entry. The log entry is a dictionary with the following keys: - author_date: date of the commit - author_name: name of the author - message: commit message """ r1 = revision_start r2 = r1 + block_size - 1 done = False if r2 >= revision_end: r2 = revision_end done = True for log_entry in self._logs(r1, r2): # determine the full diff between (rev - 1) and rev # diff = self.client.diff(url_or_path=self.local_url, # tmp_path='/tmp', # url_or_path2=self.local_url, # revision1=Revision( # opt_revision_kind.number, rev-1), # revision2=Revision( # opt_revision_kind.number, rev), # ignore_content_type=True) yield self._to_entry(log_entry) if not done: yield from self.logs(r2 + 1, revision_end, block_size) def swh_previous_revision(self): """Look for possible existing revision in swh. Returns: The previous swh revision if found, None otherwise. """ storage = self.storage occ = storage.occurrence_get(self.origin_id) if occ: revision_id = occ[0]['target'] revisions = storage.revision_get([revision_id]) if revisions: return revisions[0] def swh_hash_data_per_revision(self, start_revision, end_revision): """Compute swh hash data per each revision between start_revision and end_revision. Args: start_revision: starting revision end_revision: ending revision Yields: tuple (rev, nextrev, commit, objects_per_path) - rev: current revision - nextrev: next revision - commit: commit data (author, date, message) for such revision - objects_per_path: dictionary of path, swh hash data with type """ def ignore_svn_folder(dirpath): return b'.svn' not in dirpath local_url = self.local_url.encode('utf-8') for commit in self.logs(start_revision, end_revision): rev = commit['rev'] # checkout to the revision rev self.checkout(revision=rev) if rev == start_revision: # first time we walk the complete tree objects_per_path = git.walk_and_compute_sha1_from_directory( local_url, dir_ok_fn=ignore_svn_folder) else: # then we update only what needs to be objects_per_path = git.update_checksums_from( commit['changed_paths'], objects_per_path, dir_ok_fn=ignore_svn_folder) if rev == end_revision: nextrev = None else: nextrev = rev + 1 yield rev, nextrev, commit, objects_per_path def clean_fs(self): """Clean up the local url checkout. """ shutil.rmtree(self.local_url) diff --git a/swh/loader/svn/tests/test_converters.py b/swh/loader/svn/tests/test_converters.py index f136e5b..bbf9162 100644 --- a/swh/loader/svn/tests/test_converters.py +++ b/swh/loader/svn/tests/test_converters.py @@ -1,107 +1,115 @@ # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import unittest from nose.tools import istest from swh.loader.svn import converters class TestConverters(unittest.TestCase): @istest - def uid_to_person(self): - actual_person1 = converters.uid_to_person('tony ', - encode=False) + def svn_author_to_person_as_bytes(self): + actual_person1 = converters.svn_author_to_person( + b'tony ') self.assertEquals(actual_person1, { - 'name': 'tony', - 'email': 'ynot@dagobah' + 'fullname': b'tony ', + 'name': b'tony ', + 'email': None, }) - actual_person2 = converters.uid_to_person('ardumont ', - encode=True) - self.assertEquals(actual_person2, { - 'name': b'ardumont', - 'email': b'ard@dagobah' + @istest + def svn_author_to_person_as_str(self): + # should not happen - input is bytes but nothing prevents it + actual_person1 = converters.svn_author_to_person('tony ') + self.assertEquals(actual_person1, { + 'fullname': 'tony ', + 'name': 'tony ', + 'email': None, }) - actual_person3 = converters.uid_to_person('someone') - self.assertEquals(actual_person3, { - 'name': b'someone', - 'email': b'' + @istest + def svn_author_to_person_None(self): + # should not happen - nothing prevents it though + actual_person2 = converters.svn_author_to_person(None) + self.assertEquals(actual_person2, { + 'fullname': None, + 'name': None, + 'email': None, }) @istest def build_swh_revision(self): actual_swh_revision = converters.build_swh_revision( repo_uuid='uuid', dir_id='dir-id', - commit={'author_name': 'theo', - 'message': 'commit message', + commit={'author_name': b'theo', + 'message': b'commit message', 'author_date': '2009-04-18 06:55:53 +0200'}, rev=10, parents=['123']) self.assertEquals(actual_swh_revision, { 'date': {'timestamp': '2009-04-18 06:55:53 +0200', 'offset': 0}, 'committer_date': {'timestamp': '2009-04-18 06:55:53 +0200', 'offset': 0}, 'type': 'svn', 'directory': 'dir-id', 'message': b'commit message', - 'author': {'name': b'theo', 'email': b''}, - 'committer': {'name': b'theo', 'email': b''}, + 'author': {'name': b'theo', 'email': None, 'fullname': b'theo'}, + 'committer': {'name': b'theo', 'email': None, 'fullname': b'theo'}, 'synthetic': True, 'metadata': { 'extra_headers': [ ['svn_repo_uuid', 'uuid'], ['svn_revision', 10], ] }, 'parents': ['123'], }) @istest def build_swh_revision_empty_data(self): actual_swh_revision = converters.build_swh_revision( repo_uuid='uuid', dir_id='dir-id', - commit={'author_name': '', - 'message': '', + commit={'author_name': b'', + 'message': b'', 'author_date': '2009-04-10 06:55:53'}, rev=8, parents=[]) self.assertEquals(actual_swh_revision, { 'date': {'timestamp': '2009-04-10 06:55:53', 'offset': 0}, 'committer_date': {'timestamp': '2009-04-10 06:55:53', 'offset': 0}, 'type': 'svn', 'directory': 'dir-id', 'message': b'', - 'author': {'name': b'', 'email': b''}, - 'committer': {'name': b'', 'email': b''}, + 'author': {'name': b'', 'email': None, 'fullname': b''}, + 'committer': {'name': b'', 'email': None, 'fullname': b''}, 'synthetic': True, 'metadata': { 'extra_headers': [ ['svn_repo_uuid', 'uuid'], ['svn_revision', 8], ] }, 'parents': [], }) @istest def build_swh_occurrence(self): actual_occ = converters.build_swh_occurrence('revision-id', 'origin-id', 'some-date') self.assertEquals(actual_occ, { 'branch': 'master', 'target': 'revision-id', 'target_type': 'revision', 'origin': 'origin-id', 'date': 'some-date'})