diff --git a/swh/loader/svn/converters.py b/swh/loader/svn/converters.py index 1e82959..336a82f 100644 --- a/swh/loader/svn/converters.py +++ b/swh/loader/svn/converters.py @@ -1,192 +1,232 @@ # Copyright (C) 2015-2016 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from email import utils +from swh.core import hashutil + from .utils import strdate_to_timestamp def svn_date_to_gitsvn_date(strdate): """Convert a string date to an swh one. Args: strdate: A string formatted for .utils.strdate_to_timestamp to do its jobs Returns: An swh date format with an integer timestamp. """ return { 'timestamp': int(strdate_to_timestamp(strdate)), 'offset': 0 } def svn_date_to_swh_date(strdate): """Convert a string date to an swh one. Args: strdate: A string formatted for .utils.strdate_to_timestamp to do its jobs Returns: An swh date format """ return { 'timestamp': strdate_to_timestamp(strdate), 'offset': 0 } def svn_author_to_swh_person(author): """Convert an svn author to an swh person. Default policy: No information is added. Args: author (string): the svn author (in bytes) Returns: a dictionary with keys: fullname: the author's associated fullname name: the author's associated name email: None (no email in svn) """ if not author: return {'fullname': b'', 'name': None, 'email': None} author = author.encode('utf-8') if b'<' in author and b'>' in author: name, email = utils.parseaddr(author.decode('utf-8')) return { 'fullname': author, 'name': name.encode('utf-8'), 'email': email.encode('utf-8') } return {'fullname': author, 'email': None, 'name': author} def svn_author_to_gitsvn_person(author, repo_uuid): """Convert an svn author to a person suitable for insertion. Default policy: If no email is found, the email is created using the author and the repo_uuid. Args: author (string): the svn author (in bytes) repo_uuid (bytes): the repository's uuid Returns: a dictionary with keys: fullname: the author's associated fullname name: the author's associated name email: None (no email in svn) """ if not author: author = '(no author)' author = author.encode('utf-8') if b'<' in author and b'>' in author: name, email = utils.parseaddr(author.decode('utf-8')) return { 'fullname': author, 'name': name.encode('utf-8'), 'email': email.encode('utf-8') } # we'll construct the author's fullname the same way git svn does # 'user ' email = b'@'.join([author, repo_uuid]) return { 'fullname': b''.join([author, b' ', b'<', email, b'>']), 'name': author, 'email': email, } def build_swh_revision(rev, commit, repo_uuid, dir_id, parents): """Given a svn revision, build a swh revision. This adds an ['metadata']['extra-headers'] entry with the repository's uuid and the svn revision. Args: - rev: the svn revision number - commit: the commit metadata - repo_uuid: The repository's uuid - dir_id: the tree's hash identifier - parents: the revision's parents identifier Returns: The swh revision dictionary. """ author = commit['author_name'] msg = commit['message'] date = commit['author_date'] metadata = { 'extra_headers': [ ['svn_repo_uuid', repo_uuid], ['svn_revision', str(rev).encode('utf-8')] ] } return { 'date': date, 'committer_date': date, 'type': 'svn', 'directory': dir_id, 'message': msg, 'author': author, 'committer': author, 'synthetic': True, 'metadata': metadata, 'parents': parents, } def build_gitsvn_swh_revision(rev, commit, dir_id, parents): """Given a svn revision, build a swh revision. Args: - rev: the svn revision number - commit: the commit metadata - dir_id: the tree's hash identifier - parents: the revision's parents identifier Returns: The swh revision dictionary. """ author = commit['author_name'] msg = commit['message'] date = commit['author_date'] return { 'date': date, 'committer_date': date, 'type': 'svn', 'directory': dir_id, 'message': msg, 'author': author, 'committer': author, 'synthetic': True, 'metadata': None, 'parents': parents, } def build_swh_occurrence(revision_id, origin_id, date): """Build a swh occurrence from the revision id, origin id, and date. """ return {'branch': 'master', 'target': revision_id, 'target_type': 'revision', 'origin': origin_id, 'date': date} + + +def loader_to_scheduler_revision(swh_revision): + """To avoid serialization or scheduler storage problem, transform + adequately the revision. + + FIXME: Should be more generically dealt with in swh-scheduler's + side. The advantage to having it here is that we known what we + store. + + """ + metadata = swh_revision['metadata'] + for entry in metadata['extra_headers']: + entry[1] = entry[1].decode('utf-8') + + return { + 'id': hashutil.hash_to_hex(swh_revision['id']), + 'parents': [hashutil.hash_to_hex(parent) for parent + in swh_revision['parents']], + 'metadata': metadata + } + + +def scheduler_to_loader_revision(swh_revision): + """If the known state (a revision) is already passed, it will be + serializable ready but not loader ready. + + FIXME: Should be more generically dealt with in swh-scheduler's + side. The advantage to having it here is that we known what we + store. + + """ + return { + 'id': hashutil.hex_to_hash(swh_revision['id']), + 'parents': [hashutil.hex_to_hash(parent) for parent + in swh_revision['parents']], + 'metadata': swh_revision['metadata'] + } diff --git a/swh/loader/svn/tasks.py b/swh/loader/svn/tasks.py index 8b3588f..dd926cb 100644 --- a/swh/loader/svn/tasks.py +++ b/swh/loader/svn/tasks.py @@ -1,131 +1,92 @@ # Copyright (C) 2015-2016 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from swh.loader.core import tasks -from swh.core import hashutil from .loader import GitSvnSvnLoader, SWHSvnLoader, SvnLoaderException - - -def loader_to_scheduler_revision(swh_revision): - """To avoid serialization or scheduler storage problem, transform - adequately the revision. - - FIXME: Should be more generically dealt with in swh-scheduler's - side. The advantage to having it here is that we known what we - store. - - """ - metadata = swh_revision['metadata'] - for entry in metadata['extra_headers']: - entry[1] = entry[1].decode('utf-8') - - return { - 'id': hashutil.hash_to_hex(swh_revision['id']), - 'parents': [hashutil.hash_to_hex(parent) for parent - in swh_revision['parents']], - 'metadata': metadata - } - - -def scheduler_to_loader_revision(swh_revision): - """If the known state (a revision) is already passed, it will be - serializable ready but not loader ready. - - FIXME: Should be more generically dealt with in swh-scheduler's - side. The advantage to having it here is that we known what we - store. - - """ - return { - 'id': hashutil.hex_to_hash(swh_revision['id']), - 'parents': [hashutil.hex_to_hash(parent) for parent - in swh_revision['parents']], - 'metadata': swh_revision['metadata'] - } +from .loader import converters class LoadSvnRepositoryTsk(tasks.LoaderCoreTask): """Import one svn repository to Software Heritage. """ CONFIG_BASE_FILENAME = 'loader/svn.ini' ADDITIONAL_CONFIG = { 'storage_class': ('str', 'remote_storage'), 'storage_args': ('list[str]', ['http://localhost:5000/']), 'with_policy': ('string', 'swh'), # Default, other possible # value is 'gitsvn' } task_queue = 'swh_loader_svn' def run(self, *args, **kwargs): """Import a svn repository. Args: - svn_url: svn's repository url - destination_path: root directory to locally retrieve svn's data - swh_revision: Optional extra swh revision to start from. cf. swh.loader.svn.SvnLoader.process docstring """ destination_path = kwargs['destination_path'] svn_url = kwargs['svn_url'] if 'origin' not in kwargs: # first time, we'll create the origin origin = { 'type': 'svn', 'url': svn_url, } origin['id'] = self.storage.origin_add_one(origin) retry = False else: origin = { 'id': kwargs['origin'], 'url': kwargs['svn_url'], 'type': 'svn' } retry = True fetch_history_id = self.open_fetch_history(origin['id']) try: # Determine which loader to trigger if self.config['with_policy'] == 'gitsvn': # this one compute hashes but do not store anywhere loader = GitSvnSvnLoader(svn_url, destination_path, origin) elif self.config['with_policy'] == 'swh': # the real production use case with storage and all loader = SWHSvnLoader(svn_url, destination_path, origin) else: raise ValueError('Only gitsvn or swh policies are supported in' '\'with_policy\' entry. ' 'Please adapt your svn.ini file accordingly') if retry: - swh_revision = scheduler_to_loader_revision( + swh_revision = converters.scheduler_to_loader_revision( kwargs['swh_revision']) result = loader.load(swh_revision) else: result = loader.load() except SvnLoaderException as e: # reschedule a task - print(e) - swh_rev = loader_to_scheduler_revision(e.swh_revision) + swh_rev = converters.loader_to_scheduler_revision(e.swh_revision) self.scheduler_backend.create_task({ 'type': 'svn-loader', 'arguments': { 'args': None, 'kwargs': { 'origin': origin['id'], 'svn_url': svn_url, 'destination_path': destination_path, 'swh_revision': swh_rev, } } }) result = {'status': False} self.close_fetch_history(fetch_history_id, result) diff --git a/swh/loader/svn/tests/test_converters.py b/swh/loader/svn/tests/test_converters.py index 939a7cd..404e4c8 100644 --- a/swh/loader/svn/tests/test_converters.py +++ b/swh/loader/svn/tests/test_converters.py @@ -1,262 +1,312 @@ # Copyright (C) 2015-2016 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import unittest from nose.tools import istest from swh.loader.svn import converters class TestAuthorGitSvnConverters(unittest.TestCase): @istest def svn_author_to_gitsvn_person(self): """The author should have name, email and fullname filled. """ actual_person = converters.svn_author_to_gitsvn_person( 'tony ', repo_uuid=None) self.assertEquals(actual_person, { 'fullname': b'tony ', 'name': b'tony', 'email': b'ynot@dagobah', }) @istest def svn_author_to_gitsvn_person_no_email(self): """The author should see his/her email filled with author@. """ actual_person = converters.svn_author_to_gitsvn_person( 'tony', repo_uuid=b'some-uuid') self.assertEquals(actual_person, { 'fullname': b'tony ', 'name': b'tony', 'email': b'tony@some-uuid', }) @istest def svn_author_to_gitsvn_person_empty_person(self): """The empty person should see name, fullname and email filled. """ actual_person = converters.svn_author_to_gitsvn_person( '', repo_uuid=b'some-uuid') self.assertEqual(actual_person, { 'fullname': b'(no author) <(no author)@some-uuid>', 'name': b'(no author)', 'email': b'(no author)@some-uuid' }) class TestAuthorSWHConverters(unittest.TestCase): @istest def svn_author_to_swh_person(self): """The author should have name, email and fullname filled. """ actual_person = converters.svn_author_to_swh_person( 'tony ') self.assertEquals(actual_person, { 'fullname': b'tony ', 'name': b'tony', 'email': b'ynot@dagobah', }) @istest def svn_author_to_swh_person_no_email(self): """The author and fullname should be the same as the input (author). """ actual_person = converters.svn_author_to_swh_person('tony') self.assertEquals(actual_person, { 'fullname': b'tony', 'name': b'tony', 'email': None, }) @istest def svn_author_to_swh_person_empty_person(self): """Empty person has only its fullname filled with the empty byte-string. """ actual_person = converters.svn_author_to_swh_person('') self.assertEqual(actual_person, { 'fullname': b'', 'name': None, 'email': None, }) class TestSWHRevisionConverters(unittest.TestCase): @istest def build_swh_revision_default(self): """This should build the swh revision with the swh revision's extra headers about the repository. """ actual_swh_revision = converters.build_swh_revision( repo_uuid=b'uuid', dir_id='dir-id', commit={ 'author_name': { 'name': b'theo', 'email': b'theo@uuid', 'fullname': b'theo ' }, 'message': b'commit message', 'author_date': { 'timestamp': 1088108379, 'offset': 0 } }, rev=10, parents=['123']) date = {'timestamp': 1088108379, 'offset': 0} self.assertEquals(actual_swh_revision, { 'date': date, 'committer_date': date, 'type': 'svn', 'directory': 'dir-id', 'message': b'commit message', 'author': { 'name': b'theo', 'email': b'theo@uuid', 'fullname': b'theo ' }, 'committer': { 'name': b'theo', 'email': b'theo@uuid', 'fullname': b'theo ' }, 'synthetic': True, 'metadata': { 'extra_headers': [ ['svn_repo_uuid', b'uuid'], ['svn_revision', b'10'], ] }, 'parents': ['123'], }) class TestGitSvnRevisionConverters(unittest.TestCase): @istest def build_gitsvn_swh_revision_default(self): """This should build the swh revision without the swh revision's extra headers about the repository. """ actual_swh_revision = converters.build_gitsvn_swh_revision( dir_id='dir-id', commit={ 'author_name': { 'name': b'theo', 'email': b'theo@uuid', 'fullname': b'theo ' }, 'message': b'commit message', 'author_date': { 'timestamp': 1088108379, 'offset': 0 } }, rev=10, parents=['123']) date = {'timestamp': 1088108379, 'offset': 0} self.assertEquals(actual_swh_revision, { 'date': date, 'committer_date': date, 'type': 'svn', 'directory': 'dir-id', 'message': b'commit message', 'author': { 'name': b'theo', 'email': b'theo@uuid', 'fullname': b'theo ' }, 'committer': { 'name': b'theo', 'email': b'theo@uuid', 'fullname': b'theo ' }, 'synthetic': True, 'metadata': None, 'parents': ['123'], }) class TestSWHOccurrence(unittest.TestCase): @istest def build_swh_occurrence(self): actual_occ = converters.build_swh_occurrence('revision-id', 'origin-id', 'some-date') self.assertEquals(actual_occ, { 'branch': 'master', 'target': 'revision-id', 'target_type': 'revision', 'origin': 'origin-id', 'date': 'some-date'}) class ConvertSWHDate(unittest.TestCase): @istest def svn_date_to_swh_date(self): """The timestamp should not be tampered with and include the decimals. """ self.assertEquals( converters.svn_date_to_swh_date('2011-05-31T06:04:39.500900Z'), { 'timestamp': 1306821879.5009, 'offset': 0 }) self.assertEquals( converters.svn_date_to_swh_date('2011-05-31T06:04:39.800722Z'), { 'timestamp': 1306821879.800722, 'offset': 0 }) @istest def svn_date_to_swh_date_epoch(self): """Empty date should be EPOCH (timestamp and offset at 0).""" # It should return 0, epoch self.assertEquals({'timestamp': 0, 'offset': 0}, converters.svn_date_to_swh_date('')) self.assertEquals({'timestamp': 0, 'offset': 0}, converters.svn_date_to_swh_date(None)) class ConvertGitSvnDate(unittest.TestCase): @istest def svn_date_to_gitsvn_date(self): """The timestamp should be truncated to be an integer.""" actual_ts = converters.svn_date_to_gitsvn_date( '2011-05-31T06:04:39.800722Z') self.assertEquals(actual_ts, {'timestamp': 1306821879, 'offset': 0}) @istest def svn_date_to_gitsvn_date_epoch(self): """Empty date should be EPOCH (timestamp and offset at 0).""" # It should return 0, epoch self.assertEquals({'timestamp': 0, 'offset': 0}, converters.svn_date_to_gitsvn_date('')) self.assertEquals({'timestamp': 0, 'offset': 0}, converters.svn_date_to_gitsvn_date(None)) + + +class ConvertSWHRevision(unittest.TestCase): + @istest + def loader_to_scheduler_revision(self): + actual_rev = converters.loader_to_scheduler_revision({ + 'parents': [b'e\n\xbe\xe9\xc0\x87y\xfeG\xf7\xcfG\x82h\xa8i\xe8\xfe\xe2\x13'], # noqa + 'id': b'\xedd\x92w\xab\xb2\x16,\xea*\x90O8\x0f\x96/\xfb\xd4\x16`', + 'metadata': { + 'extra_headers': [ + ['svn_repo_uuid', b'bc7d6c17-68a5-4917-9c54-c565d7424229'], + ['svn_revision', b'4'] + ] + } + }) + + self.assertEquals(actual_rev, { + 'id': 'ed649277abb2162cea2a904f380f962ffbd41660', + 'parents': ['650abee9c08779fe47f7cf478268a869e8fee213'], + 'metadata': { + 'extra_headers': [ + ['svn_repo_uuid', 'bc7d6c17-68a5-4917-9c54-c565d7424229'], + ['svn_revision', '4'] + ] + } + }) + + @istest + def scheduler_to_loader_revision(self): + actual_rev = converters.scheduler_to_loader_revision({ + 'id': 'ed649277abb2162cea2a904f380f962ffbd41660', + 'parents': ['650abee9c08779fe47f7cf478268a869e8fee213'], + 'metadata': { + 'extra_headers': [ + ['svn_repo_uuid', 'bc7d6c17-68a5-4917-9c54-c565d7424229'], + ['svn_revision', '4'] + ] + } + }) + + self.assertEquals(actual_rev, { + 'parents': [b'e\n\xbe\xe9\xc0\x87y\xfeG\xf7\xcfG\x82h\xa8i\xe8\xfe\xe2\x13'], # noqa + 'id': b'\xedd\x92w\xab\xb2\x16,\xea*\x90O8\x0f\x96/\xfb\xd4\x16`', + 'metadata': { + 'extra_headers': [ + ['svn_repo_uuid', 'bc7d6c17-68a5-4917-9c54-c565d7424229'], + ['svn_revision', '4'] + ] + } + })