diff --git a/MANIFEST.in b/MANIFEST.in index 08ebc95..e7c46fc 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,3 +1,4 @@ include Makefile include requirements.txt +include requirements-swh.txt include version.txt diff --git a/PKG-INFO b/PKG-INFO index 5df92a7..0034acc 100644 --- a/PKG-INFO +++ b/PKG-INFO @@ -1,10 +1,10 @@ Metadata-Version: 1.0 Name: swh.loader.svn -Version: 0.0.22 +Version: 0.0.23 Summary: Software Heritage Loader SVN Home-page: https://forge.softwareheritage.org/diffusion/DLDSVN Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Description: UNKNOWN Platform: UNKNOWN diff --git a/debian/control b/debian/control index b14b7c8..9519f40 100644 --- a/debian/control +++ b/debian/control @@ -1,29 +1,35 @@ Source: swh-loader-svn Maintainer: Software Heritage developers Section: python Priority: optional Build-Depends: debhelper (>= 9), dh-python, python3-all, + python3-click, + python3-dateutil, python3-nose, + python3-retrying, python3-setuptools, + python3-subvertpy (>= 0.9.4~), python3-swh.core (>= 0.0.19~), - python3-swh.storage (>= 0.0.76~), - python3-swh.model (>= 0.0.11~), - python3-swh.scheduler (>= 0.0.7~), python3-swh.loader.core (>= 0.0.12~), - python3-subvertpy (>= 0.9.4~), - python3-dateutil, - python3-retrying, - python3-click, + python3-swh.model (>= 0.0.11~), + python3-swh.scheduler (>= 0.0.11~), + python3-swh.storage (>= 0.0.79~), python3-vcversioner Standards-Version: 3.9.6 Homepage: https://forge.softwareheritage.org/diffusion/DLDSVN/ Package: python3-swh.loader.svn Architecture: all -Depends: python3-swh.core (>= 0.0.19~), python3-swh.storage (>= 0.0.76~), python3-swh.model (>= 0.0.11~), - python3-swh.scheduler (>= 0.0.7~), python3-swh.loader.core (>= 0.0.12~), - subversion, pigz, ${misc:Depends}, ${python3:Depends} +Depends: pigz, + python3-swh.core (>= 0.0.19~), + python3-swh.loader.core (>= 0.0.12~), + python3-swh.model (>= 0.0.11~), + python3-swh.scheduler (>= 0.0.11~), + python3-swh.storage (>= 0.0.79~), + subversion, + ${misc:Depends}, + ${python3:Depends} Description: Software Heritage Loader Svn Module in charge of loading svn repositories into swh storage. diff --git a/requirements-swh.txt b/requirements-swh.txt new file mode 100644 index 0000000..fd95219 --- /dev/null +++ b/requirements-swh.txt @@ -0,0 +1,5 @@ +swh.core >= 0.0.19 +swh.storage >= 0.0.79 +swh.model >= 0.0.11 +swh.scheduler >= 0.0.11 +swh.loader.core >= 0.0.12 diff --git a/requirements.txt b/requirements.txt index a8a3fca..474c1d7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,13 +1,8 @@ # Add here external Python modules dependencies, one per line. Module names # should match https://pypi.python.org/pypi names. For the full spec or # dependency lines, see https://pip.readthedocs.org/en/1.1/requirements.html vcversioner -swh.core >= 0.0.19 -swh.storage >= 0.0.76 -swh.model >= 0.0.11 -swh.scheduler >= 0.0.7 -swh.loader.core >= 0.0.12 click retrying python-dateutil subvertpy >= 0.9.4 diff --git a/setup.py b/setup.py index 1b83f32..ce28893 100644 --- a/setup.py +++ b/setup.py @@ -1,28 +1,28 @@ from setuptools import setup def parse_requirements(): requirements = [] - with open('requirements.txt') as f: - for line in f.readlines(): - line = line.strip() - if not line or line.startswith('#'): - continue - requirements.append(line) - + for reqf in ('requirements.txt', 'requirements-swh.txt'): + with open(reqf) as f: + for line in f.readlines(): + line = line.strip() + if not line or line.startswith('#'): + continue + requirements.append(line) return requirements setup( name='swh.loader.svn', description='Software Heritage Loader SVN', author='Software Heritage developers', author_email='swh-devel@inria.fr', url='https://forge.softwareheritage.org/diffusion/DLDSVN', packages=['swh.loader.svn'], # packages's modules scripts=[], # scripts to package install_requires=parse_requirements(), setup_requires=['vcversioner'], vcversioner={}, include_package_data=True, ) diff --git a/swh.loader.svn.egg-info/PKG-INFO b/swh.loader.svn.egg-info/PKG-INFO index 5df92a7..0034acc 100644 --- a/swh.loader.svn.egg-info/PKG-INFO +++ b/swh.loader.svn.egg-info/PKG-INFO @@ -1,10 +1,10 @@ Metadata-Version: 1.0 Name: swh.loader.svn -Version: 0.0.22 +Version: 0.0.23 Summary: Software Heritage Loader SVN Home-page: https://forge.softwareheritage.org/diffusion/DLDSVN Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Description: UNKNOWN Platform: UNKNOWN diff --git a/swh.loader.svn.egg-info/SOURCES.txt b/swh.loader.svn.egg-info/SOURCES.txt index 1bafddb..c576d79 100644 --- a/swh.loader.svn.egg-info/SOURCES.txt +++ b/swh.loader.svn.egg-info/SOURCES.txt @@ -1,44 +1,45 @@ .gitignore AUTHORS LICENSE MANIFEST.in Makefile README README-dev.org errors.org +requirements-swh.txt requirements.txt setup.py svn-lib-client-analysis.org version.txt bin/init-svn-repository.sh bin/swh-hashtree bin/swh-svn debian/changelog debian/compat debian/control debian/copyright debian/rules debian/source/format docs/comparison-git-svn-swh-svn.org docs/swh-loader-svn.txt install/install-pysvn.sh install/install-subvertpy.sh resources/svn.ini swh.loader.svn.egg-info/PKG-INFO swh.loader.svn.egg-info/SOURCES.txt swh.loader.svn.egg-info/dependency_links.txt swh.loader.svn.egg-info/requires.txt swh.loader.svn.egg-info/top_level.txt swh/loader/svn/__init__.py swh/loader/svn/converters.py swh/loader/svn/loader.py swh/loader/svn/producer.py swh/loader/svn/ra.py swh/loader/svn/svn.py swh/loader/svn/tasks.py swh/loader/svn/utils.py swh/loader/svn/tests/test_base.py swh/loader/svn/tests/test_converters.py swh/loader/svn/tests/test_loader.org swh/loader/svn/tests/test_loader.py swh/loader/svn/tests/test_utils.py \ No newline at end of file diff --git a/swh.loader.svn.egg-info/requires.txt b/swh.loader.svn.egg-info/requires.txt index d0f2a02..83fe043 100644 --- a/swh.loader.svn.egg-info/requires.txt +++ b/swh.loader.svn.egg-info/requires.txt @@ -1,10 +1,10 @@ click python-dateutil retrying subvertpy>=0.9.4 swh.core>=0.0.19 swh.loader.core>=0.0.12 swh.model>=0.0.11 -swh.scheduler>=0.0.7 -swh.storage>=0.0.76 +swh.scheduler>=0.0.11 +swh.storage>=0.0.79 vcversioner diff --git a/swh/loader/svn/converters.py b/swh/loader/svn/converters.py index 72e2f35..d0cbe48 100644 --- a/swh/loader/svn/converters.py +++ b/swh/loader/svn/converters.py @@ -1,238 +1,242 @@ # Copyright (C) 2015-2016 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from email import utils from swh.core import hashutil from .utils import strdate_to_timestamp def svn_date_to_gitsvn_date(strdate): """Convert a string date to an swh one. Args: strdate: A string formatted for .utils.strdate_to_timestamp to do its jobs Returns: An swh date format with an integer timestamp. """ + ts = strdate_to_timestamp(strdate) return { - 'timestamp': int(strdate_to_timestamp(strdate)), + 'timestamp': { + 'seconds': ts['seconds'], + 'microseconds': 0, + }, 'offset': 0 } def svn_date_to_swh_date(strdate): """Convert a string date to an swh one. Args: strdate: A string formatted for .utils.strdate_to_timestamp to do its jobs Returns: An swh date format """ return { 'timestamp': strdate_to_timestamp(strdate), 'offset': 0 } def svn_author_to_swh_person(author): """Convert an svn author to an swh person. Default policy: No information is added. Args: author (string): the svn author (in bytes) Returns: a dictionary with keys: fullname: the author's associated fullname name: the author's associated name email: None (no email in svn) """ if not author: return {'fullname': b'', 'name': None, 'email': None} author = author.encode('utf-8') if b'<' in author and b'>' in author: name, email = utils.parseaddr(author.decode('utf-8')) return { 'fullname': author, 'name': name.encode('utf-8'), 'email': email.encode('utf-8') } return {'fullname': author, 'email': None, 'name': author} def svn_author_to_gitsvn_person(author, repo_uuid): """Convert an svn author to a person suitable for insertion. Default policy: If no email is found, the email is created using the author and the repo_uuid. Args: author (string): the svn author (in bytes) repo_uuid (bytes): the repository's uuid Returns: a dictionary with keys: fullname: the author's associated fullname name: the author's associated name email: None (no email in svn) """ if not author: author = '(no author)' author = author.encode('utf-8') if b'<' in author and b'>' in author: name, email = utils.parseaddr(author.decode('utf-8')) return { 'fullname': author, 'name': name.encode('utf-8'), 'email': email.encode('utf-8') } # we'll construct the author's fullname the same way git svn does # 'user ' email = b'@'.join([author, repo_uuid]) return { 'fullname': b''.join([author, b' ', b'<', email, b'>']), 'name': author, 'email': email, } def build_swh_revision(rev, commit, repo_uuid, dir_id, parents): """Given a svn revision, build a swh revision. This adds an ['metadata']['extra-headers'] entry with the repository's uuid and the svn revision. Args: - rev: the svn revision number - commit: the commit metadata - repo_uuid: The repository's uuid - dir_id: the tree's hash identifier - parents: the revision's parents identifier Returns: The swh revision dictionary. """ author = commit['author_name'] msg = commit['message'] date = commit['author_date'] metadata = { 'extra_headers': [ ['svn_repo_uuid', repo_uuid], ['svn_revision', str(rev).encode('utf-8')] ] } return { 'date': date, 'committer_date': date, 'type': 'svn', 'directory': dir_id, 'message': msg, 'author': author, 'committer': author, 'synthetic': True, 'metadata': metadata, 'parents': parents, } def build_gitsvn_swh_revision(rev, commit, dir_id, parents): """Given a svn revision, build a swh revision. Args: - rev: the svn revision number - commit: the commit metadata - dir_id: the tree's hash identifier - parents: the revision's parents identifier Returns: The swh revision dictionary. """ author = commit['author_name'] msg = commit['message'] date = commit['author_date'] return { 'date': date, 'committer_date': date, 'type': 'svn', 'directory': dir_id, 'message': msg, 'author': author, 'committer': author, 'synthetic': True, 'metadata': None, 'parents': parents, } def build_swh_occurrence(revision_id, origin_id, visit): """Build a swh occurrence from the revision id, origin id, and date. """ return {'branch': 'master', 'target': revision_id, 'target_type': 'revision', 'origin': origin_id, 'visit': visit} def loader_to_scheduler_revision(swh_revision): """To avoid serialization or scheduler storage problem, transform adequately the revision. FIXME: Should be more generically dealt with in swh-scheduler's side. The advantage to having it here is that we known what we store. """ if not swh_revision: return None metadata = swh_revision['metadata'] for entry in (e for e in metadata['extra_headers'] if isinstance(e[1], bytes)): entry[1] = entry[1].decode('utf-8') return { 'id': hashutil.hash_to_hex(swh_revision['id']), 'parents': [hashutil.hash_to_hex(parent) for parent in swh_revision['parents']], 'metadata': metadata } def scheduler_to_loader_revision(swh_revision): """If the known state (a revision) is already passed, it will be serializable ready but not loader ready. FIXME: Should be more generically dealt with in swh-scheduler's side. The advantage to having it here is that we known what we store. """ if not swh_revision: return None return { 'id': hashutil.hex_to_hash(swh_revision['id']), 'parents': [hashutil.hex_to_hash(parent) for parent in swh_revision['parents']], 'metadata': swh_revision['metadata'] } diff --git a/swh/loader/svn/producer.py b/swh/loader/svn/producer.py index 0c9ccea..aa15a8c 100644 --- a/swh/loader/svn/producer.py +++ b/swh/loader/svn/producer.py @@ -1,115 +1,106 @@ -# Copyright (C) 2015-2016 The Software Heritage developers +# Copyright (C) 2015-2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import click import sys - -def get_task(task_name): - """Retrieve task object in the application by its fully qualified name. - - """ - from swh.scheduler.celery_backend.config import app - for module in app.conf.CELERY_IMPORTS: - __import__(module) - - return app.tasks[task_name] +from swh.scheduler.utils import get_task def _produce_svn_to_load( svn_url, origin_url, destination_path=None, visit_date=None, synchroneous=False, task_name='swh.loader.svn.tasks.LoadSWHSvnRepositoryTsk'): """Produce svn urls on the message queue. Those urls can either be read from stdin or directly passed as argument. """ task = get_task(task_name) if not synchroneous and svn_url: task.delay(svn_url=svn_url, origin_url=origin_url, visit_date=visit_date, destination_path=destination_path) elif synchroneous and svn_url: # for debug purpose task(svn_url=svn_url, origin_url=origin_url, visit_date=visit_date, destination_path=destination_path) else: # input from stdin, so we ignore most of the function's input for line in sys.stdin: line = line.rstrip() data = line.split(' ') svn_url = data[0] if len(data) > 1: origin_url = data[1] else: origin_url = None if svn_url: print(svn_url, origin_url) task.delay(svn_url=svn_url, origin_url=origin_url, destination_path=destination_path) def _produce_archive_to_mount_and_load( archive_path, visit_date, task_name='swh.loader.svn.tasks.MountAndLoadSvnRepositoryTsk'): task = get_task(task_name) if archive_path: task.delay(archive_path) else: for line in sys.stdin: line = line.rstrip() data = line.split(' ') archive_path = data[0] if len(data) > 1: origin_url = data[1] else: origin_url = None if archive_path: print(archive_path, origin_url) task.delay(archive_path, origin_url, visit_date=visit_date) @click.group() def cli(): pass @cli.command('svn', help='Default svn urls producer') @click.option('--url', help="svn repository's mirror url.") @click.option('--origin-url', default=None, help='svn repository\'s original remote url ' '(if different than --svn-url).') @click.option('--destination-path', help="(optional) svn checkout destination.") @click.option('--visit-date', help="(optional) visit date to override") @click.option('--synchroneous', is_flag=True, help="To execute directly the svn loading.") def produce_svn_to_load(url, origin_url, destination_path, visit_date, synchroneous): _produce_svn_to_load(svn_url=url, origin_url=origin_url, destination_path=destination_path, synchroneous=synchroneous) @cli.command('svn-archive', help='Default svndump archive producer') @click.option('--visit-date', help="(optional) visit date to override") @click.option('--path', help="Archive's Path to load and mount") def produce_archive_to_mount_and_load(path, visit_date): _produce_archive_to_mount_and_load(path, visit_date) if __name__ == '__main__': cli() diff --git a/swh/loader/svn/tests/test_converters.py b/swh/loader/svn/tests/test_converters.py index d2e28b5..6a90bcc 100644 --- a/swh/loader/svn/tests/test_converters.py +++ b/swh/loader/svn/tests/test_converters.py @@ -1,321 +1,368 @@ # Copyright (C) 2015-2016 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import unittest from nose.tools import istest from swh.loader.svn import converters class TestAuthorGitSvnConverters(unittest.TestCase): @istest def svn_author_to_gitsvn_person(self): """The author should have name, email and fullname filled. """ actual_person = converters.svn_author_to_gitsvn_person( 'tony ', repo_uuid=None) self.assertEquals(actual_person, { 'fullname': b'tony ', 'name': b'tony', 'email': b'ynot@dagobah', }) @istest def svn_author_to_gitsvn_person_no_email(self): """The author should see his/her email filled with author@. """ actual_person = converters.svn_author_to_gitsvn_person( 'tony', repo_uuid=b'some-uuid') self.assertEquals(actual_person, { 'fullname': b'tony ', 'name': b'tony', 'email': b'tony@some-uuid', }) @istest def svn_author_to_gitsvn_person_empty_person(self): """The empty person should see name, fullname and email filled. """ actual_person = converters.svn_author_to_gitsvn_person( '', repo_uuid=b'some-uuid') self.assertEqual(actual_person, { 'fullname': b'(no author) <(no author)@some-uuid>', 'name': b'(no author)', 'email': b'(no author)@some-uuid' }) class TestAuthorSWHConverters(unittest.TestCase): @istest def svn_author_to_swh_person(self): """The author should have name, email and fullname filled. """ actual_person = converters.svn_author_to_swh_person( 'tony ') self.assertEquals(actual_person, { 'fullname': b'tony ', 'name': b'tony', 'email': b'ynot@dagobah', }) @istest def svn_author_to_swh_person_no_email(self): """The author and fullname should be the same as the input (author). """ actual_person = converters.svn_author_to_swh_person('tony') self.assertEquals(actual_person, { 'fullname': b'tony', 'name': b'tony', 'email': None, }) @istest def svn_author_to_swh_person_empty_person(self): """Empty person has only its fullname filled with the empty byte-string. """ actual_person = converters.svn_author_to_swh_person('') self.assertEqual(actual_person, { 'fullname': b'', 'name': None, 'email': None, }) class TestSWHRevisionConverters(unittest.TestCase): @istest def build_swh_revision_default(self): """This should build the swh revision with the swh revision's extra headers about the repository. """ actual_swh_revision = converters.build_swh_revision( repo_uuid=b'uuid', dir_id='dir-id', commit={ 'author_name': { 'name': b'theo', 'email': b'theo@uuid', 'fullname': b'theo ' }, 'message': b'commit message', 'author_date': { - 'timestamp': 1088108379, + 'timestamp': { + 'seconds': 1088108379, + 'microseconds': 0, + }, 'offset': 0 } }, rev=10, parents=['123']) - date = {'timestamp': 1088108379, 'offset': 0} + date = { + 'timestamp': { + 'seconds': 1088108379, + 'microseconds': 0, + }, + 'offset': 0, + } self.assertEquals(actual_swh_revision, { 'date': date, 'committer_date': date, 'type': 'svn', 'directory': 'dir-id', 'message': b'commit message', 'author': { 'name': b'theo', 'email': b'theo@uuid', 'fullname': b'theo ' }, 'committer': { 'name': b'theo', 'email': b'theo@uuid', 'fullname': b'theo ' }, 'synthetic': True, 'metadata': { 'extra_headers': [ ['svn_repo_uuid', b'uuid'], ['svn_revision', b'10'], ] }, 'parents': ['123'], }) class TestGitSvnRevisionConverters(unittest.TestCase): @istest def build_gitsvn_swh_revision_default(self): """This should build the swh revision without the swh revision's extra headers about the repository. """ actual_swh_revision = converters.build_gitsvn_swh_revision( dir_id='dir-id', commit={ 'author_name': { 'name': b'theo', 'email': b'theo@uuid', 'fullname': b'theo ' }, 'message': b'commit message', 'author_date': { - 'timestamp': 1088108379, + 'timestamp': { + 'seconds': 1088108379, + 'microseconds': 0, + }, 'offset': 0 } }, rev=10, parents=['123']) - date = {'timestamp': 1088108379, 'offset': 0} + date = { + 'timestamp': { + 'seconds': 1088108379, + 'microseconds': 0, + }, + 'offset': 0, + } self.assertEquals(actual_swh_revision, { 'date': date, 'committer_date': date, 'type': 'svn', 'directory': 'dir-id', 'message': b'commit message', 'author': { 'name': b'theo', 'email': b'theo@uuid', 'fullname': b'theo ' }, 'committer': { 'name': b'theo', 'email': b'theo@uuid', 'fullname': b'theo ' }, 'synthetic': True, 'metadata': None, 'parents': ['123'], }) class TestSWHOccurrence(unittest.TestCase): @istest def build_swh_occurrence(self): actual_occ = converters.build_swh_occurrence('revision-id', 'origin-id', visit=10) self.assertEquals(actual_occ, { 'branch': 'master', 'target': 'revision-id', 'target_type': 'revision', 'origin': 'origin-id', 'visit': 10 }) class ConvertSWHDate(unittest.TestCase): @istest def svn_date_to_swh_date(self): """The timestamp should not be tampered with and include the decimals. """ self.assertEquals( - converters.svn_date_to_swh_date('2011-05-31T06:04:39.500900Z'), - { - 'timestamp': 1306821879.5009, + converters.svn_date_to_swh_date('2011-05-31T06:04:39.500900Z'), { + 'timestamp': { + 'seconds': 1306821879, + 'microseconds': 500900, + }, 'offset': 0 }) self.assertEquals( converters.svn_date_to_swh_date('2011-05-31T06:04:39.800722Z'), { - 'timestamp': 1306821879.800722, + 'timestamp': { + 'seconds': 1306821879, + 'microseconds': 800722, + }, 'offset': 0 }) @istest def svn_date_to_swh_date_epoch(self): """Empty date should be EPOCH (timestamp and offset at 0).""" # It should return 0, epoch - self.assertEquals({'timestamp': 0, 'offset': 0}, - converters.svn_date_to_swh_date('')) - self.assertEquals({'timestamp': 0, 'offset': 0}, - converters.svn_date_to_swh_date(None)) + self.assertEquals({ + 'timestamp': { + 'seconds': 0, + 'microseconds': 0, + }, + 'offset': 0, + }, converters.svn_date_to_swh_date('')) + self.assertEquals({ + 'timestamp': { + 'seconds': 0, + 'microseconds': 0, + }, 'offset': 0, + }, converters.svn_date_to_swh_date(None)) class ConvertGitSvnDate(unittest.TestCase): @istest def svn_date_to_gitsvn_date(self): """The timestamp should be truncated to be an integer.""" actual_ts = converters.svn_date_to_gitsvn_date( '2011-05-31T06:04:39.800722Z') - self.assertEquals(actual_ts, - {'timestamp': 1306821879, 'offset': 0}) + self.assertEquals(actual_ts, { + 'timestamp': { + 'seconds': 1306821879, + 'microseconds': 0, + }, + 'offset': 0, + }) @istest def svn_date_to_gitsvn_date_epoch(self): """Empty date should be EPOCH (timestamp and offset at 0).""" # It should return 0, epoch - self.assertEquals({'timestamp': 0, 'offset': 0}, - converters.svn_date_to_gitsvn_date('')) - self.assertEquals({'timestamp': 0, 'offset': 0}, - converters.svn_date_to_gitsvn_date(None)) + self.assertEquals({ + 'timestamp': { + 'seconds': 0, + 'microseconds': 0, + }, + 'offset': 0, + }, converters.svn_date_to_gitsvn_date('')) + self.assertEquals({ + 'timestamp': { + 'seconds': 0, + 'microseconds': 0, + }, + 'offset': 0, + }, converters.svn_date_to_gitsvn_date(None)) class ConvertSWHRevision(unittest.TestCase): @istest def loader_to_scheduler_revision(self): actual_rev = converters.loader_to_scheduler_revision({ 'parents': [b'e\n\xbe\xe9\xc0\x87y\xfeG\xf7\xcfG\x82h\xa8i\xe8\xfe\xe2\x13'], # noqa 'id': b'\xedd\x92w\xab\xb2\x16,\xea*\x90O8\x0f\x96/\xfb\xd4\x16`', 'metadata': { 'extra_headers': [ ['svn_repo_uuid', b'bc7d6c17-68a5-4917-9c54-c565d7424229'], ['svn_revision', b'4'] ] } }) self.assertEquals(actual_rev, { 'id': 'ed649277abb2162cea2a904f380f962ffbd41660', 'parents': ['650abee9c08779fe47f7cf478268a869e8fee213'], 'metadata': { 'extra_headers': [ ['svn_repo_uuid', 'bc7d6c17-68a5-4917-9c54-c565d7424229'], ['svn_revision', '4'] ] } }) @istest def loader_to_scheduler_revision_none(self): self.assertIsNone(converters.loader_to_scheduler_revision(None)) @istest def scheduler_to_loader_revision(self): actual_rev = converters.scheduler_to_loader_revision({ 'id': 'ed649277abb2162cea2a904f380f962ffbd41660', 'parents': ['650abee9c08779fe47f7cf478268a869e8fee213'], 'metadata': { 'extra_headers': [ ['svn_repo_uuid', 'bc7d6c17-68a5-4917-9c54-c565d7424229'], ['svn_revision', '4'] ] } }) self.assertEquals(actual_rev, { 'parents': [b'e\n\xbe\xe9\xc0\x87y\xfeG\xf7\xcfG\x82h\xa8i\xe8\xfe\xe2\x13'], # noqa 'id': b'\xedd\x92w\xab\xb2\x16,\xea*\x90O8\x0f\x96/\xfb\xd4\x16`', 'metadata': { 'extra_headers': [ ['svn_repo_uuid', 'bc7d6c17-68a5-4917-9c54-c565d7424229'], ['svn_revision', '4'] ] } }) @istest def scheduler_to_loader_revision_none(self): self.assertIsNone(converters.scheduler_to_loader_revision(None)) diff --git a/swh/loader/svn/tests/test_utils.py b/swh/loader/svn/tests/test_utils.py index 32a918a..f102966 100644 --- a/swh/loader/svn/tests/test_utils.py +++ b/swh/loader/svn/tests/test_utils.py @@ -1,189 +1,193 @@ # Copyright (C) 2016 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import os import unittest from nose.tools import istest from test_base import BaseTestTreeLoader from swh.loader.svn import utils from swh.model import git class TestUtils(unittest.TestCase): @istest def strdate_to_timestamp(self): """Formatted string date should be converted in timestamp.""" actual_ts = utils.strdate_to_timestamp('2011-05-31T06:04:39.800722Z') - self.assertEquals(actual_ts, 1306821879.800722) + self.assertEquals(actual_ts, {'seconds': 1306821879, + 'microseconds': 800722}) actual_ts = utils.strdate_to_timestamp('2011-05-31T06:03:39.123450Z') - self.assertEquals(actual_ts, 1306821819.12345) + self.assertEquals(actual_ts, {'seconds': 1306821819, + 'microseconds': 123450}) @istest def strdate_to_timestamp_empty_does_not_break(self): """Empty or None date should be timestamp 0.""" - self.assertEquals(0, utils.strdate_to_timestamp('')) - self.assertEquals(0, utils.strdate_to_timestamp(None)) + self.assertEquals({'seconds': 0, 'microseconds': 0}, + utils.strdate_to_timestamp('')) + self.assertEquals({'seconds': 0, 'microseconds': 0}, + utils.strdate_to_timestamp(None)) class TestHashesConvert(unittest.TestCase): def setUp(self): self.hashes = { b'/tmp/tmp.c39vkrp1.swh.loader/pkg-fox': { 'checksums': { 'name': b'pkg-fox', 'sha1_git': b'\xad\xdf2x\x1fBX\xdb\xe8Adt\xc9\xf5~\xcb6\x98^\xbf', # noqa 'path': b'/tmp/tmp.c39vkrp1.swh.loader/pkg-fox' }, 'children': { b'/tmp/tmp.c39vkrp1.swh.loader/pkg-fox/fox-1.2', b'/tmp/tmp.c39vkrp1.swh.loader/pkg-fox/fox-1.4' } }, b'/tmp/tmp.c39vkrp1.swh.loader/pkg-fox/fox-1.4': { 'checksums': 'something', 'children': set() }, b'/tmp/tmp.c39vkrp1.swh.loader/pkg-fox/fox-1.2': { 'checksums': 'something' }, b'/tmp/tmp.c39vkrp1.swh.loader/pkg-fox/fox-1.3': { 'checksums': 'or something', 'children': { b'/tmp/tmp.c39vkrp1.swh.loader/pkg-fox/some/path' } } } self.expected_output = { b'': { 'checksums': { 'name': b'pkg-fox', 'sha1_git': b'\xad\xdf2x\x1fBX\xdb\xe8Adt\xc9\xf5~\xcb6\x98^\xbf', # noqa 'path': b'/tmp/tmp.c39vkrp1.swh.loader/pkg-fox' }, 'children': { b'fox-1.2', b'fox-1.4' } }, b'fox-1.4': { 'checksums': 'something', 'children': set() }, b'fox-1.2': { 'checksums': 'something', }, b'fox-1.3': { 'checksums': 'or something', 'children': { b'some/path' } } } @istest def convert_hashes_with_relative_path(self): actual_output = utils.convert_hashes_with_relative_path( self.hashes, b'/tmp/tmp.c39vkrp1.swh.loader/pkg-fox') self.assertEquals(actual_output, self.expected_output) @istest def convert_hashes_with_relative_path_with_slash(self): actual_output = utils.convert_hashes_with_relative_path( self.hashes, b'/tmp/tmp.c39vkrp1.swh.loader/pkg-fox/') self.assertEquals(actual_output, self.expected_output) class HashtreeITTest(BaseTestTreeLoader): @istest def hashtree_not_existing_path(self): # path does not exist with self.assertRaises(ValueError): utils.hashtree('/not/exists', ignore_empty_folder=False) @istest def hashtree_not_a_dir(self): fpath = '/tmp/foobar' with open(fpath, 'wb') as f: f.write(b'foo') # path is not a folder with self.assertRaises(ValueError): utils.hashtree(fpath, ignore_empty_folder=True) os.unlink(fpath) @istest def hashtree_with_empty_folder(self): # not ignoring empty folder # no pattern to ignore # this is the base case root_hash = self.tmp_root_path.encode('utf-8') actual_hash = utils.hashtree(root_hash, ignore_empty_folder=False) expected_hashes = git.compute_hashes_from_directory( self.tmp_root_path.encode('utf-8')) expected_hash = expected_hashes[root_hash]['checksums']['sha1_git'] self.assertEquals(actual_hash['sha1_git'], expected_hash) @istest def hashtree_ignore_pattern_with_empty_folder(self): # not ignoring empty folder # 'empty-folder' pattern to ignore root_hash = self.tmp_root_path.encode('utf-8') actual_hash = utils.hashtree(root_hash, ignore_empty_folder=False, ignore=['empty-folder']) expected_hashes = git.compute_hashes_from_directory( self.tmp_root_path.encode('utf-8'), dir_ok_fn=lambda dp: b'empty-folder' not in dp) expected_hash = expected_hashes[root_hash]['checksums']['sha1_git'] self.assertEquals(actual_hash['sha1_git'], expected_hash) @istest def hashtree_ignore_pattern_no_empty_folder(self): # ignoring empty folder # '/barfoo/' pattern to ignore root_hash = self.tmp_root_path.encode('utf-8') actual_hash = utils.hashtree(root_hash, ignore_empty_folder=True, ignore=['/barfoo/']) def ignore_fn(dp): return b'/barfoo/' not in dp expected_hashes = git.compute_hashes_from_directory( self.tmp_root_path.encode('utf-8'), dir_ok_fn=ignore_fn, remove_empty_folder=True) expected_hash = expected_hashes[root_hash]['checksums']['sha1_git'] self.assertEquals(actual_hash['sha1_git'], expected_hash) @istest def hashtree_no_ignore_pattern_no_empty_folder(self): # ignoring empty folder root_hash = self.tmp_root_path.encode('utf-8') actual_hash = utils.hashtree(root_hash, ignore_empty_folder=True) expected_hashes = git.compute_hashes_from_directory( self.tmp_root_path.encode('utf-8'), remove_empty_folder=True) expected_hash = expected_hashes[root_hash]['checksums']['sha1_git'] self.assertEquals(actual_hash['sha1_git'], expected_hash) diff --git a/swh/loader/svn/utils.py b/swh/loader/svn/utils.py index 354cead..e82db15 100644 --- a/swh/loader/svn/utils.py +++ b/swh/loader/svn/utils.py @@ -1,175 +1,178 @@ # Copyright (C) 2016 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import os import tempfile import shutil from dateutil import parser from subprocess import PIPE, Popen, call from swh.model import git def strdate_to_timestamp(strdate): """Convert a string date to an int timestamp. Args: strdate: A string representing a date with format like 'YYYY-mm-DDTHH:MM:SS.800722Z' Returns: - A timestamp in float + A couple of integers: seconds, microseconds """ if strdate: dt = parser.parse(strdate) - ts_float = dt.timestamp() + ts = { + 'seconds': int(dt.timestamp()), + 'microseconds': dt.microsecond, + } else: # epoch - ts_float = 0 - return ts_float + ts = {'seconds': 0, 'microseconds': 0} + return ts def convert_hashes_with_relative_path(hashes, rootpath): """A function to ease the transformation of absolute path to relative ones. This is an implementation detail: - swh.loader.svn.ra compute hashes and store keys with relative paths - swh.model.git compute hashes and store keys with full paths """ if rootpath.endswith(b'/'): rootpath = rootpath[:-1] root_value = hashes.pop(rootpath) if not rootpath.endswith(b'/'): rootpath = rootpath + b'/' def _replace_slash(s, rootpath=rootpath): return s.replace(rootpath, b'') def _update_children(children): return set((_replace_slash(c) for c in children)) h = { b'': { 'checksums': root_value['checksums'], 'children': _update_children(root_value['children']) } } for path, v in hashes.items(): p = _replace_slash(path) if 'children' in v: v['children'] = _update_children(v['children']) h[p] = v return h def hashtree(path, ignore_empty_folder=False, ignore=None): """Given a path and options, compute the hash's upper tree. This is not for production use. It's merely a helper function used mainly in bin/swh-hashtree Args: - path: The path to hash - ignore_empty_folder: An option to ignore empty folder - ignore: An option to ignore patterns in directory names. Returns: The path's checksums respecting the options passed as parameters. """ if os.path.exists(path): if not os.path.isdir(path): raise ValueError('%s should be a directory!' % path) else: raise ValueError('%s should exist!' % path) if isinstance(path, str): path = path.encode('utf-8') if ignore: patterns = [] for exc in ignore: patterns.append(exc.encode('utf-8')) def dir_ok_fn_basic(dirpath, patterns=patterns): dname = os.path.basename(dirpath) for pattern_to_ignore in patterns: if pattern_to_ignore == dname: return False if (pattern_to_ignore + b'/') in dirpath: return False return True if ignore_empty_folder: def dir_ok_fn(dirpath, patterns=patterns): if not dir_ok_fn_basic(dirpath): return False return os.listdir(dirpath) != [] else: dir_ok_fn = dir_ok_fn_basic else: if ignore_empty_folder: def dir_ok_fn(dirpath): return os.listdir(dirpath) != [] else: dir_ok_fn = git.default_validation_dir objects = git.compute_hashes_from_directory( path, dir_ok_fn=dir_ok_fn) h = objects[path]['checksums'] return h def init_svn_repo_from_archive_dump(archive_path, root_temp_dir='/tmp'): """Given a path to an archive containing an svn dump. Initialize an svn repository with the content of said dump. Returns: A tuple: - temporary folder: containing the mounted repository - repo_path, path to the mounted repository inside the temporary folder Raises: ValueError in case of failure to run the command to uncompress and load the dump. """ project_name = os.path.basename(os.path.dirname(archive_path)) temp_dir = tempfile.mkdtemp(suffix='.swh.loader.svn', prefix='tmp.', dir=root_temp_dir) try: repo_path = os.path.join(temp_dir, project_name) # create the repository that will be loaded with the dump cmd = ['svnadmin', 'create', repo_path] r = call(cmd) if r != 0: raise ValueError( 'Failed to initialize empty svn repo for %s' % project_name) with Popen(['pigz', '-dc', archive_path], stdout=PIPE) as dump: cmd = ['svnadmin', 'load', '-q', repo_path] r = call(cmd, stdin=dump.stdout) if r != 0: raise ValueError( 'Failed to mount the svn dump for project %s' % project_name) return temp_dir, repo_path except Exception as e: shutil.rmtree(temp_dir) raise e diff --git a/version.txt b/version.txt index fccfd54..50caf36 100644 --- a/version.txt +++ b/version.txt @@ -1 +1 @@ -v0.0.22-0-gb62f9ee \ No newline at end of file +v0.0.23-0-g8b3a85c \ No newline at end of file