Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F11023726
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
18 KB
Subscribers
None
View Options
diff --git a/resources/svn.ini b/resources/svn.ini
index 45a10dd..51a574a 100644
--- a/resources/svn.ini
+++ b/resources/svn.ini
@@ -1,26 +1,30 @@
[main]
storage_class = remote_storage
storage_args = http://localhost:5000/
send_contents = True
send_directories = True
send_revisions = True
send_releases = True
send_occurrences = True
# nb of max contents to send for storage (if size threshold not reached before)
content_packet_size = 10000
# 100 Mib of content data (size threshold of data before sending for storage)
content_packet_block_size_bytes = 104857600
# limit for swh content storage for one blob (beyond that limit, the
# content's data is not sent for storage)
content_packet_size_bytes = 1073741824
# packet of directories to send for storage
directory_packet_size = 25000
# packet of revisions to send for storage
revision_packet_size = 10000
# packet of releases to send for storage
release_packet_size = 100000
# packet of occurrences to send for storage
occurrence_packet_size = 100000
+
+### for git-svn comparison
+
# Determine if we use the git's extra headers or not
# This should always be True for production
-revision_with_headers = True
+with_revision_headers = True
+# Determine if we consider the empty directory for sha1_git
diff --git a/swh/loader/svn/converters.py b/swh/loader/svn/converters.py
index 8071aca..734e129 100644
--- a/swh/loader/svn/converters.py
+++ b/swh/loader/svn/converters.py
@@ -1,93 +1,93 @@
# Copyright (C) 2015-2016 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from email import utils
def svn_author_to_person(author, repo_uuid):
"""Convert an svn author to a person suitable for insertion.
Args:
author (bytes): the svn author (in bytes)
repo_uuid (string): the repository's uuid
Returns: a dictionary with keys:
fullname: the author's associated fullname
name: the author's associated name
email: None (no email in svn)
"""
if not author:
return {
'fullname': None,
'name': None,
'email': None
}
if b'<' in author and b'>' in author:
name, email = utils.parseaddr(author.decode('utf-8'))
return {
'fullname': author,
'name': name.encode('utf-8'),
'email': email.encode('utf-8')
}
# we'll construct the author's fullname the same way git svn does
# 'user <user@repo-uuid>'
email = b'%s@%s' % (author, repo_uuid.encode('utf-8'))
return {
'fullname': b'%s <%s>' % (author, email),
'name': author,
'email': email,
}
def build_swh_revision(repo_uuid, commit, rev, dir_id, parents,
- with_extra_headers=True):
+ with_revision_headers=True):
"""Given a svn revision, build a swh revision.
"""
author = svn_author_to_person(commit['author_name'], repo_uuid)
msg = commit['message']
date = {
'timestamp': int(commit['author_date']),
'offset': 0,
}
- if with_extra_headers:
+ if with_revision_headers:
metadata = {
'extra_headers': [
['svn_repo_uuid', repo_uuid],
['svn_revision', rev]
]
}
else:
metadata = None
return {
'date': date,
'committer_date': date,
'type': 'svn',
'directory': dir_id,
'message': msg,
'author': author,
'committer': author,
'synthetic': True,
'metadata': metadata,
'parents': parents,
}
def build_swh_occurrence(revision_id, origin_id, date):
"""Build a swh occurrence from the revision id, origin id, and date.
"""
return {'branch': 'master',
'target': revision_id,
'target_type': 'revision',
'origin': origin_id,
'date': date}
diff --git a/swh/loader/svn/loader.py b/swh/loader/svn/loader.py
index 8e89bad..5bc332a 100644
--- a/swh/loader/svn/loader.py
+++ b/swh/loader/svn/loader.py
@@ -1,216 +1,217 @@
# Copyright (C) 2015-2016 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import datetime
from swh.core import utils
from swh.model import git, hashutil
from swh.model.git import GitType
from swh.loader.vcs import loader
from swh.loader.svn import svn, converters
def objects_per_type(objects_per_path):
"""Given an object dictionary returned by
`swh.model.git.walk_and_compute_sha1_from_directory`, return a map
grouped by type.
Returns:
Dictionary with keys:
- GitType.BLOB: list of blobs
- GitType.TREE: list of directories
"""
objects = {
GitType.BLOB: [],
GitType.TREE: [],
}
for tree_path in objects_per_path:
objs = objects_per_path[tree_path]
for obj in objs:
objects[obj['type']].append(obj)
return objects
class SvnLoader(loader.SWHLoader):
"""Svn loader to load one svn repository.
"""
def __init__(self, config, origin_id):
super().__init__(config,
origin_id,
logging_class='swh.loader.svn.SvnLoader')
- self.with_extra_headers = self.config['revision_with_headers'].lower() == 'true' # noqa
+ self.with_revision_headers = self.config['with_revision_headers'].lower() == 'true' # noqa
+ self.with_empty_folder = self.config['with_empty_folder'].lower() == 'true' # noqa
def check_history_not_altered(self, svnrepo, revision_start, swh_rev):
"""Given a svn repository, check if the history was not tampered with.
"""
revision_id = swh_rev['id']
parents = swh_rev['parents']
hash_data_per_revs = svnrepo.swh_hash_data_per_revision(revision_start,
revision_start)
rev, _, commit, objects_per_path = list(hash_data_per_revs)[0]
dir_id = objects_per_path[git.ROOT_TREE_KEY][0]['sha1_git']
swh_revision = converters.build_swh_revision(svnrepo.uuid,
commit,
rev,
dir_id,
parents)
swh_revision_id = git.compute_revision_sha1_git(swh_revision)
return swh_revision_id == revision_id
def process_svn_revisions(self, svnrepo, revision_start, revision_end,
revision_parents):
"""Process revisions from revision_start to revision_end and send to swh for
storage.
At each svn revision, checkout the repository, compute the
tree hash and blobs and send for swh storage to store.
Then computes and yields the swh revision.
Yields:
swh revision
"""
gen_revs = svnrepo.swh_hash_data_per_revision(revision_start,
revision_end)
for rev, nextrev, commit, objects_per_path in gen_revs:
# compute the fs tree's checksums
dir_id = objects_per_path[git.ROOT_TREE_KEY][0]['sha1_git']
swh_revision = converters.build_swh_revision(
svnrepo.uuid,
commit,
rev,
dir_id,
revision_parents[rev],
- with_extra_headers=self.with_extra_headers) # BEWARE: if False, svn repo update won't work... # noqa
+ with_revision_headers=self.with_revision_headers) # BEWARE: if False, svn repo update won't work... # noqa
swh_revision['id'] = git.compute_revision_sha1_git(swh_revision)
self.log.debug('rev: %s, swhrev: %s, dir: %s' % (
rev,
hashutil.hash_to_hex(swh_revision['id']),
hashutil.hash_to_hex(dir_id)))
if nextrev:
revision_parents[nextrev] = [swh_revision['id']]
objects = objects_per_type(objects_per_path)
self.maybe_load_contents(objects[GitType.BLOB])
self.maybe_load_directories(objects[GitType.TREE],
objects_per_path)
yield swh_revision
def process_swh_revisions(self,
svnrepo,
revision_start,
revision_end,
revision_parents):
"""Process and store revision to swh (sent by by blocks of
'revision_packet_size')
Returns:
The latest revision stored.
"""
for revisions in utils.grouper(
self.process_svn_revisions(svnrepo,
revision_start,
revision_end,
revision_parents),
self.config['revision_packet_size']):
revs = list(revisions)
self.maybe_load_revisions(revs)
return revs[-1]
def process_swh_occurrence(self, revision, origin):
"""Process and load the occurrence pointing to the latest revision.
"""
occ = converters.build_swh_occurrence(revision['id'],
origin['id'],
datetime.datetime.utcnow())
self.log.debug('occ: %s' % occ)
self.maybe_load_occurrences([occ])
def process(self, svn_url, origin, destination_path):
"""Load a svn repository in swh.
Checkout the svn repository locally in destination_path.
Args:
- svn_url: svn repository url to import
- origin: Dictionary origin
- id: origin's id
- url: url origin we fetched
- type: type of the origin
Returns:
Dictionary with the following keys:
- status: mandatory, the status result as a boolean
- stderr: optional when status is True, mandatory otherwise
"""
svnrepo = svn.SvnRepo(svn_url, origin['id'], self.storage,
destination_path)
try:
swh_rev = svnrepo.swh_previous_revision()
if swh_rev:
extra_headers = dict(swh_rev['metadata']['extra_headers'])
revision_start = extra_headers['svn_revision']
revision_parents = {
revision_start: swh_rev['parents']
}
else:
revision_start = 1
revision_parents = {
revision_start: []
}
svnrepo.fork(revision_start)
self.log.debug('svn co %s@%s' % (svn_url, revision_start))
if swh_rev and not self.check_history_not_altered(svnrepo,
revision_start,
swh_rev):
msg = 'History of svn %s@%s history modified. Skipping...' % (
svn_url, revision_start)
self.log.warn(msg)
return {'status': False, 'stderr': msg}
revision_end = svnrepo.head_revision()
self.log.info('[revision_start-revision_end]: [%s-%s]' % (
revision_start, revision_end))
if revision_start == revision_end and revision_start is not 1:
self.log.info('%s@%s already injected.' % (svn_url,
revision_end))
return {'status': True}
self.log.info('Repo %s ready to be processed.' % svnrepo)
# process and store revision to swh (sent by by blocks of
# 'revision_packet_size')
latest_rev = self.process_swh_revisions(svnrepo,
revision_start,
revision_end,
revision_parents)
self.process_swh_occurrence(latest_rev, origin)
# flush eventual remaining data
self.flush()
finally:
svnrepo.clean_fs()
return {'status': True}
diff --git a/swh/loader/svn/tests/test_converters.py b/swh/loader/svn/tests/test_converters.py
index c78e473..7d98fef 100644
--- a/swh/loader/svn/tests/test_converters.py
+++ b/swh/loader/svn/tests/test_converters.py
@@ -1,171 +1,171 @@
# Copyright (C) 2015 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import unittest
from nose.tools import istest
from swh.loader.svn import converters
class TestConverters(unittest.TestCase):
@istest
def svn_author_to_person(self):
actual_person = converters.svn_author_to_person(
b'tony <ynot@dagobah>',
repo_uuid=None)
self.assertEquals(actual_person, {
'fullname': b'tony <ynot@dagobah>',
'name': b'tony',
'email': b'ynot@dagobah',
})
@istest
def svn_author_to_person_no_email(self):
# should not happen - input is bytes but nothing prevents it
actual_person = converters.svn_author_to_person(b'tony',
repo_uuid='some-uuid')
self.assertEquals(actual_person, {
'fullname': b'tony <tony@some-uuid>',
'name': b'tony',
'email': b'tony@some-uuid',
})
@istest
def svn_author_to_person_None(self):
# should not happen - nothing prevents it though
actual_person = converters.svn_author_to_person(None,
repo_uuid=None)
self.assertEquals(actual_person, {
'fullname': None,
'name': None,
'email': None,
})
@istest
def svn_author_to_person_empty_person(self):
# should not happen - nothing prevents it though
actual_person = converters.svn_author_to_person(b'',
repo_uuid=None)
self.assertEquals(actual_person, {
'fullname': None,
'name': None,
'email': None,
})
@istest
def build_swh_revision_default(self):
actual_swh_revision = converters.build_swh_revision(
repo_uuid='uuid',
dir_id='dir-id',
commit={'author_name': b'theo',
'message': b'commit message',
'author_date': 1095446497.574042},
rev=10,
parents=['123'])
self.assertEquals(actual_swh_revision, {
'date': {'timestamp': 1095446497, 'offset': 0},
'committer_date': {'timestamp': 1095446497,
'offset': 0},
'type': 'svn',
'directory': 'dir-id',
'message': b'commit message',
'author': {
'name': b'theo',
'email': b'theo@uuid',
'fullname': b'theo <theo@uuid>'
},
'committer': {
'name': b'theo',
'email': b'theo@uuid',
'fullname': b'theo <theo@uuid>'
},
'synthetic': True,
'metadata': {
'extra_headers': [
['svn_repo_uuid', 'uuid'],
['svn_revision', 10],
]
},
'parents': ['123'],
})
@istest
def build_swh_revision_no_extra_headers(self):
actual_swh_revision = converters.build_swh_revision(
repo_uuid='uuid',
dir_id='dir-id',
commit={'author_name': b'theo',
'message': b'commit message',
'author_date': 1095446497.574042},
rev=10,
parents=['123'],
- with_extra_headers=False)
+ with_revision_headers=False)
self.assertEquals(actual_swh_revision, {
'date': {'timestamp': 1095446497, 'offset': 0},
'committer_date': {'timestamp': 1095446497,
'offset': 0},
'type': 'svn',
'directory': 'dir-id',
'message': b'commit message',
'author': {
'name': b'theo',
'email': b'theo@uuid',
'fullname': b'theo <theo@uuid>'
},
'committer': {
'name': b'theo',
'email': b'theo@uuid',
'fullname': b'theo <theo@uuid>'
},
'synthetic': True,
'metadata': None,
'parents': ['123'],
})
@istest
def build_swh_revision_empty_data(self):
actual_swh_revision = converters.build_swh_revision(
repo_uuid='uuid',
dir_id='dir-id',
commit={'author_name': b'',
'message': b'',
'author_date': 1095446497.574042},
rev=8,
parents=[])
self.assertEquals(actual_swh_revision, {
'date': {'timestamp': 1095446497, 'offset': 0},
'committer_date': {'timestamp': 1095446497,
'offset': 0},
'type': 'svn',
'directory': 'dir-id',
'message': b'',
'author': {'name': None, 'email': None, 'fullname': None},
'committer': {'name': None, 'email': None, 'fullname': None},
'synthetic': True,
'metadata': {
'extra_headers': [
['svn_repo_uuid', 'uuid'],
['svn_revision', 8],
]
},
'parents': [],
})
@istest
def build_swh_occurrence(self):
actual_occ = converters.build_swh_occurrence('revision-id',
'origin-id',
'some-date')
self.assertEquals(actual_occ, {
'branch': 'master',
'target': 'revision-id',
'target_type': 'revision',
'origin': 'origin-id',
'date': 'some-date'})
File Metadata
Details
Attached
Mime Type
text/x-diff
Expires
Thu, Sep 18, 5:02 PM (1 d, 17 h)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3262159
Attached To
rDLDSVN Subversion (SVN) loader
Event Timeline
Log In to Comment