Page MenuHomeSoftware Heritage

No OneTemporary

diff --git a/swh/loader/git/converters.py b/swh/loader/git/converters.py
index 472668b..11c9c0e 100644
--- a/swh/loader/git/converters.py
+++ b/swh/loader/git/converters.py
@@ -1,341 +1,361 @@
# Copyright (C) 2015 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
"""Convert pygit2 objects to dictionaries suitable for swh.storage"""
from pygit2 import GIT_OBJ_COMMIT
from swh.core import hashutil
from .utils import format_date
HASH_ALGORITHMS = ['sha1', 'sha256']
def blob_to_content(id, repo, log=None, max_content_size=None, origin_id=None):
"""Format a blob as a content"""
blob = repo[id]
size = blob.size
ret = {
'sha1_git': id.raw,
'length': blob.size,
'status': 'absent'
}
if max_content_size:
if size > max_content_size:
if log:
log.info('Skipping content %s, too large (%s > %s)' %
(id.hex, size, max_content_size), extra={
'swh_type': 'loader_git_content_skip',
'swh_repo': repo.path,
'swh_id': id.hex,
'swh_size': size,
})
ret['reason'] = 'Content too large'
ret['origin'] = origin_id
return ret
data = blob.data
hashes = hashutil.hashdata(data, HASH_ALGORITHMS)
ret.update(hashes)
ret['data'] = data
ret['status'] = 'visible'
return ret
def tree_to_directory(id, repo, log=None):
"""Format a tree as a directory"""
ret = {
'id': id.raw,
}
entries = []
ret['entries'] = entries
entry_type_map = {
'tree': 'dir',
'blob': 'file',
'commit': 'rev',
}
for entry in repo[id]:
entries.append({
'type': entry_type_map[entry.type],
'perms': entry.filemode,
'name': entry._name,
'target': entry.id.raw,
})
return ret
def commit_to_revision(id, repo, log=None):
"""Format a commit as a revision"""
commit = repo[id]
author = commit.author
committer = commit.committer
return {
'id': id.raw,
'date': format_date(author),
'committer_date': format_date(committer),
'type': 'git',
'directory': commit.tree_id.raw,
'message': commit.raw_message,
'metadata': None,
'author': {
'name': author.raw_name,
'email': author.raw_email,
},
'committer': {
'name': committer.raw_name,
'email': committer.raw_email,
},
'synthetic': False,
'parents': [p.raw for p in commit.parent_ids],
}
def annotated_tag_to_release(id, repo, log=None):
"""Format an annotated tag as a release"""
tag = repo[id]
tag_pointer = repo[tag.target]
if tag_pointer.type != GIT_OBJ_COMMIT:
if log:
log.warn("Ignoring tag %s pointing at %s %s" % (
tag.id.hex, tag_pointer.__class__.__name__,
tag_pointer.id.hex), extra={
'swh_type': 'loader_git_tag_ignore',
'swh_repo': repo.path,
'swh_tag_id': tag.id.hex,
'swh_tag_dest': {
'type': tag_pointer.__class__.__name__,
'id': tag_pointer.id.hex,
},
})
return
if not tag.tagger:
if log:
log.warn("Tag %s has no author, using default values"
% id.hex, extra={
'swh_type': 'loader_git_tag_author_default',
'swh_repo': repo.path,
'swh_tag_id': tag.id.hex,
})
author = None
date = None
else:
author = {
'name': tag.tagger.raw_name,
'email': tag.tagger.raw_email,
}
date = format_date(tag.tagger)
return {
'id': id.raw,
'date': date,
'target': tag.target.raw,
'target_type': 'revision',
'message': tag._message,
'name': tag.name.raw,
'author': author,
'metadata': None,
'synthetic': False,
}
def ref_to_occurrence(ref):
"""Format a reference as an occurrence"""
occ = ref.copy()
if 'branch' in ref:
branch = ref['branch']
if isinstance(branch, str):
occ['branch'] = branch.encode('utf-8')
else:
occ['branch'] = branch
return occ
def origin_url_to_origin(origin_url):
"""Format a pygit2.Repository as an origin suitable for swh.storage"""
return {
'type': 'git',
'url': origin_url,
}
def dulwich_blob_to_content(blob, log=None, max_content_size=None,
origin_id=None):
"""Convert a dulwich blob to a Software Heritage content"""
if blob.type_name != b'blob':
return
size = blob.raw_length()
ret = {
'sha1_git': blob.sha().digest(),
'length': size,
'status': 'absent'
}
if max_content_size:
if size > max_content_size:
if log:
log.info('Skipping content %s, too large (%s > %s)' %
(blob.id.encode(), size, max_content_size), extra={
'swh_type': 'loader_git_content_skip',
'swh_id': id.hex,
'swh_size': size,
})
ret['reason'] = 'Content too large'
ret['origin'] = origin_id
return ret
data = blob.as_raw_string()
hashes = hashutil.hashdata(data, HASH_ALGORITHMS)
ret.update(hashes)
ret['data'] = data
ret['status'] = 'visible'
return ret
def dulwich_tree_to_directory(tree, log=None):
"""Format a tree as a directory"""
if tree.type_name != b'tree':
return
ret = {
'id': tree.sha().digest(),
}
entries = []
ret['entries'] = entries
entry_mode_map = {
0o040000: 'dir',
0o160000: 'rev',
0o100644: 'file',
0o100755: 'file',
0o120000: 'file',
}
for entry in tree.iteritems():
entries.append({
'type': entry_mode_map.get(entry.mode, 'file'),
'perms': entry.mode,
'name': entry.path,
'target': hashutil.hex_to_hash(entry.sha.decode('ascii')),
})
return ret
def parse_author(name_email):
"""Parse an author line"""
- if not name_email:
+ if name_email is None:
return None
- name, email = name_email.split(b' <', 1)
- email = email[:-1]
+ try:
+ open_bracket = name_email.index(b'<')
+ except ValueError:
+ name = email = None
+ else:
+ raw_name = name_email[:open_bracket]
+ raw_email = name_email[open_bracket+1:]
+
+ if not raw_name:
+ name = None
+ elif raw_name.endswith(b' '):
+ name = raw_name[:-1]
+ else:
+ name = raw_name
+
+ try:
+ close_bracket = raw_email.index(b'>')
+ except ValueError:
+ email = None
+ else:
+ email = raw_email[:close_bracket]
return {
'name': name,
'email': email,
+ 'fullname': name_email,
}
def dulwich_tsinfo_to_timestamp(timestamp, timezone, timezone_neg_utc):
"""Convert the dulwich timestamp information to a structure compatible with
Software Heritage"""
return {
'timestamp': timestamp,
'offset': timezone // 60,
'negative_utc': timezone_neg_utc if timezone == 0 else None,
}
def dulwich_commit_to_revision(commit, log=None):
if commit.type_name != b'commit':
return
ret = {
'id': commit.sha().digest(),
'author': parse_author(commit.author),
'date': dulwich_tsinfo_to_timestamp(
commit.author_time,
commit.author_timezone,
commit._author_timezone_neg_utc,
),
'committer': parse_author(commit.committer),
'committer_date': dulwich_tsinfo_to_timestamp(
commit.commit_time,
commit.commit_timezone,
commit._commit_timezone_neg_utc,
),
'type': 'git',
'directory': bytes.fromhex(commit.tree.decode()),
'message': commit.message,
'metadata': None,
'synthetic': False,
'parents': [bytes.fromhex(p.decode()) for p in commit.parents],
}
git_metadata = []
if commit.mergetag:
for mergetag in commit.mergetag:
git_metadata.append(['mergetag', mergetag.as_raw_string()])
if commit.extra:
git_metadata.extend([k, v] for k, v in commit.extra)
if commit.gpgsig:
git_metadata.append(['gpgsig', commit.gpgsig])
if git_metadata:
ret['metadata'] = {
'extra_git_headers': git_metadata,
}
return ret
DULWICH_TYPES = {
b'blob': 'content',
b'tree': 'directory',
b'commit': 'revision',
b'tag': 'release',
}
def dulwich_tag_to_release(tag, log=None):
if tag.type_name != b'tag':
return
target_type, target = tag.object
ret = {
'id': tag.sha().digest(),
'name': tag.name,
'target': bytes.fromhex(target.decode()),
'target_type': DULWICH_TYPES[target_type.type_name],
'message': tag._message,
'metadata': None,
'synthetic': False,
}
if tag.tagger:
ret['author'] = parse_author(tag.tagger)
ret['date'] = dulwich_tsinfo_to_timestamp(
tag.tag_time,
tag.tag_timezone,
tag._tag_timezone_neg_utc,
)
else:
ret['author'] = ret['date'] = None
return ret
diff --git a/swh/loader/git/tests/test_converters.py b/swh/loader/git/tests/test_converters.py
index ffb0cc1..c634333 100644
--- a/swh/loader/git/tests/test_converters.py
+++ b/swh/loader/git/tests/test_converters.py
@@ -1,159 +1,199 @@
# Copyright (C) 2015 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import os
import shutil
import subprocess
import tempfile
import unittest
import datetime
from nose.tools import istest
import pygit2
import swh.loader.git.converters as converters
from swh.core.hashutil import hex_to_hash
class TestConverters(unittest.TestCase):
@classmethod
def setUpClass(cls):
super().setUpClass()
cls.repo_path = tempfile.mkdtemp()
cls.repo = pygit2.init_repository(cls.repo_path, bare=True)
fast_export = os.path.join(os.path.dirname(__file__),
'../../../../..',
'swh-storage-testdata',
'git-repos',
'example-submodule.fast-export.xz')
xz = subprocess.Popen(
['xzcat'],
stdin=open(fast_export, 'rb'),
stdout=subprocess.PIPE,
)
git = subprocess.Popen(
['git', 'fast-import', '--quiet'],
stdin=xz.stdout,
cwd=cls.repo_path,
)
# flush stdout of xz
xz.stdout.close()
git.communicate()
@classmethod
def tearDownClass(cls):
super().tearDownClass()
shutil.rmtree(cls.repo_path)
print(cls.repo_path)
def setUp(self):
super().setUp()
self.blob_id = pygit2.Oid(
hex='28c6f4023d65f74e3b59a2dea3c4277ed9ee07b0')
self.blob = {
'sha1_git': self.blob_id.raw,
'sha1': hex_to_hash('4850a3420a2262ff061cb296fb915430fa92301c'),
'sha256': hex_to_hash('fee7c8a485a10321ad94b64135073cb5'
'5f22cb9f57fa2417d2adfb09d310adef'),
'data': (b'[submodule "example-dependency"]\n'
b'\tpath = example-dependency\n'
b'\turl = https://github.com/githubtraining/'
b'example-dependency.git\n'),
'length': 124,
'status': 'visible',
}
self.blob_hidden = {
'sha1_git': self.blob_id.raw,
'length': 124,
'status': 'absent',
'reason': 'Content too large',
'origin': None,
}
@istest
def blob_to_content(self):
content = converters.blob_to_content(self.blob_id, self.repo)
self.assertEqual(self.blob, content)
@istest
def blob_to_content_absent(self):
max_length = self.blob['length'] - 1
content = converters.blob_to_content(self.blob_id, self.repo,
max_content_size=max_length)
self.assertEqual(self.blob_hidden, content)
@istest
def commit_to_revision(self):
sha1 = '9768d0b576dbaaecd80abedad6dfd0d72f1476da'
commit = self.repo.revparse_single(sha1)
# when
actual_revision = converters.commit_to_revision(commit.id, self.repo)
offset = datetime.timedelta(minutes=120)
tzoffset = datetime.timezone(offset)
expected_revision = {
'id': hex_to_hash('9768d0b576dbaaecd80abedad6dfd0d72f1476da'),
'directory': b'\xf0i\\./\xa7\xce\x9dW@#\xc3A7a\xa4s\xe5\x00\xca',
'type': 'git',
'committer': {
'name': b'Stefano Zacchiroli',
'email': b'zack@upsilon.cc',
},
'author': {
'name': b'Stefano Zacchiroli',
'email': b'zack@upsilon.cc',
},
'committer_date': datetime.datetime(2015, 9, 24, 10, 36, 5,
tzinfo=tzoffset),
'message': b'add submodule dependency\n',
'metadata': None,
'date': datetime.datetime(2015, 9, 24, 10, 36, 5,
tzinfo=tzoffset),
'parents': [
b'\xc3\xc5\x88q23`\x9f[\xbb\xb2\xd9\xe7\xf3\xfbJf\x0f?r'
],
'synthetic': False,
}
# then
self.assertEquals(actual_revision, expected_revision)
self.assertEquals(offset, expected_revision['date'].utcoffset())
self.assertEquals(offset,
expected_revision['committer_date'].utcoffset())
@istest
def ref_to_occurrence_1(self):
# when
actual_occ = converters.ref_to_occurrence({
'id': 'some-id',
'branch': 'some/branch'
})
# then
self.assertEquals(actual_occ, {
'id': 'some-id',
'branch': b'some/branch'
})
@istest
def ref_to_occurrence_2(self):
# when
actual_occ = converters.ref_to_occurrence({
'id': 'some-id',
'branch': b'some/branch'
})
# then
self.assertEquals(actual_occ, {
'id': 'some-id',
'branch': b'some/branch'
})
+
+ @istest
+ def author_line_to_author(self):
+ tests = {
+ b'a <b@c.com>': {
+ 'name': b'a',
+ 'email': b'b@c.com',
+ 'fullname': b'a <b@c.com>',
+ },
+ b'<foo@bar.com>': {
+ 'name': None,
+ 'email': b'foo@bar.com',
+ 'fullname': b'<foo@bar.com>',
+ },
+ b'malformed <email': {
+ 'name': b'malformed',
+ 'email': None,
+ 'fullname': b'malformed <email'
+ },
+ b'trailing <sp@c.e> ': {
+ 'name': b'trailing',
+ 'email': b'sp@c.e',
+ 'fullname': b'trailing <sp@c.e> ',
+ },
+ b'no<sp@c.e>': {
+ 'name': b'no',
+ 'email': b'sp@c.e',
+ 'fullname': b'no<sp@c.e>',
+ },
+ b' <>': {
+ 'name': b'',
+ 'email': b'',
+ 'fullname': b' <>',
+ },
+ }
+
+ for author in sorted(tests):
+ parsed_author = tests[author]
+ self.assertEquals(parsed_author,
+ converters.parse_author(author))

File Metadata

Mime Type
text/x-diff
Expires
Tue, Aug 19, 12:59 AM (3 w, 2 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3365733

Event Timeline