diff --git a/swh/loader/git/converters.py b/swh/loader/git/converters.py index fc28332..b912a3c 100644 --- a/swh/loader/git/converters.py +++ b/swh/loader/git/converters.py @@ -1,229 +1,232 @@ # Copyright (C) 2015-2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information """Convert dulwich objects to dictionaries suitable for swh.storage""" from swh.model import hashutil HASH_ALGORITHMS = hashutil.DEFAULT_ALGORITHMS - {'sha1_git'} def origin_url_to_origin(origin_url): """Format a pygit2.Repository as an origin suitable for swh.storage""" return { 'type': 'git', 'url': origin_url, } def dulwich_blob_to_content_id(blob): """Convert a dulwich blob to a Software Heritage content id""" if blob.type_name != b'blob': return size = blob.raw_length() ret = { 'sha1_git': blob.sha().digest(), 'length': size, } data = blob.as_raw_string() ret.update(hashutil.hash_data(data, HASH_ALGORITHMS)) return ret def dulwich_blob_to_content(blob, log=None, max_content_size=None, origin_id=None): """Convert a dulwich blob to a Software Heritage content""" if blob.type_name != b'blob': return ret = dulwich_blob_to_content_id(blob) size = ret['length'] if max_content_size: if size > max_content_size: id = hashutil.hash_to_hex(ret['sha1_git']) if log: log.info('Skipping content %s, too large (%s > %s)' % (id, size, max_content_size), extra={ 'swh_type': 'loader_git_content_skip', 'swh_id': id, 'swh_size': size, }) ret['status'] = 'absent' ret['reason'] = 'Content too large' ret['origin'] = origin_id return ret data = blob.as_raw_string() ret['data'] = data ret['status'] = 'visible' return ret def dulwich_tree_to_directory(tree, log=None): """Format a tree as a directory""" if tree.type_name != b'tree': return ret = { 'id': tree.sha().digest(), } entries = [] ret['entries'] = entries entry_mode_map = { 0o040000: 'dir', 0o160000: 'rev', 0o100644: 'file', 0o100755: 'file', 0o120000: 'file', } for entry in tree.iteritems(): entries.append({ 'type': entry_mode_map.get(entry.mode, 'file'), 'perms': entry.mode, 'name': entry.path, 'target': hashutil.hash_to_bytes(entry.sha.decode('ascii')), }) return ret def parse_author(name_email): """Parse an author line""" if name_email is None: return None try: open_bracket = name_email.index(b'<') except ValueError: name = email = None else: raw_name = name_email[:open_bracket] raw_email = name_email[open_bracket+1:] if not raw_name: name = None elif raw_name.endswith(b' '): name = raw_name[:-1] else: name = raw_name try: close_bracket = raw_email.index(b'>') except ValueError: email = None else: email = raw_email[:close_bracket] return { 'name': name, 'email': email, 'fullname': name_email, } def dulwich_tsinfo_to_timestamp(timestamp, timezone, timezone_neg_utc): """Convert the dulwich timestamp information to a structure compatible with Software Heritage""" return { 'timestamp': timestamp, 'offset': timezone // 60, 'negative_utc': timezone_neg_utc if timezone == 0 else None, } def dulwich_commit_to_revision(commit, log=None): if commit.type_name != b'commit': return ret = { 'id': commit.sha().digest(), 'author': parse_author(commit.author), 'date': dulwich_tsinfo_to_timestamp( commit.author_time, commit.author_timezone, commit._author_timezone_neg_utc, ), 'committer': parse_author(commit.committer), 'committer_date': dulwich_tsinfo_to_timestamp( commit.commit_time, commit.commit_timezone, commit._commit_timezone_neg_utc, ), 'type': 'git', 'directory': bytes.fromhex(commit.tree.decode()), 'message': commit.message, 'metadata': None, 'synthetic': False, 'parents': [bytes.fromhex(p.decode()) for p in commit.parents], } git_metadata = [] if commit.encoding is not None: git_metadata.append(['encoding', commit.encoding]) if commit.mergetag: for mergetag in commit.mergetag: raw_string = mergetag.as_raw_string() assert raw_string.endswith(b'\n') git_metadata.append(['mergetag', raw_string[:-1]]) if commit.extra: git_metadata.extend([k.decode('utf-8'), v] for k, v in commit.extra) if commit.gpgsig: git_metadata.append(['gpgsig', commit.gpgsig]) if git_metadata: ret['metadata'] = { 'extra_headers': git_metadata, } return ret DULWICH_TYPES = { b'blob': 'content', b'tree': 'directory', b'commit': 'revision', b'tag': 'release', } def dulwich_tag_to_release(tag, log=None): if tag.type_name != b'tag': return target_type, target = tag.object ret = { 'id': tag.sha().digest(), 'name': tag.name, 'target': bytes.fromhex(target.decode()), 'target_type': DULWICH_TYPES[target_type.type_name], 'message': tag._message, 'metadata': None, 'synthetic': False, } if tag.tagger: ret['author'] = parse_author(tag.tagger) - ret['date'] = dulwich_tsinfo_to_timestamp( - tag.tag_time, - tag.tag_timezone, - tag._tag_timezone_neg_utc, - ) + if not tag.tag_time: + ret['date'] = None + else: + ret['date'] = dulwich_tsinfo_to_timestamp( + tag.tag_time, + tag.tag_timezone, + tag._tag_timezone_neg_utc, + ) else: ret['author'] = ret['date'] = None return ret diff --git a/swh/loader/git/tests/test_converters.py b/swh/loader/git/tests/test_converters.py index 622c10f..bab4aae 100644 --- a/swh/loader/git/tests/test_converters.py +++ b/swh/loader/git/tests/test_converters.py @@ -1,179 +1,317 @@ # Copyright (C) 2015-2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import os import shutil import subprocess import tempfile import unittest from nose.tools import istest from nose.plugins.attrib import attr import dulwich.repo import swh.loader.git.converters as converters from swh.model.hashutil import bytehex_to_hash, hash_to_bytes @attr('fs') class TestConverters(unittest.TestCase): @classmethod def setUpClass(cls): super().setUpClass() cls.repo_path = tempfile.mkdtemp() cls.repo = dulwich.repo.Repo.init_bare(cls.repo_path) fast_export = os.path.join(os.path.dirname(__file__), '../../../../..', 'swh-storage-testdata', 'git-repos', 'example-submodule.fast-export.xz') xz = subprocess.Popen( ['xzcat'], stdin=open(fast_export, 'rb'), stdout=subprocess.PIPE, ) git = subprocess.Popen( ['git', 'fast-import', '--quiet'], stdin=xz.stdout, cwd=cls.repo_path, ) # flush stdout of xz xz.stdout.close() git.communicate() @classmethod def tearDownClass(cls): super().tearDownClass() shutil.rmtree(cls.repo_path) def setUp(self): super().setUp() self.blob_id = b'28c6f4023d65f74e3b59a2dea3c4277ed9ee07b0' self.blob = { 'sha1_git': bytehex_to_hash(self.blob_id), 'sha1': hash_to_bytes('4850a3420a2262ff061cb296fb915430fa92301c'), 'sha256': hash_to_bytes('fee7c8a485a10321ad94b64135073cb5' '5f22cb9f57fa2417d2adfb09d310adef'), 'blake2s256': hash_to_bytes('5d71873f42a137f6d89286e43677721e574' '1fa05ce4cd5e3c7ea7c44d4c2d10b'), 'data': (b'[submodule "example-dependency"]\n' b'\tpath = example-dependency\n' b'\turl = https://github.com/githubtraining/' b'example-dependency.git\n'), 'length': 124, 'status': 'visible', } self.blob_hidden = { 'sha1_git': bytehex_to_hash(self.blob_id), 'sha1': hash_to_bytes('4850a3420a2262ff061cb296fb915430fa92301c'), 'sha256': hash_to_bytes('fee7c8a485a10321ad94b64135073cb5' '5f22cb9f57fa2417d2adfb09d310adef'), 'blake2s256': hash_to_bytes('5d71873f42a137f6d89286e43677721e574' '1fa05ce4cd5e3c7ea7c44d4c2d10b'), 'length': 124, 'status': 'absent', 'reason': 'Content too large', 'origin': None, } @istest def blob_to_content(self): content = converters.dulwich_blob_to_content(self.repo[self.blob_id]) self.assertEqual(self.blob, content) @istest def blob_to_content_absent(self): max_length = self.blob['length'] - 1 content = converters.dulwich_blob_to_content( self.repo[self.blob_id], max_content_size=max_length) self.assertEqual(self.blob_hidden, content) @istest def commit_to_revision(self): sha1 = b'9768d0b576dbaaecd80abedad6dfd0d72f1476da' revision = converters.dulwich_commit_to_revision(self.repo[sha1]) expected_revision = { 'id': hash_to_bytes('9768d0b576dbaaecd80abedad6dfd0d72f1476da'), 'directory': b'\xf0i\\./\xa7\xce\x9dW@#\xc3A7a\xa4s\xe5\x00\xca', 'type': 'git', 'committer': { 'name': b'Stefano Zacchiroli', 'fullname': b'Stefano Zacchiroli ', 'email': b'zack@upsilon.cc', }, 'author': { 'name': b'Stefano Zacchiroli', 'fullname': b'Stefano Zacchiroli ', 'email': b'zack@upsilon.cc', }, 'committer_date': { 'negative_utc': None, 'timestamp': 1443083765, 'offset': 120, }, 'message': b'add submodule dependency\n', 'metadata': None, 'date': { 'negative_utc': None, 'timestamp': 1443083765, 'offset': 120, }, 'parents': [ b'\xc3\xc5\x88q23`\x9f[\xbb\xb2\xd9\xe7\xf3\xfbJf\x0f?r' ], 'synthetic': False, } self.assertEquals(revision, expected_revision) @istest def author_line_to_author(self): tests = { b'a ': { 'name': b'a', 'email': b'b@c.com', 'fullname': b'a ', }, b'': { 'name': None, 'email': b'foo@bar.com', 'fullname': b'', }, b'malformed ': { 'name': b'trailing', 'email': b'sp@c.e', 'fullname': b'trailing ', }, b'no': { 'name': b'no', 'email': b'sp@c.e', 'fullname': b'no', }, b' <>': { 'name': b'', 'email': b'', 'fullname': b' <>', }, } for author in sorted(tests): parsed_author = tests[author] self.assertEquals(parsed_author, converters.parse_author(author)) + + @istest + def dulwich_tag_to_release_no_author_no_date(self): + target = b'641fb6e08ddb2e4fd096dcf18e80b894bf' + message = b'some release message' + tag = SWHTag(name='blah', + type_name=b'tag', + target=target, + target_type=b'commit', + message=message, + tagger=None, + tag_time=None, tag_timezone=None) + + # when + actual_release = converters.dulwich_tag_to_release(tag) + + # then + expected_release = { + 'author': None, + 'date': None, + 'id': b'\xda9\xa3\xee^kK\r2U\xbf\xef\x95`\x18\x90\xaf\xd8\x07\t', + 'message': message, + 'metadata': None, + 'name': 'blah', + 'synthetic': False, + 'target': hash_to_bytes(target.decode()), + 'target_type': 'revision' + } + + self.assertEquals(actual_release, expected_release) + + @istest + def dulwich_tag_to_release_author_and_date(self): + tagger = b'hey dude ' + target = b'641fb6e08ddb2e4fd096dcf18e80b894bf' + message = b'some release message' + + import datetime + date = datetime.datetime(2007, 12, 5).timestamp() + + tag = SWHTag(name='blah', + type_name=b'tag', + target=target, + target_type=b'commit', + message=message, + tagger=tagger, + tag_time=date, + tag_timezone=0) + + # when + actual_release = converters.dulwich_tag_to_release(tag) + + # then + expected_release = { + 'author': { + 'email': b'hello@mail.org', + 'fullname': b'hey dude ', + 'name': b'hey dude' + }, + 'date': { + 'negative_utc': False, + 'offset': 0, + 'timestamp': 1196809200.0 + }, + 'id': b'\xda9\xa3\xee^kK\r2U\xbf\xef\x95`\x18\x90\xaf\xd8\x07\t', + 'message': message, + 'metadata': None, + 'name': 'blah', + 'synthetic': False, + 'target': hash_to_bytes(target.decode()), + 'target_type': 'revision' + } + + self.assertEquals(actual_release, expected_release) + + @istest + def dulwich_tag_to_release_author_no_date(self): + # to reproduce bug T815 (fixed) + tagger = b'hey dude ' + target = b'641fb6e08ddb2e4fd096dcf18e80b894bf' + message = b'some release message' + tag = SWHTag(name='blah', + type_name=b'tag', + target=target, + target_type=b'commit', + message=message, + tagger=tagger, + tag_time=None, tag_timezone=None) + + # when + actual_release = converters.dulwich_tag_to_release(tag) + + # then + expected_release = { + 'author': { + 'email': b'hello@mail.org', + 'fullname': b'hey dude ', + 'name': b'hey dude' + }, + 'date': None, + 'id': b'\xda9\xa3\xee^kK\r2U\xbf\xef\x95`\x18\x90\xaf\xd8\x07\t', + 'message': message, + 'metadata': None, + 'name': 'blah', + 'synthetic': False, + 'target': hash_to_bytes(target.decode()), + 'target_type': 'revision' + } + + self.assertEquals(actual_release, expected_release) + + +class SWHTargetType: + """Dulwich lookalike TargetType class + + """ + def __init__(self, type_name): + self.type_name = type_name + + +class SWHTag: + """Dulwich lookalike tag class + + """ + def __init__(self, name, type_name, target, target_type, tagger, tag_time, + tag_timezone, message): + self.name = name + self.type_name = type_name + self.object = SWHTargetType(target_type), target + self.tagger = tagger + self._message = message + self.tag_time = tag_time + self.tag_timezone = tag_timezone + self._tag_timezone_neg_utc = False + + def sha(self): + from hashlib import sha1 + return sha1()