diff --git a/swh/loader/git/converters.py b/swh/loader/git/converters.py index 6a4f08b..cbd1ed2 100644 --- a/swh/loader/git/converters.py +++ b/swh/loader/git/converters.py @@ -1,162 +1,174 @@ # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information """Convert pygit2 objects to dictionaries suitable for swh.storage""" +import sys + from pygit2 import GIT_OBJ_COMMIT from swh.core import hashutil from .utils import format_date HASH_ALGORITHMS = ['sha1', 'sha256'] +sys_encoding = sys.getfilesystemencoding() + + def blob_to_content(id, repo, log=None, max_content_size=None, origin_id=None): """Format a blob as a content""" blob = repo[id] size = blob.size ret = { 'sha1_git': id.raw, 'length': blob.size, 'status': 'absent' } if max_content_size: if size > max_content_size: if log: log.info('Skipping content %s, too large (%s > %s)' % (id.hex, size, max_content_size), extra={ 'swh_type': 'loader_git_content_skip', 'swh_repo': repo.path, 'swh_id': id.hex, 'swh_size': size, }) ret['reason'] = 'Content too large' ret['origin'] = origin_id return ret data = blob.data hashes = hashutil.hashdata(data, HASH_ALGORITHMS) ret.update(hashes) ret['data'] = data ret['status'] = 'visible' return ret def tree_to_directory(id, repo, log=None): """Format a tree as a directory""" ret = { 'id': id.raw, } entries = [] ret['entries'] = entries entry_type_map = { 'tree': 'dir', 'blob': 'file', 'commit': 'rev', } for entry in repo[id]: entries.append({ 'type': entry_type_map[entry.type], 'perms': entry.filemode, 'name': entry._name, 'target': entry.id.raw, }) return ret def commit_to_revision(id, repo, log=None): """Format a commit as a revision""" commit = repo[id] author = commit.author committer = commit.committer return { 'id': id.raw, 'date': format_date(author), 'committer_date': format_date(committer), 'type': 'git', 'directory': commit.tree_id.raw, 'message': commit.raw_message, 'metadata': None, 'author': { 'name': author.raw_name, 'email': author.raw_email, }, 'committer': { 'name': committer.raw_name, 'email': committer.raw_email, }, 'synthetic': False, 'parents': [p.raw for p in commit.parent_ids], } def annotated_tag_to_release(id, repo, log=None): """Format an annotated tag as a release""" tag = repo[id] tag_pointer = repo[tag.target] if tag_pointer.type != GIT_OBJ_COMMIT: if log: log.warn("Ignoring tag %s pointing at %s %s" % ( tag.id.hex, tag_pointer.__class__.__name__, tag_pointer.id.hex), extra={ 'swh_type': 'loader_git_tag_ignore', 'swh_repo': repo.path, 'swh_tag_id': tag.id.hex, 'swh_tag_dest': { 'type': tag_pointer.__class__.__name__, 'id': tag_pointer.id.hex, }, }) return if not tag.tagger: if log: log.warn("Tag %s has no author, using default values" % id.hex, extra={ 'swh_type': 'loader_git_tag_author_default', 'swh_repo': repo.path, 'swh_tag_id': tag.id.hex, }) author = None date = None else: author = { 'name': tag.tagger.raw_name, 'email': tag.tagger.raw_email, } date = format_date(tag.tagger) return { 'id': id.raw, 'date': date, 'target': tag.target.raw, 'target_type': 'revision', 'message': tag._message, - 'name': tag.name, + 'name': tag.name.encode(sys_encoding), 'author': author, 'metadata': None, 'synthetic': False, } def ref_to_occurrence(ref): """Format a reference as an occurrence""" - return ref + occ = ref.copy() + if 'branch' in ref: + branch = ref['branch'] + if isinstance(branch, str): + occ['branch'] = branch.encode(sys_encoding) + else: + occ['branch'] = branch + return occ def origin_url_to_origin(origin_url): """Format a pygit2.Repository as an origin suitable for swh.storage""" return { 'type': 'git', 'url': origin_url, } diff --git a/swh/loader/git/tests/test_converters.py b/swh/loader/git/tests/test_converters.py index 852f959..3fe8c2b 100644 --- a/swh/loader/git/tests/test_converters.py +++ b/swh/loader/git/tests/test_converters.py @@ -1,132 +1,158 @@ # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import os import shutil import subprocess import tempfile import unittest import datetime from nose.tools import istest import pygit2 import swh.loader.git.converters as converters from swh.core.hashutil import hex_to_hash class TestConverters(unittest.TestCase): @classmethod def setUpClass(cls): super().setUpClass() cls.repo_path = tempfile.mkdtemp() cls.repo = pygit2.init_repository(cls.repo_path, bare=True) fast_export = os.path.join(os.path.dirname(__file__), '../../../../..', 'swh-storage-testdata', 'git-repos', 'example-submodule.fast-export.xz') xz = subprocess.Popen( ['xzcat'], stdin=open(fast_export, 'rb'), stdout=subprocess.PIPE, ) git = subprocess.Popen( ['git', 'fast-import', '--quiet'], stdin=xz.stdout, cwd=cls.repo_path, ) # flush stdout of xz xz.stdout.close() git.communicate() @classmethod def tearDownClass(cls): super().tearDownClass() shutil.rmtree(cls.repo_path) print(cls.repo_path) def setUp(self): super().setUp() self.blob_id = pygit2.Oid( hex='28c6f4023d65f74e3b59a2dea3c4277ed9ee07b0') self.blob = { 'sha1_git': self.blob_id.raw, 'sha1': hex_to_hash('4850a3420a2262ff061cb296fb915430fa92301c'), 'sha256': hex_to_hash('fee7c8a485a10321ad94b64135073cb5' '5f22cb9f57fa2417d2adfb09d310adef'), 'data': (b'[submodule "example-dependency"]\n' b'\tpath = example-dependency\n' b'\turl = https://github.com/githubtraining/' b'example-dependency.git\n'), 'length': 124, 'status': 'visible', } self.blob_hidden = { 'sha1_git': self.blob_id.raw, 'length': 124, 'status': 'absent', 'reason': 'Content too large', 'origin': None, } @istest def blob_to_content(self): content = converters.blob_to_content(self.blob_id, self.repo) self.assertEqual(self.blob, content) @istest def blob_to_content_absent(self): max_length = self.blob['length'] - 1 content = converters.blob_to_content(self.blob_id, self.repo, max_content_size=max_length) self.assertEqual(self.blob_hidden, content) @istest def commit_to_revision(self): sha1 = '9768d0b576dbaaecd80abedad6dfd0d72f1476da' commit = self.repo.revparse_single(sha1) # when actual_revision = converters.commit_to_revision(commit.id, self.repo) offset = datetime.timedelta(minutes=120) tzoffset = datetime.timezone(offset) expected_revision = { 'id': hex_to_hash('9768d0b576dbaaecd80abedad6dfd0d72f1476da'), 'directory': b'\xf0i\\./\xa7\xce\x9dW@#\xc3A7a\xa4s\xe5\x00\xca', 'type': 'git', 'committer': { 'name': b'Stefano Zacchiroli', 'email': b'zack@upsilon.cc', }, 'author': { 'name': b'Stefano Zacchiroli', 'email': b'zack@upsilon.cc', }, 'committer_date': datetime.datetime(2015, 9, 24, 10, 36, 5, tzinfo=tzoffset), 'message': b'add submodule dependency\n', 'metadata': None, 'date': datetime.datetime(2015, 9, 24, 10, 36, 5, tzinfo=tzoffset), 'parents': [ b'\xc3\xc5\x88q23`\x9f[\xbb\xb2\xd9\xe7\xf3\xfbJf\x0f?r' ], 'synthetic': False, } # then self.assertEquals(actual_revision, expected_revision) self.assertEquals(offset, expected_revision['date'].utcoffset()) self.assertEquals(offset, expected_revision['committer_date'].utcoffset()) + + @istest + def ref_to_occurrence_1(self): + # when + actual_occ = converters.ref_to_occurrence({ + 'id': 'some-id', + 'branch': 'some/branch' + }) + # then + self.assertEquals(actual_occ, { + 'id': 'some-id', + 'branch': b'some/branch' + }) + + def ref_to_occurrence_2(self): + # when + actual_occ = converters.ref_to_occurrence({ + 'id': 'some-id', + 'branch': b'some/branch' + }) + + # then + self.assertEquals(actual_occ, { + 'id': 'some-id', + 'branch': b'some/branch' + })