diff --git a/requirements-swh.txt b/requirements-swh.txt index f5a2bf6..38ea505 100644 --- a/requirements-swh.txt +++ b/requirements-swh.txt @@ -1,5 +1,5 @@ swh.core >= 0.0.7 swh.loader.core >= 0.0.78 -swh.model >= 0.0.27 +swh.model >= 0.0.60 swh.scheduler >= 0.0.39 swh.storage >= 0.0.108 diff --git a/swh/loader/git/converters.py b/swh/loader/git/converters.py index 9dfd5bd..58d6fc4 100644 --- a/swh/loader/git/converters.py +++ b/swh/loader/git/converters.py @@ -1,227 +1,198 @@ # Copyright (C) 2015-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information """Convert dulwich objects to dictionaries suitable for swh.storage""" from typing import Any, Dict, Optional from swh.model.hashutil import ( DEFAULT_ALGORITHMS, hash_to_bytes, MultiHash ) from swh.model.model import ( BaseContent, Content, Directory, DirectoryEntry, ObjectType, Person, Release, Revision, RevisionType, SkippedContent, TargetType, Timestamp, TimestampWithTimezone, ) HASH_ALGORITHMS = DEFAULT_ALGORITHMS - {'sha1_git'} def dulwich_blob_to_content_id(blob) -> Dict[str, Any]: """Convert a dulwich blob to a Software Heritage content id""" if blob.type_name != b'blob': raise ValueError('Argument is not a blob.') size = blob.raw_length() data = blob.as_raw_string() hashes = MultiHash.from_data(data, HASH_ALGORITHMS).digest() hashes['sha1_git'] = blob.sha().digest() hashes['length'] = size return hashes def dulwich_blob_to_content(blob, max_content_size=None) -> BaseContent: """Convert a dulwich blob to a Software Heritage content """ if blob.type_name != b'blob': raise ValueError('Argument is not a blob.') hashes = dulwich_blob_to_content_id(blob) if max_content_size is not None and hashes['length'] >= max_content_size: return SkippedContent( status='absent', reason='Content too large', **hashes, ) else: return Content( data=blob.as_raw_string(), status='visible', **hashes, ) def dulwich_tree_to_directory(tree, log=None) -> Directory: """Format a tree as a directory""" if tree.type_name != b'tree': raise ValueError('Argument is not a tree.') entries = [] entry_mode_map = { 0o040000: 'dir', 0o160000: 'rev', 0o100644: 'file', 0o100755: 'file', 0o120000: 'file', } for entry in tree.iteritems(): entries.append(DirectoryEntry( type=entry_mode_map.get(entry.mode, 'file'), perms=entry.mode, name=entry.path, target=hash_to_bytes(entry.sha.decode('ascii')), )) return Directory( id=tree.sha().digest(), entries=entries, ) def parse_author(name_email: bytes) -> Person: """Parse an author line""" - if name_email is None: - raise ValueError('fullname is None') - - try: - open_bracket = name_email.index(b'<') - except ValueError: - name = email = None - else: - raw_name = name_email[:open_bracket] - raw_email = name_email[open_bracket+1:] - - if not raw_name: - name = None - elif raw_name.endswith(b' '): - name = raw_name[:-1] - else: - name = raw_name - - try: - close_bracket = raw_email.index(b'>') - except ValueError: - email = None - else: - email = raw_email[:close_bracket] - - return Person( - name=name, - email=email, - fullname=name_email, - ) + return Person.from_fullname(name_email) def dulwich_tsinfo_to_timestamp( timestamp, timezone, timezone_neg_utc) -> TimestampWithTimezone: """Convert the dulwich timestamp information to a structure compatible with Software Heritage""" return TimestampWithTimezone( timestamp=Timestamp( seconds=timestamp, microseconds=0, ), offset=timezone // 60, negative_utc=timezone_neg_utc if timezone == 0 else None, ) def dulwich_commit_to_revision(commit, log=None) -> Revision: if commit.type_name != b'commit': raise ValueError('Argument is not a commit.') git_metadata = [] if commit.encoding is not None: git_metadata.append(['encoding', commit.encoding]) if commit.mergetag: for mergetag in commit.mergetag: raw_string = mergetag.as_raw_string() assert raw_string.endswith(b'\n') git_metadata.append(['mergetag', raw_string[:-1]]) if commit.extra: git_metadata.extend([k.decode('utf-8'), v] for k, v in commit.extra) if commit.gpgsig: git_metadata.append(['gpgsig', commit.gpgsig]) if git_metadata: metadata: Optional[Dict[str, Any]] = { 'extra_headers': git_metadata, } else: metadata = None return Revision( id=commit.sha().digest(), author=parse_author(commit.author), date=dulwich_tsinfo_to_timestamp( commit.author_time, commit.author_timezone, commit._author_timezone_neg_utc, ), committer=parse_author(commit.committer), committer_date=dulwich_tsinfo_to_timestamp( commit.commit_time, commit.commit_timezone, commit._commit_timezone_neg_utc, ), type=RevisionType.GIT, directory=bytes.fromhex(commit.tree.decode()), message=commit.message, metadata=metadata, synthetic=False, parents=[bytes.fromhex(p.decode()) for p in commit.parents], ) DULWICH_TARGET_TYPES = { b'blob': TargetType.CONTENT, b'tree': TargetType.DIRECTORY, b'commit': TargetType.REVISION, b'tag': TargetType.RELEASE, } DULWICH_OBJECT_TYPES = { b'blob': ObjectType.CONTENT, b'tree': ObjectType.DIRECTORY, b'commit': ObjectType.REVISION, b'tag': ObjectType.RELEASE, } def dulwich_tag_to_release(tag, log=None) -> Release: if tag.type_name != b'tag': raise ValueError('Argument is not a tag.') target_type, target = tag.object if tag.tagger: author: Optional[Person] = parse_author(tag.tagger) if not tag.tag_time: date = None else: date = dulwich_tsinfo_to_timestamp( tag.tag_time, tag.tag_timezone, tag._tag_timezone_neg_utc, ) else: author = date = None return Release( id=tag.sha().digest(), author=author, date=date, name=tag.name, target=bytes.fromhex(target.decode()), target_type=DULWICH_OBJECT_TYPES[target_type.type_name], message=tag._message, metadata=None, synthetic=False, ) diff --git a/swh/loader/git/tests/test_converters.py b/swh/loader/git/tests/test_converters.py index ab794e8..550435a 100644 --- a/swh/loader/git/tests/test_converters.py +++ b/swh/loader/git/tests/test_converters.py @@ -1,325 +1,325 @@ # Copyright (C) 2015-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import os import pytest import shutil import subprocess import tempfile import unittest import dulwich.repo from swh.model.hashutil import bytehex_to_hash, hash_to_bytes from swh.model.model import ( Content, Person, Release, Revision, RevisionType, ObjectType, Timestamp, TimestampWithTimezone, ) import swh.loader.git.converters as converters TEST_DATA = os.path.join(os.path.dirname(__file__), 'data') class SWHObjectType: """Dulwich lookalike ObjectType class """ def __init__(self, type_name): self.type_name = type_name class SWHTag: """Dulwich lookalike tag class """ def __init__(self, name, type_name, target, target_type, tagger, tag_time, tag_timezone, message): self.name = name self.type_name = type_name self.object = SWHObjectType(target_type), target self.tagger = tagger self._message = message self.tag_time = tag_time self.tag_timezone = tag_timezone self._tag_timezone_neg_utc = False def sha(self): from hashlib import sha1 return sha1() @pytest.mark.fs class TestConverters(unittest.TestCase): @classmethod def setUpClass(cls): super().setUpClass() cls.repo_path = tempfile.mkdtemp() cls.repo = dulwich.repo.Repo.init_bare(cls.repo_path) fast_export = os.path.join( TEST_DATA, 'git-repos', 'example-submodule.fast-export.xz') xz = subprocess.Popen( ['xzcat'], stdin=open(fast_export, 'rb'), stdout=subprocess.PIPE, ) git = subprocess.Popen( ['git', 'fast-import', '--quiet'], stdin=xz.stdout, cwd=cls.repo_path, ) # flush stdout of xz xz.stdout.close() git.communicate() @classmethod def tearDownClass(cls): super().tearDownClass() shutil.rmtree(cls.repo_path) def test_blob_to_content(self): content_id = b'28c6f4023d65f74e3b59a2dea3c4277ed9ee07b0' content = converters.dulwich_blob_to_content(self.repo[content_id]) expected_content = Content( sha1_git=bytehex_to_hash(content_id), sha1=hash_to_bytes('4850a3420a2262ff061cb296fb915430fa92301c'), sha256=hash_to_bytes('fee7c8a485a10321ad94b64135073cb5' '5f22cb9f57fa2417d2adfb09d310adef'), blake2s256=hash_to_bytes('5d71873f42a137f6d89286e43677721e574' '1fa05ce4cd5e3c7ea7c44d4c2d10b'), data=(b'[submodule "example-dependency"]\n' b'\tpath = example-dependency\n' b'\turl = https://github.com/githubtraining/' b'example-dependency.git\n'), length=124, status='visible', ) self.assertEqual(content, expected_content) def test_convertion_wrong_input(self): class Something: type_name = b'something-not-the-right-type' m = { 'blob': converters.dulwich_blob_to_content, 'blob2': converters.dulwich_blob_to_content_id, 'tree': converters.dulwich_tree_to_directory, 'commit': converters.dulwich_tree_to_directory, 'tag': converters.dulwich_tag_to_release, } for _callable in m.values(): with self.assertRaises(ValueError): _callable(Something()) def test_commit_to_revision(self): sha1 = b'9768d0b576dbaaecd80abedad6dfd0d72f1476da' revision = converters.dulwich_commit_to_revision(self.repo[sha1]) expected_revision = Revision( id=hash_to_bytes('9768d0b576dbaaecd80abedad6dfd0d72f1476da'), directory=b'\xf0i\\./\xa7\xce\x9dW@#\xc3A7a\xa4s\xe5\x00\xca', type=RevisionType.GIT, committer=Person( name=b'Stefano Zacchiroli', fullname=b'Stefano Zacchiroli ', email=b'zack@upsilon.cc', ), author=Person( name=b'Stefano Zacchiroli', fullname=b'Stefano Zacchiroli ', email=b'zack@upsilon.cc', ), committer_date=TimestampWithTimezone( timestamp=Timestamp( seconds=1443083765, microseconds=0, ), negative_utc=None, offset=120, ), message=b'add submodule dependency\n', metadata=None, date=TimestampWithTimezone( timestamp=Timestamp( seconds=1443083765, microseconds=0, ), negative_utc=None, offset=120, ), parents=[ b'\xc3\xc5\x88q23`\x9f[\xbb\xb2\xd9\xe7\xf3\xfbJf\x0f?r' ], synthetic=False, ) self.assertEqual(revision, expected_revision) def test_author_line_to_author(self): # edge case out of the way - with self.assertRaises(ValueError): + with self.assertRaises(TypeError): converters.parse_author(None) tests = { b'a ': Person( name=b'a', email=b'b@c.com', fullname=b'a ', ), b'': Person( name=None, email=b'foo@bar.com', fullname=b'', ), b'malformed ': Person( name=b'trailing', email=b'sp@c.e', fullname=b'trailing ', ), b'no': Person( name=b'no', email=b'sp@c.e', fullname=b'no', ), b' <>': Person( - name=b'', - email=b'', + name=None, + email=None, fullname=b' <>', ), b'something': Person( - name=None, + name=b'something', email=None, fullname=b'something' ) } for author in sorted(tests): parsed_author = tests[author] self.assertEqual(parsed_author, converters.parse_author(author)) def test_dulwich_tag_to_release_no_author_no_date(self): target = b'641fb6e08ddb2e4fd096dcf18e80b894bf' message = b'some release message' tag = SWHTag(name='blah', type_name=b'tag', target=target, target_type=b'commit', message=message, tagger=None, tag_time=None, tag_timezone=None) # when actual_release = converters.dulwich_tag_to_release(tag) # then expected_release = Release( author=None, date=None, id=b'\xda9\xa3\xee^kK\r2U\xbf\xef\x95`\x18\x90\xaf\xd8\x07\t', message=message, metadata=None, name='blah', synthetic=False, target=hash_to_bytes(target.decode()), target_type=ObjectType.REVISION, ) self.assertEqual(actual_release, expected_release) def test_dulwich_tag_to_release_author_and_date(self): tagger = b'hey dude ' target = b'641fb6e08ddb2e4fd096dcf18e80b894bf' message = b'some release message' import datetime date = datetime.datetime( 2007, 12, 5, tzinfo=datetime.timezone.utc ).timestamp() tag = SWHTag(name='blah', type_name=b'tag', target=target, target_type=b'commit', message=message, tagger=tagger, tag_time=date, tag_timezone=0) # when actual_release = converters.dulwich_tag_to_release(tag) # then expected_release = Release( author=Person( email=b'hello@mail.org', fullname=b'hey dude ', name=b'hey dude' ), date=TimestampWithTimezone( negative_utc=False, offset=0, timestamp=Timestamp( seconds=1196812800, microseconds=0, ) ), id=b'\xda9\xa3\xee^kK\r2U\xbf\xef\x95`\x18\x90\xaf\xd8\x07\t', message=message, metadata=None, name='blah', synthetic=False, target=hash_to_bytes(target.decode()), target_type=ObjectType.REVISION, ) self.assertEqual(actual_release, expected_release) def test_dulwich_tag_to_release_author_no_date(self): # to reproduce bug T815 (fixed) tagger = b'hey dude ' target = b'641fb6e08ddb2e4fd096dcf18e80b894bf' message = b'some release message' tag = SWHTag(name='blah', type_name=b'tag', target=target, target_type=b'commit', message=message, tagger=tagger, tag_time=None, tag_timezone=None) # when actual_release = converters.dulwich_tag_to_release(tag) # then expected_release = Release( author=Person( email=b'hello@mail.org', fullname=b'hey dude ', name=b'hey dude' ), date=None, id=b'\xda9\xa3\xee^kK\r2U\xbf\xef\x95`\x18\x90\xaf\xd8\x07\t', message=message, metadata=None, name='blah', synthetic=False, target=hash_to_bytes(target.decode()), target_type=ObjectType.REVISION, ) self.assertEqual(actual_release, expected_release)