diff --git a/requirements-swh.txt b/requirements-swh.txt --- a/requirements-swh.txt +++ b/requirements-swh.txt @@ -1,5 +1,5 @@ swh.core >= 0.0.7 swh.loader.core >= 0.18.0 -swh.model >= 0.4.0 +swh.model >= 2.9.0 swh.scheduler >= 0.0.39 swh.storage >= 0.22.0 diff --git a/requirements.txt b/requirements.txt --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ dulwich >= 0.18.7 retrying click +typing-extensions diff --git a/swh/loader/git/converters.py b/swh/loader/git/converters.py --- a/swh/loader/git/converters.py +++ b/swh/loader/git/converters.py @@ -15,6 +15,7 @@ Content, Directory, DirectoryEntry, + HashableObject, ObjectType, Person, Release, @@ -26,7 +27,18 @@ TimestampWithTimezone, ) -HASH_ALGORITHMS = DEFAULT_ALGORITHMS - {"sha1_git"} + +class HashMismatch(Exception): + pass + + +def check_id(obj: HashableObject) -> None: + real_id = obj.compute_hash() + if obj.id != real_id: + raise HashMismatch( + f"Expected {type(obj).__name__} hash to be {obj.id.hex()}, " + f"got {real_id.hex()}" + ) def dulwich_blob_to_content_id(obj: ShaFile) -> Dict[str, Any]: @@ -37,8 +49,12 @@ size = blob.raw_length() data = blob.as_raw_string() - hashes = MultiHash.from_data(data, HASH_ALGORITHMS).digest() - hashes["sha1_git"] = blob.sha().digest() + hashes = MultiHash.from_data(data, DEFAULT_ALGORITHMS).digest() + if hashes["sha1_git"] != blob.sha().digest(): + raise HashMismatch( + f"Expected Content hash to be {blob.sha().digest().hex()}, " + f"got {hashes['sha1_git'].hex()}" + ) hashes["length"] = size return hashes @@ -84,7 +100,9 @@ ) ) - return Directory(id=tree.sha().digest(), entries=tuple(entries),) + dir_ = Directory(id=tree.sha().digest(), entries=tuple(entries),) + check_id(dir_) + return dir_ def parse_author(name_email: bytes) -> Person: @@ -124,7 +142,7 @@ if commit.gpgsig: extra_headers.append((b"gpgsig", commit.gpgsig)) - return Revision( + rev = Revision( id=commit.sha().digest(), author=parse_author(commit.author), date=dulwich_tsinfo_to_timestamp( @@ -142,6 +160,8 @@ synthetic=False, parents=tuple(bytes.fromhex(p.decode()) for p in commit.parents), ) + check_id(rev) + return rev DULWICH_TARGET_TYPES = { @@ -181,7 +201,7 @@ if tag.signature: message += tag.signature - return Release( + rel = Release( id=tag.sha().digest(), author=author, date=date, @@ -192,3 +212,5 @@ metadata=None, synthetic=False, ) + check_id(rel) + return rel diff --git a/swh/loader/git/tests/test_converters.py b/swh/loader/git/tests/test_converters.py --- a/swh/loader/git/tests/test_converters.py +++ b/swh/loader/git/tests/test_converters.py @@ -3,6 +3,7 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +import copy import os import shutil import subprocess @@ -12,7 +13,7 @@ import pytest import swh.loader.git.converters as converters -from swh.model.hashutil import bytehex_to_hash, hash_to_bytes +from swh.model.hashutil import bytehex_to_hash, hash_to_bytehex, hash_to_bytes from swh.model.model import ( Content, ObjectType, @@ -89,6 +90,7 @@ def __init__( self, + sha, name, type_name, target, @@ -99,6 +101,7 @@ message, signature, ): + self._sha = sha self.name = name self.type_name = type_name self.object = SWHObjectType(target_type), target @@ -110,9 +113,11 @@ self._tag_timezone_neg_utc = False def sha(self): - from hashlib import sha1 + class hasher: + def digest(): + return self._sha - return sha1() + return hasher @pytest.mark.fs @@ -161,13 +166,34 @@ ) assert content == expected_content + def test_corrupt_blob(self, mocker): + # has a signature + sha1 = hash_to_bytes("28c6f4023d65f74e3b59a2dea3c4277ed9ee07b0") + + blob = copy.deepcopy(self.repo[hash_to_bytehex(sha1)]) + + class hasher: + def digest(): + return sha1 + + blob._sha = hasher + + converters.dulwich_blob_to_content(blob) + converters.dulwich_blob_to_content_id(blob) + + sha1 = hash_to_bytes("1234" * 10) + + with pytest.raises(converters.HashMismatch): + converters.dulwich_blob_to_content(blob) + with pytest.raises(converters.HashMismatch): + converters.dulwich_blob_to_content_id(blob) + def test_convertion_wrong_input(self): class Something: type_name = b"something-not-the-right-type" m = { "blob": converters.dulwich_blob_to_content, - "blob2": converters.dulwich_blob_to_content_id, "tree": converters.dulwich_tree_to_directory, "commit": converters.dulwich_tree_to_directory, "tag": converters.dulwich_tag_to_release, @@ -177,6 +203,17 @@ with pytest.raises(ValueError): _callable(Something()) + def test_corrupt_tree(self): + # has a signature + sha1 = b"f0695c2e2fa7ce9d574023c3413761a473e500ca" + tree = copy.deepcopy(self.repo[sha1]) + converters.dulwich_tree_to_directory(tree) + + del tree._entries[next(iter(tree._entries))] + + with pytest.raises(converters.HashMismatch): + converters.dulwich_tree_to_directory(tree) + def test_commit_to_revision(self): sha1 = b"9768d0b576dbaaecd80abedad6dfd0d72f1476da" @@ -251,6 +288,23 @@ assert revision == expected_revision + @pytest.mark.parametrize( + "attribute", ["_message", "_encoding", "_author", "_gpgsig"] + ) + def test_corrupt_commit(self, attribute): + # has a signature + sha1 = b"322f5bc915e50fc25e85226b5a182bded0e98e4b" + commit = copy.deepcopy(self.repo[sha1]) + converters.dulwich_commit_to_revision(commit) + setattr(commit, attribute, b"abcde") + with pytest.raises(converters.HashMismatch): + converters.dulwich_commit_to_revision(commit) + + if attribute == "_gpgsig": + setattr(commit, attribute, None) + with pytest.raises(converters.HashMismatch): + converters.dulwich_commit_to_revision(commit) + def test_commit_to_revision_with_extra_headers_mergetag(self): sha1 = b"3ab3da4bf0f81407be16969df09cd1c8af9ac703" @@ -319,9 +373,11 @@ assert parsed_author == converters.parse_author(author) def test_dulwich_tag_to_release_no_author_no_date(self): - target = b"641fb6e08ddb2e4fd096dcf18e80b894bf" + sha = hash_to_bytes("f6e367357b446bd1315276de5e88ba3d0d99e136") + target = b"641fb6e08ddb2e4fd096dcf18e80b894bf7e25ce" message = b"some release message" tag = SWHTag( + sha=sha, name=b"blah", type_name=b"tag", target=target, @@ -340,7 +396,7 @@ expected_release = Release( author=None, date=None, - id=b"\xda9\xa3\xee^kK\r2U\xbf\xef\x95`\x18\x90\xaf\xd8\x07\t", + id=sha, message=message, metadata=None, name=b"blah", @@ -352,8 +408,9 @@ assert actual_release == expected_release def test_dulwich_tag_to_release_author_and_date(self): + sha = hash_to_bytes("fc1e6a4f1e37e93e28e78560e73efd0b12f616ef") tagger = b"hey dude " - target = b"641fb6e08ddb2e4fd096dcf18e80b894bf" + target = b"641fb6e08ddb2e4fd096dcf18e80b894bf7e25ce" message = b"some release message" import datetime @@ -361,6 +418,7 @@ date = datetime.datetime(2007, 12, 5, tzinfo=datetime.timezone.utc).timestamp() tag = SWHTag( + sha=sha, name=b"blah", type_name=b"tag", target=target, @@ -387,7 +445,7 @@ offset=0, timestamp=Timestamp(seconds=1196812800, microseconds=0,), ), - id=b"\xda9\xa3\xee^kK\r2U\xbf\xef\x95`\x18\x90\xaf\xd8\x07\t", + id=sha, message=message, metadata=None, name=b"blah", @@ -400,10 +458,12 @@ def test_dulwich_tag_to_release_author_no_date(self): # to reproduce bug T815 (fixed) + sha = hash_to_bytes("41076e970975122dc6b2a878aa9797960bc4781d") tagger = b"hey dude " - target = b"641fb6e08ddb2e4fd096dcf18e80b894bf" + target = b"641fb6e08ddb2e4fd096dcf18e80b894bf7e25ce" message = b"some release message" tag = SWHTag( + sha=sha, name=b"blah", type_name=b"tag", target=target, @@ -426,7 +486,7 @@ name=b"hey dude", ), date=None, - id=b"\xda9\xa3\xee^kK\r2U\xbf\xef\x95`\x18\x90\xaf\xd8\x07\t", + id=sha, message=message, metadata=None, name=b"blah", @@ -438,9 +498,11 @@ assert actual_release == expected_release def test_dulwich_tag_to_release_signature(self): - target = b"641fb6e08ddb2e4fd096dcf18e80b894bf" + target = b"641fb6e08ddb2e4fd096dcf18e80b894bf7e25ce" message = b"some release message" + sha = hash_to_bytes("46fff489610ed733d2cc904e363070dadee05c71") tag = SWHTag( + sha=sha, name=b"blah", type_name=b"tag", target=target, @@ -459,7 +521,7 @@ expected_release = Release( author=None, date=None, - id=b"\xda9\xa3\xee^kK\r2U\xbf\xef\x95`\x18\x90\xaf\xd8\x07\t", + id=sha, message=message + GPGSIG, metadata=None, name=b"blah", @@ -469,3 +531,32 @@ ) assert actual_release == expected_release + + @pytest.mark.parametrize("attribute", ["name", "message", "signature"]) + def test_corrupt_tag(self, attribute): + # has a signature + sha = hash_to_bytes("46fff489610ed733d2cc904e363070dadee05c71") + target = b"641fb6e08ddb2e4fd096dcf18e80b894bf7e25ce" + message = b"some release message" + tag = SWHTag( + sha=sha, + name=b"blah", + type_name=b"tag", + target=target, + target_type=b"commit", + message=message, + signature=GPGSIG, + tagger=None, + tag_time=None, + tag_timezone=None, + ) + converters.dulwich_tag_to_release(tag) + + setattr(tag, attribute, b"abcde") + with pytest.raises(converters.HashMismatch): + converters.dulwich_tag_to_release(tag) + + if attribute == "signature": + setattr(tag, attribute, None) + with pytest.raises(converters.HashMismatch): + converters.dulwich_tag_to_release(tag)