diff --git a/requirements-swh.txt b/requirements-swh.txt --- a/requirements-swh.txt +++ b/requirements-swh.txt @@ -1,5 +1,5 @@ swh.core >= 0.0.7 swh.loader.core >= 0.18.0 -swh.model >= 2.9.0 +swh.model >= 4.3.0 swh.scheduler >= 0.0.39 swh.storage >= 0.22.0 diff --git a/swh/loader/git/converters.py b/swh/loader/git/converters.py --- a/swh/loader/git/converters.py +++ b/swh/loader/git/converters.py @@ -1,15 +1,24 @@ -# Copyright (C) 2015-2020 The Software Heritage developers +# Copyright (C) 2015-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information """Convert dulwich objects to dictionaries suitable for swh.storage""" +import logging +import re from typing import Any, Dict, Optional, cast -from dulwich.objects import Blob, Commit, ShaFile, Tag, Tree +import attr +from dulwich.objects import Blob, Commit, ShaFile, Tag, Tree, _parse_message -from swh.model.hashutil import DEFAULT_ALGORITHMS, MultiHash, hash_to_bytes +from swh.model.hashutil import ( + DEFAULT_ALGORITHMS, + MultiHash, + git_object_header, + hash_to_bytes, + hash_to_hex, +) from swh.model.model import ( BaseContent, Content, @@ -34,6 +43,10 @@ """Mode/perms of tree entries that point to a tree. They are normally equal to this mask, but may have more bits set to 1.""" +AUTHORSHIP_LINE_RE = re.compile(rb"^.*> (?P\S+) (?P\S+)$") + +logger = logging.getLogger(__name__) + class HashMismatch(Exception): pass @@ -107,6 +120,20 @@ ) dir_ = Directory(id=tree.sha().digest(), entries=tuple(entries),) + + if dir_.compute_hash() != dir_.id: + expected_id = dir_.id + actual_id = dir_.compute_hash() + logger.warning( + "Expected directory to have id %s, but got %s. Recording raw_manifest.", + hash_to_hex(expected_id), + hash_to_hex(actual_id), + ) + raw_string = tree.as_raw_string() + dir_ = attr.evolve( + dir_, raw_manifest=git_object_header("tree", len(raw_string)) + raw_string + ) + check_id(dir_) return dir_ @@ -117,15 +144,19 @@ def dulwich_tsinfo_to_timestamp( - timestamp, timezone, timezone_neg_utc + timestamp, timezone: int, timezone_neg_utc: bool, timezone_bytes: Optional[bytes], ) -> TimestampWithTimezone: """Convert the dulwich timestamp information to a structure compatible with - Software Heritage""" - return TimestampWithTimezone( - timestamp=Timestamp(seconds=int(timestamp), microseconds=0,), - offset=timezone // 60, - negative_utc=timezone_neg_utc if timezone == 0 else False, - ) + Software Heritage.""" + ts = Timestamp(seconds=int(timestamp), microseconds=0,) + if timezone_bytes is None: + # Failed to parse from the raw manifest, fallback to what Dulwich managed to + # parse. + return TimestampWithTimezone.from_numeric_offset( + timestamp=ts, offset=timezone // 60, negative_utc=timezone_neg_utc, + ) + else: + return TimestampWithTimezone(timestamp=ts, offset_bytes=timezone_bytes) def dulwich_commit_to_revision(obj: ShaFile) -> Revision: @@ -133,6 +164,18 @@ raise ValueError("Argument is not a commit.") commit = cast(Commit, obj) + author_timezone = None + committer_timezone = None + for (field, value) in _parse_message(commit._chunked_text): + if field == b"author": + m = AUTHORSHIP_LINE_RE.match(value) + if m: + author_timezone = m.group("timezone") + elif field == b"committer": + m = AUTHORSHIP_LINE_RE.match(value) + if m: + committer_timezone = m.group("timezone") + extra_headers = [] if commit.encoding is not None: extra_headers.append((b"encoding", commit.encoding)) @@ -152,11 +195,17 @@ id=commit.sha().digest(), author=parse_author(commit.author), date=dulwich_tsinfo_to_timestamp( - commit.author_time, commit.author_timezone, commit._author_timezone_neg_utc, + commit.author_time, + commit.author_timezone, + commit._author_timezone_neg_utc, + author_timezone, ), committer=parse_author(commit.committer), committer_date=dulwich_tsinfo_to_timestamp( - commit.commit_time, commit.commit_timezone, commit._commit_timezone_neg_utc, + commit.commit_time, + commit.commit_timezone, + commit._commit_timezone_neg_utc, + committer_timezone, ), type=RevisionType.GIT, directory=bytes.fromhex(commit.tree.decode()), @@ -166,6 +215,20 @@ synthetic=False, parents=tuple(bytes.fromhex(p.decode()) for p in commit.parents), ) + + if rev.compute_hash() != rev.id: + expected_id = rev.id + actual_id = rev.compute_hash() + logger.warning( + "Expected revision to have id %s, but got %s. Recording raw_manifest.", + hash_to_hex(expected_id), + hash_to_hex(actual_id), + ) + raw_string = commit.as_raw_string() + rev = attr.evolve( + rev, raw_manifest=git_object_header("commit", len(raw_string)) + raw_string + ) + check_id(rev) return rev @@ -191,6 +254,14 @@ raise ValueError("Argument is not a tag.") tag = cast(Tag, obj) + tagger_timezone = None + # FIXME: _parse_message is a private function from Dulwich. + for (field, value) in _parse_message(tag.as_raw_chunks()): + if field == b"tagger": + m = AUTHORSHIP_LINE_RE.match(value) + if m: + tagger_timezone = m.group("timezone") + target_type, target = tag.object if tag.tagger: author: Optional[Person] = parse_author(tag.tagger) @@ -198,7 +269,10 @@ date = None else: date = dulwich_tsinfo_to_timestamp( - tag.tag_time, tag.tag_timezone, tag._tag_timezone_neg_utc, + tag.tag_time, + tag.tag_timezone, + tag._tag_timezone_neg_utc, + tagger_timezone, ) else: author = date = None @@ -218,5 +292,19 @@ metadata=None, synthetic=False, ) + + if rel.compute_hash() != rel.id: + expected_id = rel.id + actual_id = rel.compute_hash() + logger.warning( + "Expected release to have id %s, but got %s. Recording raw_manifest.", + hash_to_hex(expected_id), + hash_to_hex(actual_id), + ) + raw_string = tag.as_raw_string() + rel = attr.evolve( + rel, raw_manifest=git_object_header("tag", len(raw_string)) + raw_string + ) + check_id(rel) return rel diff --git a/swh/loader/git/tests/test_converters.py b/swh/loader/git/tests/test_converters.py --- a/swh/loader/git/tests/test_converters.py +++ b/swh/loader/git/tests/test_converters.py @@ -1,4 +1,4 @@ -# Copyright (C) 2015-2018 The Software Heritage developers +# Copyright (C) 2015-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -171,16 +171,54 @@ _callable(Something()) def test_corrupt_tree(self): - # has a signature - sha1 = b"f0695c2e2fa7ce9d574023c3413761a473e500ca" - tree = copy.deepcopy(self.repo[sha1]) + sha1 = b"a9b41fc6347d778f16c4380b598d8083e9b4c1fb" + target = b"641fb6e08ddb2e4fd096dcf18e80b894bf7e25ce" + tree = dulwich.objects.Tree() + tree.add(b"file1", 0o644, target) + assert tree.sha().hexdigest() == sha1.decode() converters.dulwich_tree_to_directory(tree) - del tree._entries[next(iter(tree._entries))] + original_sha = tree.sha() + + tree.add(b"file2", 0o644, target) + tree.sha() # reset tree._needs_serialization + tree._sha = original_sha # force the wrong hash + assert tree.sha().hexdigest() == sha1.decode() with pytest.raises(converters.HashMismatch): converters.dulwich_tree_to_directory(tree) + def test_weird_tree(self): + """Tests a tree with entries the wrong order""" + + raw_manifest = ( + b"0644 file2\x00" + b"d\x1f\xb6\xe0\x8d\xdb.O\xd0\x96\xdc\xf1\x8e\x80\xb8\x94\xbf~%\xce" + b"0644 file1\x00" + b"d\x1f\xb6\xe0\x8d\xdb.O\xd0\x96\xdc\xf1\x8e\x80\xb8\x94\xbf~%\xce" + ) + + tree = dulwich.objects.Tree.from_raw_string(b"tree", raw_manifest) + + assert converters.dulwich_tree_to_directory(tree) == Directory( + entries=( + # in alphabetical order, as it should be + DirectoryEntry( + name=b"file1", + type="file", + target=hash_to_bytes("641fb6e08ddb2e4fd096dcf18e80b894bf7e25ce"), + perms=0o644, + ), + DirectoryEntry( + name=b"file2", + type="file", + target=hash_to_bytes("641fb6e08ddb2e4fd096dcf18e80b894bf7e25ce"), + perms=0o644, + ), + ), + raw_manifest=b"tree 62\x00" + raw_manifest, + ) + def test_tree_perms(self): entries = [ (b"blob_100644", 0o100644, "file"), @@ -228,16 +266,14 @@ ), committer_date=TimestampWithTimezone( timestamp=Timestamp(seconds=1443083765, microseconds=0,), - negative_utc=False, - offset=120, + offset_bytes=b"+0200", ), message=b"add submodule dependency\n", metadata=None, extra_headers=(), date=TimestampWithTimezone( timestamp=Timestamp(seconds=1443083765, microseconds=0,), - negative_utc=False, - offset=120, + offset_bytes=b"+0200", ), parents=(b"\xc3\xc5\x88q23`\x9f[\xbb\xb2\xd9\xe7\xf3\xfbJf\x0f?r",), synthetic=False, @@ -265,16 +301,14 @@ ), committer_date=TimestampWithTimezone( timestamp=Timestamp(seconds=1594137902, microseconds=0,), - negative_utc=False, - offset=120, + offset_bytes=b"+0200", ), message=b"Am\xe9lioration du fichier READM\xa4\n", metadata=None, extra_headers=((b"encoding", b"ISO-8859-15"), (b"gpgsig", GPGSIG)), date=TimestampWithTimezone( timestamp=Timestamp(seconds=1594136900, microseconds=0,), - negative_utc=False, - offset=120, + offset_bytes=b"+0200", ), parents=(bytes.fromhex("c730509025c6e81947102b2d77bc4dc1cade9489"),), synthetic=False, @@ -282,20 +316,67 @@ assert revision == expected_revision - @pytest.mark.parametrize( - "attribute", ["_message", "_encoding", "_author", "_gpgsig"] - ) + def test_commit_without_manifest(self): + """Tests a Release can still be produced when the manifest is not understood + by the custom parser in dulwich_commit_to_revision.""" + target = b"641fb6e08ddb2e4fd096dcf18e80b894bf7e25ce" + message = b"some commit message" + author = Person( + fullname=b"Foo ", name=b"Foo", email=b"foo@example.org" + ) + commit = dulwich.objects.Commit() + commit.tree = target + commit.message = message + commit.author = commit.committer = b"Foo " + commit.author_time = commit.commit_time = 1641980946 + commit.author_timezone = commit.commit_timezone = 3600 + assert converters.dulwich_commit_to_revision(commit) == Revision( + message=b"some commit message", + author=author, + committer=author, + date=TimestampWithTimezone( + timestamp=Timestamp(seconds=1641980946, microseconds=0), + offset_bytes=b"+0100", + ), + committer_date=TimestampWithTimezone( + timestamp=Timestamp(seconds=1641980946, microseconds=0), + offset_bytes=b"+0100", + ), + type=RevisionType.GIT, + directory=hash_to_bytes(target.decode()), + synthetic=False, + metadata=None, + parents=(), + ) + + @pytest.mark.parametrize("attribute", ["message", "encoding", "author", "gpgsig"]) def test_corrupt_commit(self, attribute): - # has a signature - sha1 = b"322f5bc915e50fc25e85226b5a182bded0e98e4b" - commit = copy.deepcopy(self.repo[sha1]) + sha = hash_to_bytes("3f0ac5a6d15d89cf928209a57334e3b77c5651b9") + target = b"641fb6e08ddb2e4fd096dcf18e80b894bf7e25ce" + message = b"some commit message" + commit = dulwich.objects.Commit() + commit.tree = target + commit.message = message + commit.gpgsig = GPGSIG + commit.author = commit.committer = b"Foo " + commit.author_time = commit.commit_time = 1641980946 + commit.author_timezone = commit.commit_timezone = 3600 converters.dulwich_commit_to_revision(commit) + assert commit.sha().digest() == sha + + original_sha = commit.sha() + setattr(commit, attribute, b"abcde") + commit.sha() # reset tag._needs_serialization + commit._sha = original_sha # force the wrong hash + with pytest.raises(converters.HashMismatch): converters.dulwich_commit_to_revision(commit) if attribute == "_gpgsig": setattr(commit, attribute, None) + commit.sha() # reset tag._needs_serialization + commit._sha = original_sha # force the wrong hash with pytest.raises(converters.HashMismatch): converters.dulwich_commit_to_revision(commit) @@ -319,16 +400,14 @@ ), committer_date=TimestampWithTimezone( timestamp=Timestamp(seconds=1594138183, microseconds=0,), - negative_utc=False, - offset=120, + offset_bytes=b"+0200", ), message=b"Merge tag 'v0.0.1' into readme\n\nv0.0.1\n", metadata=None, extra_headers=((b"encoding", b"ISO-8859-15"), (b"mergetag", MERGETAG)), date=TimestampWithTimezone( timestamp=Timestamp(seconds=1594138183, microseconds=0,), - negative_utc=False, - offset=120, + offset_bytes=b"+0200", ), parents=( bytes.fromhex("322f5bc915e50fc25e85226b5a182bded0e98e4b"), @@ -339,6 +418,74 @@ assert revision == expected_revision + def test_weird_commit(self): + """Checks raw_manifest is set when the commit cannot fit the data model""" + + # Well-formed manifest + raw_manifest = ( + b"tree 641fb6e08ddb2e4fd096dcf18e80b894bf7e25ce\n" + b"author Foo 1640191028 +0200\n" + b"committer Foo 1640191028 +0200\n\n" + b"some commit message" + ) + commit = dulwich.objects.Commit.from_raw_string(b"commit", raw_manifest) + date = TimestampWithTimezone( + timestamp=Timestamp(seconds=1640191028, microseconds=0), + offset_bytes=b"+0200", + ) + assert converters.dulwich_commit_to_revision(commit) == Revision( + message=b"some commit message", + directory=hash_to_bytes("641fb6e08ddb2e4fd096dcf18e80b894bf7e25ce"), + synthetic=False, + author=Person.from_fullname(b"Foo ",), + committer=Person.from_fullname(b"Foo ",), + date=date, + committer_date=date, + type=RevisionType.GIT, + raw_manifest=None, + ) + + # Mess with the offset + raw_manifest2 = raw_manifest.replace(b"+0200", b"+200") + commit = dulwich.objects.Commit.from_raw_string(b"commit", raw_manifest2) + date = TimestampWithTimezone( + timestamp=Timestamp(seconds=1640191028, microseconds=0), + offset_bytes=b"+200", + ) + assert converters.dulwich_commit_to_revision(commit) == Revision( + message=b"some commit message", + directory=hash_to_bytes("641fb6e08ddb2e4fd096dcf18e80b894bf7e25ce"), + synthetic=False, + author=Person.from_fullname(b"Foo ",), + committer=Person.from_fullname(b"Foo ",), + date=date, + committer_date=date, + type=RevisionType.GIT, + raw_manifest=None, + ) + + # Mess with the rest of the manifest + raw_manifest2 = raw_manifest.replace( + b"641fb6e08ddb2e4fd096dcf18e80b894bf7e25ce", + b"641FB6E08DDB2E4FD096DCF18E80B894BF7E25CE", + ) + commit = dulwich.objects.Commit.from_raw_string(b"commit", raw_manifest2) + date = TimestampWithTimezone( + timestamp=Timestamp(seconds=1640191028, microseconds=0), + offset_bytes=b"+0200", + ) + assert converters.dulwich_commit_to_revision(commit) == Revision( + message=b"some commit message", + directory=hash_to_bytes("641fb6e08ddb2e4fd096dcf18e80b894bf7e25ce"), + synthetic=False, + author=Person.from_fullname(b"Foo ",), + committer=Person.from_fullname(b"Foo ",), + date=date, + committer_date=date, + type=RevisionType.GIT, + raw_manifest=b"commit 161\x00" + raw_manifest2, + ) + def test_author_line_to_author(self): # edge case out of the way with pytest.raises(TypeError): @@ -429,9 +576,8 @@ name=b"hey dude", ), date=TimestampWithTimezone( - negative_utc=False, - offset=0, timestamp=Timestamp(seconds=1196812800, microseconds=0,), + offset_bytes=b"+0000", ), id=sha, message=message, @@ -512,9 +658,7 @@ name=b"hey dude", ), date=TimestampWithTimezone( - negative_utc=False, - offset=0, - timestamp=Timestamp(seconds=0, microseconds=0,), + timestamp=Timestamp(seconds=0, microseconds=0,), offset_bytes=b"+0000" ), id=sha, message=message, @@ -561,7 +705,6 @@ @pytest.mark.parametrize("attribute", ["name", "message", "signature"]) def test_corrupt_tag(self, attribute): - # has a signature sha = hash_to_bytes("46fff489610ed733d2cc904e363070dadee05c71") target = b"641fb6e08ddb2e4fd096dcf18e80b894bf7e25ce" message = b"some release message" @@ -590,3 +733,81 @@ tag._sha = original_sha # force the wrong hash with pytest.raises(converters.HashMismatch): converters.dulwich_tag_to_release(tag) + + def test_weird_tag(self): + """Checks raw_manifest is set when the tag cannot fit the data model""" + + # Well-formed manifest + raw_manifest = ( + b"object 641fb6e08ddb2e4fd096dcf18e80b894bf7e25ce\n" + b"type commit\n" + b"tag blah\n" + b"tagger Foo 1640191027 +0200\n\n" + b"some release message" + ) + tag = dulwich.objects.Tag.from_raw_string(b"tag", raw_manifest) + assert converters.dulwich_tag_to_release(tag) == Release( + name=b"blah", + message=b"some release message", + target=hash_to_bytes("641fb6e08ddb2e4fd096dcf18e80b894bf7e25ce"), + target_type=ObjectType.REVISION, + synthetic=False, + author=Person.from_fullname(b"Foo ",), + date=TimestampWithTimezone( + timestamp=Timestamp(seconds=1640191027, microseconds=0), + offset_bytes=b"+0200", + ), + raw_manifest=None, + ) + + # Mess with the offset (negative UTC) + raw_manifest2 = raw_manifest.replace(b"+0200", b"-0000") + tag = dulwich.objects.Tag.from_raw_string(b"tag", raw_manifest2) + assert converters.dulwich_tag_to_release(tag) == Release( + name=b"blah", + message=b"some release message", + target=hash_to_bytes("641fb6e08ddb2e4fd096dcf18e80b894bf7e25ce"), + target_type=ObjectType.REVISION, + synthetic=False, + author=Person.from_fullname(b"Foo ",), + date=TimestampWithTimezone( + timestamp=Timestamp(seconds=1640191027, microseconds=0), + offset_bytes=b"-0000", + ), + ) + + # Mess with the offset (other) + raw_manifest2 = raw_manifest.replace(b"+0200", b"+200") + tag = dulwich.objects.Tag.from_raw_string(b"tag", raw_manifest2) + assert converters.dulwich_tag_to_release(tag) == Release( + name=b"blah", + message=b"some release message", + target=hash_to_bytes("641fb6e08ddb2e4fd096dcf18e80b894bf7e25ce"), + target_type=ObjectType.REVISION, + synthetic=False, + author=Person.from_fullname(b"Foo ",), + date=TimestampWithTimezone( + timestamp=Timestamp(seconds=1640191027, microseconds=0), + offset_bytes=b"+200", + ), + ) + + # Mess with the rest of the manifest + raw_manifest2 = raw_manifest.replace( + b"641fb6e08ddb2e4fd096dcf18e80b894bf7e25ce", + b"641FB6E08DDB2E4FD096DCF18E80B894BF7E25CE", + ) + tag = dulwich.objects.Tag.from_raw_string(b"tag", raw_manifest2) + assert converters.dulwich_tag_to_release(tag) == Release( + name=b"blah", + message=b"some release message", + target=hash_to_bytes("641fb6e08ddb2e4fd096dcf18e80b894bf7e25ce"), + target_type=ObjectType.REVISION, + synthetic=False, + author=Person.from_fullname(b"Foo ",), + date=TimestampWithTimezone( + timestamp=Timestamp(seconds=1640191027, microseconds=0), + offset_bytes=b"+0200", + ), + raw_manifest=b"tag 136\x00" + raw_manifest2, + )