diff --git a/requirements-swh.txt b/requirements-swh.txt --- a/requirements-swh.txt +++ b/requirements-swh.txt @@ -1,5 +1,5 @@ swh.core >= 0.0.7 swh.loader.core >= 0.18.0 -swh.model >= 2.9.0 +swh.model >= 4.2.0 swh.scheduler >= 0.0.39 swh.storage >= 0.22.0 diff --git a/swh/loader/git/converters.py b/swh/loader/git/converters.py --- a/swh/loader/git/converters.py +++ b/swh/loader/git/converters.py @@ -1,15 +1,24 @@ -# Copyright (C) 2015-2020 The Software Heritage developers +# Copyright (C) 2015-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information """Convert dulwich objects to dictionaries suitable for swh.storage""" +import logging +import re from typing import Any, Dict, Optional, cast -from dulwich.objects import Blob, Commit, ShaFile, Tag, Tree +import attr +from dulwich.objects import Blob, Commit, ShaFile, Tag, Tree, _parse_message -from swh.model.hashutil import DEFAULT_ALGORITHMS, MultiHash, hash_to_bytes +from swh.model.hashutil import ( + DEFAULT_ALGORITHMS, + MultiHash, + git_object_header, + hash_to_bytes, + hash_to_hex, +) from swh.model.model import ( BaseContent, Content, @@ -34,6 +43,10 @@ """Mode/perms of tree entries that point to a tree. They are normally equal to this mask, but may have more bits set to 1.""" +AUTHORSHIP_LINE_RE = re.compile(rb"^.*> (?P\S+) (?P\S+)$") + +logger = logging.getLogger(__name__) + class HashMismatch(Exception): pass @@ -117,14 +130,18 @@ def dulwich_tsinfo_to_timestamp( - timestamp, timezone, timezone_neg_utc + timestamp, timezone, timezone_neg_utc, timezone_bytes: Optional[bytes], ) -> TimestampWithTimezone: """Convert the dulwich timestamp information to a structure compatible with Software Heritage""" + kwargs = {} + if timezone_bytes is not None: + kwargs["offset_bytes"] = timezone_bytes return TimestampWithTimezone( timestamp=Timestamp(seconds=int(timestamp), microseconds=0,), offset=timezone // 60, negative_utc=timezone_neg_utc if timezone == 0 else False, + **kwargs, ) @@ -133,6 +150,18 @@ raise ValueError("Argument is not a commit.") commit = cast(Commit, obj) + author_timezone = None + committer_timezone = None + for (field, value) in _parse_message(commit._chunked_text): + if field == b"author": + m = AUTHORSHIP_LINE_RE.match(value) + if m: + author_timezone = m.group("timezone") + elif field == b"committer": + m = AUTHORSHIP_LINE_RE.match(value) + if m: + committer_timezone = m.group("timezone") + extra_headers = [] if commit.encoding is not None: extra_headers.append((b"encoding", commit.encoding)) @@ -152,11 +181,17 @@ id=commit.sha().digest(), author=parse_author(commit.author), date=dulwich_tsinfo_to_timestamp( - commit.author_time, commit.author_timezone, commit._author_timezone_neg_utc, + commit.author_time, + commit.author_timezone, + commit._author_timezone_neg_utc, + author_timezone, ), committer=parse_author(commit.committer), committer_date=dulwich_tsinfo_to_timestamp( - commit.commit_time, commit.commit_timezone, commit._commit_timezone_neg_utc, + commit.commit_time, + commit.commit_timezone, + commit._commit_timezone_neg_utc, + committer_timezone, ), type=RevisionType.GIT, directory=bytes.fromhex(commit.tree.decode()), @@ -166,6 +201,20 @@ synthetic=False, parents=tuple(bytes.fromhex(p.decode()) for p in commit.parents), ) + + if rev.compute_hash() != rev.id: + expected_id = rev.id + actual_id = rev.compute_hash() + logger.warning( + "Expected revision to have id %s, but got %s. Recording raw_manifest.", + hash_to_hex(expected_id), + hash_to_hex(actual_id), + ) + raw_string = commit.as_raw_string() + rev = attr.evolve( + rev, raw_manifest=git_object_header("commit", len(raw_string)) + raw_string + ) + check_id(rev) return rev @@ -191,6 +240,14 @@ raise ValueError("Argument is not a tag.") tag = cast(Tag, obj) + tagger_timezone = None + # FIXME: _parse_message is a private function from Dulwich. + for (field, value) in _parse_message(tag.as_raw_chunks()): + if field == b"tagger": + m = AUTHORSHIP_LINE_RE.match(value) + if m: + tagger_timezone = m.group("timezone") + target_type, target = tag.object if tag.tagger: author: Optional[Person] = parse_author(tag.tagger) @@ -198,7 +255,10 @@ date = None else: date = dulwich_tsinfo_to_timestamp( - tag.tag_time, tag.tag_timezone, tag._tag_timezone_neg_utc, + tag.tag_time, + tag.tag_timezone, + tag._tag_timezone_neg_utc, + tagger_timezone, ) else: author = date = None @@ -218,5 +278,19 @@ metadata=None, synthetic=False, ) + + if rel.compute_hash() != rel.id: + expected_id = rel.id + actual_id = rel.compute_hash() + logger.warning( + "Expected release to have id %s, but got %s. Recording raw_manifest.", + hash_to_hex(expected_id), + hash_to_hex(actual_id), + ) + raw_string = tag.as_raw_string() + rel = attr.evolve( + rel, raw_manifest=git_object_header("tag", len(raw_string)) + raw_string + ) + check_id(rel) return rel diff --git a/swh/loader/git/tests/test_converters.py b/swh/loader/git/tests/test_converters.py --- a/swh/loader/git/tests/test_converters.py +++ b/swh/loader/git/tests/test_converters.py @@ -1,4 +1,4 @@ -# Copyright (C) 2015-2018 The Software Heritage developers +# Copyright (C) 2015-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -282,20 +282,34 @@ assert revision == expected_revision - @pytest.mark.parametrize( - "attribute", ["_message", "_encoding", "_author", "_gpgsig"] - ) + @pytest.mark.parametrize("attribute", ["message", "encoding", "author", "gpgsig"]) def test_corrupt_commit(self, attribute): - # has a signature - sha1 = b"322f5bc915e50fc25e85226b5a182bded0e98e4b" - commit = copy.deepcopy(self.repo[sha1]) + sha = hash_to_bytes("c40d5a78d0d499296c101fd6e9fe161e2a9af43b") + target = b"641fb6e08ddb2e4fd096dcf18e80b894bf7e25ce" + message = b"some commit message" + commit = dulwich.objects.Commit() + commit.tree = target + commit.message = message + commit.gpgsig = GPGSIG + commit.author = commit.committer = b"Foo " + commit.author_time = commit.commit_time = 1641980946 + commit.author_timezone = commit.commit_timezone = 60 converters.dulwich_commit_to_revision(commit) + assert commit.sha().digest() == sha + + original_sha = commit.sha() + setattr(commit, attribute, b"abcde") + commit.sha() # reset tag._needs_serialization + commit._sha = original_sha # force the wrong hash + with pytest.raises(converters.HashMismatch): converters.dulwich_commit_to_revision(commit) if attribute == "_gpgsig": setattr(commit, attribute, None) + commit.sha() # reset tag._needs_serialization + commit._sha = original_sha # force the wrong hash with pytest.raises(converters.HashMismatch): converters.dulwich_commit_to_revision(commit) @@ -339,6 +353,80 @@ assert revision == expected_revision + def test_weird_commit(self): + """Checks raw_manifest is set when the commit cannot fit the data model""" + + # Well-formed manifest + raw_manifest = ( + b"tree 641fb6e08ddb2e4fd096dcf18e80b894bf7e25ce\n" + b"author Foo 1640191028 +0200\n" + b"committer Foo 1640191028 +0200\n\n" + b"some commit message" + ) + commit = dulwich.objects.Commit.from_raw_string(b"commit", raw_manifest) + date = TimestampWithTimezone( + timestamp=Timestamp(seconds=1640191028, microseconds=0), + offset=120, + negative_utc=False, + offset_bytes=b"+0200", + ) + assert converters.dulwich_commit_to_revision(commit) == Revision( + message=b"some commit message", + directory=hash_to_bytes("641fb6e08ddb2e4fd096dcf18e80b894bf7e25ce"), + synthetic=False, + author=Person.from_fullname(b"Foo ",), + committer=Person.from_fullname(b"Foo ",), + date=date, + committer_date=date, + type=RevisionType.GIT, + raw_manifest=None, + ) + + # Mess with the offset + raw_manifest2 = raw_manifest.replace(b"+0200", b"+200") + commit = dulwich.objects.Commit.from_raw_string(b"commit", raw_manifest2) + date = TimestampWithTimezone( + timestamp=Timestamp(seconds=1640191028, microseconds=0), + offset=120, + negative_utc=False, + offset_bytes=b"+200", + ) + assert converters.dulwich_commit_to_revision(commit) == Revision( + message=b"some commit message", + directory=hash_to_bytes("641fb6e08ddb2e4fd096dcf18e80b894bf7e25ce"), + synthetic=False, + author=Person.from_fullname(b"Foo ",), + committer=Person.from_fullname(b"Foo ",), + date=date, + committer_date=date, + type=RevisionType.GIT, + raw_manifest=None, + ) + + # Mess with the rest of the manifest + raw_manifest2 = raw_manifest.replace( + b"641fb6e08ddb2e4fd096dcf18e80b894bf7e25ce", + b"641FB6E08DDB2E4FD096DCF18E80B894BF7E25CE", + ) + commit = dulwich.objects.Commit.from_raw_string(b"commit", raw_manifest2) + date = TimestampWithTimezone( + timestamp=Timestamp(seconds=1640191028, microseconds=0), + offset=120, + negative_utc=False, + offset_bytes=b"+0200", + ) + assert converters.dulwich_commit_to_revision(commit) == Revision( + message=b"some commit message", + directory=hash_to_bytes("641fb6e08ddb2e4fd096dcf18e80b894bf7e25ce"), + synthetic=False, + author=Person.from_fullname(b"Foo ",), + committer=Person.from_fullname(b"Foo ",), + date=date, + committer_date=date, + type=RevisionType.GIT, + raw_manifest=b"commit 161\x00" + raw_manifest2, + ) + def test_author_line_to_author(self): # edge case out of the way with pytest.raises(TypeError): @@ -561,7 +649,6 @@ @pytest.mark.parametrize("attribute", ["name", "message", "signature"]) def test_corrupt_tag(self, attribute): - # has a signature sha = hash_to_bytes("46fff489610ed733d2cc904e363070dadee05c71") target = b"641fb6e08ddb2e4fd096dcf18e80b894bf7e25ce" message = b"some release message" @@ -590,3 +677,70 @@ tag._sha = original_sha # force the wrong hash with pytest.raises(converters.HashMismatch): converters.dulwich_tag_to_release(tag) + + def test_weird_tag(self): + """Checks raw_manifest is set when the tag cannot fit the data model""" + + # Well-formed manifest + raw_manifest = ( + b"object 641fb6e08ddb2e4fd096dcf18e80b894bf7e25ce\n" + b"type commit\n" + b"tag blah\n" + b"tagger Foo 1640191027 +0200\n\n" + b"some release message" + ) + tag = dulwich.objects.Tag.from_raw_string(b"tag", raw_manifest) + assert converters.dulwich_tag_to_release(tag) == Release( + name=b"blah", + message=b"some release message", + target=hash_to_bytes("641fb6e08ddb2e4fd096dcf18e80b894bf7e25ce"), + target_type=ObjectType.REVISION, + synthetic=False, + author=Person.from_fullname(b"Foo ",), + date=TimestampWithTimezone( + timestamp=Timestamp(seconds=1640191027, microseconds=0), + offset=120, + negative_utc=False, + offset_bytes=b"+0200", + ), + raw_manifest=None, + ) + + # Mess with the offset + raw_manifest2 = raw_manifest.replace(b"+0200", b"+200") + tag = dulwich.objects.Tag.from_raw_string(b"tag", raw_manifest2) + assert converters.dulwich_tag_to_release(tag) == Release( + name=b"blah", + message=b"some release message", + target=hash_to_bytes("641fb6e08ddb2e4fd096dcf18e80b894bf7e25ce"), + target_type=ObjectType.REVISION, + synthetic=False, + author=Person.from_fullname(b"Foo ",), + date=TimestampWithTimezone( + timestamp=Timestamp(seconds=1640191027, microseconds=0), + offset=120, + negative_utc=False, + offset_bytes=b"+200", + ), + ) + + # Mess with the rest of the manifest + raw_manifest2 = raw_manifest.replace( + b"641fb6e08ddb2e4fd096dcf18e80b894bf7e25ce", + b"641FB6E08DDB2E4FD096DCF18E80B894BF7E25CE", + ) + tag = dulwich.objects.Tag.from_raw_string(b"tag", raw_manifest2) + assert converters.dulwich_tag_to_release(tag) == Release( + name=b"blah", + message=b"some release message", + target=hash_to_bytes("641fb6e08ddb2e4fd096dcf18e80b894bf7e25ce"), + target_type=ObjectType.REVISION, + synthetic=False, + author=Person.from_fullname(b"Foo ",), + date=TimestampWithTimezone( + timestamp=Timestamp(seconds=1640191027, microseconds=0), + offset=120, + negative_utc=False, + ), + raw_manifest=b"tag 136\x00" + raw_manifest2, + )