diff --git a/swh/loader/git/converters.py b/swh/loader/git/converters.py --- a/swh/loader/git/converters.py +++ b/swh/loader/git/converters.py @@ -1,15 +1,24 @@ -# Copyright (C) 2015-2020 The Software Heritage developers +# Copyright (C) 2015-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information """Convert dulwich objects to dictionaries suitable for swh.storage""" +import logging +import re from typing import Any, Dict, Optional, cast -from dulwich.objects import Blob, Commit, ShaFile, Tag, Tree +import attr +from dulwich.objects import Blob, Commit, ShaFile, Tag, Tree, _parse_message -from swh.model.hashutil import DEFAULT_ALGORITHMS, MultiHash, hash_to_bytes +from swh.model.hashutil import ( + DEFAULT_ALGORITHMS, + MultiHash, + git_object_header, + hash_to_bytes, + hash_to_hex, +) from swh.model.model import ( BaseContent, Content, @@ -34,6 +43,10 @@ """Mode/perms of tree entries that point to a tree. They are normally equal to this mask, but may have more bits set to 1.""" +AUTHORSHIP_LINE_RE = re.compile(rb"^.*> (?P\S+) (?P\S+)$") + +logger = logging.getLogger(__name__) + class HashMismatch(Exception): pass @@ -117,14 +130,18 @@ def dulwich_tsinfo_to_timestamp( - timestamp, timezone, timezone_neg_utc + timestamp, timezone, timezone_neg_utc, timezone_bytes: Optional[bytes], ) -> TimestampWithTimezone: """Convert the dulwich timestamp information to a structure compatible with Software Heritage""" + kwargs = {} + if timezone_bytes is not None: + kwargs["offset_bytes"] = timezone_bytes return TimestampWithTimezone( timestamp=Timestamp(seconds=int(timestamp), microseconds=0,), offset=timezone // 60, negative_utc=timezone_neg_utc if timezone == 0 else False, + **kwargs, ) @@ -133,6 +150,18 @@ raise ValueError("Argument is not a commit.") commit = cast(Commit, obj) + author_timezone = None + committer_timezone = None + for (field, value) in _parse_message(commit._chunked_text): + if field == b"author": + m = AUTHORSHIP_LINE_RE.match(value) + if m: + author_timezone = m.group("timezone") + elif field == b"committer": + m = AUTHORSHIP_LINE_RE.match(value) + if m: + committer_timezone = m.group("timezone") + extra_headers = [] if commit.encoding is not None: extra_headers.append((b"encoding", commit.encoding)) @@ -152,11 +181,17 @@ id=commit.sha().digest(), author=parse_author(commit.author), date=dulwich_tsinfo_to_timestamp( - commit.author_time, commit.author_timezone, commit._author_timezone_neg_utc, + commit.author_time, + commit.author_timezone, + commit._author_timezone_neg_utc, + author_timezone, ), committer=parse_author(commit.committer), committer_date=dulwich_tsinfo_to_timestamp( - commit.commit_time, commit.commit_timezone, commit._commit_timezone_neg_utc, + commit.commit_time, + commit.commit_timezone, + commit._commit_timezone_neg_utc, + committer_timezone, ), type=RevisionType.GIT, directory=bytes.fromhex(commit.tree.decode()), @@ -191,6 +226,14 @@ raise ValueError("Argument is not a tag.") tag = cast(Tag, obj) + tagger_timezone = None + # FIXME: _parse_message is a private function from Dulwich. + for (field, value) in _parse_message(tag.as_raw_chunks()): + if field == b"tagger": + m = AUTHORSHIP_LINE_RE.match(value) + if m: + tagger_timezone = m.group("timezone") + target_type, target = tag.object if tag.tagger: author: Optional[Person] = parse_author(tag.tagger) @@ -198,7 +241,10 @@ date = None else: date = dulwich_tsinfo_to_timestamp( - tag.tag_time, tag.tag_timezone, tag._tag_timezone_neg_utc, + tag.tag_time, + tag.tag_timezone, + tag._tag_timezone_neg_utc, + tagger_timezone, ) else: author = date = None @@ -218,5 +264,19 @@ metadata=None, synthetic=False, ) + + if rel.compute_hash() != rel.id: + expected_id = rel.id + actual_id = rel.compute_hash() + logger.warning( + "Expected release to have id %s, but got %s. Adding raw_manifest.", + hash_to_hex(expected_id), + hash_to_hex(actual_id), + ) + raw_string = tag.as_raw_string() + rel = attr.evolve( + rel, raw_manifest=git_object_header("tag", len(raw_string)) + raw_string + ) + check_id(rel) return rel diff --git a/swh/loader/git/tests/test_converters.py b/swh/loader/git/tests/test_converters.py --- a/swh/loader/git/tests/test_converters.py +++ b/swh/loader/git/tests/test_converters.py @@ -1,4 +1,4 @@ -# Copyright (C) 2015-2018 The Software Heritage developers +# Copyright (C) 2015-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -561,7 +561,6 @@ @pytest.mark.parametrize("attribute", ["name", "message", "signature"]) def test_corrupt_tag(self, attribute): - # has a signature sha = hash_to_bytes("46fff489610ed733d2cc904e363070dadee05c71") target = b"641fb6e08ddb2e4fd096dcf18e80b894bf7e25ce" message = b"some release message" @@ -590,3 +589,70 @@ tag._sha = original_sha # force the wrong hash with pytest.raises(converters.HashMismatch): converters.dulwich_tag_to_release(tag) + + def test_weird_tag(self): + """Checks raw_manifest is set when the tag cannot fit the data model""" + + # Well-formed manifest + raw_manifest = ( + b"object 641fb6e08ddb2e4fd096dcf18e80b894bf7e25ce\n" + b"type commit\n" + b"tag blah\n" + b"tagger Foo 1640191027 +0200\n\n" + b"some release message" + ) + tag = dulwich.objects.Tag.from_raw_string(b"tag", raw_manifest) + assert converters.dulwich_tag_to_release(tag) == Release( + name=b"blah", + message=b"some release message", + target=hash_to_bytes("641fb6e08ddb2e4fd096dcf18e80b894bf7e25ce"), + target_type=ObjectType.REVISION, + synthetic=False, + author=Person.from_fullname(b"Foo ",), + date=TimestampWithTimezone( + timestamp=Timestamp(seconds=1640191027, microseconds=0), + offset=120, + negative_utc=False, + offset_bytes=b"+0200", + ), + raw_manifest=None, + ) + + # Mess with the offset + raw_manifest2 = raw_manifest.replace(b"+0200", b"+200") + tag = dulwich.objects.Tag.from_raw_string(b"tag", raw_manifest2) + assert converters.dulwich_tag_to_release(tag) == Release( + name=b"blah", + message=b"some release message", + target=hash_to_bytes("641fb6e08ddb2e4fd096dcf18e80b894bf7e25ce"), + target_type=ObjectType.REVISION, + synthetic=False, + author=Person.from_fullname(b"Foo ",), + date=TimestampWithTimezone( + timestamp=Timestamp(seconds=1640191027, microseconds=0), + offset=120, + negative_utc=False, + offset_bytes=b"+200", + ), + ) + + # Mess with the rest of the manifest + raw_manifest2 = raw_manifest.replace( + b"641fb6e08ddb2e4fd096dcf18e80b894bf7e25ce", + b"641FB6E08DDB2E4FD096DCF18E80B894BF7E25CE", + ) + tag = dulwich.objects.Tag.from_raw_string(b"tag", raw_manifest2) + assert converters.dulwich_tag_to_release(tag) == Release( + name=b"blah", + message=b"some release message", + target=hash_to_bytes("641fb6e08ddb2e4fd096dcf18e80b894bf7e25ce"), + target_type=ObjectType.REVISION, + synthetic=False, + author=Person.from_fullname(b"Foo ",), + date=TimestampWithTimezone( + timestamp=Timestamp(seconds=1640191027, microseconds=0), + offset=120, + negative_utc=False, + ), + raw_manifest=b"tag 136\x00" + raw_manifest2, + )