Changeset View
Changeset View
Standalone View
Standalone View
swh/loader/git/converters.py
# Copyright (C) 2015-2020 The Software Heritage developers | # Copyright (C) 2015-2022 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
"""Convert dulwich objects to dictionaries suitable for swh.storage""" | """Convert dulwich objects to dictionaries suitable for swh.storage""" | ||||
import logging | |||||
import re | |||||
from typing import Any, Dict, Optional, cast | from typing import Any, Dict, Optional, cast | ||||
from dulwich.objects import Blob, Commit, ShaFile, Tag, Tree | import attr | ||||
from dulwich.objects import Blob, Commit, ShaFile, Tag, Tree, _parse_message | |||||
from swh.model.hashutil import DEFAULT_ALGORITHMS, MultiHash, hash_to_bytes | from swh.model.hashutil import ( | ||||
DEFAULT_ALGORITHMS, | |||||
MultiHash, | |||||
git_object_header, | |||||
hash_to_bytes, | |||||
hash_to_hex, | |||||
) | |||||
from swh.model.model import ( | from swh.model.model import ( | ||||
BaseContent, | BaseContent, | ||||
Content, | Content, | ||||
Directory, | Directory, | ||||
DirectoryEntry, | DirectoryEntry, | ||||
HashableObject, | HashableObject, | ||||
ObjectType, | ObjectType, | ||||
Person, | Person, | ||||
Release, | Release, | ||||
Revision, | Revision, | ||||
RevisionType, | RevisionType, | ||||
SkippedContent, | SkippedContent, | ||||
TargetType, | TargetType, | ||||
Timestamp, | Timestamp, | ||||
TimestampWithTimezone, | TimestampWithTimezone, | ||||
) | ) | ||||
COMMIT_MODE_MASK = 0o160000 | COMMIT_MODE_MASK = 0o160000 | ||||
"""Mode/perms of tree entries that point to a commit. | """Mode/perms of tree entries that point to a commit. | ||||
They are normally equal to this mask, but may have more bits set to 1.""" | They are normally equal to this mask, but may have more bits set to 1.""" | ||||
TREE_MODE_MASK = 0o040000 | TREE_MODE_MASK = 0o040000 | ||||
"""Mode/perms of tree entries that point to a tree. | """Mode/perms of tree entries that point to a tree. | ||||
They are normally equal to this mask, but may have more bits set to 1.""" | They are normally equal to this mask, but may have more bits set to 1.""" | ||||
AUTHORSHIP_LINE_RE = re.compile(rb"^.*> (?P<timestamp>\S+) (?P<timezone>\S+)$") | |||||
logger = logging.getLogger(__name__) | |||||
class HashMismatch(Exception): | class HashMismatch(Exception): | ||||
pass | pass | ||||
def check_id(obj: HashableObject) -> None: | def check_id(obj: HashableObject) -> None: | ||||
real_id = obj.compute_hash() | real_id = obj.compute_hash() | ||||
if obj.id != real_id: | if obj.id != real_id: | ||||
▲ Show 20 Lines • Show All 67 Lines • ▼ Show 20 Lines | |||||
def parse_author(name_email: bytes) -> Person: | def parse_author(name_email: bytes) -> Person: | ||||
"""Parse an author line""" | """Parse an author line""" | ||||
return Person.from_fullname(name_email) | return Person.from_fullname(name_email) | ||||
def dulwich_tsinfo_to_timestamp( | def dulwich_tsinfo_to_timestamp( | ||||
timestamp, timezone, timezone_neg_utc | timestamp, timezone, timezone_neg_utc, timezone_bytes: Optional[bytes], | ||||
) -> TimestampWithTimezone: | ) -> TimestampWithTimezone: | ||||
"""Convert the dulwich timestamp information to a structure compatible with | """Convert the dulwich timestamp information to a structure compatible with | ||||
Software Heritage""" | Software Heritage""" | ||||
kwargs = {} | |||||
if timezone_bytes is not None: | |||||
kwargs["offset_bytes"] = timezone_bytes | |||||
return TimestampWithTimezone( | return TimestampWithTimezone( | ||||
timestamp=Timestamp(seconds=int(timestamp), microseconds=0,), | timestamp=Timestamp(seconds=int(timestamp), microseconds=0,), | ||||
offset=timezone // 60, | offset=timezone // 60, | ||||
negative_utc=timezone_neg_utc if timezone == 0 else False, | negative_utc=timezone_neg_utc if timezone == 0 else False, | ||||
**kwargs, | |||||
olasd: So, we'll be dropping these arguments once we drop them off of storage, correct? | |||||
Done Inline Actionsyes vlorentz: yes | |||||
) | ) | ||||
def dulwich_commit_to_revision(obj: ShaFile) -> Revision: | def dulwich_commit_to_revision(obj: ShaFile) -> Revision: | ||||
if obj.type_name != b"commit": | if obj.type_name != b"commit": | ||||
raise ValueError("Argument is not a commit.") | raise ValueError("Argument is not a commit.") | ||||
commit = cast(Commit, obj) | commit = cast(Commit, obj) | ||||
author_timezone = None | |||||
committer_timezone = None | |||||
for (field, value) in _parse_message(commit._chunked_text): | |||||
if field == b"author": | |||||
m = AUTHORSHIP_LINE_RE.match(value) | |||||
if m: | |||||
author_timezone = m.group("timezone") | |||||
elif field == b"committer": | |||||
m = AUTHORSHIP_LINE_RE.match(value) | |||||
if m: | |||||
committer_timezone = m.group("timezone") | |||||
extra_headers = [] | extra_headers = [] | ||||
if commit.encoding is not None: | if commit.encoding is not None: | ||||
extra_headers.append((b"encoding", commit.encoding)) | extra_headers.append((b"encoding", commit.encoding)) | ||||
if commit.mergetag: | if commit.mergetag: | ||||
for mergetag in commit.mergetag: | for mergetag in commit.mergetag: | ||||
raw_string = mergetag.as_raw_string() | raw_string = mergetag.as_raw_string() | ||||
assert raw_string.endswith(b"\n") | assert raw_string.endswith(b"\n") | ||||
extra_headers.append((b"mergetag", raw_string[:-1])) | extra_headers.append((b"mergetag", raw_string[:-1])) | ||||
if commit.extra: | if commit.extra: | ||||
extra_headers.extend((k, v) for k, v in commit.extra) | extra_headers.extend((k, v) for k, v in commit.extra) | ||||
if commit.gpgsig: | if commit.gpgsig: | ||||
extra_headers.append((b"gpgsig", commit.gpgsig)) | extra_headers.append((b"gpgsig", commit.gpgsig)) | ||||
rev = Revision( | rev = Revision( | ||||
id=commit.sha().digest(), | id=commit.sha().digest(), | ||||
author=parse_author(commit.author), | author=parse_author(commit.author), | ||||
date=dulwich_tsinfo_to_timestamp( | date=dulwich_tsinfo_to_timestamp( | ||||
commit.author_time, commit.author_timezone, commit._author_timezone_neg_utc, | commit.author_time, | ||||
commit.author_timezone, | |||||
commit._author_timezone_neg_utc, | |||||
author_timezone, | |||||
), | ), | ||||
committer=parse_author(commit.committer), | committer=parse_author(commit.committer), | ||||
committer_date=dulwich_tsinfo_to_timestamp( | committer_date=dulwich_tsinfo_to_timestamp( | ||||
commit.commit_time, commit.commit_timezone, commit._commit_timezone_neg_utc, | commit.commit_time, | ||||
commit.commit_timezone, | |||||
commit._commit_timezone_neg_utc, | |||||
committer_timezone, | |||||
), | ), | ||||
type=RevisionType.GIT, | type=RevisionType.GIT, | ||||
directory=bytes.fromhex(commit.tree.decode()), | directory=bytes.fromhex(commit.tree.decode()), | ||||
message=commit.message, | message=commit.message, | ||||
metadata=None, | metadata=None, | ||||
extra_headers=tuple(extra_headers), | extra_headers=tuple(extra_headers), | ||||
synthetic=False, | synthetic=False, | ||||
parents=tuple(bytes.fromhex(p.decode()) for p in commit.parents), | parents=tuple(bytes.fromhex(p.decode()) for p in commit.parents), | ||||
) | ) | ||||
if rev.compute_hash() != rev.id: | |||||
expected_id = rev.id | |||||
actual_id = rev.compute_hash() | |||||
logger.warning( | |||||
"Expected revision to have id %s, but got %s. Recording raw_manifest.", | |||||
hash_to_hex(expected_id), | |||||
hash_to_hex(actual_id), | |||||
) | |||||
raw_string = commit.as_raw_string() | |||||
rev = attr.evolve( | |||||
rev, raw_manifest=git_object_header("commit", len(raw_string)) + raw_string | |||||
) | |||||
check_id(rev) | check_id(rev) | ||||
return rev | return rev | ||||
DULWICH_TARGET_TYPES = { | DULWICH_TARGET_TYPES = { | ||||
b"blob": TargetType.CONTENT, | b"blob": TargetType.CONTENT, | ||||
b"tree": TargetType.DIRECTORY, | b"tree": TargetType.DIRECTORY, | ||||
b"commit": TargetType.REVISION, | b"commit": TargetType.REVISION, | ||||
Show All 9 Lines | |||||
} | } | ||||
def dulwich_tag_to_release(obj: ShaFile) -> Release: | def dulwich_tag_to_release(obj: ShaFile) -> Release: | ||||
if obj.type_name != b"tag": | if obj.type_name != b"tag": | ||||
raise ValueError("Argument is not a tag.") | raise ValueError("Argument is not a tag.") | ||||
tag = cast(Tag, obj) | tag = cast(Tag, obj) | ||||
tagger_timezone = None | |||||
# FIXME: _parse_message is a private function from Dulwich. | |||||
for (field, value) in _parse_message(tag.as_raw_chunks()): | |||||
if field == b"tagger": | |||||
m = AUTHORSHIP_LINE_RE.match(value) | |||||
if m: | |||||
tagger_timezone = m.group("timezone") | |||||
target_type, target = tag.object | target_type, target = tag.object | ||||
if tag.tagger: | if tag.tagger: | ||||
author: Optional[Person] = parse_author(tag.tagger) | author: Optional[Person] = parse_author(tag.tagger) | ||||
if tag.tag_time is None: | if tag.tag_time is None: | ||||
date = None | date = None | ||||
else: | else: | ||||
date = dulwich_tsinfo_to_timestamp( | date = dulwich_tsinfo_to_timestamp( | ||||
tag.tag_time, tag.tag_timezone, tag._tag_timezone_neg_utc, | tag.tag_time, | ||||
tag.tag_timezone, | |||||
tag._tag_timezone_neg_utc, | |||||
tagger_timezone, | |||||
) | ) | ||||
else: | else: | ||||
author = date = None | author = date = None | ||||
message = tag.message | message = tag.message | ||||
if tag.signature: | if tag.signature: | ||||
message += tag.signature | message += tag.signature | ||||
rel = Release( | rel = Release( | ||||
id=tag.sha().digest(), | id=tag.sha().digest(), | ||||
author=author, | author=author, | ||||
date=date, | date=date, | ||||
name=tag.name, | name=tag.name, | ||||
target=bytes.fromhex(target.decode()), | target=bytes.fromhex(target.decode()), | ||||
target_type=DULWICH_OBJECT_TYPES[target_type.type_name], | target_type=DULWICH_OBJECT_TYPES[target_type.type_name], | ||||
message=message, | message=message, | ||||
metadata=None, | metadata=None, | ||||
synthetic=False, | synthetic=False, | ||||
) | ) | ||||
if rel.compute_hash() != rel.id: | |||||
expected_id = rel.id | |||||
actual_id = rel.compute_hash() | |||||
logger.warning( | |||||
Done Inline Actionss/Adding/Recording/ ? olasd: s/Adding/Recording/ ? | |||||
"Expected release to have id %s, but got %s. Recording raw_manifest.", | |||||
hash_to_hex(expected_id), | |||||
hash_to_hex(actual_id), | |||||
) | |||||
raw_string = tag.as_raw_string() | |||||
rel = attr.evolve( | |||||
rel, raw_manifest=git_object_header("tag", len(raw_string)) + raw_string | |||||
) | |||||
check_id(rel) | check_id(rel) | ||||
return rel | return rel |
So, we'll be dropping these arguments once we drop them off of storage, correct?