Changeset View
Changeset View
Standalone View
Standalone View
swh/loader/git/converters.py
Show All 9 Lines | |||||
from dulwich.objects import Blob, Commit, ShaFile, Tag, Tree | from dulwich.objects import Blob, Commit, ShaFile, Tag, Tree | ||||
from swh.model.hashutil import DEFAULT_ALGORITHMS, MultiHash, hash_to_bytes | from swh.model.hashutil import DEFAULT_ALGORITHMS, MultiHash, hash_to_bytes | ||||
from swh.model.model import ( | from swh.model.model import ( | ||||
BaseContent, | BaseContent, | ||||
Content, | Content, | ||||
Directory, | Directory, | ||||
DirectoryEntry, | DirectoryEntry, | ||||
HashableObject, | |||||
ObjectType, | ObjectType, | ||||
Person, | Person, | ||||
Release, | Release, | ||||
Revision, | Revision, | ||||
RevisionType, | RevisionType, | ||||
SkippedContent, | SkippedContent, | ||||
TargetType, | TargetType, | ||||
Timestamp, | Timestamp, | ||||
TimestampWithTimezone, | TimestampWithTimezone, | ||||
) | ) | ||||
HASH_ALGORITHMS = DEFAULT_ALGORITHMS - {"sha1_git"} | |||||
class HashMismatch(Exception): | |||||
pass | |||||
douardda: why not make check_id return the object?
so you can write:
```
return check_id(Directory… | |||||
Done Inline Actionsbecause it's weird and forces line wrapping when constructing Revision vlorentz: because it's weird and forces line wrapping when constructing Revision | |||||
def check_id(obj: HashableObject) -> None: | |||||
real_id = obj.compute_hash() | |||||
if obj.id != real_id: | |||||
raise HashMismatch( | |||||
f"Expected {type(obj).__name__} hash to be {obj.id.hex()}, " | |||||
f"got {real_id.hex()}" | |||||
) | |||||
Not Done Inline ActionsI have a hard time reading this ^ chunk (line 37 to 45). Does that mean something like, all functions decorated with check_ids must respect the signature (self, obj: ShaFile, log=None) -> _THashable when called upon? (That plus what the decorator actually does, checking for mismatched checksums). Now that i said it, it actually makes sense to me... ardumont: I have a hard time reading this ^ chunk (line 37 to 45).
Does that mean something like, all… | |||||
def dulwich_blob_to_content_id(obj: ShaFile) -> Dict[str, Any]: | def dulwich_blob_to_content_id(obj: ShaFile) -> Dict[str, Any]: | ||||
"""Convert a dulwich blob to a Software Heritage content id""" | """Convert a dulwich blob to a Software Heritage content id""" | ||||
if obj.type_name != b"blob": | if obj.type_name != b"blob": | ||||
raise ValueError("Argument is not a blob.") | raise ValueError("Argument is not a blob.") | ||||
blob = cast(Blob, obj) | blob = cast(Blob, obj) | ||||
size = blob.raw_length() | size = blob.raw_length() | ||||
data = blob.as_raw_string() | data = blob.as_raw_string() | ||||
hashes = MultiHash.from_data(data, HASH_ALGORITHMS).digest() | hashes = MultiHash.from_data(data, DEFAULT_ALGORITHMS).digest() | ||||
hashes["sha1_git"] = blob.sha().digest() | if hashes["sha1_git"] != blob.sha().digest(): | ||||
raise HashMismatch( | |||||
f"Expected Content hash to be {blob.sha().digest().hex()}, " | |||||
f"got {hashes['sha1_git'].hex()}" | |||||
) | |||||
hashes["length"] = size | hashes["length"] = size | ||||
return hashes | return hashes | ||||
def dulwich_blob_to_content(obj: ShaFile, max_content_size=None) -> BaseContent: | def dulwich_blob_to_content(obj: ShaFile, max_content_size=None) -> BaseContent: | ||||
"""Convert a dulwich blob to a Software Heritage content | """Convert a dulwich blob to a Software Heritage content | ||||
""" | """ | ||||
Show All 29 Lines | for entry in tree.iteritems(): | ||||
DirectoryEntry( | DirectoryEntry( | ||||
type=entry_mode_map.get(entry.mode, "file"), | type=entry_mode_map.get(entry.mode, "file"), | ||||
perms=entry.mode, | perms=entry.mode, | ||||
name=entry.path, | name=entry.path, | ||||
target=hash_to_bytes(entry.sha.decode("ascii")), | target=hash_to_bytes(entry.sha.decode("ascii")), | ||||
) | ) | ||||
) | ) | ||||
return Directory(id=tree.sha().digest(), entries=tuple(entries),) | dir_ = Directory(id=tree.sha().digest(), entries=tuple(entries),) | ||||
check_id(dir_) | |||||
return dir_ | |||||
def parse_author(name_email: bytes) -> Person: | def parse_author(name_email: bytes) -> Person: | ||||
"""Parse an author line""" | """Parse an author line""" | ||||
return Person.from_fullname(name_email) | return Person.from_fullname(name_email) | ||||
def dulwich_tsinfo_to_timestamp( | def dulwich_tsinfo_to_timestamp( | ||||
Show All 23 Lines | if commit.mergetag: | ||||
extra_headers.append((b"mergetag", raw_string[:-1])) | extra_headers.append((b"mergetag", raw_string[:-1])) | ||||
if commit.extra: | if commit.extra: | ||||
extra_headers.extend((k, v) for k, v in commit.extra) | extra_headers.extend((k, v) for k, v in commit.extra) | ||||
if commit.gpgsig: | if commit.gpgsig: | ||||
extra_headers.append((b"gpgsig", commit.gpgsig)) | extra_headers.append((b"gpgsig", commit.gpgsig)) | ||||
return Revision( | rev = Revision( | ||||
id=commit.sha().digest(), | id=commit.sha().digest(), | ||||
author=parse_author(commit.author), | author=parse_author(commit.author), | ||||
date=dulwich_tsinfo_to_timestamp( | date=dulwich_tsinfo_to_timestamp( | ||||
commit.author_time, commit.author_timezone, commit._author_timezone_neg_utc, | commit.author_time, commit.author_timezone, commit._author_timezone_neg_utc, | ||||
), | ), | ||||
committer=parse_author(commit.committer), | committer=parse_author(commit.committer), | ||||
committer_date=dulwich_tsinfo_to_timestamp( | committer_date=dulwich_tsinfo_to_timestamp( | ||||
commit.commit_time, commit.commit_timezone, commit._commit_timezone_neg_utc, | commit.commit_time, commit.commit_timezone, commit._commit_timezone_neg_utc, | ||||
), | ), | ||||
type=RevisionType.GIT, | type=RevisionType.GIT, | ||||
directory=bytes.fromhex(commit.tree.decode()), | directory=bytes.fromhex(commit.tree.decode()), | ||||
message=commit.message, | message=commit.message, | ||||
metadata=None, | metadata=None, | ||||
extra_headers=tuple(extra_headers), | extra_headers=tuple(extra_headers), | ||||
synthetic=False, | synthetic=False, | ||||
parents=tuple(bytes.fromhex(p.decode()) for p in commit.parents), | parents=tuple(bytes.fromhex(p.decode()) for p in commit.parents), | ||||
) | ) | ||||
check_id(rev) | |||||
return rev | |||||
DULWICH_TARGET_TYPES = { | DULWICH_TARGET_TYPES = { | ||||
b"blob": TargetType.CONTENT, | b"blob": TargetType.CONTENT, | ||||
b"tree": TargetType.DIRECTORY, | b"tree": TargetType.DIRECTORY, | ||||
b"commit": TargetType.REVISION, | b"commit": TargetType.REVISION, | ||||
b"tag": TargetType.RELEASE, | b"tag": TargetType.RELEASE, | ||||
} | } | ||||
Show All 23 Lines | if tag.tagger: | ||||
) | ) | ||||
else: | else: | ||||
author = date = None | author = date = None | ||||
message = tag.message | message = tag.message | ||||
if tag.signature: | if tag.signature: | ||||
message += tag.signature | message += tag.signature | ||||
return Release( | rel = Release( | ||||
id=tag.sha().digest(), | id=tag.sha().digest(), | ||||
author=author, | author=author, | ||||
date=date, | date=date, | ||||
name=tag.name, | name=tag.name, | ||||
target=bytes.fromhex(target.decode()), | target=bytes.fromhex(target.decode()), | ||||
target_type=DULWICH_OBJECT_TYPES[target_type.type_name], | target_type=DULWICH_OBJECT_TYPES[target_type.type_name], | ||||
message=message, | message=message, | ||||
metadata=None, | metadata=None, | ||||
synthetic=False, | synthetic=False, | ||||
) | ) | ||||
check_id(rel) | |||||
return rel |
why not make check_id return the object?
so you can write: