Changeset View
Changeset View
Standalone View
Standalone View
swh/loader/git/converters.py
# Copyright (C) 2015-2020 The Software Heritage developers | # Copyright (C) 2015-2020 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
"""Convert dulwich objects to dictionaries suitable for swh.storage""" | """Convert dulwich objects to dictionaries suitable for swh.storage""" | ||||
from typing import Any, Dict, Optional, cast | import functools | ||||
from typing import Any, Dict, Generic, Optional, TypeVar, cast | |||||
from dulwich.objects import Blob, Commit, ShaFile, Tag, Tree | from dulwich.objects import Blob, Commit, ShaFile, Tag, Tree | ||||
from typing_extensions import Protocol | |||||
from swh.model.hashutil import DEFAULT_ALGORITHMS, MultiHash, hash_to_bytes | from swh.model.hashutil import DEFAULT_ALGORITHMS, MultiHash, hash_to_bytes | ||||
from swh.model.model import ( | from swh.model.model import ( | ||||
BaseContent, | BaseContent, | ||||
Content, | Content, | ||||
Directory, | Directory, | ||||
DirectoryEntry, | DirectoryEntry, | ||||
HashableObject, | |||||
ObjectType, | ObjectType, | ||||
Person, | Person, | ||||
Release, | Release, | ||||
Revision, | Revision, | ||||
RevisionType, | RevisionType, | ||||
SkippedContent, | SkippedContent, | ||||
TargetType, | TargetType, | ||||
Timestamp, | Timestamp, | ||||
TimestampWithTimezone, | TimestampWithTimezone, | ||||
) | ) | ||||
HASH_ALGORITHMS = DEFAULT_ALGORITHMS - {"sha1_git"} | |||||
class HashMismatch(Exception): | |||||
pass | |||||
douardda: why not make check_id return the object?
so you can write:
```
return check_id(Directory… | |||||
Done Inline Actionsbecause it's weird and forces line wrapping when constructing Revision vlorentz: because it's weird and forces line wrapping when constructing Revision | |||||
_THashable = TypeVar("_THashable", bound=HashableObject, covariant=True) | |||||
class _ConverterProtocol(Protocol, Generic[_THashable]): | |||||
def __call__(self, obj: ShaFile, log=None) -> _THashable: | |||||
... | |||||
Not Done Inline ActionsI have a hard time reading this ^ chunk (line 37 to 45). Does that mean something like, all functions decorated with check_ids must respect the signature (self, obj: ShaFile, log=None) -> _THashable when called upon? (That plus what the decorator actually does, checking for mismatched checksums). Now that i said it, it actually makes sense to me... ardumont: I have a hard time reading this ^ chunk (line 37 to 45).
Does that mean something like, all… | |||||
def check_ids(f: _ConverterProtocol[_THashable]) -> _ConverterProtocol[_THashable]: | |||||
"""Decorator for functions returning a BaseModel object. | |||||
Recomputes these object's id, and errors if they don't match.""" | |||||
@functools.wraps(f) | |||||
def newf(*args, **kwargs) -> _THashable: | |||||
obj: _THashable = f(*args, **kwargs) | |||||
real_id = obj.compute_hash() | |||||
if obj.id != real_id: | |||||
raise HashMismatch( | |||||
f"Expected {type(obj).__name__} hash to be {obj.id.hex()}, " | |||||
f"got {real_id.hex()}" | |||||
) | |||||
return obj | |||||
return newf | |||||
def dulwich_blob_to_content_id(obj: ShaFile) -> Dict[str, Any]: | def dulwich_blob_to_content_id(obj: ShaFile) -> Dict[str, Any]: | ||||
"""Convert a dulwich blob to a Software Heritage content id""" | """Convert a dulwich blob to a Software Heritage content id""" | ||||
if obj.type_name != b"blob": | if obj.type_name != b"blob": | ||||
raise ValueError("Argument is not a blob.") | raise ValueError("Argument is not a blob.") | ||||
blob = cast(Blob, obj) | blob = cast(Blob, obj) | ||||
size = blob.raw_length() | size = blob.raw_length() | ||||
data = blob.as_raw_string() | data = blob.as_raw_string() | ||||
hashes = MultiHash.from_data(data, HASH_ALGORITHMS).digest() | hashes = MultiHash.from_data(data, DEFAULT_ALGORITHMS).digest() | ||||
hashes["sha1_git"] = blob.sha().digest() | if hashes["sha1_git"] != blob.sha().digest(): | ||||
raise HashMismatch( | |||||
f"Expected Content hash to be {blob.sha().digest().hex()}, " | |||||
f"got {hashes['sha1_git'].hex()}" | |||||
) | |||||
hashes["length"] = size | hashes["length"] = size | ||||
return hashes | return hashes | ||||
def dulwich_blob_to_content(obj: ShaFile, max_content_size=None) -> BaseContent: | def dulwich_blob_to_content(obj: ShaFile, max_content_size=None) -> BaseContent: | ||||
"""Convert a dulwich blob to a Software Heritage content | """Convert a dulwich blob to a Software Heritage content | ||||
""" | """ | ||||
if obj.type_name != b"blob": | if obj.type_name != b"blob": | ||||
raise ValueError("Argument is not a blob.") | raise ValueError("Argument is not a blob.") | ||||
blob = cast(Blob, obj) | blob = cast(Blob, obj) | ||||
hashes = dulwich_blob_to_content_id(blob) | hashes = dulwich_blob_to_content_id(blob) | ||||
if max_content_size is not None and hashes["length"] >= max_content_size: | if max_content_size is not None and hashes["length"] >= max_content_size: | ||||
return SkippedContent(status="absent", reason="Content too large", **hashes,) | return SkippedContent(status="absent", reason="Content too large", **hashes,) | ||||
else: | else: | ||||
return Content(data=blob.as_raw_string(), status="visible", **hashes,) | return Content(data=blob.as_raw_string(), status="visible", **hashes,) | ||||
@check_ids | |||||
def dulwich_tree_to_directory(obj: ShaFile, log=None) -> Directory: | def dulwich_tree_to_directory(obj: ShaFile, log=None) -> Directory: | ||||
"""Format a tree as a directory""" | """Format a tree as a directory""" | ||||
if obj.type_name != b"tree": | if obj.type_name != b"tree": | ||||
raise ValueError("Argument is not a tree.") | raise ValueError("Argument is not a tree.") | ||||
tree = cast(Tree, obj) | tree = cast(Tree, obj) | ||||
entries = [] | entries = [] | ||||
Show All 30 Lines | ) -> TimestampWithTimezone: | ||||
Software Heritage""" | Software Heritage""" | ||||
return TimestampWithTimezone( | return TimestampWithTimezone( | ||||
timestamp=Timestamp(seconds=int(timestamp), microseconds=0,), | timestamp=Timestamp(seconds=int(timestamp), microseconds=0,), | ||||
offset=timezone // 60, | offset=timezone // 60, | ||||
negative_utc=timezone_neg_utc if timezone == 0 else False, | negative_utc=timezone_neg_utc if timezone == 0 else False, | ||||
) | ) | ||||
@check_ids | |||||
def dulwich_commit_to_revision(obj: ShaFile, log=None) -> Revision: | def dulwich_commit_to_revision(obj: ShaFile, log=None) -> Revision: | ||||
if obj.type_name != b"commit": | if obj.type_name != b"commit": | ||||
raise ValueError("Argument is not a commit.") | raise ValueError("Argument is not a commit.") | ||||
commit = cast(Commit, obj) | commit = cast(Commit, obj) | ||||
extra_headers = [] | extra_headers = [] | ||||
if commit.encoding is not None: | if commit.encoding is not None: | ||||
extra_headers.append((b"encoding", commit.encoding)) | extra_headers.append((b"encoding", commit.encoding)) | ||||
Show All 40 Lines | |||||
DULWICH_OBJECT_TYPES = { | DULWICH_OBJECT_TYPES = { | ||||
b"blob": ObjectType.CONTENT, | b"blob": ObjectType.CONTENT, | ||||
b"tree": ObjectType.DIRECTORY, | b"tree": ObjectType.DIRECTORY, | ||||
b"commit": ObjectType.REVISION, | b"commit": ObjectType.REVISION, | ||||
b"tag": ObjectType.RELEASE, | b"tag": ObjectType.RELEASE, | ||||
} | } | ||||
@check_ids | |||||
def dulwich_tag_to_release(obj: ShaFile, log=None) -> Release: | def dulwich_tag_to_release(obj: ShaFile, log=None) -> Release: | ||||
if obj.type_name != b"tag": | if obj.type_name != b"tag": | ||||
raise ValueError("Argument is not a tag.") | raise ValueError("Argument is not a tag.") | ||||
tag = cast(Tag, obj) | tag = cast(Tag, obj) | ||||
target_type, target = tag.object | target_type, target = tag.object | ||||
if tag.tagger: | if tag.tagger: | ||||
author: Optional[Person] = parse_author(tag.tagger) | author: Optional[Person] = parse_author(tag.tagger) | ||||
Show All 24 Lines |
why not make check_id return the object?
so you can write: