Differential D6281 Diff 22749 swh/loader/git/converters.py

Changeset View

Standalone View

swh/loader/git/converters.py

# Copyright (C) 2015-2020 The Software Heritage developers		# Copyright (C) 2015-2020 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution		# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version		# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information		# See top-level LICENSE file for more information

"""Convert dulwich objects to dictionaries suitable for swh.storage"""		"""Convert dulwich objects to dictionaries suitable for swh.storage"""

from typing import Any, Dict, Optional, cast		import functools
		from typing import Any, Dict, Generic, Optional, TypeVar, cast

from dulwich.objects import Blob, Commit, ShaFile, Tag, Tree		from dulwich.objects import Blob, Commit, ShaFile, Tag, Tree
		from typing_extensions import Protocol

from swh.model.hashutil import DEFAULT_ALGORITHMS, MultiHash, hash_to_bytes		from swh.model.hashutil import DEFAULT_ALGORITHMS, MultiHash, hash_to_bytes
from swh.model.model import (		from swh.model.model import (
BaseContent,		BaseContent,
Content,		Content,
Directory,		Directory,
DirectoryEntry,		DirectoryEntry,
		HashableObject,
ObjectType,		ObjectType,
Person,		Person,
Release,		Release,
Revision,		Revision,
RevisionType,		RevisionType,
SkippedContent,		SkippedContent,
TargetType,		TargetType,
Timestamp,		Timestamp,
TimestampWithTimezone,		TimestampWithTimezone,
)		)

HASH_ALGORITHMS = DEFAULT_ALGORITHMS - {"sha1_git"}
		class HashMismatch(Exception):
		pass


		douarddaUnsubmitted Not Done Inline Actions why not make check_id return the object? so you can write: return check_id(Directory(id=tree.sha().digest(), entries=tuple(entries),)) douardda: why not make check_id return the object? so you can write: ``` return check_id(Directory…
		vlorentzAuthorUnsubmitted Done Inline Actions because it's weird and forces line wrapping when constructing Revision vlorentz: because it's weird and forces line wrapping when constructing Revision
		_THashable = TypeVar("_THashable", bound=HashableObject, covariant=True)


		class _ConverterProtocol(Protocol, Generic[_THashable]):
		def __call__(self, obj: ShaFile, log=None) -> _THashable:
		...


		ardumontUnsubmitted Not Done Inline Actions I have a hard time reading this ^ chunk (line 37 to 45). Does that mean something like, all functions decorated with `check_ids` must respect the signature `(self, obj: ShaFile, log=None) -> _THashable` when called upon? (That plus what the decorator actually does, checking for mismatched checksums). Now that i said it, it actually makes sense to me... ardumont: I have a hard time reading this ^ chunk (line 37 to 45). Does that mean something like, all…
		def check_ids(f: _ConverterProtocol[_THashable]) -> _ConverterProtocol[_THashable]:
		"""Decorator for functions returning a BaseModel object.
		Recomputes these object's id, and errors if they don't match."""

		@functools.wraps(f)
		def newf(args, *kwargs) -> _THashable:
		obj: _THashable = f(args, *kwargs)
		real_id = obj.compute_hash()
		if obj.id != real_id:
		raise HashMismatch(
		f"Expected {type(obj).__name__} hash to be {obj.id.hex()}, "
		f"got {real_id.hex()}"
		)
		return obj

		return newf


def dulwich_blob_to_content_id(obj: ShaFile) -> Dict[str, Any]:		def dulwich_blob_to_content_id(obj: ShaFile) -> Dict[str, Any]:
"""Convert a dulwich blob to a Software Heritage content id"""		"""Convert a dulwich blob to a Software Heritage content id"""
if obj.type_name != b"blob":		if obj.type_name != b"blob":
raise ValueError("Argument is not a blob.")		raise ValueError("Argument is not a blob.")
blob = cast(Blob, obj)		blob = cast(Blob, obj)

size = blob.raw_length()		size = blob.raw_length()
data = blob.as_raw_string()		data = blob.as_raw_string()
hashes = MultiHash.from_data(data, HASH_ALGORITHMS).digest()		hashes = MultiHash.from_data(data, DEFAULT_ALGORITHMS).digest()
hashes["sha1_git"] = blob.sha().digest()		if hashes["sha1_git"] != blob.sha().digest():
		raise HashMismatch(
		f"Expected Content hash to be {blob.sha().digest().hex()}, "
		f"got {hashes['sha1_git'].hex()}"
		)
hashes["length"] = size		hashes["length"] = size
return hashes		return hashes


def dulwich_blob_to_content(obj: ShaFile, max_content_size=None) -> BaseContent:		def dulwich_blob_to_content(obj: ShaFile, max_content_size=None) -> BaseContent:
"""Convert a dulwich blob to a Software Heritage content		"""Convert a dulwich blob to a Software Heritage content

"""		"""
if obj.type_name != b"blob":		if obj.type_name != b"blob":
raise ValueError("Argument is not a blob.")		raise ValueError("Argument is not a blob.")
blob = cast(Blob, obj)		blob = cast(Blob, obj)

hashes = dulwich_blob_to_content_id(blob)		hashes = dulwich_blob_to_content_id(blob)
if max_content_size is not None and hashes["length"] >= max_content_size:		if max_content_size is not None and hashes["length"] >= max_content_size:
return SkippedContent(status="absent", reason="Content too large", **hashes,)		return SkippedContent(status="absent", reason="Content too large", **hashes,)
else:		else:
return Content(data=blob.as_raw_string(), status="visible", **hashes,)		return Content(data=blob.as_raw_string(), status="visible", **hashes,)


		@check_ids
def dulwich_tree_to_directory(obj: ShaFile, log=None) -> Directory:		def dulwich_tree_to_directory(obj: ShaFile, log=None) -> Directory:
"""Format a tree as a directory"""		"""Format a tree as a directory"""
if obj.type_name != b"tree":		if obj.type_name != b"tree":
raise ValueError("Argument is not a tree.")		raise ValueError("Argument is not a tree.")
tree = cast(Tree, obj)		tree = cast(Tree, obj)

entries = []		entries = []

Show All 30 Lines	) -> TimestampWithTimezone:
Software Heritage"""		Software Heritage"""
return TimestampWithTimezone(		return TimestampWithTimezone(
timestamp=Timestamp(seconds=int(timestamp), microseconds=0,),		timestamp=Timestamp(seconds=int(timestamp), microseconds=0,),
offset=timezone // 60,		offset=timezone // 60,
negative_utc=timezone_neg_utc if timezone == 0 else False,		negative_utc=timezone_neg_utc if timezone == 0 else False,
)		)


		@check_ids
def dulwich_commit_to_revision(obj: ShaFile, log=None) -> Revision:		def dulwich_commit_to_revision(obj: ShaFile, log=None) -> Revision:
if obj.type_name != b"commit":		if obj.type_name != b"commit":
raise ValueError("Argument is not a commit.")		raise ValueError("Argument is not a commit.")
commit = cast(Commit, obj)		commit = cast(Commit, obj)

extra_headers = []		extra_headers = []
if commit.encoding is not None:		if commit.encoding is not None:
extra_headers.append((b"encoding", commit.encoding))		extra_headers.append((b"encoding", commit.encoding))
Show All 40 Lines
DULWICH_OBJECT_TYPES = {		DULWICH_OBJECT_TYPES = {
b"blob": ObjectType.CONTENT,		b"blob": ObjectType.CONTENT,
b"tree": ObjectType.DIRECTORY,		b"tree": ObjectType.DIRECTORY,
b"commit": ObjectType.REVISION,		b"commit": ObjectType.REVISION,
b"tag": ObjectType.RELEASE,		b"tag": ObjectType.RELEASE,
}		}


		@check_ids
def dulwich_tag_to_release(obj: ShaFile, log=None) -> Release:		def dulwich_tag_to_release(obj: ShaFile, log=None) -> Release:
if obj.type_name != b"tag":		if obj.type_name != b"tag":
raise ValueError("Argument is not a tag.")		raise ValueError("Argument is not a tag.")
tag = cast(Tag, obj)		tag = cast(Tag, obj)

target_type, target = tag.object		target_type, target = tag.object
if tag.tagger:		if tag.tagger:
author: Optional[Person] = parse_author(tag.tagger)		author: Optional[Person] = parse_author(tag.tagger)
Show All 24 Lines