Differential D6281 Diff 22777 swh/loader/git/converters.py

Changeset View

Standalone View

swh/loader/git/converters.py

Show All 9 Lines
from dulwich.objects import Blob, Commit, ShaFile, Tag, Tree		from dulwich.objects import Blob, Commit, ShaFile, Tag, Tree

from swh.model.hashutil import DEFAULT_ALGORITHMS, MultiHash, hash_to_bytes		from swh.model.hashutil import DEFAULT_ALGORITHMS, MultiHash, hash_to_bytes
from swh.model.model import (		from swh.model.model import (
BaseContent,		BaseContent,
Content,		Content,
Directory,		Directory,
DirectoryEntry,		DirectoryEntry,
		HashableObject,
ObjectType,		ObjectType,
Person,		Person,
Release,		Release,
Revision,		Revision,
RevisionType,		RevisionType,
SkippedContent,		SkippedContent,
TargetType,		TargetType,
Timestamp,		Timestamp,
TimestampWithTimezone,		TimestampWithTimezone,
)		)

HASH_ALGORITHMS = DEFAULT_ALGORITHMS - {"sha1_git"}
		class HashMismatch(Exception):
		pass


		douarddaUnsubmitted Not Done Inline Actions why not make check_id return the object? so you can write: return check_id(Directory(id=tree.sha().digest(), entries=tuple(entries),)) douardda: why not make check_id return the object? so you can write: ``` return check_id(Directory…
		vlorentzAuthorUnsubmitted Done Inline Actions because it's weird and forces line wrapping when constructing Revision vlorentz: because it's weird and forces line wrapping when constructing Revision
		def check_id(obj: HashableObject) -> None:
		real_id = obj.compute_hash()
		if obj.id != real_id:
		raise HashMismatch(
		f"Expected {type(obj).__name__} hash to be {obj.id.hex()}, "
		f"got {real_id.hex()}"
		)

		ardumontUnsubmitted Not Done Inline Actions I have a hard time reading this ^ chunk (line 37 to 45). Does that mean something like, all functions decorated with `check_ids` must respect the signature `(self, obj: ShaFile, log=None) -> _THashable` when called upon? (That plus what the decorator actually does, checking for mismatched checksums). Now that i said it, it actually makes sense to me... ardumont: I have a hard time reading this ^ chunk (line 37 to 45). Does that mean something like, all…

def dulwich_blob_to_content_id(obj: ShaFile) -> Dict[str, Any]:		def dulwich_blob_to_content_id(obj: ShaFile) -> Dict[str, Any]:
"""Convert a dulwich blob to a Software Heritage content id"""		"""Convert a dulwich blob to a Software Heritage content id"""
if obj.type_name != b"blob":		if obj.type_name != b"blob":
raise ValueError("Argument is not a blob.")		raise ValueError("Argument is not a blob.")
blob = cast(Blob, obj)		blob = cast(Blob, obj)

size = blob.raw_length()		size = blob.raw_length()
data = blob.as_raw_string()		data = blob.as_raw_string()
hashes = MultiHash.from_data(data, HASH_ALGORITHMS).digest()		hashes = MultiHash.from_data(data, DEFAULT_ALGORITHMS).digest()
hashes["sha1_git"] = blob.sha().digest()		if hashes["sha1_git"] != blob.sha().digest():
		raise HashMismatch(
		f"Expected Content hash to be {blob.sha().digest().hex()}, "
		f"got {hashes['sha1_git'].hex()}"
		)
hashes["length"] = size		hashes["length"] = size
return hashes		return hashes


def dulwich_blob_to_content(obj: ShaFile, max_content_size=None) -> BaseContent:		def dulwich_blob_to_content(obj: ShaFile, max_content_size=None) -> BaseContent:
"""Convert a dulwich blob to a Software Heritage content		"""Convert a dulwich blob to a Software Heritage content

"""		"""
Show All 29 Lines	for entry in tree.iteritems():
DirectoryEntry(		DirectoryEntry(
type=entry_mode_map.get(entry.mode, "file"),		type=entry_mode_map.get(entry.mode, "file"),
perms=entry.mode,		perms=entry.mode,
name=entry.path,		name=entry.path,
target=hash_to_bytes(entry.sha.decode("ascii")),		target=hash_to_bytes(entry.sha.decode("ascii")),
)		)
)		)

return Directory(id=tree.sha().digest(), entries=tuple(entries),)		dir_ = Directory(id=tree.sha().digest(), entries=tuple(entries),)
		check_id(dir_)
		return dir_


def parse_author(name_email: bytes) -> Person:		def parse_author(name_email: bytes) -> Person:
"""Parse an author line"""		"""Parse an author line"""
return Person.from_fullname(name_email)		return Person.from_fullname(name_email)


def dulwich_tsinfo_to_timestamp(		def dulwich_tsinfo_to_timestamp(
Show All 23 Lines	if commit.mergetag:
extra_headers.append((b"mergetag", raw_string[:-1]))		extra_headers.append((b"mergetag", raw_string[:-1]))

if commit.extra:		if commit.extra:
extra_headers.extend((k, v) for k, v in commit.extra)		extra_headers.extend((k, v) for k, v in commit.extra)

if commit.gpgsig:		if commit.gpgsig:
extra_headers.append((b"gpgsig", commit.gpgsig))		extra_headers.append((b"gpgsig", commit.gpgsig))

return Revision(		rev = Revision(
id=commit.sha().digest(),		id=commit.sha().digest(),
author=parse_author(commit.author),		author=parse_author(commit.author),
date=dulwich_tsinfo_to_timestamp(		date=dulwich_tsinfo_to_timestamp(
commit.author_time, commit.author_timezone, commit._author_timezone_neg_utc,		commit.author_time, commit.author_timezone, commit._author_timezone_neg_utc,
),		),
committer=parse_author(commit.committer),		committer=parse_author(commit.committer),
committer_date=dulwich_tsinfo_to_timestamp(		committer_date=dulwich_tsinfo_to_timestamp(
commit.commit_time, commit.commit_timezone, commit._commit_timezone_neg_utc,		commit.commit_time, commit.commit_timezone, commit._commit_timezone_neg_utc,
),		),
type=RevisionType.GIT,		type=RevisionType.GIT,
directory=bytes.fromhex(commit.tree.decode()),		directory=bytes.fromhex(commit.tree.decode()),
message=commit.message,		message=commit.message,
metadata=None,		metadata=None,
extra_headers=tuple(extra_headers),		extra_headers=tuple(extra_headers),
synthetic=False,		synthetic=False,
parents=tuple(bytes.fromhex(p.decode()) for p in commit.parents),		parents=tuple(bytes.fromhex(p.decode()) for p in commit.parents),
)		)
		check_id(rev)
		return rev


DULWICH_TARGET_TYPES = {		DULWICH_TARGET_TYPES = {
b"blob": TargetType.CONTENT,		b"blob": TargetType.CONTENT,
b"tree": TargetType.DIRECTORY,		b"tree": TargetType.DIRECTORY,
b"commit": TargetType.REVISION,		b"commit": TargetType.REVISION,
b"tag": TargetType.RELEASE,		b"tag": TargetType.RELEASE,
}		}
Show All 23 Lines	if tag.tagger:
)		)
else:		else:
author = date = None		author = date = None

message = tag.message		message = tag.message
if tag.signature:		if tag.signature:
message += tag.signature		message += tag.signature

return Release(		rel = Release(
id=tag.sha().digest(),		id=tag.sha().digest(),
author=author,		author=author,
date=date,		date=date,
name=tag.name,		name=tag.name,
target=bytes.fromhex(target.decode()),		target=bytes.fromhex(target.decode()),
target_type=DULWICH_OBJECT_TYPES[target_type.type_name],		target_type=DULWICH_OBJECT_TYPES[target_type.type_name],
message=message,		message=message,
metadata=None,		metadata=None,
synthetic=False,		synthetic=False,
)		)
		check_id(rel)
		return rel