Changeset View
Changeset View
Standalone View
Standalone View
swh/loader/git/converters.py
# Copyright (C) 2015-2020 The Software Heritage developers | # Copyright (C) 2015-2020 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
"""Convert dulwich objects to dictionaries suitable for swh.storage""" | """Convert dulwich objects to dictionaries suitable for swh.storage""" | ||||
from typing import Any, Dict, Optional | from typing import Any, Dict, Optional, cast | ||||
from dulwich.objects import Blob, Commit, ShaFile, Tag, Tree | |||||
from swh.model.hashutil import DEFAULT_ALGORITHMS, MultiHash, hash_to_bytes | from swh.model.hashutil import DEFAULT_ALGORITHMS, MultiHash, hash_to_bytes | ||||
from swh.model.model import ( | from swh.model.model import ( | ||||
BaseContent, | BaseContent, | ||||
Content, | Content, | ||||
Directory, | Directory, | ||||
DirectoryEntry, | DirectoryEntry, | ||||
ObjectType, | ObjectType, | ||||
Person, | Person, | ||||
Release, | Release, | ||||
Revision, | Revision, | ||||
RevisionType, | RevisionType, | ||||
SkippedContent, | SkippedContent, | ||||
TargetType, | TargetType, | ||||
Timestamp, | Timestamp, | ||||
TimestampWithTimezone, | TimestampWithTimezone, | ||||
) | ) | ||||
HASH_ALGORITHMS = DEFAULT_ALGORITHMS - {"sha1_git"} | HASH_ALGORITHMS = DEFAULT_ALGORITHMS - {"sha1_git"} | ||||
def dulwich_blob_to_content_id(blob) -> Dict[str, Any]: | def dulwich_blob_to_content_id(obj: ShaFile) -> Dict[str, Any]: | ||||
"""Convert a dulwich blob to a Software Heritage content id""" | """Convert a dulwich blob to a Software Heritage content id""" | ||||
if blob.type_name != b"blob": | if obj.type_name != b"blob": | ||||
raise ValueError("Argument is not a blob.") | raise ValueError("Argument is not a blob.") | ||||
blob = cast(Blob, obj) | |||||
size = blob.raw_length() | size = blob.raw_length() | ||||
data = blob.as_raw_string() | data = blob.as_raw_string() | ||||
hashes = MultiHash.from_data(data, HASH_ALGORITHMS).digest() | hashes = MultiHash.from_data(data, HASH_ALGORITHMS).digest() | ||||
hashes["sha1_git"] = blob.sha().digest() | hashes["sha1_git"] = blob.sha().digest() | ||||
hashes["length"] = size | hashes["length"] = size | ||||
return hashes | return hashes | ||||
def dulwich_blob_to_content(blob, max_content_size=None) -> BaseContent: | def dulwich_blob_to_content(obj: ShaFile, max_content_size=None) -> BaseContent: | ||||
"""Convert a dulwich blob to a Software Heritage content | """Convert a dulwich blob to a Software Heritage content | ||||
""" | """ | ||||
if blob.type_name != b"blob": | if obj.type_name != b"blob": | ||||
raise ValueError("Argument is not a blob.") | raise ValueError("Argument is not a blob.") | ||||
blob = cast(Blob, obj) | |||||
hashes = dulwich_blob_to_content_id(blob) | hashes = dulwich_blob_to_content_id(blob) | ||||
if max_content_size is not None and hashes["length"] >= max_content_size: | if max_content_size is not None and hashes["length"] >= max_content_size: | ||||
return SkippedContent(status="absent", reason="Content too large", **hashes,) | return SkippedContent(status="absent", reason="Content too large", **hashes,) | ||||
else: | else: | ||||
return Content(data=blob.as_raw_string(), status="visible", **hashes,) | return Content(data=blob.as_raw_string(), status="visible", **hashes,) | ||||
def dulwich_tree_to_directory(tree, log=None) -> Directory: | def dulwich_tree_to_directory(obj: ShaFile, log=None) -> Directory: | ||||
"""Format a tree as a directory""" | """Format a tree as a directory""" | ||||
if tree.type_name != b"tree": | if obj.type_name != b"tree": | ||||
raise ValueError("Argument is not a tree.") | raise ValueError("Argument is not a tree.") | ||||
tree = cast(Tree, obj) | |||||
entries = [] | entries = [] | ||||
entry_mode_map = { | entry_mode_map = { | ||||
0o040000: "dir", | 0o040000: "dir", | ||||
0o160000: "rev", | 0o160000: "rev", | ||||
0o100644: "file", | 0o100644: "file", | ||||
0o100755: "file", | 0o100755: "file", | ||||
Show All 25 Lines | ) -> TimestampWithTimezone: | ||||
Software Heritage""" | Software Heritage""" | ||||
return TimestampWithTimezone( | return TimestampWithTimezone( | ||||
timestamp=Timestamp(seconds=int(timestamp), microseconds=0,), | timestamp=Timestamp(seconds=int(timestamp), microseconds=0,), | ||||
offset=timezone // 60, | offset=timezone // 60, | ||||
negative_utc=timezone_neg_utc if timezone == 0 else False, | negative_utc=timezone_neg_utc if timezone == 0 else False, | ||||
) | ) | ||||
def dulwich_commit_to_revision(commit, log=None) -> Revision: | def dulwich_commit_to_revision(obj: ShaFile, log=None) -> Revision: | ||||
if commit.type_name != b"commit": | if obj.type_name != b"commit": | ||||
raise ValueError("Argument is not a commit.") | raise ValueError("Argument is not a commit.") | ||||
commit = cast(Commit, obj) | |||||
extra_headers = [] | extra_headers = [] | ||||
if commit.encoding is not None: | if commit.encoding is not None: | ||||
extra_headers.append((b"encoding", commit.encoding)) | extra_headers.append((b"encoding", commit.encoding)) | ||||
if commit.mergetag: | if commit.mergetag: | ||||
for mergetag in commit.mergetag: | for mergetag in commit.mergetag: | ||||
raw_string = mergetag.as_raw_string() | raw_string = mergetag.as_raw_string() | ||||
assert raw_string.endswith(b"\n") | assert raw_string.endswith(b"\n") | ||||
Show All 36 Lines | |||||
DULWICH_OBJECT_TYPES = { | DULWICH_OBJECT_TYPES = { | ||||
b"blob": ObjectType.CONTENT, | b"blob": ObjectType.CONTENT, | ||||
b"tree": ObjectType.DIRECTORY, | b"tree": ObjectType.DIRECTORY, | ||||
b"commit": ObjectType.REVISION, | b"commit": ObjectType.REVISION, | ||||
b"tag": ObjectType.RELEASE, | b"tag": ObjectType.RELEASE, | ||||
} | } | ||||
def dulwich_tag_to_release(tag, log=None) -> Release: | def dulwich_tag_to_release(obj: ShaFile, log=None) -> Release: | ||||
if tag.type_name != b"tag": | if obj.type_name != b"tag": | ||||
raise ValueError("Argument is not a tag.") | raise ValueError("Argument is not a tag.") | ||||
tag = cast(Tag, obj) | |||||
target_type, target = tag.object | target_type, target = tag.object | ||||
if tag.tagger: | if tag.tagger: | ||||
author: Optional[Person] = parse_author(tag.tagger) | author: Optional[Person] = parse_author(tag.tagger) | ||||
if not tag.tag_time: | if not tag.tag_time: | ||||
date = None | date = None | ||||
else: | else: | ||||
date = dulwich_tsinfo_to_timestamp( | date = dulwich_tsinfo_to_timestamp( | ||||
Show All 20 Lines |