Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F7437740
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
18 KB
Subscribers
None
View Options
diff --git a/swh/loader/git/converters.py b/swh/loader/git/converters.py
index cb176ca..ea9ccf3 100644
--- a/swh/loader/git/converters.py
+++ b/swh/loader/git/converters.py
@@ -1,329 +1,330 @@
# Copyright (C) 2015-2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
"""Convert dulwich objects to dictionaries suitable for swh.storage"""
import logging
import re
from typing import Any, Dict, Optional, cast
import attr
from dulwich.objects import Blob, Commit, ShaFile, Tag, Tree, _parse_message
from swh.model.hashutil import (
DEFAULT_ALGORITHMS,
MultiHash,
git_object_header,
hash_to_bytes,
hash_to_hex,
)
from swh.model.model import (
BaseContent,
Content,
Directory,
DirectoryEntry,
HashableObject,
ObjectType,
Person,
Release,
Revision,
RevisionType,
SkippedContent,
TargetType,
Timestamp,
TimestampWithTimezone,
)
COMMIT_MODE_MASK = 0o160000
"""Mode/perms of tree entries that point to a commit.
They are normally equal to this mask, but may have more bits set to 1."""
TREE_MODE_MASK = 0o040000
"""Mode/perms of tree entries that point to a tree.
They are normally equal to this mask, but may have more bits set to 1."""
AUTHORSHIP_LINE_RE = re.compile(rb"^.*> (?P<timestamp>\S+) (?P<timezone>\S+)$")
logger = logging.getLogger(__name__)
class HashMismatch(Exception):
pass
def check_id(obj: HashableObject) -> None:
real_id = obj.compute_hash()
if obj.id != real_id:
raise HashMismatch(
f"Expected {type(obj).__name__} hash to be {obj.id.hex()}, "
f"got {real_id.hex()}"
)
def dulwich_blob_to_content_id(obj: ShaFile) -> Dict[str, Any]:
"""Convert a dulwich blob to a Software Heritage content id"""
if obj.type_name != b"blob":
raise ValueError("Argument is not a blob.")
blob = cast(Blob, obj)
size = blob.raw_length()
data = blob.as_raw_string()
hashes = MultiHash.from_data(data, DEFAULT_ALGORITHMS).digest()
if hashes["sha1_git"] != blob.sha().digest():
raise HashMismatch(
f"Expected Content hash to be {blob.sha().digest().hex()}, "
f"got {hashes['sha1_git'].hex()}"
)
hashes["length"] = size
return hashes
def dulwich_blob_to_content(obj: ShaFile, max_content_size=None) -> BaseContent:
"""Convert a dulwich blob to a Software Heritage content"""
if obj.type_name != b"blob":
raise ValueError("Argument is not a blob.")
blob = cast(Blob, obj)
hashes = dulwich_blob_to_content_id(blob)
if max_content_size is not None and hashes["length"] >= max_content_size:
return SkippedContent(
status="absent",
reason="Content too large",
**hashes,
)
else:
return Content(
data=blob.as_raw_string(),
status="visible",
**hashes,
)
def dulwich_tree_to_directory(obj: ShaFile) -> Directory:
"""Format a tree as a directory"""
if obj.type_name != b"tree":
raise ValueError("Argument is not a tree.")
tree = cast(Tree, obj)
entries = []
for entry in tree.iteritems():
if entry.mode & COMMIT_MODE_MASK == COMMIT_MODE_MASK:
type_ = "rev"
elif entry.mode & TREE_MODE_MASK == TREE_MODE_MASK:
type_ = "dir"
else:
type_ = "file"
entries.append(
DirectoryEntry(
type=type_,
perms=entry.mode,
name=entry.path.replace(
b"/", b"_"
), # '/' is very rare, and invalid in SWH.
target=hash_to_bytes(entry.sha.decode("ascii")),
)
)
dir_ = Directory(
id=tree.sha().digest(),
entries=tuple(entries),
)
if dir_.compute_hash() != dir_.id:
expected_id = dir_.id
actual_id = dir_.compute_hash()
logger.warning(
"Expected directory to have id %s, but got %s. Recording raw_manifest.",
hash_to_hex(expected_id),
hash_to_hex(actual_id),
)
raw_string = tree.as_raw_string()
dir_ = attr.evolve(
dir_, raw_manifest=git_object_header("tree", len(raw_string)) + raw_string
)
check_id(dir_)
return dir_
def parse_author(name_email: bytes) -> Person:
"""Parse an author line"""
return Person.from_fullname(name_email)
def dulwich_tsinfo_to_timestamp(
timestamp,
timezone: int,
timezone_neg_utc: bool,
timezone_bytes: Optional[bytes],
) -> TimestampWithTimezone:
"""Convert the dulwich timestamp information to a structure compatible with
Software Heritage."""
ts = Timestamp(
seconds=int(timestamp),
microseconds=0,
)
if timezone_bytes is None:
# Failed to parse from the raw manifest, fallback to what Dulwich managed to
# parse.
return TimestampWithTimezone.from_numeric_offset(
timestamp=ts,
offset=timezone // 60,
negative_utc=timezone_neg_utc,
)
else:
return TimestampWithTimezone(timestamp=ts, offset_bytes=timezone_bytes)
def dulwich_commit_to_revision(obj: ShaFile) -> Revision:
if obj.type_name != b"commit":
raise ValueError("Argument is not a commit.")
commit = cast(Commit, obj)
author_timezone = None
committer_timezone = None
+ assert commit._chunked_text is not None # to keep mypy happy
for (field, value) in _parse_message(commit._chunked_text):
if field == b"author":
m = AUTHORSHIP_LINE_RE.match(value)
if m:
author_timezone = m.group("timezone")
elif field == b"committer":
m = AUTHORSHIP_LINE_RE.match(value)
if m:
committer_timezone = m.group("timezone")
extra_headers = []
if commit.encoding is not None:
extra_headers.append((b"encoding", commit.encoding))
if commit.mergetag:
for mergetag in commit.mergetag:
raw_string = mergetag.as_raw_string()
assert raw_string.endswith(b"\n")
extra_headers.append((b"mergetag", raw_string[:-1]))
if commit.extra:
extra_headers.extend((k, v) for k, v in commit.extra)
if commit.gpgsig:
extra_headers.append((b"gpgsig", commit.gpgsig))
rev = Revision(
id=commit.sha().digest(),
author=parse_author(commit.author),
date=dulwich_tsinfo_to_timestamp(
commit.author_time,
commit.author_timezone,
commit._author_timezone_neg_utc,
author_timezone,
),
committer=parse_author(commit.committer),
committer_date=dulwich_tsinfo_to_timestamp(
commit.commit_time,
commit.commit_timezone,
commit._commit_timezone_neg_utc,
committer_timezone,
),
type=RevisionType.GIT,
directory=bytes.fromhex(commit.tree.decode()),
message=commit.message,
metadata=None,
extra_headers=tuple(extra_headers),
synthetic=False,
parents=tuple(bytes.fromhex(p.decode()) for p in commit.parents),
)
if rev.compute_hash() != rev.id:
expected_id = rev.id
actual_id = rev.compute_hash()
logger.warning(
"Expected revision to have id %s, but got %s. Recording raw_manifest.",
hash_to_hex(expected_id),
hash_to_hex(actual_id),
)
raw_string = commit.as_raw_string()
rev = attr.evolve(
rev, raw_manifest=git_object_header("commit", len(raw_string)) + raw_string
)
check_id(rev)
return rev
DULWICH_TARGET_TYPES = {
b"blob": TargetType.CONTENT,
b"tree": TargetType.DIRECTORY,
b"commit": TargetType.REVISION,
b"tag": TargetType.RELEASE,
}
DULWICH_OBJECT_TYPES = {
b"blob": ObjectType.CONTENT,
b"tree": ObjectType.DIRECTORY,
b"commit": ObjectType.REVISION,
b"tag": ObjectType.RELEASE,
}
def dulwich_tag_to_release(obj: ShaFile) -> Release:
if obj.type_name != b"tag":
raise ValueError("Argument is not a tag.")
tag = cast(Tag, obj)
tagger_timezone = None
# FIXME: _parse_message is a private function from Dulwich.
for (field, value) in _parse_message(tag.as_raw_chunks()):
if field == b"tagger":
m = AUTHORSHIP_LINE_RE.match(value)
if m:
tagger_timezone = m.group("timezone")
target_type, target = tag.object
if tag.tagger:
author: Optional[Person] = parse_author(tag.tagger)
if tag.tag_time is None:
date = None
else:
date = dulwich_tsinfo_to_timestamp(
tag.tag_time,
tag.tag_timezone,
tag._tag_timezone_neg_utc,
tagger_timezone,
)
else:
author = date = None
message = tag.message
if tag.signature:
message += tag.signature
rel = Release(
id=tag.sha().digest(),
author=author,
date=date,
name=tag.name,
target=bytes.fromhex(target.decode()),
target_type=DULWICH_OBJECT_TYPES[target_type.type_name],
message=message,
metadata=None,
synthetic=False,
)
if rel.compute_hash() != rel.id:
expected_id = rel.id
actual_id = rel.compute_hash()
logger.warning(
"Expected release to have id %s, but got %s. Recording raw_manifest.",
hash_to_hex(expected_id),
hash_to_hex(actual_id),
)
raw_string = tag.as_raw_string()
rel = attr.evolve(
rel, raw_manifest=git_object_header("tag", len(raw_string)) + raw_string
)
check_id(rel)
return rel
diff --git a/swh/loader/git/dumb.py b/swh/loader/git/dumb.py
index c34c19b..35826e9 100644
--- a/swh/loader/git/dumb.py
+++ b/swh/loader/git/dumb.py
@@ -1,204 +1,204 @@
# Copyright (C) 2021 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from __future__ import annotations
from collections import defaultdict
import logging
import stat
import struct
from tempfile import SpooledTemporaryFile
from typing import TYPE_CHECKING, Callable, Dict, Iterable, List, Set, cast
import urllib.parse
from dulwich.errors import NotGitRepository
from dulwich.objects import S_IFGITLINK, Commit, ShaFile, Tree
from dulwich.pack import Pack, PackData, PackIndex, load_pack_index_file
import requests
from swh.loader.git.utils import HexBytes
if TYPE_CHECKING:
from .loader import RepoRepresentation
logger = logging.getLogger(__name__)
HEADERS = {"User-Agent": "Software Heritage dumb Git loader"}
def check_protocol(repo_url: str) -> bool:
"""Checks if a git repository can be cloned using the dumb protocol.
Args:
repo_url: Base URL of a git repository
Returns:
Whether the dumb protocol is supported.
"""
if not repo_url.startswith("http"):
return False
url = urllib.parse.urljoin(
repo_url.rstrip("/") + "/", "info/refs?service=git-upload-pack/"
)
logger.debug("Fetching %s", url)
response = requests.get(url, headers=HEADERS)
content_type = response.headers.get("Content-Type")
return (
response.status_code
in (
200,
304,
)
# header is not mandatory in protocol specification
and (content_type is None or not content_type.startswith("application/x-git-"))
)
class GitObjectsFetcher:
"""Git objects fetcher using dumb HTTP protocol.
Fetches a set of git objects for a repository according to its archival
state by Software Heritage and provides iterators on them.
Args:
repo_url: Base URL of a git repository
base_repo: State of repository archived by Software Heritage
"""
def __init__(self, repo_url: str, base_repo: RepoRepresentation):
self._session = requests.Session()
self.repo_url = repo_url
self.base_repo = base_repo
self.objects: Dict[bytes, Set[bytes]] = defaultdict(set)
self.refs = self._get_refs()
self.head = self._get_head() if self.refs else {}
self.packs = self._get_packs()
def fetch_object_ids(self) -> None:
"""Fetches identifiers of git objects to load into the archive."""
wants = self.base_repo.determine_wants(self.refs)
# process refs
commit_objects = []
for ref in wants:
ref_object = self._get_git_object(ref)
- if ref_object.get_type() == Commit.type_num:
+ if ref_object.type_num == Commit.type_num:
commit_objects.append(cast(Commit, ref_object))
self.objects[b"commit"].add(ref)
else:
self.objects[b"tag"].add(ref)
# perform DFS on commits graph
while commit_objects:
commit = commit_objects.pop()
# fetch tree and blob ids recursively
self._fetch_tree_objects(commit.tree)
for parent in commit.parents:
if (
# commit not already seen in the current load
parent not in self.objects[b"commit"]
# commit not already archived by a previous load
and parent not in self.base_repo.heads
):
commit_objects.append(cast(Commit, self._get_git_object(parent)))
self.objects[b"commit"].add(parent)
def iter_objects(self, object_type: bytes) -> Iterable[ShaFile]:
"""Returns a generator on fetched git objects per type.
Args:
object_type: Git object type, either b"blob", b"commit", b"tag" or b"tree"
Returns:
A generator fetching git objects on the fly.
"""
return map(self._get_git_object, self.objects[object_type])
def _http_get(self, path: str) -> SpooledTemporaryFile:
url = urllib.parse.urljoin(self.repo_url.rstrip("/") + "/", path)
logger.debug("Fetching %s", url)
response = self._session.get(url, headers=HEADERS)
buffer = SpooledTemporaryFile(max_size=100 * 1024 * 1024)
for chunk in response.iter_content(chunk_size=10 * 1024 * 1024):
buffer.write(chunk)
buffer.flush()
buffer.seek(0)
return buffer
def _get_refs(self) -> Dict[bytes, HexBytes]:
refs = {}
refs_resp_bytes = self._http_get("info/refs")
for ref_line in refs_resp_bytes.readlines():
ref_target, ref_name = ref_line.replace(b"\n", b"").split(b"\t")
refs[ref_name] = ref_target
return refs
def _get_head(self) -> Dict[bytes, HexBytes]:
head_resp_bytes = self._http_get("HEAD")
_, head_target = head_resp_bytes.readline().replace(b"\n", b"").split(b" ")
return {b"HEAD": head_target}
def _get_pack_data(self, pack_name: str) -> Callable[[], PackData]:
def _pack_data() -> PackData:
pack_data_bytes = self._http_get(f"objects/pack/{pack_name}")
return PackData(pack_name, file=pack_data_bytes)
return _pack_data
def _get_pack_idx(self, pack_idx_name: str) -> Callable[[], PackIndex]:
def _pack_idx() -> PackIndex:
pack_idx_bytes = self._http_get(f"objects/pack/{pack_idx_name}")
return load_pack_index_file(pack_idx_name, pack_idx_bytes)
return _pack_idx
def _get_packs(self) -> List[Pack]:
packs = []
packs_info_bytes = self._http_get("objects/info/packs")
packs_info = packs_info_bytes.read().decode()
for pack_info in packs_info.split("\n"):
if pack_info:
pack_name = pack_info.split(" ")[1]
pack_idx_name = pack_name.replace(".pack", ".idx")
# pack index and data file will be lazily fetched when required
packs.append(
Pack.from_lazy_objects(
self._get_pack_data(pack_name),
self._get_pack_idx(pack_idx_name),
)
)
return packs
def _get_git_object(self, sha: bytes) -> ShaFile:
# try to get the object from a pack file first to avoid flooding
# git server with numerous HTTP requests
for pack in list(self.packs):
try:
if sha in pack:
return pack[sha]
except (NotGitRepository, struct.error):
# missing (dulwich http client raises NotGitRepository on 404)
# or invalid pack index/content, remove it from global packs list
logger.debug("A pack file is missing or its content is invalid")
self.packs.remove(pack)
# fetch it from objects/ directory otherwise
sha_hex = sha.decode()
object_path = f"objects/{sha_hex[:2]}/{sha_hex[2:]}"
return ShaFile.from_file(self._http_get(object_path))
def _fetch_tree_objects(self, sha: bytes) -> None:
if sha not in self.objects[b"tree"]:
tree = cast(Tree, self._get_git_object(sha))
self.objects[b"tree"].add(sha)
for item in tree.items():
if item.mode == S_IFGITLINK:
# skip submodules as objects are not stored in repository
continue
if item.mode & stat.S_IFDIR:
self._fetch_tree_objects(item.sha)
else:
self.objects[b"blob"].add(item.sha)
File Metadata
Details
Attached
Mime Type
text/x-diff
Expires
Tue, Apr 15, 3:49 AM (2 d, 12 h ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3276076
Attached To
rDLDG Git loader
Event Timeline
Log In to Comment