No OneTemporary
Actions

Size

18 KB

Subscribers

None

View Options

	diff --git a/swh/loader/git/converters.py b/swh/loader/git/converters.py
	index cb176ca..ea9ccf3 100644
	--- a/swh/loader/git/converters.py
	+++ b/swh/loader/git/converters.py
	@@ -1,329 +1,330 @@
	# Copyright (C) 2015-2022 The Software Heritage developers
	# See the AUTHORS file at the top-level directory of this distribution
	# License: GNU General Public License version 3, or any later version
	# See top-level LICENSE file for more information

	"""Convert dulwich objects to dictionaries suitable for swh.storage"""

	import logging
	import re
	from typing import Any, Dict, Optional, cast

	import attr
	from dulwich.objects import Blob, Commit, ShaFile, Tag, Tree, _parse_message

	from swh.model.hashutil import (
	DEFAULT_ALGORITHMS,
	MultiHash,
	git_object_header,
	hash_to_bytes,
	hash_to_hex,
	)
	from swh.model.model import (
	BaseContent,
	Content,
	Directory,
	DirectoryEntry,
	HashableObject,
	ObjectType,
	Person,
	Release,
	Revision,
	RevisionType,
	SkippedContent,
	TargetType,
	Timestamp,
	TimestampWithTimezone,
	)

	COMMIT_MODE_MASK = 0o160000
	"""Mode/perms of tree entries that point to a commit.
	They are normally equal to this mask, but may have more bits set to 1."""
	TREE_MODE_MASK = 0o040000
	"""Mode/perms of tree entries that point to a tree.
	They are normally equal to this mask, but may have more bits set to 1."""

	AUTHORSHIP_LINE_RE = re.compile(rb"^.*> (?P<timestamp>\S+) (?P<timezone>\S+)$")

	logger = logging.getLogger(__name__)


	class HashMismatch(Exception):
	pass


	def check_id(obj: HashableObject) -> None:
	real_id = obj.compute_hash()
	if obj.id != real_id:
	raise HashMismatch(
	f"Expected {type(obj).__name__} hash to be {obj.id.hex()}, "
	f"got {real_id.hex()}"
	)


	def dulwich_blob_to_content_id(obj: ShaFile) -> Dict[str, Any]:
	"""Convert a dulwich blob to a Software Heritage content id"""
	if obj.type_name != b"blob":
	raise ValueError("Argument is not a blob.")
	blob = cast(Blob, obj)

	size = blob.raw_length()
	data = blob.as_raw_string()
	hashes = MultiHash.from_data(data, DEFAULT_ALGORITHMS).digest()
	if hashes["sha1_git"] != blob.sha().digest():
	raise HashMismatch(
	f"Expected Content hash to be {blob.sha().digest().hex()}, "
	f"got {hashes['sha1_git'].hex()}"
	)
	hashes["length"] = size
	return hashes


	def dulwich_blob_to_content(obj: ShaFile, max_content_size=None) -> BaseContent:
	"""Convert a dulwich blob to a Software Heritage content"""
	if obj.type_name != b"blob":
	raise ValueError("Argument is not a blob.")
	blob = cast(Blob, obj)

	hashes = dulwich_blob_to_content_id(blob)
	if max_content_size is not None and hashes["length"] >= max_content_size:
	return SkippedContent(
	status="absent",
	reason="Content too large",
	**hashes,
	)
	else:
	return Content(
	data=blob.as_raw_string(),
	status="visible",
	**hashes,
	)


	def dulwich_tree_to_directory(obj: ShaFile) -> Directory:
	"""Format a tree as a directory"""
	if obj.type_name != b"tree":
	raise ValueError("Argument is not a tree.")
	tree = cast(Tree, obj)

	entries = []

	for entry in tree.iteritems():
	if entry.mode & COMMIT_MODE_MASK == COMMIT_MODE_MASK:
	type_ = "rev"
	elif entry.mode & TREE_MODE_MASK == TREE_MODE_MASK:
	type_ = "dir"
	else:
	type_ = "file"

	entries.append(
	DirectoryEntry(
	type=type_,
	perms=entry.mode,
	name=entry.path.replace(
	b"/", b"_"
	), # '/' is very rare, and invalid in SWH.
	target=hash_to_bytes(entry.sha.decode("ascii")),
	)
	)

	dir_ = Directory(
	id=tree.sha().digest(),
	entries=tuple(entries),
	)

	if dir_.compute_hash() != dir_.id:
	expected_id = dir_.id
	actual_id = dir_.compute_hash()
	logger.warning(
	"Expected directory to have id %s, but got %s. Recording raw_manifest.",
	hash_to_hex(expected_id),
	hash_to_hex(actual_id),
	)
	raw_string = tree.as_raw_string()
	dir_ = attr.evolve(
	dir_, raw_manifest=git_object_header("tree", len(raw_string)) + raw_string
	)

	check_id(dir_)
	return dir_


	def parse_author(name_email: bytes) -> Person:
	"""Parse an author line"""
	return Person.from_fullname(name_email)


	def dulwich_tsinfo_to_timestamp(
	timestamp,
	timezone: int,
	timezone_neg_utc: bool,
	timezone_bytes: Optional[bytes],
	) -> TimestampWithTimezone:
	"""Convert the dulwich timestamp information to a structure compatible with
	Software Heritage."""
	ts = Timestamp(
	seconds=int(timestamp),
	microseconds=0,
	)
	if timezone_bytes is None:
	# Failed to parse from the raw manifest, fallback to what Dulwich managed to
	# parse.
	return TimestampWithTimezone.from_numeric_offset(
	timestamp=ts,
	offset=timezone // 60,
	negative_utc=timezone_neg_utc,
	)
	else:
	return TimestampWithTimezone(timestamp=ts, offset_bytes=timezone_bytes)


	def dulwich_commit_to_revision(obj: ShaFile) -> Revision:
	if obj.type_name != b"commit":
	raise ValueError("Argument is not a commit.")
	commit = cast(Commit, obj)

	author_timezone = None
	committer_timezone = None
	+ assert commit._chunked_text is not None # to keep mypy happy
	for (field, value) in _parse_message(commit._chunked_text):
	if field == b"author":
	m = AUTHORSHIP_LINE_RE.match(value)
	if m:
	author_timezone = m.group("timezone")
	elif field == b"committer":
	m = AUTHORSHIP_LINE_RE.match(value)
	if m:
	committer_timezone = m.group("timezone")

	extra_headers = []
	if commit.encoding is not None:
	extra_headers.append((b"encoding", commit.encoding))
	if commit.mergetag:
	for mergetag in commit.mergetag:
	raw_string = mergetag.as_raw_string()
	assert raw_string.endswith(b"\n")
	extra_headers.append((b"mergetag", raw_string[:-1]))

	if commit.extra:
	extra_headers.extend((k, v) for k, v in commit.extra)

	if commit.gpgsig:
	extra_headers.append((b"gpgsig", commit.gpgsig))

	rev = Revision(
	id=commit.sha().digest(),
	author=parse_author(commit.author),
	date=dulwich_tsinfo_to_timestamp(
	commit.author_time,
	commit.author_timezone,
	commit._author_timezone_neg_utc,
	author_timezone,
	),
	committer=parse_author(commit.committer),
	committer_date=dulwich_tsinfo_to_timestamp(
	commit.commit_time,
	commit.commit_timezone,
	commit._commit_timezone_neg_utc,
	committer_timezone,
	),
	type=RevisionType.GIT,
	directory=bytes.fromhex(commit.tree.decode()),
	message=commit.message,
	metadata=None,
	extra_headers=tuple(extra_headers),
	synthetic=False,
	parents=tuple(bytes.fromhex(p.decode()) for p in commit.parents),
	)

	if rev.compute_hash() != rev.id:
	expected_id = rev.id
	actual_id = rev.compute_hash()
	logger.warning(
	"Expected revision to have id %s, but got %s. Recording raw_manifest.",
	hash_to_hex(expected_id),
	hash_to_hex(actual_id),
	)
	raw_string = commit.as_raw_string()
	rev = attr.evolve(
	rev, raw_manifest=git_object_header("commit", len(raw_string)) + raw_string
	)

	check_id(rev)
	return rev


	DULWICH_TARGET_TYPES = {
	b"blob": TargetType.CONTENT,
	b"tree": TargetType.DIRECTORY,
	b"commit": TargetType.REVISION,
	b"tag": TargetType.RELEASE,
	}


	DULWICH_OBJECT_TYPES = {
	b"blob": ObjectType.CONTENT,
	b"tree": ObjectType.DIRECTORY,
	b"commit": ObjectType.REVISION,
	b"tag": ObjectType.RELEASE,
	}


	def dulwich_tag_to_release(obj: ShaFile) -> Release:
	if obj.type_name != b"tag":
	raise ValueError("Argument is not a tag.")
	tag = cast(Tag, obj)

	tagger_timezone = None
	# FIXME: _parse_message is a private function from Dulwich.
	for (field, value) in _parse_message(tag.as_raw_chunks()):
	if field == b"tagger":
	m = AUTHORSHIP_LINE_RE.match(value)
	if m:
	tagger_timezone = m.group("timezone")

	target_type, target = tag.object
	if tag.tagger:
	author: Optional[Person] = parse_author(tag.tagger)
	if tag.tag_time is None:
	date = None
	else:
	date = dulwich_tsinfo_to_timestamp(
	tag.tag_time,
	tag.tag_timezone,
	tag._tag_timezone_neg_utc,
	tagger_timezone,
	)
	else:
	author = date = None

	message = tag.message
	if tag.signature:
	message += tag.signature

	rel = Release(
	id=tag.sha().digest(),
	author=author,
	date=date,
	name=tag.name,
	target=bytes.fromhex(target.decode()),
	target_type=DULWICH_OBJECT_TYPES[target_type.type_name],
	message=message,
	metadata=None,
	synthetic=False,
	)

	if rel.compute_hash() != rel.id:
	expected_id = rel.id
	actual_id = rel.compute_hash()
	logger.warning(
	"Expected release to have id %s, but got %s. Recording raw_manifest.",
	hash_to_hex(expected_id),
	hash_to_hex(actual_id),
	)
	raw_string = tag.as_raw_string()
	rel = attr.evolve(
	rel, raw_manifest=git_object_header("tag", len(raw_string)) + raw_string
	)

	check_id(rel)
	return rel
	diff --git a/swh/loader/git/dumb.py b/swh/loader/git/dumb.py
	index c34c19b..35826e9 100644
	--- a/swh/loader/git/dumb.py
	+++ b/swh/loader/git/dumb.py
	@@ -1,204 +1,204 @@
	# Copyright (C) 2021 The Software Heritage developers
	# See the AUTHORS file at the top-level directory of this distribution
	# License: GNU General Public License version 3, or any later version
	# See top-level LICENSE file for more information

	from __future__ import annotations

	from collections import defaultdict
	import logging
	import stat
	import struct
	from tempfile import SpooledTemporaryFile
	from typing import TYPE_CHECKING, Callable, Dict, Iterable, List, Set, cast
	import urllib.parse

	from dulwich.errors import NotGitRepository
	from dulwich.objects import S_IFGITLINK, Commit, ShaFile, Tree
	from dulwich.pack import Pack, PackData, PackIndex, load_pack_index_file
	import requests

	from swh.loader.git.utils import HexBytes

	if TYPE_CHECKING:
	from .loader import RepoRepresentation

	logger = logging.getLogger(__name__)


	HEADERS = {"User-Agent": "Software Heritage dumb Git loader"}


	def check_protocol(repo_url: str) -> bool:
	"""Checks if a git repository can be cloned using the dumb protocol.

	Args:
	repo_url: Base URL of a git repository

	Returns:
	Whether the dumb protocol is supported.

	"""
	if not repo_url.startswith("http"):
	return False
	url = urllib.parse.urljoin(
	repo_url.rstrip("/") + "/", "info/refs?service=git-upload-pack/"
	)
	logger.debug("Fetching %s", url)
	response = requests.get(url, headers=HEADERS)
	content_type = response.headers.get("Content-Type")
	return (
	response.status_code
	in (
	200,
	304,
	)
	# header is not mandatory in protocol specification
	and (content_type is None or not content_type.startswith("application/x-git-"))
	)


	class GitObjectsFetcher:
	"""Git objects fetcher using dumb HTTP protocol.

	Fetches a set of git objects for a repository according to its archival
	state by Software Heritage and provides iterators on them.

	Args:
	repo_url: Base URL of a git repository
	base_repo: State of repository archived by Software Heritage
	"""

	def __init__(self, repo_url: str, base_repo: RepoRepresentation):
	self._session = requests.Session()
	self.repo_url = repo_url
	self.base_repo = base_repo
	self.objects: Dict[bytes, Set[bytes]] = defaultdict(set)
	self.refs = self._get_refs()
	self.head = self._get_head() if self.refs else {}
	self.packs = self._get_packs()

	def fetch_object_ids(self) -> None:
	"""Fetches identifiers of git objects to load into the archive."""
	wants = self.base_repo.determine_wants(self.refs)

	# process refs
	commit_objects = []
	for ref in wants:
	ref_object = self._get_git_object(ref)
	- if ref_object.get_type() == Commit.type_num:
	+ if ref_object.type_num == Commit.type_num:
	commit_objects.append(cast(Commit, ref_object))
	self.objects[b"commit"].add(ref)
	else:
	self.objects[b"tag"].add(ref)

	# perform DFS on commits graph
	while commit_objects:
	commit = commit_objects.pop()
	# fetch tree and blob ids recursively
	self._fetch_tree_objects(commit.tree)
	for parent in commit.parents:
	if (
	# commit not already seen in the current load
	parent not in self.objects[b"commit"]
	# commit not already archived by a previous load
	and parent not in self.base_repo.heads
	):
	commit_objects.append(cast(Commit, self._get_git_object(parent)))
	self.objects[b"commit"].add(parent)

	def iter_objects(self, object_type: bytes) -> Iterable[ShaFile]:
	"""Returns a generator on fetched git objects per type.

	Args:
	object_type: Git object type, either b"blob", b"commit", b"tag" or b"tree"

	Returns:
	A generator fetching git objects on the fly.
	"""
	return map(self._get_git_object, self.objects[object_type])

	def _http_get(self, path: str) -> SpooledTemporaryFile:
	url = urllib.parse.urljoin(self.repo_url.rstrip("/") + "/", path)
	logger.debug("Fetching %s", url)
	response = self._session.get(url, headers=HEADERS)
	buffer = SpooledTemporaryFile(max_size=100 * 1024 * 1024)
	for chunk in response.iter_content(chunk_size=10 * 1024 * 1024):
	buffer.write(chunk)
	buffer.flush()
	buffer.seek(0)
	return buffer

	def _get_refs(self) -> Dict[bytes, HexBytes]:
	refs = {}
	refs_resp_bytes = self._http_get("info/refs")
	for ref_line in refs_resp_bytes.readlines():
	ref_target, ref_name = ref_line.replace(b"\n", b"").split(b"\t")
	refs[ref_name] = ref_target
	return refs

	def _get_head(self) -> Dict[bytes, HexBytes]:
	head_resp_bytes = self._http_get("HEAD")
	_, head_target = head_resp_bytes.readline().replace(b"\n", b"").split(b" ")
	return {b"HEAD": head_target}

	def _get_pack_data(self, pack_name: str) -> Callable[[], PackData]:
	def _pack_data() -> PackData:
	pack_data_bytes = self._http_get(f"objects/pack/{pack_name}")
	return PackData(pack_name, file=pack_data_bytes)

	return _pack_data

	def _get_pack_idx(self, pack_idx_name: str) -> Callable[[], PackIndex]:
	def _pack_idx() -> PackIndex:
	pack_idx_bytes = self._http_get(f"objects/pack/{pack_idx_name}")
	return load_pack_index_file(pack_idx_name, pack_idx_bytes)

	return _pack_idx

	def _get_packs(self) -> List[Pack]:
	packs = []
	packs_info_bytes = self._http_get("objects/info/packs")
	packs_info = packs_info_bytes.read().decode()
	for pack_info in packs_info.split("\n"):
	if pack_info:
	pack_name = pack_info.split(" ")[1]
	pack_idx_name = pack_name.replace(".pack", ".idx")
	# pack index and data file will be lazily fetched when required
	packs.append(
	Pack.from_lazy_objects(
	self._get_pack_data(pack_name),
	self._get_pack_idx(pack_idx_name),
	)
	)
	return packs

	def _get_git_object(self, sha: bytes) -> ShaFile:
	# try to get the object from a pack file first to avoid flooding
	# git server with numerous HTTP requests
	for pack in list(self.packs):
	try:
	if sha in pack:
	return pack[sha]
	except (NotGitRepository, struct.error):
	# missing (dulwich http client raises NotGitRepository on 404)
	# or invalid pack index/content, remove it from global packs list
	logger.debug("A pack file is missing or its content is invalid")
	self.packs.remove(pack)
	# fetch it from objects/ directory otherwise
	sha_hex = sha.decode()
	object_path = f"objects/{sha_hex[:2]}/{sha_hex[2:]}"
	return ShaFile.from_file(self._http_get(object_path))

	def _fetch_tree_objects(self, sha: bytes) -> None:
	if sha not in self.objects[b"tree"]:
	tree = cast(Tree, self._get_git_object(sha))
	self.objects[b"tree"].add(sha)
	for item in tree.items():
	if item.mode == S_IFGITLINK:
	# skip submodules as objects are not stored in repository
	continue
	if item.mode & stat.S_IFDIR:
	self._fetch_tree_objects(item.sha)
	else:
	self.objects[b"blob"].add(item.sha)

File Metadata

Mime Type: text/x-diff
Expires: Tue, Apr 15, 3:49 AM (2 d, 12 h ago)
Storage Engine: blob
Storage Format: Raw Data
Storage Handle: 3276076

No OneTemporaryActions

View Options

File Metadata

Event Timeline

No OneTemporary
Actions