Differential D6901 Diff 25021 swh/loader/git/dumb.py

Changeset View

Standalone View

swh/loader/git/dumb.py

# Copyright (C) 2021 The Software Heritage developers		# Copyright (C) 2021 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution		# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version		# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information		# See top-level LICENSE file for more information

from __future__ import annotations		from __future__ import annotations

from collections import defaultdict		from collections import defaultdict
import logging		import logging
import stat		import stat
import struct		import struct
from tempfile import SpooledTemporaryFile		from tempfile import SpooledTemporaryFile
from typing import TYPE_CHECKING, Callable, Dict, Iterable, List, Set, cast		from typing import TYPE_CHECKING, Callable, Dict, Iterable, List, Set, cast
		import urllib.parse

from dulwich.client import HttpGitClient
from dulwich.errors import NotGitRepository		from dulwich.errors import NotGitRepository
from dulwich.objects import S_IFGITLINK, Commit, ShaFile, Tree		from dulwich.objects import S_IFGITLINK, Commit, ShaFile, Tree
from dulwich.pack import Pack, PackData, PackIndex, load_pack_index_file		from dulwich.pack import Pack, PackData, PackIndex, load_pack_index_file
from urllib3.response import HTTPResponse		import requests

from swh.loader.git.utils import HexBytes		from swh.loader.git.utils import HexBytes

if TYPE_CHECKING:		if TYPE_CHECKING:
from .loader import RepoRepresentation		from .loader import RepoRepresentation

logger = logging.getLogger(__name__)		logger = logging.getLogger(__name__)


class DumbHttpGitClient(HttpGitClient):		HEADERS = {"User-Agent": "Software Heritage dumb Git loader"}
"""Simple wrapper around dulwich.client.HTTPGitClient
"""

def __init__(self, base_url: str):
super().__init__(base_url)
self.user_agent = "Software Heritage dumb Git loader"

def get(self, url: str) -> HTTPResponse:
logger.debug("Fetching %s", url)
response, _ = self._http_request(url, headers={"User-Agent": self.user_agent})
return response


def check_protocol(repo_url: str) -> bool:		def check_protocol(repo_url: str) -> bool:
"""Checks if a git repository can be cloned using the dumb protocol.		"""Checks if a git repository can be cloned using the dumb protocol.

Args:		Args:
repo_url: Base URL of a git repository		repo_url: Base URL of a git repository

Returns:		Returns:
Whether the dumb protocol is supported.		Whether the dumb protocol is supported.

"""		"""
if not repo_url.startswith("http"):		if not repo_url.startswith("http"):
return False		return False
http_client = DumbHttpGitClient(repo_url)		url = urllib.parse.urljoin(repo_url, "info/refs?service=git-upload-pack/")
url = http_client.get_url("info/refs?service=git-upload-pack")		response = requests.get(url, headers=HEADERS)
response = http_client.get(url)		content_type = response.headers.get("Content-Type")
content_type = response.getheader("Content-Type")
return (		return (
response.status in (200, 304,)		response.status_code in (200, 304,)
# header is not mandatory in protocol specification		# header is not mandatory in protocol specification
and content_type is None		and (content_type is None or not content_type.startswith("application/x-git-"))
or not content_type.startswith("application/x-git-")
)		)


class GitObjectsFetcher:		class GitObjectsFetcher:
"""Git objects fetcher using dumb HTTP protocol.		"""Git objects fetcher using dumb HTTP protocol.

Fetches a set of git objects for a repository according to its archival		Fetches a set of git objects for a repository according to its archival
state by Software Heritage and provides iterators on them.		state by Software Heritage and provides iterators on them.

Args:		Args:
repo_url: Base URL of a git repository		repo_url: Base URL of a git repository
base_repo: State of repository archived by Software Heritage		base_repo: State of repository archived by Software Heritage
"""		"""

def __init__(self, repo_url: str, base_repo: RepoRepresentation):		def __init__(self, repo_url: str, base_repo: RepoRepresentation):
self.http_client = DumbHttpGitClient(repo_url)		self.repo_url = repo_url
self.base_repo = base_repo		self.base_repo = base_repo
self.objects: Dict[bytes, Set[bytes]] = defaultdict(set)		self.objects: Dict[bytes, Set[bytes]] = defaultdict(set)
self.refs = self._get_refs()		self.refs = self._get_refs()
self.head = self._get_head() if self.refs else {}		self.head = self._get_head() if self.refs else {}
self.packs = self._get_packs()		self.packs = self._get_packs()

def fetch_object_ids(self) -> None:		def fetch_object_ids(self) -> None:
"""Fetches identifiers of git objects to load into the archive.		"""Fetches identifiers of git objects to load into the archive.
Show All 32 Lines	def iter_objects(self, object_type: bytes) -> Iterable[ShaFile]:
object_type: Git object type, either b"blob", b"commit", b"tag" or b"tree"		object_type: Git object type, either b"blob", b"commit", b"tag" or b"tree"

Returns:		Returns:
A generator fetching git objects on the fly.		A generator fetching git objects on the fly.
"""		"""
return map(self._get_git_object, self.objects[object_type])		return map(self._get_git_object, self.objects[object_type])

def _http_get(self, path: str) -> SpooledTemporaryFile:		def _http_get(self, path: str) -> SpooledTemporaryFile:
url = self.http_client.get_url(path)		url = urllib.parse.urljoin(self.repo_url, path)
response = self.http_client.get(url)		response = requests.get(url, headers=HEADERS)
		anlambertUnsubmitted Done Inline Actions Use a `requests.Session` initialized in the class constructor instead as numerous requests can be sent with the dumb protocol. anlambert: Use a `requests.Session` initialized in the class constructor instead as numerous requests can…
		vlorentzAuthorUnsubmitted Done Inline Actions oh, I assumed there was a default session, but it looks like there isn't vlorentz: oh, I assumed there was a default session, but it looks like there isn't
buffer = SpooledTemporaryFile(max_size=100 * 1024 * 1024)		buffer = SpooledTemporaryFile(max_size=100 * 1024 * 1024)
buffer.write(response.data)		buffer.write(response.content)
buffer.flush()		buffer.flush()
buffer.seek(0)		buffer.seek(0)
return buffer		return buffer

def _get_refs(self) -> Dict[bytes, HexBytes]:		def _get_refs(self) -> Dict[bytes, HexBytes]:
refs = {}		refs = {}
refs_resp_bytes = self._http_get("info/refs")		refs_resp_bytes = self._http_get("info/refs")
for ref_line in refs_resp_bytes.readlines():		for ref_line in refs_resp_bytes.readlines():
▲ Show 20 Lines • Show All 69 Lines • Show Last 20 Lines