diff --git a/mypy.ini b/mypy.ini --- a/mypy.ini +++ b/mypy.ini @@ -17,5 +17,8 @@ [mypy-pytest.*] ignore_missing_imports = True +[mypy-urllib3.*] +ignore_missing_imports = True + [mypy-swh.loader.*] ignore_missing_imports = True diff --git a/swh/loader/git/dumb.py b/swh/loader/git/dumb.py new file mode 100644 --- /dev/null +++ b/swh/loader/git/dumb.py @@ -0,0 +1,197 @@ +# Copyright (C) 2021 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from __future__ import annotations + +from collections import defaultdict +import logging +import stat +from tempfile import SpooledTemporaryFile +from typing import TYPE_CHECKING, Callable, Dict, Iterable, List, Set, cast + +from dulwich.client import HttpGitClient +from dulwich.objects import S_IFGITLINK, Commit, ShaFile, Tree +from dulwich.pack import Pack, PackData, PackIndex, load_pack_index_file +from urllib3.response import HTTPResponse + +if TYPE_CHECKING: + from .loader import RepoRepresentation + +logger = logging.getLogger(__name__) + + +class DumbHttpGitClient(HttpGitClient): + """Simple wrapper around dulwich.client.HTTPGitClient + """ + + def __init__(self, base_url: str): + super().__init__(base_url) + self.user_agent = "Software Heritage dumb Git loader" + + def get(self, url: str) -> HTTPResponse: + logger.debug("Fetching %s", url) + response, _ = self._http_request(url, headers={"User-Agent": self.user_agent}) + return response + + +def check_protocol(repo_url: str) -> bool: + """Checks if a git repository can be cloned using the dumb protocol. + + Args: + repo_url: Base URL of a git repository + + Returns: + Whether the dumb protocol is supported. + + """ + if not repo_url.startswith("http"): + return False + http_client = DumbHttpGitClient(repo_url) + url = http_client.get_url("info/refs?service=git-upload-pack") + response = http_client.get(url) + return ( + response.status in (200, 304,) + # header is not mandatory in protocol specification + and response.content_type is None + or not response.content_type.startswith("application/x-git-") + ) + + +class GitObjectsFetcher: + """Git objects fetcher using dumb HTTP protocol. + + Fetches a set of git objects for a repository according to its archival + state by Software Heritage and provides iterators on them. + + Args: + repo_url: Base URL of a git repository + base_repo: State of repository archived by Software Heritage + """ + + def __init__(self, repo_url: str, base_repo: RepoRepresentation): + self.http_client = DumbHttpGitClient(repo_url) + self.base_repo = base_repo + self.objects: Dict[bytes, Set[bytes]] = defaultdict(set) + self.refs = self._get_refs() + self.head = self._get_head() + self.packs = self._get_packs() + + def fetch_object_ids(self) -> None: + """Fetches identifiers of git objects to load into the archive. + """ + wants = self.base_repo.determine_wants(self.refs) + + # process refs + commit_objects = [] + for ref in wants: + ref_object = self._get_git_object(ref) + if ref_object.get_type() == Commit.type_num: + commit_objects.append(cast(Commit, ref_object)) + self.objects[b"commit"].add(ref) + else: + self.objects[b"tag"].add(ref) + + # perform DFS on commits graph + while commit_objects: + commit = commit_objects.pop() + # fetch tree and blob ids recursively + self._fetch_tree_objects(commit.tree) + for parent in commit.parents: + if ( + # commit not already seen in the current load + parent not in self.objects[b"commit"] + # commit not already archived by a previous load + and parent not in self.base_repo.heads + ): + commit_objects.append(cast(Commit, self._get_git_object(parent))) + self.objects[b"commit"].add(parent) + + def iter_objects(self, object_type: bytes) -> Iterable[ShaFile]: + """Returns a generator on fetched git objects per type. + + Args: + object_type: Git object type, either b"blob", b"commit", b"tag" or b"tree" + + Returns: + A generator fetching git objects on the fly. + """ + return map(self._get_git_object, self.objects[object_type]) + + def _http_get(self, path: str) -> SpooledTemporaryFile: + url = self.http_client.get_url(path) + response = self.http_client.get(url) + buffer = SpooledTemporaryFile(max_size=100 * 1024 * 1024) + buffer.write(response.data) + buffer.flush() + buffer.seek(0) + return buffer + + def _get_refs(self) -> Dict[bytes, bytes]: + refs = {} + refs_resp_bytes = self._http_get("info/refs") + for ref_line in refs_resp_bytes.readlines(): + ref_target, ref_name = ref_line.replace(b"\n", b"").split(b"\t") + refs[ref_name] = ref_target + return refs + + def _get_head(self) -> Dict[bytes, bytes]: + head_resp_bytes = self._http_get("HEAD") + _, head_target = head_resp_bytes.readline().replace(b"\n", b"").split(b" ") + return {b"HEAD": head_target} + + def _get_pack_data(self, pack_name: str) -> Callable[[], PackData]: + def _pack_data() -> PackData: + pack_data_bytes = self._http_get(f"objects/pack/{pack_name}") + return PackData(pack_name, file=pack_data_bytes) + + return _pack_data + + def _get_pack_idx(self, pack_idx_name: str) -> Callable[[], PackIndex]: + def _pack_idx() -> PackIndex: + pack_idx_bytes = self._http_get(f"objects/pack/{pack_idx_name}") + return load_pack_index_file(pack_idx_name, pack_idx_bytes) + + return _pack_idx + + def _get_packs(self) -> List[Pack]: + packs = [] + packs_info_bytes = self._http_get("objects/info/packs") + packs_info = packs_info_bytes.read().decode() + for pack_info in packs_info.split("\n"): + if pack_info: + pack_name = pack_info.split(" ")[1] + pack_idx_name = pack_name.replace(".pack", ".idx") + # pack index and data file will be lazily fetched when required + packs.append( + Pack.from_lazy_objects( + self._get_pack_data(pack_name), + self._get_pack_idx(pack_idx_name), + ) + ) + return packs + + def _get_git_object(self, sha: bytes) -> ShaFile: + # try to get the object from a pack file first to avoid flooding + # git server with numerous HTTP requests + for pack in self.packs: + if sha in pack: + return pack[sha] + # fetch it from object storage otherwise + sha_hex = sha.decode() + object_path = f"objects/{sha_hex[:2]}/{sha_hex[2:]}" + return ShaFile.from_file(self._http_get(object_path)) + + def _fetch_tree_objects(self, sha: bytes) -> None: + if sha not in self.objects[b"tree"]: + tree = cast(Tree, self._get_git_object(sha)) + self.objects[b"tree"].add(sha) + for item in tree.items(): + # skip submodules + if item.mode == S_IFGITLINK: + continue + if item.mode & stat.S_IFDIR: + self._fetch_tree_objects(item.sha) + else: + self.objects[b"blob"].add(item.sha) diff --git a/swh/loader/git/loader.py b/swh/loader/git/loader.py --- a/swh/loader/git/loader.py +++ b/swh/loader/git/loader.py @@ -34,7 +34,7 @@ from swh.storage.algos.snapshot import snapshot_get_latest from swh.storage.interface import StorageInterface -from . import converters, utils +from . import converters, dumb, utils logger = logging.getLogger(__name__) @@ -143,17 +143,12 @@ def fetch_pack_from_origin( self, origin_url: str, - base_snapshot: Optional[Snapshot], + base_repo: RepoRepresentation, do_activity: Callable[[bytes], None], ) -> FetchPackReturn: """Fetch a pack from the origin""" - pack_buffer = SpooledTemporaryFile(max_size=self.temp_file_cutoff) - base_repo = self.repo_representation( - storage=self.storage, - base_snapshot=base_snapshot, - ignore_history=self.ignore_history, - ) + pack_buffer = SpooledTemporaryFile(max_size=self.temp_file_cutoff) # Hardcode the use of the tcp transport (for GitHub origins) @@ -207,6 +202,11 @@ logger.debug("Fetched pack size: %s", pack_size) + # check if repository only supports git dumb transfer protocol, + # fetched pack file will be empty in that case as dulwich do + # not support it and do not fetch any refs + self.dumb = transport_url.startswith("http") and client.dumb + return FetchPackReturn( remote_refs=utils.filter_refs(remote_refs), symbolic_refs=utils.filter_refs(symbolic_refs), @@ -242,13 +242,19 @@ def fetch_data(self) -> bool: assert self.origin is not None + base_repo = self.repo_representation( + storage=self.storage, + base_snapshot=self.base_snapshot, + ignore_history=self.ignore_history, + ) + def do_progress(msg: bytes) -> None: sys.stderr.buffer.write(msg) sys.stderr.flush() try: fetch_info = self.fetch_pack_from_origin( - self.origin.url, self.base_snapshot, do_progress + self.origin.url, base_repo, do_progress ) except NotGitRepository as e: raise NotFound(e) @@ -264,15 +270,29 @@ raise NotFound(e) # otherwise transmit the error raise + except (AttributeError, NotImplementedError, ValueError): + # with old dulwich versions, those exceptions types can be raised + # by the fetch_pack operation when encountering a repository with + # dumb transfer protocol so we check if the repository supports it + # here to continue the loading if it is the case + self.dumb = dumb.check_protocol(self.origin_url) + if not self.dumb: + raise + + if self.dumb: + logger.debug("Fetching objects with HTTP dumb transfer protocol") + self.dumb_fetcher = dumb.GitObjectsFetcher(self.origin_url, base_repo) + self.dumb_fetcher.fetch_object_ids() + self.remote_refs = utils.filter_refs(self.dumb_fetcher.refs) + self.symbolic_refs = self.dumb_fetcher.head + else: + self.pack_buffer = fetch_info.pack_buffer + self.pack_size = fetch_info.pack_size + self.remote_refs = fetch_info.remote_refs + self.symbolic_refs = fetch_info.symbolic_refs - self.pack_buffer = fetch_info.pack_buffer - self.pack_size = fetch_info.pack_size - - self.remote_refs = fetch_info.remote_refs self.ref_object_types = {sha1: None for sha1 in self.remote_refs.values()} - self.symbolic_refs = fetch_info.symbolic_refs - self.log.info( "Listed %d refs for repo %s" % (len(self.remote_refs), self.origin.url), extra={ @@ -310,13 +330,16 @@ def iter_objects(self, object_type: bytes) -> Iterator[ShaFile]: """Read all the objects of type `object_type` from the packfile""" - self.pack_buffer.seek(0) - for obj in PackInflater.for_pack_data( - PackData.from_file(self.pack_buffer, self.pack_size) - ): - if obj.type_name != object_type: - continue - yield obj + if self.dumb: + yield from self.dumb_fetcher.iter_objects(object_type) + else: + self.pack_buffer.seek(0) + for obj in PackInflater.for_pack_data( + PackData.from_file(self.pack_buffer, self.pack_size) + ): + if obj.type_name != object_type: + continue + yield obj def get_contents(self) -> Iterable[BaseContent]: """Format the blobs from the git repository as swh contents""" diff --git a/swh/loader/git/tests/test_loader.py b/swh/loader/git/tests/test_loader.py --- a/swh/loader/git/tests/test_loader.py +++ b/swh/loader/git/tests/test_loader.py @@ -3,9 +3,14 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +from functools import partial +from http.server import HTTPServer, SimpleHTTPRequestHandler import os +import subprocess +from threading import Thread from dulwich.errors import GitProtocolError, NotGitRepository, ObjectFormatException +from dulwich.porcelain import push import dulwich.repo import pytest @@ -117,3 +122,93 @@ base_url = f"base://{self.repo_url}" self.loader = GitLoader(swh_storage, self.repo_url, base_url=base_url) self.repo = dulwich.repo.Repo(self.destination_path) + + +class TestDumbGitLoader(FullGitLoaderTests): + """Prepare a git repository to be loaded using the HTTP dumb transfer protocol. + """ + + @pytest.fixture(autouse=True) + def init(self, swh_storage, datadir, tmp_path): + # remove any proxy settings in order to successfully spawn a local HTTP server + http_proxy = os.environ.get("http_proxy") + https_proxy = os.environ.get("https_proxy") + if http_proxy: + del os.environ["http_proxy"] + if http_proxy: + del os.environ["https_proxy"] + + # prepare test base repository using smart transfer protocol + archive_name = "testrepo" + archive_path = os.path.join(datadir, f"{archive_name}.tgz") + tmp_path = str(tmp_path) + base_repo_url = prepare_repository_from_archive( + archive_path, archive_name, tmp_path=tmp_path + ) + destination_path = os.path.join(tmp_path, archive_name) + self.destination_path = destination_path + + # create a bare clone of that repository in another folder + bare_repo_path = os.path.join(tmp_path, archive_name + "_bare") + subprocess.run( + ["git", "clone", "--bare", base_repo_url, bare_repo_path], check=True, + ) + + # spawn local HTTP server that will serve the bare repository files + hostname = "localhost" + handler = partial(SimpleHTTPRequestHandler, directory=bare_repo_path) + httpd = HTTPServer((hostname, 0), handler, bind_and_activate=True) + + def serve_forever(httpd): + with httpd: + httpd.serve_forever() + + thread = Thread(target=serve_forever, args=(httpd,)) + thread.start() + + repo = dulwich.repo.Repo(self.destination_path) + + class DumbGitLoaderTest(GitLoader): + def load(self): + """ + Override load method to ensure the bare repository will be synchronized + with the base one as tests can modify its content. + """ + # ensure HEAD ref will be the same for both repositories + with open(os.path.join(bare_repo_path, "HEAD"), "wb") as fw: + with open(os.path.join(destination_path, ".git/HEAD"), "rb") as fr: + head_ref = fr.read() + fw.write(head_ref) + + # push possibly modified refs in the base repository to the bare one + for ref in repo.refs.allkeys(): + if ref != b"HEAD" or head_ref in repo.refs: + push( + repo, + remote_location=f"file://{bare_repo_path}", + refspecs=ref, + ) + + # generate or update the info/refs file used in dumb protocol + subprocess.run( + ["git", "-C", bare_repo_path, "update-server-info"], check=True, + ) + + return super().load() + + # bare repository with dumb protocol only URL + self.repo_url = f"http://{httpd.server_name}:{httpd.server_port}" + self.loader = DumbGitLoaderTest(swh_storage, self.repo_url) + self.repo = repo + + yield + + # shutdown HTTP server + httpd.shutdown() + thread.join() + + # restore HTTP proxy settings if any + if http_proxy: + os.environ["http_proxy"] = http_proxy + if https_proxy: + os.environ["https_proxy"] = https_proxy