Changeset View
Changeset View
Standalone View
Standalone View
swh/loader/git/dumb.py
# Copyright (C) 2021 The Software Heritage developers | # Copyright (C) 2021 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
from __future__ import annotations | from __future__ import annotations | ||||
from collections import defaultdict | from collections import defaultdict | ||||
import logging | import logging | ||||
import stat | import stat | ||||
import struct | import struct | ||||
from tempfile import SpooledTemporaryFile | from tempfile import SpooledTemporaryFile | ||||
from typing import TYPE_CHECKING, Callable, Dict, Iterable, List, Set, cast | from typing import TYPE_CHECKING, Callable, Dict, Iterable, List, Set, cast | ||||
import urllib.parse | |||||
from dulwich.client import HttpGitClient | |||||
from dulwich.errors import NotGitRepository | from dulwich.errors import NotGitRepository | ||||
from dulwich.objects import S_IFGITLINK, Commit, ShaFile, Tree | from dulwich.objects import S_IFGITLINK, Commit, ShaFile, Tree | ||||
from dulwich.pack import Pack, PackData, PackIndex, load_pack_index_file | from dulwich.pack import Pack, PackData, PackIndex, load_pack_index_file | ||||
from urllib3.response import HTTPResponse | import requests | ||||
from swh.loader.git.utils import HexBytes | from swh.loader.git.utils import HexBytes | ||||
if TYPE_CHECKING: | if TYPE_CHECKING: | ||||
from .loader import RepoRepresentation | from .loader import RepoRepresentation | ||||
logger = logging.getLogger(__name__) | logger = logging.getLogger(__name__) | ||||
class DumbHttpGitClient(HttpGitClient): | HEADERS = {"User-Agent": "Software Heritage dumb Git loader"} | ||||
"""Simple wrapper around dulwich.client.HTTPGitClient | |||||
""" | |||||
def __init__(self, base_url: str): | |||||
super().__init__(base_url) | |||||
self.user_agent = "Software Heritage dumb Git loader" | |||||
def get(self, url: str) -> HTTPResponse: | |||||
logger.debug("Fetching %s", url) | |||||
response, _ = self._http_request(url, headers={"User-Agent": self.user_agent}) | |||||
return response | |||||
def check_protocol(repo_url: str) -> bool: | def check_protocol(repo_url: str) -> bool: | ||||
"""Checks if a git repository can be cloned using the dumb protocol. | """Checks if a git repository can be cloned using the dumb protocol. | ||||
Args: | Args: | ||||
repo_url: Base URL of a git repository | repo_url: Base URL of a git repository | ||||
Returns: | Returns: | ||||
Whether the dumb protocol is supported. | Whether the dumb protocol is supported. | ||||
""" | """ | ||||
if not repo_url.startswith("http"): | if not repo_url.startswith("http"): | ||||
return False | return False | ||||
http_client = DumbHttpGitClient(repo_url) | url = urllib.parse.urljoin(repo_url, "info/refs?service=git-upload-pack/") | ||||
url = http_client.get_url("info/refs?service=git-upload-pack") | response = requests.get(url, headers=HEADERS) | ||||
response = http_client.get(url) | content_type = response.headers.get("Content-Type") | ||||
content_type = response.getheader("Content-Type") | |||||
return ( | return ( | ||||
response.status in (200, 304,) | response.status_code in (200, 304,) | ||||
# header is not mandatory in protocol specification | # header is not mandatory in protocol specification | ||||
and content_type is None | and (content_type is None or not content_type.startswith("application/x-git-")) | ||||
or not content_type.startswith("application/x-git-") | |||||
) | ) | ||||
class GitObjectsFetcher: | class GitObjectsFetcher: | ||||
"""Git objects fetcher using dumb HTTP protocol. | """Git objects fetcher using dumb HTTP protocol. | ||||
Fetches a set of git objects for a repository according to its archival | Fetches a set of git objects for a repository according to its archival | ||||
state by Software Heritage and provides iterators on them. | state by Software Heritage and provides iterators on them. | ||||
Args: | Args: | ||||
repo_url: Base URL of a git repository | repo_url: Base URL of a git repository | ||||
base_repo: State of repository archived by Software Heritage | base_repo: State of repository archived by Software Heritage | ||||
""" | """ | ||||
def __init__(self, repo_url: str, base_repo: RepoRepresentation): | def __init__(self, repo_url: str, base_repo: RepoRepresentation): | ||||
self.http_client = DumbHttpGitClient(repo_url) | self.repo_url = repo_url | ||||
self.base_repo = base_repo | self.base_repo = base_repo | ||||
self.objects: Dict[bytes, Set[bytes]] = defaultdict(set) | self.objects: Dict[bytes, Set[bytes]] = defaultdict(set) | ||||
self.refs = self._get_refs() | self.refs = self._get_refs() | ||||
self.head = self._get_head() if self.refs else {} | self.head = self._get_head() if self.refs else {} | ||||
self.packs = self._get_packs() | self.packs = self._get_packs() | ||||
def fetch_object_ids(self) -> None: | def fetch_object_ids(self) -> None: | ||||
"""Fetches identifiers of git objects to load into the archive. | """Fetches identifiers of git objects to load into the archive. | ||||
Show All 32 Lines | def iter_objects(self, object_type: bytes) -> Iterable[ShaFile]: | ||||
object_type: Git object type, either b"blob", b"commit", b"tag" or b"tree" | object_type: Git object type, either b"blob", b"commit", b"tag" or b"tree" | ||||
Returns: | Returns: | ||||
A generator fetching git objects on the fly. | A generator fetching git objects on the fly. | ||||
""" | """ | ||||
return map(self._get_git_object, self.objects[object_type]) | return map(self._get_git_object, self.objects[object_type]) | ||||
def _http_get(self, path: str) -> SpooledTemporaryFile: | def _http_get(self, path: str) -> SpooledTemporaryFile: | ||||
url = self.http_client.get_url(path) | url = urllib.parse.urljoin(self.repo_url, path) | ||||
response = self.http_client.get(url) | response = requests.get(url, headers=HEADERS) | ||||
anlambert: Use a `requests.Session` initialized in the class constructor instead as numerous requests can… | |||||
vlorentzAuthorUnsubmitted Done Inline Actionsoh, I assumed there was a default session, but it looks like there isn't vlorentz: oh, I assumed there was a default session, but it looks like there isn't | |||||
buffer = SpooledTemporaryFile(max_size=100 * 1024 * 1024) | buffer = SpooledTemporaryFile(max_size=100 * 1024 * 1024) | ||||
buffer.write(response.data) | buffer.write(response.content) | ||||
buffer.flush() | buffer.flush() | ||||
buffer.seek(0) | buffer.seek(0) | ||||
return buffer | return buffer | ||||
def _get_refs(self) -> Dict[bytes, HexBytes]: | def _get_refs(self) -> Dict[bytes, HexBytes]: | ||||
refs = {} | refs = {} | ||||
refs_resp_bytes = self._http_get("info/refs") | refs_resp_bytes = self._http_get("info/refs") | ||||
for ref_line in refs_resp_bytes.readlines(): | for ref_line in refs_resp_bytes.readlines(): | ||||
▲ Show 20 Lines • Show All 69 Lines • Show Last 20 Lines |
Use a requests.Session initialized in the class constructor instead as numerous requests can be sent with the dumb protocol.