Changeset View
Changeset View
Standalone View
Standalone View
swh/loader/git/loader.py
# Copyright (C) 2016-2021 The Software Heritage developers | # Copyright (C) 2016-2021 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
from dataclasses import dataclass | from dataclasses import dataclass | ||||
import datetime | import datetime | ||||
from io import BytesIO | |||||
import logging | import logging | ||||
import os | import os | ||||
import pickle | import pickle | ||||
import sys | import sys | ||||
from tempfile import SpooledTemporaryFile | |||||
from typing import Any, Callable, Dict, Iterable, Iterator, List, Optional, Set, Type | from typing import Any, Callable, Dict, Iterable, Iterator, List, Optional, Set, Type | ||||
import dulwich.client | import dulwich.client | ||||
from dulwich.errors import GitProtocolError, NotGitRepository | from dulwich.errors import GitProtocolError, NotGitRepository | ||||
from dulwich.object_store import ObjectStoreGraphWalker | from dulwich.object_store import ObjectStoreGraphWalker | ||||
from dulwich.objects import ShaFile | from dulwich.objects import ShaFile | ||||
from dulwich.pack import PackData, PackInflater | from dulwich.pack import PackData, PackInflater | ||||
▲ Show 20 Lines • Show All 69 Lines • ▼ Show 20 Lines | def determine_wants(self, refs: Dict[bytes, bytes]) -> List[bytes]: | ||||
return list(remote_heads - local_heads) | return list(remote_heads - local_heads) | ||||
@dataclass | @dataclass | ||||
class FetchPackReturn: | class FetchPackReturn: | ||||
remote_refs: Dict[bytes, bytes] | remote_refs: Dict[bytes, bytes] | ||||
symbolic_refs: Dict[bytes, bytes] | symbolic_refs: Dict[bytes, bytes] | ||||
pack_buffer: BytesIO | pack_buffer: SpooledTemporaryFile | ||||
pack_size: int | pack_size: int | ||||
class GitLoader(DVCSLoader): | class GitLoader(DVCSLoader): | ||||
"""A bulk loader for a git repository""" | """A bulk loader for a git repository""" | ||||
visit_type = "git" | visit_type = "git" | ||||
def __init__( | def __init__( | ||||
self, | self, | ||||
storage: StorageInterface, | storage: StorageInterface, | ||||
url: str, | url: str, | ||||
base_url: Optional[str] = None, | base_url: Optional[str] = None, | ||||
ignore_history: bool = False, | ignore_history: bool = False, | ||||
repo_representation: Type[RepoRepresentation] = RepoRepresentation, | repo_representation: Type[RepoRepresentation] = RepoRepresentation, | ||||
pack_size_bytes: int = 4 * 1024 * 1024 * 1024, | pack_size_bytes: int = 4 * 1024 * 1024 * 1024, | ||||
temp_file_cutoff: int = 100 * 1024 * 1024, | |||||
save_data_path: Optional[str] = None, | save_data_path: Optional[str] = None, | ||||
max_content_size: Optional[int] = None, | max_content_size: Optional[int] = None, | ||||
): | ): | ||||
"""Initialize the bulk updater. | """Initialize the bulk updater. | ||||
Args: | Args: | ||||
repo_representation: swh's repository representation | repo_representation: swh's repository representation | ||||
which is in charge of filtering between known and remote | which is in charge of filtering between known and remote | ||||
data. | data. | ||||
""" | """ | ||||
super().__init__( | super().__init__( | ||||
storage=storage, | storage=storage, | ||||
save_data_path=save_data_path, | save_data_path=save_data_path, | ||||
max_content_size=max_content_size, | max_content_size=max_content_size, | ||||
) | ) | ||||
self.origin_url = url | self.origin_url = url | ||||
self.base_url = base_url | self.base_url = base_url | ||||
self.ignore_history = ignore_history | self.ignore_history = ignore_history | ||||
self.repo_representation = repo_representation | self.repo_representation = repo_representation | ||||
self.pack_size_bytes = pack_size_bytes | self.pack_size_bytes = pack_size_bytes | ||||
self.temp_file_cutoff = temp_file_cutoff | |||||
# state initialized in fetch_data | # state initialized in fetch_data | ||||
self.remote_refs: Dict[bytes, bytes] = {} | self.remote_refs: Dict[bytes, bytes] = {} | ||||
self.symbolic_refs: Dict[bytes, bytes] = {} | self.symbolic_refs: Dict[bytes, bytes] = {} | ||||
self.ref_object_types: Dict[bytes, Optional[TargetType]] = {} | self.ref_object_types: Dict[bytes, Optional[TargetType]] = {} | ||||
def fetch_pack_from_origin( | def fetch_pack_from_origin( | ||||
self, | self, | ||||
origin_url: str, | origin_url: str, | ||||
base_snapshot: Optional[Snapshot], | base_snapshot: Optional[Snapshot], | ||||
do_activity: Callable[[bytes], None], | do_activity: Callable[[bytes], None], | ||||
) -> FetchPackReturn: | ) -> FetchPackReturn: | ||||
"""Fetch a pack from the origin""" | """Fetch a pack from the origin""" | ||||
pack_buffer = BytesIO() | pack_buffer = SpooledTemporaryFile(max_size=self.temp_file_cutoff) | ||||
base_repo = self.repo_representation( | base_repo = self.repo_representation( | ||||
storage=self.storage, | storage=self.storage, | ||||
base_snapshot=base_snapshot, | base_snapshot=base_snapshot, | ||||
ignore_history=self.ignore_history, | ignore_history=self.ignore_history, | ||||
) | ) | ||||
# Hardcode the use of the tcp transport (for GitHub origins) | # Hardcode the use of the tcp transport (for GitHub origins) | ||||
▲ Show 20 Lines • Show All 320 Lines • Show Last 20 Lines |