Changeset View
Changeset View
Standalone View
Standalone View
swh/loader/git/loader.py
Show All 38 Lines | |||||
logger = logging.getLogger(__name__) | logger = logging.getLogger(__name__) | ||||
class RepoRepresentation: | class RepoRepresentation: | ||||
"""Repository representation for a Software Heritage origin.""" | """Repository representation for a Software Heritage origin.""" | ||||
def __init__( | def __init__( | ||||
self, storage, base_snapshot: Optional[Snapshot] = None, ignore_history=False | self, | ||||
storage, | |||||
base_snapshot: Optional[Snapshot] = None, | |||||
incremental: bool = True, | |||||
): | ): | ||||
self.storage = storage | self.storage = storage | ||||
self.ignore_history = ignore_history | self.incremental = incremental | ||||
if base_snapshot and not ignore_history: | if base_snapshot and incremental: | ||||
self.base_snapshot: Snapshot = base_snapshot | self.base_snapshot: Snapshot = base_snapshot | ||||
else: | else: | ||||
self.base_snapshot = Snapshot(branches={}) | self.base_snapshot = Snapshot(branches={}) | ||||
self.heads: Set[HexBytes] = set() | self.heads: Set[HexBytes] = set() | ||||
def get_parents(self, commit: bytes) -> List[bytes]: | def get_parents(self, commit: bytes) -> List[bytes]: | ||||
"""This method should return the list of known parents""" | """This method should return the list of known parents""" | ||||
▲ Show 20 Lines • Show All 48 Lines • ▼ Show 20 Lines | class GitLoader(DVCSLoader): | ||||
visit_type = "git" | visit_type = "git" | ||||
def __init__( | def __init__( | ||||
self, | self, | ||||
storage: StorageInterface, | storage: StorageInterface, | ||||
url: str, | url: str, | ||||
base_url: Optional[str] = None, | base_url: Optional[str] = None, | ||||
ignore_history: bool = False, | incremental: bool = True, | ||||
repo_representation: Type[RepoRepresentation] = RepoRepresentation, | repo_representation: Type[RepoRepresentation] = RepoRepresentation, | ||||
pack_size_bytes: int = 4 * 1024 * 1024 * 1024, | pack_size_bytes: int = 4 * 1024 * 1024 * 1024, | ||||
temp_file_cutoff: int = 100 * 1024 * 1024, | temp_file_cutoff: int = 100 * 1024 * 1024, | ||||
save_data_path: Optional[str] = None, | save_data_path: Optional[str] = None, | ||||
max_content_size: Optional[int] = None, | max_content_size: Optional[int] = None, | ||||
): | ): | ||||
"""Initialize the bulk updater. | """Initialize the bulk updater. | ||||
Args: | Args: | ||||
repo_representation: swh's repository representation | repo_representation: swh's repository representation | ||||
which is in charge of filtering between known and remote | which is in charge of filtering between known and remote | ||||
data. | data. | ||||
... | |||||
incremental: If True, the default, this starts from the last known snapshot | |||||
(if any) references. Otherwise, this loads the full repository. | |||||
""" | """ | ||||
super().__init__( | super().__init__( | ||||
storage=storage, | storage=storage, | ||||
save_data_path=save_data_path, | save_data_path=save_data_path, | ||||
max_content_size=max_content_size, | max_content_size=max_content_size, | ||||
) | ) | ||||
self.origin_url = url | self.origin_url = url | ||||
self.base_url = base_url | self.base_url = base_url | ||||
self.ignore_history = ignore_history | self.incremental = incremental | ||||
self.repo_representation = repo_representation | self.repo_representation = repo_representation | ||||
self.pack_size_bytes = pack_size_bytes | self.pack_size_bytes = pack_size_bytes | ||||
self.temp_file_cutoff = temp_file_cutoff | self.temp_file_cutoff = temp_file_cutoff | ||||
# state initialized in fetch_data | # state initialized in fetch_data | ||||
self.remote_refs: Dict[bytes, HexBytes] = {} | self.remote_refs: Dict[bytes, HexBytes] = {} | ||||
self.symbolic_refs: Dict[bytes, HexBytes] = {} | self.symbolic_refs: Dict[bytes, HexBytes] = {} | ||||
self.ref_object_types: Dict[bytes, Optional[TargetType]] = {} | self.ref_object_types: Dict[bytes, Optional[TargetType]] = {} | ||||
▲ Show 20 Lines • Show All 82 Lines • ▼ Show 20 Lines | class GitLoader(DVCSLoader): | ||||
def get_full_snapshot(self, origin_url) -> Optional[Snapshot]: | def get_full_snapshot(self, origin_url) -> Optional[Snapshot]: | ||||
return snapshot_get_latest(self.storage, origin_url) | return snapshot_get_latest(self.storage, origin_url) | ||||
def prepare(self) -> None: | def prepare(self) -> None: | ||||
assert self.origin is not None | assert self.origin is not None | ||||
prev_snapshot: Optional[Snapshot] = None | prev_snapshot: Optional[Snapshot] = None | ||||
if not self.ignore_history: | if self.incremental: | ||||
prev_snapshot = self.get_full_snapshot(self.origin.url) | prev_snapshot = self.get_full_snapshot(self.origin.url) | ||||
if self.base_url and prev_snapshot is None: | if self.base_url and prev_snapshot is None: | ||||
base_origin = list(self.storage.origin_get([self.base_url]))[0] | base_origin = list(self.storage.origin_get([self.base_url]))[0] | ||||
if base_origin: | if base_origin: | ||||
prev_snapshot = self.get_full_snapshot(base_origin.url) | prev_snapshot = self.get_full_snapshot(base_origin.url) | ||||
if prev_snapshot is not None: | if prev_snapshot is not None: | ||||
self.base_snapshot = prev_snapshot | self.base_snapshot = prev_snapshot | ||||
else: | else: | ||||
self.base_snapshot = Snapshot(branches={}) | self.base_snapshot = Snapshot(branches={}) | ||||
def fetch_data(self) -> bool: | def fetch_data(self) -> bool: | ||||
assert self.origin is not None | assert self.origin is not None | ||||
base_repo = self.repo_representation( | base_repo = self.repo_representation( | ||||
storage=self.storage, | storage=self.storage, | ||||
base_snapshot=self.base_snapshot, | base_snapshot=self.base_snapshot, | ||||
ignore_history=self.ignore_history, | incremental=self.incremental, | ||||
) | ) | ||||
def do_progress(msg: bytes) -> None: | def do_progress(msg: bytes) -> None: | ||||
sys.stderr.buffer.write(msg) | sys.stderr.buffer.write(msg) | ||||
sys.stderr.flush() | sys.stderr.flush() | ||||
try: | try: | ||||
fetch_info = self.fetch_pack_from_origin( | fetch_info = self.fetch_pack_from_origin( | ||||
▲ Show 20 Lines • Show All 237 Lines • ▼ Show 20 Lines | if __name__ == "__main__": | ||||
@click.command() | @click.command() | ||||
@click.option("--origin-url", help="Origin url", required=True) | @click.option("--origin-url", help="Origin url", required=True) | ||||
@click.option("--base-url", default=None, help="Optional Base url") | @click.option("--base-url", default=None, help="Optional Base url") | ||||
@click.option( | @click.option( | ||||
"--ignore-history/--no-ignore-history", | "--ignore-history/--no-ignore-history", | ||||
help="Ignore the repository history", | help="Ignore the repository history", | ||||
default=False, | default=False, | ||||
) | ) | ||||
def main(origin_url: str, base_url: str, ignore_history: bool) -> Dict[str, Any]: | def main(origin_url: str, base_url: str, incremental: bool) -> Dict[str, Any]: | ||||
from swh.storage import get_storage | from swh.storage import get_storage | ||||
storage = get_storage(cls="memory") | storage = get_storage(cls="memory") | ||||
loader = GitLoader( | loader = GitLoader( | ||||
storage, origin_url, base_url=base_url, ignore_history=ignore_history, | storage, origin_url, base_url=base_url, incremental=incremental, | ||||
) | ) | ||||
return loader.load() | return loader.load() | ||||
main() | main() |