Changeset View
Changeset View
Standalone View
Standalone View
swh/loader/git/loader.py
# Copyright (C) 2016-2021 The Software Heritage developers | # Copyright (C) 2016-2021 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
from collections import defaultdict | |||||
from dataclasses import dataclass | from dataclasses import dataclass | ||||
import datetime | import datetime | ||||
import logging | import logging | ||||
import os | import os | ||||
import pickle | import pickle | ||||
import sys | import sys | ||||
from tempfile import SpooledTemporaryFile | from tempfile import SpooledTemporaryFile | ||||
from typing import Any, Callable, Dict, Iterable, Iterator, List, Optional, Set, Type | from typing import Any, Callable, Dict, Iterable, Iterator, List, Optional, Set, Type | ||||
▲ Show 20 Lines • Show All 121 Lines • ▼ Show 20 Lines | ): | ||||
self.ignore_history = ignore_history | self.ignore_history = ignore_history | ||||
self.repo_representation = repo_representation | self.repo_representation = repo_representation | ||||
self.pack_size_bytes = pack_size_bytes | self.pack_size_bytes = pack_size_bytes | ||||
self.temp_file_cutoff = temp_file_cutoff | self.temp_file_cutoff = temp_file_cutoff | ||||
# state initialized in fetch_data | # state initialized in fetch_data | ||||
self.remote_refs: Dict[bytes, HexBytes] = {} | self.remote_refs: Dict[bytes, HexBytes] = {} | ||||
self.symbolic_refs: Dict[bytes, HexBytes] = {} | self.symbolic_refs: Dict[bytes, HexBytes] = {} | ||||
self.ref_object_types: Dict[bytes, Optional[TargetType]] = {} | self.ref_object_types: Dict[bytes, Optional[TargetType]] = {} | ||||
self.objects: Dict[bytes, Set[ShaFile]] = {} | |||||
def fetch_pack_from_origin( | def fetch_pack_from_origin( | ||||
self, | self, | ||||
origin_url: str, | origin_url: str, | ||||
base_repo: RepoRepresentation, | base_repo: RepoRepresentation, | ||||
do_activity: Callable[[bytes], None], | do_activity: Callable[[bytes], None], | ||||
) -> FetchPackReturn: | ) -> FetchPackReturn: | ||||
"""Fetch a pack from the origin""" | """Fetch a pack from the origin""" | ||||
▲ Show 20 Lines • Show All 135 Lines • ▼ Show 20 Lines | def fetch_data(self) -> bool: | ||||
self.dumb_fetcher.fetch_object_ids() | self.dumb_fetcher.fetch_object_ids() | ||||
self.remote_refs = utils.filter_refs(self.dumb_fetcher.refs) # type: ignore | self.remote_refs = utils.filter_refs(self.dumb_fetcher.refs) # type: ignore | ||||
self.symbolic_refs = self.dumb_fetcher.head | self.symbolic_refs = self.dumb_fetcher.head | ||||
else: | else: | ||||
self.pack_buffer = fetch_info.pack_buffer | self.pack_buffer = fetch_info.pack_buffer | ||||
self.pack_size = fetch_info.pack_size | self.pack_size = fetch_info.pack_size | ||||
self.remote_refs = fetch_info.remote_refs | self.remote_refs = fetch_info.remote_refs | ||||
self.symbolic_refs = fetch_info.symbolic_refs | self.symbolic_refs = fetch_info.symbolic_refs | ||||
# Read the pack file once and group objects per type so we can drop the | |||||
# reference early | |||||
self.objects = self.group_objects_per_type() | |||||
self.ref_object_types = {sha1: None for sha1 in self.remote_refs.values()} | self.ref_object_types = {sha1: None for sha1 in self.remote_refs.values()} | ||||
self.log.info( | self.log.info( | ||||
"Listed %d refs for repo %s", | "Listed %d refs for repo %s", | ||||
len(self.remote_refs), | len(self.remote_refs), | ||||
self.origin.url, | self.origin.url, | ||||
extra={ | extra={ | ||||
Show All 24 Lines | def save_data(self) -> None: | ||||
break | break | ||||
f.write(r) | f.write(r) | ||||
self.pack_buffer.seek(0) | self.pack_buffer.seek(0) | ||||
with open(os.path.join(pack_dir, refs_name), "xb") as f: | with open(os.path.join(pack_dir, refs_name), "xb") as f: | ||||
pickle.dump(self.remote_refs, f) | pickle.dump(self.remote_refs, f) | ||||
def group_objects_per_type(self) -> Dict[bytes, Set[ShaFile]]: | |||||
"""Group objects from the repository packfile representation into a dict of key | |||||
object_type, values the object ids for that object type. | |||||
It's an implementation detail to release earlier the packfile reference since we | |||||
no longer need it after that grouping is done. | |||||
""" | |||||
objs = defaultdict(set) | |||||
for obj in PackInflater.for_pack_data( | |||||
PackData.from_file(self.pack_buffer, self.pack_size) | |||||
): | |||||
objs[obj.type_name].add(obj) | |||||
return objs | |||||
def iter_objects(self, object_type: bytes) -> Iterator[ShaFile]: | def iter_objects(self, object_type: bytes) -> Iterator[ShaFile]: | ||||
"""Read all the objects of type `object_type` from the packfile""" | """Read all the objects of type `object_type` from the in-memory packfile | ||||
representation.""" | |||||
if self.dumb: | if self.dumb: | ||||
yield from self.dumb_fetcher.iter_objects(object_type) | yield from self.dumb_fetcher.iter_objects(object_type) | ||||
else: | else: | ||||
self.pack_buffer.seek(0) | yield from self.objects[object_type] | ||||
for obj in PackInflater.for_pack_data( | |||||
PackData.from_file(self.pack_buffer, self.pack_size) | |||||
): | |||||
if obj.type_name != object_type: | |||||
continue | |||||
yield obj | |||||
def get_contents(self) -> Iterable[BaseContent]: | def get_contents(self) -> Iterable[BaseContent]: | ||||
"""Format the blobs from the git repository as swh contents""" | """Format the blobs from the git repository as swh contents""" | ||||
for raw_obj in self.iter_objects(b"blob"): | for raw_obj in self.iter_objects(b"blob"): | ||||
if raw_obj.id in self.ref_object_types: | if raw_obj.id in self.ref_object_types: | ||||
self.ref_object_types[raw_obj.id] = TargetType.CONTENT | self.ref_object_types[raw_obj.id] = TargetType.CONTENT | ||||
yield converters.dulwich_blob_to_content( | yield converters.dulwich_blob_to_content( | ||||
▲ Show 20 Lines • Show All 151 Lines • Show Last 20 Lines |