Changeset View
Changeset View
Standalone View
Standalone View
swh/loader/git/loader.py
Show First 20 Lines • Show All 83 Lines • ▼ Show 20 Lines | def determine_wants(self, refs: Dict[bytes, HexBytes]) -> List[HexBytes]: | ||||
# Get the remote heads that we want to fetch | # Get the remote heads that we want to fetch | ||||
remote_heads: Set[HexBytes] = set() | remote_heads: Set[HexBytes] = set() | ||||
for ref_name, ref_target in refs.items(): | for ref_name, ref_target in refs.items(): | ||||
if utils.ignore_branch_name(ref_name): | if utils.ignore_branch_name(ref_name): | ||||
continue | continue | ||||
remote_heads.add(ref_target) | remote_heads.add(ref_target) | ||||
return list(remote_heads - local_heads) | logger.debug("local_heads_count=%s", len(local_heads)) | ||||
logger.debug("remote_heads_count=%s", len(remote_heads)) | |||||
wanted_refs = list(remote_heads - local_heads) | |||||
logger.debug("wanted_refs_count=%s", len(wanted_refs)) | |||||
return wanted_refs | |||||
@dataclass | @dataclass | ||||
class FetchPackReturn: | class FetchPackReturn: | ||||
remote_refs: Dict[bytes, HexBytes] | remote_refs: Dict[bytes, HexBytes] | ||||
symbolic_refs: Dict[bytes, HexBytes] | symbolic_refs: Dict[bytes, HexBytes] | ||||
pack_buffer: SpooledTemporaryFile | pack_buffer: SpooledTemporaryFile | ||||
pack_size: int | pack_size: int | ||||
▲ Show 20 Lines • Show All 62 Lines • ▼ Show 20 Lines | ) -> FetchPackReturn: | ||||
# In contrast, the Dulwich TCP transport just gives us the read handle | # In contrast, the Dulwich TCP transport just gives us the read handle | ||||
# on the underlying socket, doing no processing or copying of the bytes. | # on the underlying socket, doing no processing or copying of the bytes. | ||||
# We can interrupt it as soon as we've received too many bytes. | # We can interrupt it as soon as we've received too many bytes. | ||||
transport_url = origin_url | transport_url = origin_url | ||||
if transport_url.startswith("https://github.com/"): | if transport_url.startswith("https://github.com/"): | ||||
transport_url = "git" + transport_url[5:] | transport_url = "git" + transport_url[5:] | ||||
logger.debug("Transport url to communicate with server: %s", transport_url) | |||||
client, path = dulwich.client.get_transport_and_path( | client, path = dulwich.client.get_transport_and_path( | ||||
transport_url, thin_packs=False | transport_url, thin_packs=False | ||||
) | ) | ||||
logger.debug("Client %s to fetch pack at %s", client, path) | |||||
size_limit = self.pack_size_bytes | size_limit = self.pack_size_bytes | ||||
def do_pack(data: bytes) -> None: | def do_pack(data: bytes) -> None: | ||||
cur_size = pack_buffer.tell() | cur_size = pack_buffer.tell() | ||||
would_write = len(data) | would_write = len(data) | ||||
if cur_size + would_write > size_limit: | if cur_size + would_write > size_limit: | ||||
raise IOError( | raise IOError( | ||||
f"Pack file too big for repository {origin_url}, " | f"Pack file too big for repository {origin_url}, " | ||||
Show All 13 Lines | ) -> FetchPackReturn: | ||||
remote_refs = pack_result.refs or {} | remote_refs = pack_result.refs or {} | ||||
symbolic_refs = pack_result.symrefs or {} | symbolic_refs = pack_result.symrefs or {} | ||||
pack_buffer.flush() | pack_buffer.flush() | ||||
pack_size = pack_buffer.tell() | pack_size = pack_buffer.tell() | ||||
pack_buffer.seek(0) | pack_buffer.seek(0) | ||||
logger.debug("Fetched pack size: %s", pack_size) | logger.debug("fetched_pack_size=%s", pack_size) | ||||
# check if repository only supports git dumb transfer protocol, | # check if repository only supports git dumb transfer protocol, | ||||
# fetched pack file will be empty in that case as dulwich do | # fetched pack file will be empty in that case as dulwich do | ||||
# not support it and do not fetch any refs | # not support it and do not fetch any refs | ||||
self.dumb = transport_url.startswith("http") and client.dumb | self.dumb = transport_url.startswith("http") and client.dumb | ||||
return FetchPackReturn( | return FetchPackReturn( | ||||
remote_refs=utils.filter_refs(remote_refs), | remote_refs=utils.filter_refs(remote_refs), | ||||
anlambert: I would rather put that debug log before line 295 | |||||
Done Inline Actionsright, adapted, thanks. ardumont: right, adapted, thanks. | |||||
symbolic_refs=utils.filter_refs(symbolic_refs), | symbolic_refs=utils.filter_refs(symbolic_refs), | ||||
pack_buffer=pack_buffer, | pack_buffer=pack_buffer, | ||||
pack_size=pack_size, | pack_size=pack_size, | ||||
) | ) | ||||
def prepare_origin_visit(self) -> None: | def prepare_origin_visit(self) -> None: | ||||
self.visit_date = datetime.datetime.now(tz=datetime.timezone.utc) | self.visit_date = datetime.datetime.now(tz=datetime.timezone.utc) | ||||
self.origin = Origin(url=self.origin_url) | self.origin = Origin(url=self.origin_url) | ||||
▲ Show 20 Lines • Show All 54 Lines • ▼ Show 20 Lines | def fetch_data(self) -> bool: | ||||
# with old dulwich versions, those exceptions types can be raised | # with old dulwich versions, those exceptions types can be raised | ||||
# by the fetch_pack operation when encountering a repository with | # by the fetch_pack operation when encountering a repository with | ||||
# dumb transfer protocol so we check if the repository supports it | # dumb transfer protocol so we check if the repository supports it | ||||
# here to continue the loading if it is the case | # here to continue the loading if it is the case | ||||
self.dumb = dumb.check_protocol(self.origin_url) | self.dumb = dumb.check_protocol(self.origin_url) | ||||
if not self.dumb: | if not self.dumb: | ||||
raise | raise | ||||
logger.debug( | |||||
"Protocol used for communication: %s", "dumb" if self.dumb else "smart" | |||||
) | |||||
if self.dumb: | if self.dumb: | ||||
logger.debug("Fetching objects with HTTP dumb transfer protocol") | |||||
self.dumb_fetcher = dumb.GitObjectsFetcher(self.origin_url, base_repo) | self.dumb_fetcher = dumb.GitObjectsFetcher(self.origin_url, base_repo) | ||||
Not Done Inline ActionsTo remove if you apply my previous inline comment. anlambert: To remove if you apply my previous inline comment. | |||||
self.dumb_fetcher.fetch_object_ids() | self.dumb_fetcher.fetch_object_ids() | ||||
self.remote_refs = utils.filter_refs(self.dumb_fetcher.refs) # type: ignore | self.remote_refs = utils.filter_refs(self.dumb_fetcher.refs) # type: ignore | ||||
self.symbolic_refs = self.dumb_fetcher.head | self.symbolic_refs = self.dumb_fetcher.head | ||||
else: | else: | ||||
self.pack_buffer = fetch_info.pack_buffer | self.pack_buffer = fetch_info.pack_buffer | ||||
self.pack_size = fetch_info.pack_size | self.pack_size = fetch_info.pack_size | ||||
self.remote_refs = fetch_info.remote_refs | self.remote_refs = fetch_info.remote_refs | ||||
self.symbolic_refs = fetch_info.symbolic_refs | self.symbolic_refs = fetch_info.symbolic_refs | ||||
Show All 38 Lines | def save_data(self) -> None: | ||||
pickle.dump(self.remote_refs, f) | pickle.dump(self.remote_refs, f) | ||||
def iter_objects(self, object_type: bytes) -> Iterator[ShaFile]: | def iter_objects(self, object_type: bytes) -> Iterator[ShaFile]: | ||||
"""Read all the objects of type `object_type` from the packfile""" | """Read all the objects of type `object_type` from the packfile""" | ||||
if self.dumb: | if self.dumb: | ||||
yield from self.dumb_fetcher.iter_objects(object_type) | yield from self.dumb_fetcher.iter_objects(object_type) | ||||
else: | else: | ||||
self.pack_buffer.seek(0) | self.pack_buffer.seek(0) | ||||
count = 0 | |||||
for obj in PackInflater.for_pack_data( | for obj in PackInflater.for_pack_data( | ||||
PackData.from_file(self.pack_buffer, self.pack_size) | PackData.from_file(self.pack_buffer, self.pack_size) | ||||
): | ): | ||||
if obj.type_name != object_type: | if obj.type_name != object_type: | ||||
continue | continue | ||||
yield obj | yield obj | ||||
count += 1 | |||||
logger.debug("packfile_read_count_%s=%s", object_type.decode(), count) | |||||
def get_contents(self) -> Iterable[BaseContent]: | def get_contents(self) -> Iterable[BaseContent]: | ||||
"""Format the blobs from the git repository as swh contents""" | """Format the blobs from the git repository as swh contents""" | ||||
for raw_obj in self.iter_objects(b"blob"): | for raw_obj in self.iter_objects(b"blob"): | ||||
if raw_obj.id in self.ref_object_types: | if raw_obj.id in self.ref_object_types: | ||||
self.ref_object_types[raw_obj.id] = TargetType.CONTENT | self.ref_object_types[raw_obj.id] = TargetType.CONTENT | ||||
yield converters.dulwich_blob_to_content( | yield converters.dulwich_blob_to_content( | ||||
▲ Show 20 Lines • Show All 151 Lines • Show Last 20 Lines |
I would rather put that debug log before line 295