diff --git a/swh/loader/git/dumb.py b/swh/loader/git/dumb.py --- a/swh/loader/git/dumb.py +++ b/swh/loader/git/dumb.py @@ -8,10 +8,12 @@ from collections import defaultdict import logging import stat +import struct from tempfile import SpooledTemporaryFile from typing import TYPE_CHECKING, Callable, Dict, Iterable, List, Set, cast from dulwich.client import HttpGitClient +from dulwich.errors import NotGitRepository from dulwich.objects import S_IFGITLINK, Commit, ShaFile, Tree from dulwich.pack import Pack, PackData, PackIndex, load_pack_index_file from urllib3.response import HTTPResponse @@ -177,10 +179,16 @@ def _get_git_object(self, sha: bytes) -> ShaFile: # try to get the object from a pack file first to avoid flooding # git server with numerous HTTP requests - for pack in self.packs: - if sha in pack: - return pack[sha] - # fetch it from object/ directory otherwise + for pack in list(self.packs): + try: + if sha in pack: + return pack[sha] + except (NotGitRepository, struct.error): + # missing (dulwich http client raises NotGitRepository on 404) + # or invalid pack index/content, remove it from global packs list + logger.debug("A pack file is missing or its content is invalid") + self.packs.remove(pack) + # fetch it from objects/ directory otherwise sha_hex = sha.decode() object_path = f"objects/{sha_hex[:2]}/{sha_hex[2:]}" return ShaFile.from_file(self._http_get(object_path)) diff --git a/swh/loader/git/tests/test_loader.py b/swh/loader/git/tests/test_loader.py --- a/swh/loader/git/tests/test_loader.py +++ b/swh/loader/git/tests/test_loader.py @@ -7,6 +7,7 @@ from http.server import HTTPServer, SimpleHTTPRequestHandler import os import subprocess +from tempfile import SpooledTemporaryFile from threading import Thread from dulwich.errors import GitProtocolError, NotGitRepository, ObjectFormatException @@ -287,6 +288,34 @@ def setup_class(cls): cls.with_pack_files = True + def test_load_with_missing_pack(self, mocker): + """Some dumb git servers might reference a no longer existing pack file + while it is possible to load a repository without it. + """ + + class GitObjectsFetcherMissingPack(dumb.GitObjectsFetcher): + def _http_get(self, path: str) -> SpooledTemporaryFile: + buffer = super()._http_get(path) + if path == "objects/info/packs": + # prepend a non existing pack to the returned packs list + packs = buffer.read().decode("utf-8") + buffer.seek(0) + buffer.write( + ( + "P pack-a70762ba1a901af3a0e76de02fc3a99226842745.pack\n" + + packs + ).encode() + ) + buffer.flush() + buffer.seek(0) + return buffer + + mocker.patch.object(dumb, "GitObjectsFetcher", GitObjectsFetcherMissingPack) + + res = self.loader.load() + + assert res == {"status": "eventful"} + class TestDumbGitLoaderWithoutPack(DumbGitLoaderTestBase): @classmethod