diff --git a/swh/loader/git/loader.py b/swh/loader/git/loader.py --- a/swh/loader/git/loader.py +++ b/swh/loader/git/loader.py @@ -1,4 +1,4 @@ -# Copyright (C) 2016-2020 The Software Heritage developers +# Copyright (C) 2016-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -14,11 +14,13 @@ from typing import Any, Callable, Dict, Iterable, List, Optional, Set, Tuple, Type import dulwich.client +from dulwich.errors import GitProtocolError, ObjectFormatException from dulwich.object_store import ObjectStoreGraphWalker from dulwich.pack import PackData, PackInflater from swh.core.config import merge_configs from swh.loader.core.loader import DVCSLoader +from swh.loader.exception import NotFound from swh.model import hashutil from swh.model.model import ( BaseContent, @@ -235,9 +237,21 @@ sys.stderr.buffer.write(msg) sys.stderr.flush() - fetch_info = self.fetch_pack_from_origin( - self.origin.url, self.base_snapshot, do_progress - ) + try: + fetch_info = self.fetch_pack_from_origin( + self.origin.url, self.base_snapshot, do_progress + ) + except (IOError, ObjectFormatException) as e: + # Problem during the fetch pack communication (e.g pack file too big, ...) + # https://sentry.softwareheritage.org/share/issue/12a3987c9a3049c881a0ec536e2fc373/ # noqa + raise ValueError(e) + except GitProtocolError as e: + # Failure to communicate in some various form (e.g repository not found, + # 401, ...) + # https://sentry.softwareheritage.org/share/issue/421322f985c648e8a1cd75151a9d3543/ # noqa + # https://sentry.softwareheritage.org/share/issue/42e11d3abdb94632b9eb95e76cf41f4e/ # noqa + # https://sentry.softwareheritage.org/share/issue/8b951b6ab67c4ef28d785959029bc87a/ # noqa + raise NotFound(e) self.pack_buffer = fetch_info.pack_buffer self.pack_size = fetch_info.pack_size diff --git a/swh/loader/git/tests/test_loader.py b/swh/loader/git/tests/test_loader.py --- a/swh/loader/git/tests/test_loader.py +++ b/swh/loader/git/tests/test_loader.py @@ -1,4 +1,4 @@ -# Copyright (C) 2018-2020 The Software Heritage developers +# Copyright (C) 2018-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -6,15 +6,70 @@ import os from unittest import TestCase +from dulwich.errors import GitProtocolError, ObjectFormatException import dulwich.repo import pytest from swh.loader.git.loader import GitLoader from swh.loader.git.tests.test_from_disk import FullGitLoaderTests -from swh.loader.tests import prepare_repository_from_archive +from swh.loader.tests import assert_last_visit_matches, prepare_repository_from_archive -class GitLoaderTest(TestCase, FullGitLoaderTests): +class CommonGitLoaderNotFound: + @pytest.fixture(autouse=True) + def __inject_fixtures(self, mocker): + """Inject required fixtures in unittest.TestCase class + + """ + self.mocker = mocker + + def test_load_visit_not_found(self): + """Ingesting an unknown url result in a visit with not_found status + + """ + # simulate an initial communication error (e.g no repository found, ...) + mock = self.mocker.patch( + "swh.loader.git.loader.GitLoader.fetch_pack_from_origin" + ) + mock.side_effect = GitProtocolError + + res = self.loader.load() + assert res == {"status": "uneventful"} + + assert_last_visit_matches( + self.loader.storage, + self.repo_url, + status="not_found", + type="git", + snapshot=None, + ) + + def test_load_visit_failure(self): + """Failing during the fetch pack step result in failing visit + + """ + for failure_exception in [IOError, ObjectFormatException]: + # simulate a fetch communication error after the initial connection server + # error (e.g IOError, ObjectFormatException, ...) + mock = self.mocker.patch( + "swh.loader.git.loader.GitLoader.fetch_pack_from_origin" + ) + + mock.side_effect = failure_exception + + res = self.loader.load() + assert res == {"status": "failed"} + + assert_last_visit_matches( + self.loader.storage, + self.repo_url, + status="failed", + type="git", + snapshot=None, + ) + + +class GitLoaderTest(TestCase, FullGitLoaderTests, CommonGitLoaderNotFound): """Prepare a git directory repository to be loaded through a GitLoader. This tests all git loader scenario. @@ -34,7 +89,7 @@ self.repo = dulwich.repo.Repo(self.destination_path) -class GitLoader2Test(TestCase, FullGitLoaderTests): +class GitLoader2Test(TestCase, FullGitLoaderTests, CommonGitLoaderNotFound): """Mostly the same loading scenario but with a base-url different than the repo-url. To walk slightly different paths, the end result should stay the same.