diff --git a/swh/loader/git/loader.py b/swh/loader/git/loader.py --- a/swh/loader/git/loader.py +++ b/swh/loader/git/loader.py @@ -1,4 +1,4 @@ -# Copyright (C) 2016-2020 The Software Heritage developers +# Copyright (C) 2016-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -14,11 +14,13 @@ from typing import Any, Callable, Dict, Iterable, List, Optional, Set, Tuple, Type import dulwich.client +from dulwich.errors import GitProtocolError, NotGitRepository from dulwich.object_store import ObjectStoreGraphWalker from dulwich.pack import PackData, PackInflater from swh.core.config import merge_configs from swh.loader.core.loader import DVCSLoader +from swh.loader.exception import NotFound from swh.model import hashutil from swh.model.model import ( BaseContent, @@ -235,9 +237,24 @@ sys.stderr.buffer.write(msg) sys.stderr.flush() - fetch_info = self.fetch_pack_from_origin( - self.origin.url, self.base_snapshot, do_progress - ) + try: + fetch_info = self.fetch_pack_from_origin( + self.origin.url, self.base_snapshot, do_progress + ) + except NotGitRepository as e: + raise NotFound(e) + except GitProtocolError as e: + # unfortunately, that kind of error is not specific to a not found + # scenario... It depends on the value of message within the exception. + for msg in [ + "Repository unavailable", # e.g DMCA takedown + "Repository not found", + "unexpected http resp 401", + ]: + if msg in e.args[0]: + raise NotFound(e) + # otherwise transmit the error + raise self.pack_buffer = fetch_info.pack_buffer self.pack_size = fetch_info.pack_size diff --git a/swh/loader/git/tests/test_loader.py b/swh/loader/git/tests/test_loader.py --- a/swh/loader/git/tests/test_loader.py +++ b/swh/loader/git/tests/test_loader.py @@ -1,4 +1,4 @@ -# Copyright (C) 2018-2020 The Software Heritage developers +# Copyright (C) 2018-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -6,15 +6,84 @@ import os from unittest import TestCase +from dulwich.errors import GitProtocolError, NotGitRepository, ObjectFormatException import dulwich.repo import pytest from swh.loader.git.loader import GitLoader from swh.loader.git.tests.test_from_disk import FullGitLoaderTests -from swh.loader.tests import prepare_repository_from_archive +from swh.loader.tests import assert_last_visit_matches, prepare_repository_from_archive -class GitLoaderTest(TestCase, FullGitLoaderTests): +class CommonGitLoaderNotFound: + @pytest.fixture(autouse=True) + def __inject_fixtures(self, mocker): + """Inject required fixtures in unittest.TestCase class + + """ + self.mocker = mocker + + def test_load_visit_not_found(self): + """Ingesting an unknown url result in a visit with not_found status + + """ + for failure_exception in [ + GitProtocolError("Repository unavailable"), # e.g DMCA takedown + GitProtocolError("Repository not found"), + GitProtocolError("unexpected http resp 401"), + NotGitRepository("not a git repo"), + ]: + with self.subTest(failure_exception=failure_exception): + # simulate an initial communication error (e.g no repository found, ...) + mock = self.mocker.patch( + "swh.loader.git.loader.GitLoader.fetch_pack_from_origin" + ) + mock.side_effect = failure_exception + + res = self.loader.load() + assert res == {"status": "uneventful"} + + assert_last_visit_matches( + self.loader.storage, + self.repo_url, + status="not_found", + type="git", + snapshot=None, + ) + + def test_load_visit_failure(self): + """Failing during the fetch pack step result in failing visit + + """ + for failure_exception in [ + IOError, + ObjectFormatException, + OSError, + ValueError, + GitProtocolError, + ]: + with self.subTest(failure_exception=failure_exception): + # simulate a fetch communication error after the initial connection + # server error (e.g IOError, ObjectFormatException, ...) + mock = self.mocker.patch( + "swh.loader.git.loader.GitLoader.fetch_pack_from_origin" + ) + + mock.side_effect = failure_exception("failure") + + res = self.loader.load() + assert res == {"status": "failed"} + + assert_last_visit_matches( + self.loader.storage, + self.repo_url, + status="failed", + type="git", + snapshot=None, + ) + + +class GitLoaderTest(TestCase, FullGitLoaderTests, CommonGitLoaderNotFound): """Prepare a git directory repository to be loaded through a GitLoader. This tests all git loader scenario. @@ -34,7 +103,7 @@ self.repo = dulwich.repo.Repo(self.destination_path) -class GitLoader2Test(TestCase, FullGitLoaderTests): +class GitLoader2Test(TestCase, FullGitLoaderTests, CommonGitLoaderNotFound): """Mostly the same loading scenario but with a base-url different than the repo-url. To walk slightly different paths, the end result should stay the same.