diff --git a/swh/loader/mercurial/loader.py b/swh/loader/mercurial/loader.py --- a/swh/loader/mercurial/loader.py +++ b/swh/loader/mercurial/loader.py @@ -1,4 +1,4 @@ -# Copyright (C) 2017-2020 The Software Heritage developers +# Copyright (C) 2017-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -30,10 +30,12 @@ import billiard from dateutil import parser import hglib +from hglib.error import CommandError from swh.core.config import merge_configs from swh.loader.core.loader import DVCSLoader from swh.loader.core.utils import clean_dangling_folders +from swh.loader.exception import NotFound from swh.model import identifiers from swh.model.hashutil import ( DEFAULT_ALGORITHMS, @@ -74,6 +76,19 @@ HEAD_POINTER_NAME = b"tip" +class CommandErrorWrapper(Exception): + """This exception is raised in place of a 'CommandError' + exception (raised by the underlying hglib library) + + This is needed because billiard.Queue is serializing the + queued object and as CommandError doesn't have a constructor without + parameters, the deserialization is failing + """ + + def __init__(self, err: Optional[bytes]): + self.err = err + + class CloneTimeoutError(Exception): pass @@ -183,6 +198,9 @@ def do_clone(queue, origin, destination): try: result = hglib.clone(source=origin, dest=destination) + except CommandError as e: + # the queued object need an empty constructor to be deserialized later + queue.put(CommandErrorWrapper(e.err)) except BaseException as e: queue.put(e) else: @@ -207,6 +225,7 @@ continue process.join() + if isinstance(result, Exception): raise result from None @@ -252,9 +271,19 @@ self.clone_timeout, ) - self.clone_with_timeout( - self.log, self.origin_url, self.hgdir, self.clone_timeout - ) + try: + self.clone_with_timeout( + self.log, self.origin_url, self.hgdir, self.clone_timeout + ) + except CommandErrorWrapper as e: + for msg in [ + b"does not appear to be an hg repository", + b"404: Not Found", + b"Name or service not known", + ]: + if msg in e.err: + raise NotFound(e.args[0]) from None + raise e else: # local repository self.working_directory = None @@ -262,6 +291,7 @@ self.bundle_path = os.path.join(self.hgdir, self.bundle_filename) self.log.debug("Bundling at %s" % self.bundle_path) + with hglib.open(self.hgdir) as repo: self.heads = self.get_heads(repo) repo.bundle(bytes(self.bundle_path, "utf-8"), all=True, type=b"none-v2") diff --git a/swh/loader/mercurial/tests/test_loader.py b/swh/loader/mercurial/tests/test_loader.py --- a/swh/loader/mercurial/tests/test_loader.py +++ b/swh/loader/mercurial/tests/test_loader.py @@ -9,6 +9,7 @@ import time import hglib +from hglib.error import CommandError import pytest from swh.loader.tests import ( @@ -231,6 +232,53 @@ ) +@pytest.mark.parametrize( + "error_msg", + [ + b"does not appear to be an hg repository", + b"404: Not Found", + b" Name or service not known", + ], +) +def test_visit_error_with_status_not_found( + swh_config, datadir, tmp_path, mocker, error_msg +): + """Not reaching the repo leads to a 'not_found' ingestion status""" + mock = mocker.patch("hglib.clone") + mock.side_effect = CommandError((), 255, b"", error_msg) + + archive_name = "the-sandbox" + archive_path = os.path.join(datadir, f"{archive_name}.tgz") + repo_url = prepare_repository_from_archive(archive_path, archive_name, tmp_path) + + loader = HgBundle20Loader(repo_url) + + assert loader.load() == {"status": "uneventful"} + + assert_last_visit_matches( + loader.storage, repo_url, status="not_found", type="hg", snapshot=None, + ) + + +def test_visit_error_with_clone_error(swh_config, datadir, tmp_path, mocker): + """Testing failures other than 'not_found'""" + + mock = mocker.patch("hglib.clone") + mock.side_effect = CommandError((), 255, b"", b"out of disk space") + + archive_name = "the-sandbox" + archive_path = os.path.join(datadir, f"{archive_name}.tgz") + repo_url = prepare_repository_from_archive(archive_path, archive_name, tmp_path) + + loader = HgBundle20Loader(repo_url) + + assert loader.load() == {"status": "failed"} + + assert_last_visit_matches( + loader.storage, repo_url, status="failed", type="hg", snapshot=None, + ) + + def test_visit_repository_with_transplant_operations(swh_config, datadir, tmp_path): """Visit a mercurial repository visit transplant operations within should yield a snapshot as well.