diff --git a/swh/loader/mercurial/loader.py b/swh/loader/mercurial/loader.py index 84090c8..a7ea51c 100644 --- a/swh/loader/mercurial/loader.py +++ b/swh/loader/mercurial/loader.py @@ -1,637 +1,667 @@ -# Copyright (C) 2017-2020 The Software Heritage developers +# Copyright (C) 2017-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information """This document contains a SWH loader for ingesting repository data from Mercurial version 2 bundle files. """ # NOTE: The code here does expensive work twice in places because of the # intermediate need to check for what is missing before sending to the database # and the desire to not juggle very large amounts of data. # TODO: Decide whether to also serialize to disk and read back more quickly # from there. Maybe only for very large repos and fast drives. # - Avi import datetime import os from queue import Empty import random import re from shutil import rmtree from tempfile import mkdtemp import time from typing import Any, Dict, Iterable, List, Optional import billiard from dateutil import parser import hglib +from hglib.error import CommandError from swh.core.config import merge_configs from swh.loader.core.loader import DVCSLoader from swh.loader.core.utils import clean_dangling_folders +from swh.loader.exception import NotFound from swh.model import identifiers from swh.model.hashutil import ( DEFAULT_ALGORITHMS, MultiHash, hash_to_bytehex, hash_to_bytes, hash_to_hex, ) from swh.model.model import ( BaseContent, Content, Directory, ObjectType, Origin, Person, Release, Revision, RevisionType, Sha1Git, SkippedContent, Snapshot, SnapshotBranch, TargetType, TimestampWithTimezone, ) from swh.storage.algos.origin import origin_get_latest_visit_status from . import converters from .archive_extract import tmp_extract from .bundle20_reader import Bundle20Reader from .converters import PRIMARY_ALGO as ALGO from .objects import SelectiveCache, SimpleTree TAG_PATTERN = re.compile("[0-9A-Fa-f]{40}") TEMPORARY_DIR_PREFIX_PATTERN = "swh.loader.mercurial." HEAD_POINTER_NAME = b"tip" +class CommandErrorWrapper(Exception): + """This exception is raised in place of a 'CommandError' + exception (raised by the underlying hglib library) + + This is needed because billiard.Queue is serializing the + queued object and as CommandError doesn't have a constructor without + parameters, the deserialization is failing + """ + + def __init__(self, err: Optional[bytes]): + self.err = err + + class CloneTimeoutError(Exception): pass DEFAULT_CONFIG: Dict[str, Any] = { "bundle_filename": "HG20_none_bundle", "reduce_effort": False, "temp_directory": "/tmp", "cache1_size": 800 * 1024 * 1024, "cache2_size": 800 * 1024 * 1024, "clone_timeout_seconds": 7200, } class HgBundle20Loader(DVCSLoader): """Mercurial loader able to deal with remote or local repository. """ visit_type = "hg" def __init__( self, url, visit_date=None, directory=None, logging_class="swh.loader.mercurial.Bundle20Loader", ): super().__init__(logging_class=logging_class) self.config = merge_configs(DEFAULT_CONFIG, self.config) self.origin_url = url self.visit_date = visit_date self.directory = directory self.bundle_filename = self.config["bundle_filename"] self.reduce_effort_flag = self.config["reduce_effort"] self.empty_repository = None self.temp_directory = self.config["temp_directory"] self.cache1_size = self.config["cache1_size"] self.cache2_size = self.config["cache2_size"] self.clone_timeout = self.config["clone_timeout_seconds"] self.working_directory = None self.bundle_path = None self.heads = {} self.releases = {} self.last_snapshot_id: Optional[bytes] = None def pre_cleanup(self): """Cleanup potential dangling files from prior runs (e.g. OOM killed tasks) """ clean_dangling_folders( self.temp_directory, pattern_check=TEMPORARY_DIR_PREFIX_PATTERN, log=self.log, ) def cleanup(self): """Clean temporary working directory """ if self.bundle_path and os.path.exists(self.bundle_path): self.log.debug("Cleanup up working bundle %s" % self.bundle_path) os.unlink(self.bundle_path) if self.working_directory and os.path.exists(self.working_directory): self.log.debug( "Cleanup up working directory %s" % (self.working_directory,) ) rmtree(self.working_directory) def get_heads(self, repo): """Read the closed branches heads (branch, bookmarks) and returns a dict with key the branch_name (bytes) and values the tuple (pointer nature (bytes), mercurial's node id (bytes)). Those needs conversion to swh-ids. This is taken care of in get_revisions. """ b = {} for _, node_hash_id, pointer_nature, branch_name, *_ in repo.heads(): b[branch_name] = (pointer_nature, hash_to_bytes(node_hash_id.decode())) bookmarks = repo.bookmarks() if bookmarks and bookmarks[0]: for bookmark_name, _, target_short in bookmarks[0]: target = repo[target_short].node() b[bookmark_name] = (None, hash_to_bytes(target.decode())) return b def prepare_origin_visit(self, *args, **kwargs) -> None: self.origin = Origin(url=self.origin_url) visit_date = self.visit_date if isinstance(visit_date, str): # visit_date can be string or datetime visit_date = parser.parse(visit_date) self.visit_date = visit_date visit_status = origin_get_latest_visit_status( self.storage, self.origin_url, require_snapshot=True ) self.last_snapshot_id = None if visit_status is None else visit_status.snapshot @staticmethod def clone_with_timeout(log, origin, destination, timeout): queue = billiard.Queue() start = time.monotonic() def do_clone(queue, origin, destination): try: result = hglib.clone(source=origin, dest=destination) + except CommandError as e: + # the queued object need an empty constructor to be deserialized later + queue.put(CommandErrorWrapper(e.err)) except BaseException as e: queue.put(e) else: queue.put(result) process = billiard.Process(target=do_clone, args=(queue, origin, destination)) process.start() while True: try: result = queue.get(timeout=0.1) break except Empty: duration = time.monotonic() - start if timeout and duration > timeout: log.warning( "Timeout cloning `%s` within %s seconds", origin, timeout ) process.terminate() process.join() raise CloneTimeoutError(origin, timeout) continue process.join() + if isinstance(result, Exception): raise result from None return result def prepare(self, *args, **kwargs): """Prepare the necessary steps to load an actual remote or local repository. To load a local repository, pass the optional directory parameter as filled with a path to a real local folder. To load a remote repository, pass the optional directory parameter as None. Args: origin_url (str): Origin url to load visit_date (str/datetime): Date of the visit directory (str/None): The local directory to load """ self.branches = {} self.tags = [] self.releases = {} self.node_2_rev = {} self.heads = {} directory = self.directory if not directory: # remote repository self.working_directory = mkdtemp( prefix=TEMPORARY_DIR_PREFIX_PATTERN, suffix="-%s" % os.getpid(), dir=self.temp_directory, ) os.makedirs(self.working_directory, exist_ok=True) self.hgdir = self.working_directory self.log.debug( "Cloning %s to %s with timeout %s seconds", self.origin_url, self.hgdir, self.clone_timeout, ) - self.clone_with_timeout( - self.log, self.origin_url, self.hgdir, self.clone_timeout - ) + try: + self.clone_with_timeout( + self.log, self.origin_url, self.hgdir, self.clone_timeout + ) + except CommandErrorWrapper as e: + for msg in [ + b"does not appear to be an hg repository", + b"404: Not Found", + b"Name or service not known", + ]: + if msg in e.err: + raise NotFound(e.args[0]) from None + raise e else: # local repository self.working_directory = None self.hgdir = directory self.bundle_path = os.path.join(self.hgdir, self.bundle_filename) self.log.debug("Bundling at %s" % self.bundle_path) + with hglib.open(self.hgdir) as repo: self.heads = self.get_heads(repo) repo.bundle(bytes(self.bundle_path, "utf-8"), all=True, type=b"none-v2") self.cache_filename1 = os.path.join( self.hgdir, "swh-cache-1-%s" % (hex(random.randint(0, 0xFFFFFF))[2:],) ) self.cache_filename2 = os.path.join( self.hgdir, "swh-cache-2-%s" % (hex(random.randint(0, 0xFFFFFF))[2:],) ) try: self.br = Bundle20Reader( bundlefile=self.bundle_path, cache_filename=self.cache_filename1, cache_size=self.cache1_size, ) except FileNotFoundError: # Empty repository! Still a successful visit targeting an # empty snapshot self.log.warn("%s is an empty repository!" % self.hgdir) self.empty_repository = True else: self.reduce_effort = set() if self.reduce_effort_flag: now = datetime.datetime.now(tz=datetime.timezone.utc) if (now - self.visit_date).days > 1: # Assuming that self.visit_date would be today for # a new visit, treat older visit dates as # indication of wanting to skip some processing # effort. for header, commit in self.br.yield_all_changesets(): ts = commit["time"].timestamp() if ts < self.visit_date.timestamp(): self.reduce_effort.add(header["node"]) def has_contents(self): return not self.empty_repository def has_directories(self): return not self.empty_repository def has_revisions(self): return not self.empty_repository def has_releases(self): return not self.empty_repository def fetch_data(self): """Fetch the data from the data source.""" pass def get_contents(self) -> Iterable[BaseContent]: """Get the contents that need to be loaded.""" # NOTE: This method generates blobs twice to reduce memory usage # without generating disk writes. self.file_node_to_hash = {} hash_to_info = {} self.num_contents = 0 contents: Dict[bytes, BaseContent] = {} missing_contents = set() for blob, node_info in self.br.yield_all_blobs(): self.num_contents += 1 file_name = node_info[0] header = node_info[2] length = len(blob) if header["linknode"] in self.reduce_effort: algorithms = set([ALGO]) else: algorithms = DEFAULT_ALGORITHMS h = MultiHash.from_data(blob, hash_names=algorithms) content = h.digest() content["length"] = length blob_hash = content[ALGO] self.file_node_to_hash[header["node"]] = blob_hash if header["linknode"] in self.reduce_effort: continue hash_to_info[blob_hash] = node_info if self.max_content_size is not None and length >= self.max_content_size: contents[blob_hash] = SkippedContent( status="absent", reason="Content too large", **content ) else: contents[blob_hash] = Content(data=blob, status="visible", **content) if file_name == b".hgtags": # https://www.mercurial-scm.org/wiki/GitConcepts#Tag_model # overwrite until the last one self.tags = (t for t in blob.split(b"\n") if t != b"") if contents: missing_contents = set( self.storage.content_missing( [c.to_dict() for c in contents.values()], key_hash=ALGO ) ) # Clusters needed blobs by file offset and then only fetches the # groups at the needed offsets. focs: Dict[int, Dict[bytes, bytes]] = {} # "file/offset/contents" for blob_hash in missing_contents: _, file_offset, header = hash_to_info[blob_hash] focs.setdefault(file_offset, {}) focs[file_offset][header["node"]] = blob_hash for offset, node_hashes in sorted(focs.items()): for header, data, *_ in self.br.yield_group_objects(group_offset=offset): node = header["node"] if node in node_hashes: blob, meta = self.br.extract_meta_from_blob(data) content = contents.pop(node_hashes[node], None) if content: if ( self.max_content_size is not None and len(blob) >= self.max_content_size ): yield SkippedContent.from_data( blob, reason="Content too large" ) else: yield Content.from_data(blob) def load_directories(self): """This is where the work is done to convert manifest deltas from the repository bundle into SWH directories. """ self.mnode_to_tree_id = {} cache_hints = self.br.build_manifest_hints() def tree_size(t): return t.size() self.trees = SelectiveCache( cache_hints=cache_hints, size_function=tree_size, filename=self.cache_filename2, max_size=self.cache2_size, ) tree = SimpleTree() for header, added, removed in self.br.yield_all_manifest_deltas(cache_hints): node = header["node"] basenode = header["basenode"] tree = self.trees.fetch(basenode) or tree # working tree for path in removed.keys(): tree = tree.remove_tree_node_for_path(path) for path, info in added.items(): file_node, is_symlink, perms_code = info tree = tree.add_blob( path, self.file_node_to_hash[file_node], is_symlink, perms_code ) if header["linknode"] in self.reduce_effort: self.trees.store(node, tree) else: new_dirs = [] self.mnode_to_tree_id[node] = tree.hash_changed(new_dirs) self.trees.store(node, tree) yield header, tree, new_dirs def get_directories(self) -> Iterable[Directory]: """Compute directories to load """ dirs: Dict[Sha1Git, Directory] = {} self.num_directories = 0 for _, _, new_dirs in self.load_directories(): for d in new_dirs: self.num_directories += 1 dirs[d["id"]] = Directory.from_dict(d) missing_dirs: List[Sha1Git] = list(dirs.keys()) if missing_dirs: missing_dirs = list(self.storage.directory_missing(missing_dirs)) for _id in missing_dirs: yield dirs[_id] def get_revisions(self) -> Iterable[Revision]: """Compute revisions to load """ revisions = {} self.num_revisions = 0 for header, commit in self.br.yield_all_changesets(): if header["node"] in self.reduce_effort: continue self.num_revisions += 1 date_dict = identifiers.normalize_timestamp(int(commit["time"].timestamp())) author_dict = converters.parse_author(commit["user"]) if commit["manifest"] == Bundle20Reader.NAUGHT_NODE: directory_id = SimpleTree().hash_changed() else: directory_id = self.mnode_to_tree_id[commit["manifest"]] extra_headers = [ ( b"time_offset_seconds", str(commit["time_offset_seconds"]).encode("utf-8"), ) ] extra = commit.get("extra") if extra: for e in extra.split(b"\x00"): k, v = e.split(b":", 1) # transplant_source stores binary reference to a changeset # prefer to dump hexadecimal one in the revision metadata if k == b"transplant_source": v = hash_to_bytehex(v) extra_headers.append((k, v)) parents = [] p1 = self.node_2_rev.get(header["p1"]) p2 = self.node_2_rev.get(header["p2"]) if p1: parents.append(p1) if p2: parents.append(p2) revision = Revision( author=Person.from_dict(author_dict), date=TimestampWithTimezone.from_dict(date_dict), committer=Person.from_dict(author_dict), committer_date=TimestampWithTimezone.from_dict(date_dict), type=RevisionType.MERCURIAL, directory=directory_id, message=commit["message"], metadata={"node": hash_to_hex(header["node"]),}, extra_headers=tuple(extra_headers), synthetic=False, parents=tuple(parents), ) self.node_2_rev[header["node"]] = revision.id revisions[revision.id] = revision # Converts heads to use swh ids self.heads = { branch_name: (pointer_nature, self.node_2_rev[node_id]) for branch_name, (pointer_nature, node_id) in self.heads.items() } missing_revs = set(revisions.keys()) if missing_revs: missing_revs = set(self.storage.revision_missing(list(missing_revs))) for rev in missing_revs: yield revisions[rev] self.mnode_to_tree_id = None def _read_tag(self, tag, split_byte=b" "): node, *name = tag.split(split_byte) name = split_byte.join(name) return node, name def get_releases(self) -> Iterable[Release]: """Get the releases that need to be loaded.""" self.num_releases = 0 releases = {} missing_releases = set() for t in self.tags: self.num_releases += 1 node, name = self._read_tag(t) node = node.decode() node_bytes = hash_to_bytes(node) if not TAG_PATTERN.match(node): self.log.warn("Wrong pattern (%s) found in tags. Skipping" % (node,)) continue if node_bytes not in self.node_2_rev: self.log.warn( "No matching revision for tag %s " "(hg changeset: %s). Skipping" % (name.decode(), node) ) continue tgt_rev = self.node_2_rev[node_bytes] release = Release( name=name, target=tgt_rev, target_type=ObjectType.REVISION, message=None, metadata=None, synthetic=False, author=Person(name=None, email=None, fullname=b""), date=None, ) missing_releases.add(release.id) releases[release.id] = release self.releases[name] = release.id if missing_releases: missing_releases = set(self.storage.release_missing(list(missing_releases))) for _id in missing_releases: yield releases[_id] def get_snapshot(self) -> Snapshot: """Get the snapshot that need to be loaded.""" branches: Dict[bytes, Optional[SnapshotBranch]] = {} for name, (pointer_nature, target) in self.heads.items(): branches[name] = SnapshotBranch( target=target, target_type=TargetType.REVISION ) if pointer_nature == HEAD_POINTER_NAME: branches[b"HEAD"] = SnapshotBranch( target=name, target_type=TargetType.ALIAS ) for name, target in self.releases.items(): branches[name] = SnapshotBranch( target=target, target_type=TargetType.RELEASE ) self.snapshot = Snapshot(branches=branches) return self.snapshot def get_fetch_history_result(self): """Return the data to store in fetch_history.""" return { "contents": self.num_contents, "directories": self.num_directories, "revisions": self.num_revisions, "releases": self.num_releases, } def load_status(self): snapshot = self.get_snapshot() load_status = "eventful" if self.last_snapshot_id is not None and self.last_snapshot_id == snapshot.id: load_status = "uneventful" return { "status": load_status, } class HgArchiveBundle20Loader(HgBundle20Loader): """Mercurial loader for repository wrapped within archives. """ def __init__(self, url, visit_date=None, archive_path=None): super().__init__( url, visit_date=visit_date, logging_class="swh.loader.mercurial.HgArchiveBundle20Loader", ) self.temp_dir = None self.archive_path = archive_path def prepare(self, *args, **kwargs): self.temp_dir = tmp_extract( archive=self.archive_path, dir=self.temp_directory, prefix=TEMPORARY_DIR_PREFIX_PATTERN, suffix=".dump-%s" % os.getpid(), log=self.log, source=self.origin_url, ) repo_name = os.listdir(self.temp_dir)[0] self.directory = os.path.join(self.temp_dir, repo_name) super().prepare(*args, **kwargs) def cleanup(self): if self.temp_dir and os.path.exists(self.temp_dir): rmtree(self.temp_dir) super().cleanup() diff --git a/swh/loader/mercurial/tests/test_loader.py b/swh/loader/mercurial/tests/test_loader.py index 8df2c0f..b8b0cb4 100644 --- a/swh/loader/mercurial/tests/test_loader.py +++ b/swh/loader/mercurial/tests/test_loader.py @@ -1,320 +1,368 @@ # Copyright (C) 2018-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import copy import logging import os import time import hglib +from hglib.error import CommandError import pytest from swh.loader.tests import ( assert_last_visit_matches, check_snapshot, get_stats, prepare_repository_from_archive, ) from swh.model.hashutil import hash_to_bytes from swh.model.model import RevisionType, Snapshot, SnapshotBranch, TargetType from swh.storage.algos.snapshot import snapshot_get_latest from ..loader import CloneTimeoutError, HgArchiveBundle20Loader, HgBundle20Loader def test_loader_hg_new_visit_no_release(swh_config, datadir, tmp_path): """Eventful visit should yield 1 snapshot""" archive_name = "the-sandbox" archive_path = os.path.join(datadir, f"{archive_name}.tgz") repo_url = prepare_repository_from_archive(archive_path, archive_name, tmp_path) loader = HgBundle20Loader(repo_url) assert loader.load() == {"status": "eventful"} tip_revision_develop = "a9c4534552df370f43f0ef97146f393ef2f2a08c" tip_revision_default = "70e750bb046101fdced06f428e73fee471509c56" expected_snapshot = Snapshot( id=hash_to_bytes("3b8fe58e467deb7597b12a5fd3b2c096b8c02028"), branches={ b"develop": SnapshotBranch( target=hash_to_bytes(tip_revision_develop), target_type=TargetType.REVISION, ), b"default": SnapshotBranch( target=hash_to_bytes(tip_revision_default), target_type=TargetType.REVISION, ), b"HEAD": SnapshotBranch(target=b"develop", target_type=TargetType.ALIAS,), }, ) assert_last_visit_matches( loader.storage, repo_url, status="full", type="hg", snapshot=expected_snapshot.id, ) check_snapshot(expected_snapshot, loader.storage) stats = get_stats(loader.storage) assert stats == { "content": 2, "directory": 3, "origin": 1, "origin_visit": 1, "release": 0, "revision": 58, "skipped_content": 0, "snapshot": 1, } # Ensure archive loader yields the same snapshot loader2 = HgArchiveBundle20Loader( url=archive_path, archive_path=archive_path, visit_date="2016-05-03 15:16:32+00", ) actual_load_status = loader2.load() assert actual_load_status == {"status": "eventful"} stats2 = get_stats(loader2.storage) expected_stats = copy.deepcopy(stats) expected_stats["origin"] += 1 expected_stats["origin_visit"] += 1 assert stats2 == expected_stats # That visit yields the same snapshot assert_last_visit_matches( loader2.storage, archive_path, status="full", type="hg", snapshot=expected_snapshot.id, ) def test_loader_hg_new_visit_with_release(swh_config, datadir, tmp_path): """Eventful visit with release should yield 1 snapshot""" archive_name = "hello" archive_path = os.path.join(datadir, f"{archive_name}.tgz") repo_url = prepare_repository_from_archive(archive_path, archive_name, tmp_path) loader = HgBundle20Loader(url=repo_url, visit_date="2016-05-03 15:16:32+00",) actual_load_status = loader.load() assert actual_load_status == {"status": "eventful"} # then stats = get_stats(loader.storage) assert stats == { "content": 3, "directory": 3, "origin": 1, "origin_visit": 1, "release": 1, "revision": 3, "skipped_content": 0, "snapshot": 1, } # cf. test_loader.org for explaining from where those hashes tip_release = hash_to_bytes("515c4d72e089404356d0f4b39d60f948b8999140") release = loader.storage.release_get([tip_release])[0] assert release is not None tip_revision_default = hash_to_bytes("c3dbe4fbeaaa98dd961834e4007edb3efb0e2a27") revision = loader.storage.revision_get([tip_revision_default])[0] assert revision is not None expected_snapshot = Snapshot( id=hash_to_bytes("d35668e02e2ba4321dc951cd308cf883786f918a"), branches={ b"default": SnapshotBranch( target=tip_revision_default, target_type=TargetType.REVISION, ), b"0.1": SnapshotBranch(target=tip_release, target_type=TargetType.RELEASE,), b"HEAD": SnapshotBranch(target=b"default", target_type=TargetType.ALIAS,), }, ) check_snapshot(expected_snapshot, loader.storage) assert_last_visit_matches( loader.storage, repo_url, type=RevisionType.MERCURIAL.value, status="full", snapshot=expected_snapshot.id, ) # Ensure archive loader yields the same snapshot loader2 = HgArchiveBundle20Loader( url=archive_path, archive_path=archive_path, visit_date="2016-05-03 15:16:32+00", ) actual_load_status = loader2.load() assert actual_load_status == {"status": "eventful"} stats2 = get_stats(loader2.storage) expected_stats = copy.deepcopy(stats) expected_stats["origin"] += 1 expected_stats["origin_visit"] += 1 assert stats2 == expected_stats # That visit yields the same snapshot assert_last_visit_matches( loader2.storage, archive_path, status="full", type="hg", snapshot=expected_snapshot.id, ) def test_visit_with_archive_decompression_failure(swh_config, mocker, datadir): """Failure to decompress should fail early, no data is ingested""" mock_patoo = mocker.patch("swh.loader.mercurial.archive_extract.patoolib") mock_patoo.side_effect = ValueError archive_name = "hello" archive_path = os.path.join(datadir, f"{archive_name}.tgz") loader = HgArchiveBundle20Loader( url=archive_path, visit_date="2016-05-03 15:16:32+00", ) actual_load_status = loader.load() assert actual_load_status == {"status": "failed"} stats = get_stats(loader.storage) assert stats == { "content": 0, "directory": 0, "origin": 1, "origin_visit": 1, "release": 0, "revision": 0, "skipped_content": 0, "snapshot": 0, } # That visit yields the same snapshot assert_last_visit_matches( loader.storage, archive_path, status="failed", type="hg", snapshot=None ) def test_visit_error_with_snapshot_partial(swh_config, datadir, tmp_path, mocker): """Incomplete ingestion leads to a 'partial' ingestion status""" mock = mocker.patch("swh.loader.mercurial.loader.HgBundle20Loader.store_metadata") mock.side_effect = ValueError archive_name = "the-sandbox" archive_path = os.path.join(datadir, f"{archive_name}.tgz") repo_url = prepare_repository_from_archive(archive_path, archive_name, tmp_path) loader = HgBundle20Loader(repo_url) assert loader.load() == {"status": "failed"} assert_last_visit_matches( loader.storage, repo_url, status="partial", type="hg", snapshot=hash_to_bytes("3b8fe58e467deb7597b12a5fd3b2c096b8c02028"), ) +@pytest.mark.parametrize( + "error_msg", + [ + b"does not appear to be an hg repository", + b"404: Not Found", + b" Name or service not known", + ], +) +def test_visit_error_with_status_not_found( + swh_config, datadir, tmp_path, mocker, error_msg +): + """Not reaching the repo leads to a 'not_found' ingestion status""" + mock = mocker.patch("hglib.clone") + mock.side_effect = CommandError((), 255, b"", error_msg) + + archive_name = "the-sandbox" + archive_path = os.path.join(datadir, f"{archive_name}.tgz") + repo_url = prepare_repository_from_archive(archive_path, archive_name, tmp_path) + + loader = HgBundle20Loader(repo_url) + + assert loader.load() == {"status": "uneventful"} + + assert_last_visit_matches( + loader.storage, repo_url, status="not_found", type="hg", snapshot=None, + ) + + +def test_visit_error_with_clone_error(swh_config, datadir, tmp_path, mocker): + """Testing failures other than 'not_found'""" + + mock = mocker.patch("hglib.clone") + mock.side_effect = CommandError((), 255, b"", b"out of disk space") + + archive_name = "the-sandbox" + archive_path = os.path.join(datadir, f"{archive_name}.tgz") + repo_url = prepare_repository_from_archive(archive_path, archive_name, tmp_path) + + loader = HgBundle20Loader(repo_url) + + assert loader.load() == {"status": "failed"} + + assert_last_visit_matches( + loader.storage, repo_url, status="failed", type="hg", snapshot=None, + ) + + def test_visit_repository_with_transplant_operations(swh_config, datadir, tmp_path): """Visit a mercurial repository visit transplant operations within should yield a snapshot as well. """ archive_name = "transplant" archive_path = os.path.join(datadir, f"{archive_name}.tgz") repo_url = prepare_repository_from_archive(archive_path, archive_name, tmp_path) loader = HgBundle20Loader(url=repo_url, visit_date="2019-05-23 12:06:00+00",) # load hg repository actual_load_status = loader.load() assert actual_load_status == {"status": "eventful"} # collect swh revisions assert_last_visit_matches( loader.storage, repo_url, type=RevisionType.MERCURIAL.value, status="full" ) revisions = [] snapshot = snapshot_get_latest(loader.storage, repo_url) for branch in snapshot.branches.values(): if branch.target_type.value != "revision": continue revisions.append(branch.target) # extract original changesets info and the transplant sources hg_changesets = set() transplant_sources = set() for rev in loader.storage.revision_log(revisions): hg_changesets.add(rev["metadata"]["node"]) for k, v in rev["extra_headers"]: if k == b"transplant_source": transplant_sources.add(v.decode("ascii")) # check extracted data are valid assert len(hg_changesets) > 0 assert len(transplant_sources) > 0 assert transplant_sources.issubset(hg_changesets) def test_clone_with_timeout_timeout(caplog, tmp_path, monkeypatch): log = logging.getLogger("test_clone_with_timeout") def clone_timeout(source, dest): time.sleep(60) monkeypatch.setattr(hglib, "clone", clone_timeout) with pytest.raises(CloneTimeoutError): HgBundle20Loader.clone_with_timeout( log, "https://www.mercurial-scm.org/repo/hello", tmp_path, 1 ) for record in caplog.records: assert record.levelname == "WARNING" assert "https://www.mercurial-scm.org/repo/hello" in record.getMessage() assert record.args == ("https://www.mercurial-scm.org/repo/hello", 1) def test_clone_with_timeout_returns(caplog, tmp_path, monkeypatch): log = logging.getLogger("test_clone_with_timeout") def clone_return(source, dest): return (source, dest) monkeypatch.setattr(hglib, "clone", clone_return) assert HgBundle20Loader.clone_with_timeout( log, "https://www.mercurial-scm.org/repo/hello", tmp_path, 1 ) == ("https://www.mercurial-scm.org/repo/hello", tmp_path) def test_clone_with_timeout_exception(caplog, tmp_path, monkeypatch): log = logging.getLogger("test_clone_with_timeout") def clone_return(source, dest): raise ValueError("Test exception") monkeypatch.setattr(hglib, "clone", clone_return) with pytest.raises(ValueError) as excinfo: HgBundle20Loader.clone_with_timeout( log, "https://www.mercurial-scm.org/repo/hello", tmp_path, 1 ) assert "Test exception" in excinfo.value.args[0]