diff --git a/setup.py b/setup.py index a1613bd..dc9699f 100755 --- a/setup.py +++ b/setup.py @@ -1,73 +1,74 @@ #!/usr/bin/env python3 # Copyright (C) 2015-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from io import open from os import path from setuptools import find_packages, setup here = path.abspath(path.dirname(__file__)) # Get the long description from the README file with open(path.join(here, "README.md"), encoding="utf-8") as f: long_description = f.read() def parse_requirements(name=None): if name: reqf = "requirements-%s.txt" % name else: reqf = "requirements.txt" requirements = [] if not path.exists(reqf): return requirements with open(reqf) as f: for line in f.readlines(): line = line.strip() if not line or line.startswith("#"): continue requirements.append(line) return requirements setup( name="swh.loader.mercurial", description="Software Heritage Mercurial Loader", long_description=long_description, long_description_content_type="text/markdown", python_requires=">=3.7", author="Software Heritage developers", author_email="swh-devel@inria.fr", url="https://forge.softwareheritage.org/diffusion/DLDHG/", packages=find_packages(), scripts=[], install_requires=parse_requirements() + parse_requirements("swh"), setup_requires=["setuptools-scm"], use_scm_version=True, extras_require={"testing": parse_requirements("test")}, include_package_data=True, entry_points=""" [swh.workers] loader.mercurial=swh.loader.mercurial:register + loader.mercurial_from_disk=swh.loader.mercurial:register_from_disk [console_scripts] swh-hg-identify=swh.loader.mercurial.identify:main """, classifiers=[ "Programming Language :: Python :: 3", "Intended Audience :: Developers", "License :: OSI Approved :: GNU General Public License v3 (GPLv3)", "Operating System :: OS Independent", "Development Status :: 4 - Beta", ], project_urls={ "Bug Reports": "https://forge.softwareheritage.org/maniphest", "Funding": "https://www.softwareheritage.org/donate", "Source": ("https://forge.softwareheritage.org/source/swh-loader-mercurial"), "Documentation": "https://docs.softwareheritage.org/devel/swh-loader-mercurial/", # NoQA: E501 }, ) diff --git a/swh/loader/mercurial/__init__.py b/swh/loader/mercurial/__init__.py index 7f81f85..5494685 100644 --- a/swh/loader/mercurial/__init__.py +++ b/swh/loader/mercurial/__init__.py @@ -1,17 +1,27 @@ # Copyright (C) 2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from typing import Any, Mapping def register() -> Mapping[str, Any]: """Register the current worker module's definition""" from .loader import HgBundle20Loader return { "task_modules": [f"{__name__}.tasks"], "loader": HgBundle20Loader, } + + +def register_from_disk() -> Mapping[str, Any]: + """Register the current worker module's definition""" + from .from_disk import HgLoaderFromDisk + + return { + "task_modules": [f"{__name__}.tasks_from_disk"], + "loader": HgLoaderFromDisk, + } diff --git a/swh/loader/mercurial/from_disk.py b/swh/loader/mercurial/from_disk.py new file mode 100644 index 0000000..cfc0aac --- /dev/null +++ b/swh/loader/mercurial/from_disk.py @@ -0,0 +1,467 @@ +# Copyright (C) 2020 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import os +from collections import deque +from datetime import datetime, timezone +from shutil import rmtree +from tempfile import mkdtemp +from typing import Any, Deque, Dict, Optional, Tuple, Union + +import dateutil + +from swh.core.config import merge_configs +from swh.loader.core.loader import BaseLoader +from swh.loader.core.utils import clean_dangling_folders +from swh.model.from_disk import Content, DentryPerms, Directory +from swh.model.hashutil import MultiHash, hash_to_bytehex +from swh.model.model import Content as ModelContent +from swh.model.model import ( + ObjectType, + Origin, + Person, + Release, + Revision, + RevisionType, + Sha1Git, + Snapshot, + SnapshotBranch, + TargetType, + TimestampWithTimezone, +) + +from . import hgutil +from .archive_extract import tmp_extract +from .hgutil import HgNodeId + +FLAG_PERMS = { + b"l": DentryPerms.symlink, + b"x": DentryPerms.executable_content, + b"": DentryPerms.content, +} # type: Dict[bytes, DentryPerms] +DEFAULT_CONFIG: Dict[str, Any] = { + "temp_directory": "/tmp", + "clone_timeout_seconds": 7200, + "content_cache_size": 10_000, +} +TEMPORARY_DIR_PREFIX_PATTERN = "swh.loader.mercurial.from_disk" + + +def parse_visit_date(visit_date: Optional[Union[datetime, str]]) -> Optional[datetime]: + """Convert visit date from Optional[Union[str, datetime]] to Optional[datetime]. + + `HgLoaderFromDisk` accepts `str` and `datetime` as visit date + while `BaseLoader` only deals with `datetime`. + """ + if visit_date is None: + return None + + if isinstance(visit_date, datetime): + return visit_date + + if visit_date == "now": + return datetime.now(tz=timezone.utc) + + if isinstance(visit_date, str): + return dateutil.parser.parse(visit_date) + + return ValueError(f"invalid visit date {visit_date!r}") + + +class HgDirectory(Directory): + """A directory that creates parent directories if missing.""" + + def __setitem__(self, path: bytes, value: Union[Content, "HgDirectory"]) -> None: + if b"/" in path: + head, tail = path.split(b"/", 1) + + directory = self.get(head) + if directory is None: + directory = HgDirectory() + self[head] = directory + + directory[tail] = value + else: + super().__setitem__(path, value) + + +class HgLoaderFromDisk(BaseLoader): + """Load a mercurial repository from a local repository.""" + + CONFIG_BASE_FILENAME = "loader/mercurial" + + visit_type = "hg" + + def __init__( + self, + url: str, + directory: Optional[str] = None, + logging_class: str = "swh.loader.mercurial.LoaderFromDisk", + visit_date: Optional[Union[datetime, str]] = None, + config: Optional[Dict[str, Any]] = None, + ): + """Initialize the loader. + + Args: + url: url of the repository. + directory: directory of the local repository. + logging_class: class of the loader logger. + visit_date: visit date of the repository + config: loader configuration + """ + super().__init__(logging_class=logging_class, config=config or {}) + + self.config = merge_configs(DEFAULT_CONFIG, self.config) + self._temp_directory = self.config["temp_directory"] + self._clone_timeout = self.config["clone_timeout_seconds"] + + self.origin_url = url + self.visit_date = parse_visit_date(visit_date) + self.directory = directory + + self._repo: Optional[hgutil.Repository] = None + self._revision_nodeid_to_swhid: Dict[HgNodeId, Sha1Git] = {} + self._repo_directory: Optional[str] = None + + def pre_cleanup(self) -> None: + """As a first step, will try and check for dangling data to cleanup. + This should do its best to avoid raising issues. + + """ + clean_dangling_folders( + self._temp_directory, + pattern_check=TEMPORARY_DIR_PREFIX_PATTERN, + log=self.log, + ) + + def cleanup(self) -> None: + """Last step executed by the loader.""" + if self._repo_directory and os.path.exists(self._repo_directory): + self.log.debug(f"Cleanup up repository {self._repo_directory}") + rmtree(self._repo_directory) + + def prepare_origin_visit(self, *args, **kwargs) -> None: + """First step executed by the loader to prepare origin and visit + references. Set/update self.origin, and + optionally self.origin_url, self.visit_date. + + """ + self.origin = Origin(url=self.origin_url) + + def prepare(self, *args, **kwargs) -> None: + """Second step executed by the loader to prepare some state needed by + the loader. + + """ + + def fetch_data(self) -> bool: + """Fetch the data from the source the loader is currently loading + + Returns: + a value that is interpreted as a boolean. If True, fetch_data needs + to be called again to complete loading. + + """ + if not self.directory: # no local repository + self._repo_directory = mkdtemp( + prefix=TEMPORARY_DIR_PREFIX_PATTERN, + suffix=f"-{os.getpid()}", + dir=self._temp_directory, + ) + self.log.debug( + f"Cloning {self.origin_url} to {self.directory} " + f"with timeout {self._clone_timeout} seconds" + ) + hgutil.clone(self.origin_url, self._repo_directory, self._clone_timeout) + else: # existing local repository + # Allow to load on disk repository without cloning + # for testing purpose. + self._repo_directory = self.directory + + self._repo = hgutil.repository(self._repo_directory) + + return False + + def store_data(self): + """Store fetched data in the database.""" + for rev in self._repo: + self.store_revision(self._repo[rev]) + + branch_by_hg_nodeid: Dict[HgNodeId, bytes] = { + hg_nodeid: name for name, hg_nodeid in hgutil.branches(self._repo).items() + } + tags_by_name: Dict[bytes, HgNodeId] = self._repo.tags() + tags_by_hg_nodeid: Dict[HgNodeId, bytes] = { + hg_nodeid: name for name, hg_nodeid in tags_by_name.items() + } + + snapshot_branches: Dict[bytes, SnapshotBranch] = {} + + for hg_nodeid, revision_swhid in self._revision_nodeid_to_swhid.items(): + tag_name = tags_by_hg_nodeid.get(hg_nodeid) + + # tip is listed in the tags by the mercurial api + # but its not a tag defined by the user in `.hgtags` + if tag_name and tag_name != b"tip": + snapshot_branches[tag_name] = SnapshotBranch( + target=self.store_release(tag_name, revision_swhid), + target_type=TargetType.RELEASE, + ) + + if hg_nodeid in branch_by_hg_nodeid: + name = branch_by_hg_nodeid[hg_nodeid] + snapshot_branches[name] = SnapshotBranch( + target=revision_swhid, target_type=TargetType.REVISION, + ) + + # The tip is mapped to `HEAD` to match + # the historical implementation + if hg_nodeid == tags_by_name[b"tip"]: + snapshot_branches[b"HEAD"] = SnapshotBranch( + target=name, target_type=TargetType.ALIAS, + ) + + snapshot = Snapshot(branches=snapshot_branches) + self.storage.snapshot_add([snapshot]) + + self.flush() + self.loaded_snapshot_id = snapshot.id + + def get_revision_id_from_hg_nodeid(self, hg_nodeid: HgNodeId) -> Sha1Git: + """Return the swhid of a revision given its hg nodeid. + + Args: + hg_nodeid: the hg nodeid of the revision. + + Returns: + the swhid of the revision. + """ + return self._revision_nodeid_to_swhid[hg_nodeid] + + def get_revision_parents(self, rev_ctx: hgutil.BaseContext) -> Tuple[Sha1Git, ...]: + """Return the swhids of the parent revisions. + + Args: + hg_nodeid: the hg nodeid of the revision. + + Returns: + the swhids of the parent revisions. + """ + parents = [] + for parent_ctx in rev_ctx.parents(): + parent_hg_nodeid = parent_ctx.node() + # nullid is the value of a parent that does not exist + if parent_hg_nodeid == hgutil.NULLID: + continue + parents.append(self.get_revision_id_from_hg_nodeid(parent_hg_nodeid)) + + return tuple(parents) + + def store_revision(self, rev_ctx: hgutil.BaseContext) -> None: + """Store a revision given its hg nodeid. + + Args: + rev_ctx: the he revision context. + + Returns: + the swhid of the stored revision. + """ + hg_nodeid = rev_ctx.node() + + root_swhid = self.store_directories(rev_ctx) + + # `Person.from_fullname` is compatible with mercurial's freeform author + # as fullname is what is used in revision hash when available. + author = Person.from_fullname(rev_ctx.user()) + + (timestamp, offset) = rev_ctx.date() + + # TimestampWithTimezone.from_dict will change name + # as it accept more than just dicts + rev_date = TimestampWithTimezone.from_dict(int(timestamp)) + + extra_headers = [ + (b"time_offset_seconds", str(offset).encode(),), + ] + for key, value in rev_ctx.extra().items(): + # The default branch is skipped to match + # the historical implementation + if key == b"branch" and value == b"default": + continue + + # transplant_source is converted to match + # the historical implementation + if key == b"transplant_source": + value = hash_to_bytehex(value) + extra_headers.append((key, value)) + + revision = Revision( + author=author, + date=rev_date, + committer=author, + committer_date=rev_date, + type=RevisionType.MERCURIAL, + directory=root_swhid, + message=rev_ctx.description(), + metadata={"node": hg_nodeid.hex()}, + extra_headers=tuple(extra_headers), + synthetic=False, + parents=self.get_revision_parents(rev_ctx), + ) + + self._revision_nodeid_to_swhid[hg_nodeid] = revision.id + self.storage.revision_add([revision]) + + def store_release(self, name: bytes, target=Sha1Git) -> Sha1Git: + """Store a release given its name and its target. + + A release correspond to a user defined tag in mercurial. + The mercurial api as a `tip` tag that must be ignored. + + Args: + name: name of the release. + target: swhid of the target revision. + + Returns: + the swhid of the stored release. + """ + release = Release( + name=name, + target=target, + target_type=ObjectType.REVISION, + message=None, + metadata=None, + synthetic=False, + author=Person(name=None, email=None, fullname=b""), + date=None, + ) + + self.storage.release_add([release]) + + return release.id + + def store_content(self, rev_ctx: hgutil.BaseContext, file_path: bytes) -> Content: + """Store a revision content hg nodeid and file path. + + Content is a mix of file content at a given revision + and its permissions found in the changeset's manifest. + + Args: + rev_ctx: the he revision context. + file_path: the hg path of the content. + + Returns: + the swhid of the top level directory. + """ + hg_nodeid = rev_ctx.node() + file_ctx = rev_ctx[file_path] + + perms = FLAG_PERMS[file_ctx.flags()] + data = file_ctx.data() # caching is simple and will come in the next revision. + + content_data = MultiHash.from_data(data).digest() + content_data["length"] = len(data) + content_data["perms"] = perms + content_data["data"] = data + content_data["status"] = "visible" + content = Content(content_data) + + model = content.to_model() + if isinstance(model, ModelContent): + self.storage.content_add([model]) + else: + raise ValueError( + f"{file_path!r} at rev {hg_nodeid.hex()!r} " + "produced {type(model)!r} instead of {ModelContent!r}" + ) + + # Here we make sure to return only necessary data. + return Content({"sha1_git": content.hash, "perms": perms}) + + def store_directories(self, rev_ctx: hgutil.BaseContext) -> Sha1Git: + """Store a revision directories given its hg nodeid. + + Mercurial as no directory as in git. A Git like tree must be build + from file paths to obtain each directory hash. + + Args: + rev_ctx: the he revision context. + + Returns: + the swhid of the top level directory. + """ + root = HgDirectory() + for file_path in rev_ctx.manifest(): + content = self.store_content(rev_ctx, file_path) + root[file_path] = content + + directories: Deque[Directory] = deque([root]) + while directories: + directory = directories.pop() + self.storage.directory_add([directory.to_model()]) + directories.extend( + [item for item in directory.values() if isinstance(item, Directory)] + ) + + return root.hash + + +class HgArchiveLoaderFromDisk(HgLoaderFromDisk): + """Mercurial loader for repository wrapped within tarballs.""" + + def __init__( + self, url: str, visit_date: Optional[datetime] = None, archive_path: str = None + ): + super().__init__( + url, + visit_date=visit_date, + logging_class="swh.loader.mercurial.ArchiveLoaderFromDisk", + ) + self.temp_dir = None + self.archive_path = archive_path + + def prepare(self, *args, **kwargs): + """Extract the archive instead of cloning.""" + self._temp_directory = tmp_extract( + archive=self.archive_path, + dir=self._temp_directory, + prefix=TEMPORARY_DIR_PREFIX_PATTERN, + suffix=f".dump-{os.getpid()}", + log=self.log, + source=self.origin_url, + ) + + repo_name = os.listdir(self.temp_dir)[0] + self.directory = os.path.join(self.temp_dir, repo_name) + super().prepare(*args, **kwargs) + + def cleanup(self) -> None: + """Remove the extracted archive instead of the cloned repository.""" + if self.temp_dir and os.path.exists(self.temp_dir): + rmtree(self.temp_dir) + super().cleanup() + + +# Allow direct usage of the loader from the command line with +# `python -m swh.loader.mercurial.from_disk $ORIGIN_URL` +if __name__ == "__main__": + import logging + + import click + + logging.basicConfig( + level=logging.DEBUG, format="%(asctime)s %(process)d %(message)s" + ) + + @click.command() + @click.option("--origin-url", help="origin url") + @click.option("--hg-directory", help="Path to mercurial repository to load") + @click.option("--visit-date", default=None, help="Visit date") + def main(origin_url, hg_directory, visit_date): + return HgLoaderFromDisk( + origin_url, directory=hg_directory, visit_date=visit_date + ).load() + + main() diff --git a/swh/loader/mercurial/hgutil.py b/swh/loader/mercurial/hgutil.py new file mode 100644 index 0000000..9fdc1a4 --- /dev/null +++ b/swh/loader/mercurial/hgutil.py @@ -0,0 +1,77 @@ +import io +import traceback +from multiprocessing import Process, Queue +from typing import Dict, NewType + +# The internal Mercurial API is not guaranteed to be stable. +import mercurial.ui # type: ignore +from mercurial import context, hg + +NULLID = mercurial.node.nullid +HgNodeId = NewType("HgNodeId", bytes) +Repository = hg.localrepo +BaseContext = context.basectx + + +def repository(path: str) -> hg.localrepo: + ui = mercurial.ui.ui.load() + return hg.repository(ui, path.encode()) + + +def branches(repo: hg.localrepo) -> Dict[bytes, HgNodeId]: + """List repository named branches and their tip node.""" + result = {} + for tag, heads, tip, isclosed in repo.branchmap().iterbranches(): + if isclosed: + continue + result[tag] = tip + return result + + +class CloneTimeout(Exception): + pass + + +class CloneFailure(Exception): + pass + + +def _clone_task(src: str, dest: str, errors: Queue) -> None: + """Clone task to run in a subprocess. + + Args: + src: clone source + dest: clone destination + errors: message queue to communicate errors + """ + try: + hg.clone(mercurial.ui.ui.load(), {}, src.encode(), dest.encode()) + except Exception as e: + exc_buffer = io.StringIO() + traceback.print_exc(file=exc_buffer) + errors.put_nowait(exc_buffer.getvalue()) + raise e + + +def clone(src: str, dest: str, timeout: int) -> None: + """Clone a repository with timeout. + + Args: + src: clone source + dest: clone destination + timeout: timeout in seconds + """ + errors: Queue = Queue() + process = Process(target=_clone_task, args=(src, dest, errors)) + process.start() + process.join(timeout) + + if process.is_alive(): + process.terminate() + process.join(1) + if process.is_alive(): + process.kill() + raise CloneTimeout(src, timeout) + + if not errors.empty(): + raise CloneFailure(src, dest, errors.get()) diff --git a/swh/loader/mercurial/tasks_from_disk.py b/swh/loader/mercurial/tasks_from_disk.py new file mode 100644 index 0000000..56c01eb --- /dev/null +++ b/swh/loader/mercurial/tasks_from_disk.py @@ -0,0 +1,33 @@ +# Copyright (C) 2020 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from celery import shared_task + +from .from_disk import HgArchiveLoaderFromDisk, HgLoaderFromDisk + + +@shared_task(name=__name__ + ".LoadMercurialFromDisk") +def load_hg(*, url, directory=None, visit_date=None): + """Mercurial repository loading + + Import a mercurial tarball into swh. + + Args: see :func:`DepositLoader.load`. + + """ + loader = HgLoaderFromDisk(url, directory=directory, visit_date=visit_date) + return loader.load() + + +@shared_task(name=__name__ + ".LoadArchiveMercurialFromDisk") +def load_hg_from_archive(*, url, archive_path=None, visit_date=None): + """Import a mercurial tarball into swh. + + Args: see :func:`DepositLoader.load`. + """ + loader = HgArchiveLoaderFromDisk( + url, archive_path=archive_path, visit_date=visit_date + ) + return loader.load() diff --git a/swh/loader/mercurial/tests/test_loader.py b/swh/loader/mercurial/tests/test_from_disk.py similarity index 56% copy from swh/loader/mercurial/tests/test_loader.py copy to swh/loader/mercurial/tests/test_from_disk.py index c84d27e..bfb0ebe 100644 --- a/swh/loader/mercurial/tests/test_loader.py +++ b/swh/loader/mercurial/tests/test_from_disk.py @@ -1,310 +1,205 @@ -# Copyright (C) 2018-2020 The Software Heritage developers +# Copyright (C) 2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -import copy -import logging import os -import time - -import hglib -import pytest from swh.loader.tests import ( assert_last_visit_matches, check_snapshot, get_stats, prepare_repository_from_archive, ) +from swh.model.from_disk import Content from swh.model.hashutil import hash_to_bytes from swh.model.model import RevisionType, Snapshot, SnapshotBranch, TargetType from swh.storage.algos.snapshot import snapshot_get_latest -from ..loader import CloneTimeoutError, HgArchiveBundle20Loader, HgBundle20Loader +from ..from_disk import HgDirectory, HgLoaderFromDisk from .loader_checker import ExpectedSwhids, LoaderChecker +def test_hg_directory_creates_missing_directories(): + directory = HgDirectory() + directory[b"path/to/some/content"] = Content() + + +# Those tests assert expectations on repository loading +# by reading expected values from associated json files +# produced by the `swh-hg-identify` command line utility. +# +# It has more granularity than historical tests. +# Assertions will tell if the error comes from the directories +# revisions or release rather than only checking the snapshot. +# +# With more work it should event be possible to know which part +# of an object is faulty. def test_examples(swh_config, datadir, tmp_path): for archive_name in ("hello", "transplant", "the-sandbox", "example"): archive_path = os.path.join(datadir, f"{archive_name}.tgz") json_path = os.path.join(datadir, f"{archive_name}.json") repo_url = prepare_repository_from_archive(archive_path, archive_name, tmp_path) LoaderChecker( - loader=HgBundle20Loader(repo_url), expected=ExpectedSwhids.load(json_path), + loader=HgLoaderFromDisk(repo_url), expected=ExpectedSwhids.load(json_path), ).check() +# This test has as been adapted from the historical `HgBundle20Loader` tests +# to ensure compatibility of `HgLoaderFromDisk`. +# Hashes as been produced by copy pasting the result of the implementation +# to prevent regressions. def test_loader_hg_new_visit_no_release(swh_config, datadir, tmp_path): """Eventful visit should yield 1 snapshot""" archive_name = "the-sandbox" archive_path = os.path.join(datadir, f"{archive_name}.tgz") repo_url = prepare_repository_from_archive(archive_path, archive_name, tmp_path) - loader = HgBundle20Loader(repo_url) + loader = HgLoaderFromDisk(url=repo_url) assert loader.load() == {"status": "eventful"} tip_revision_develop = "a9c4534552df370f43f0ef97146f393ef2f2a08c" tip_revision_default = "70e750bb046101fdced06f428e73fee471509c56" expected_snapshot = Snapshot( id=hash_to_bytes("3b8fe58e467deb7597b12a5fd3b2c096b8c02028"), branches={ b"develop": SnapshotBranch( target=hash_to_bytes(tip_revision_develop), target_type=TargetType.REVISION, ), b"default": SnapshotBranch( target=hash_to_bytes(tip_revision_default), target_type=TargetType.REVISION, ), b"HEAD": SnapshotBranch(target=b"develop", target_type=TargetType.ALIAS,), }, ) assert_last_visit_matches( loader.storage, repo_url, status="full", type="hg", snapshot=expected_snapshot.id, ) check_snapshot(expected_snapshot, loader.storage) stats = get_stats(loader.storage) assert stats == { "content": 2, "directory": 3, "origin": 1, "origin_visit": 1, "release": 0, "revision": 58, "skipped_content": 0, "snapshot": 1, } - # Ensure archive loader yields the same snapshot - loader2 = HgArchiveBundle20Loader( - url=archive_path, - archive_path=archive_path, - visit_date="2016-05-03 15:16:32+00", - ) - - actual_load_status = loader2.load() - assert actual_load_status == {"status": "eventful"} - - stats2 = get_stats(loader2.storage) - expected_stats = copy.deepcopy(stats) - expected_stats["origin"] += 1 - expected_stats["origin_visit"] += 1 - assert stats2 == expected_stats - - # That visit yields the same snapshot - assert_last_visit_matches( - loader2.storage, - archive_path, - status="full", - type="hg", - snapshot=expected_snapshot.id, - ) - +# This test has as been adapted from the historical `HgBundle20Loader` tests +# to ensure compatibility of `HgLoaderFromDisk`. +# Hashes as been produced by copy pasting the result of the implementation +# to prevent regressions. def test_loader_hg_new_visit_with_release(swh_config, datadir, tmp_path): """Eventful visit with release should yield 1 snapshot""" + archive_name = "hello" archive_path = os.path.join(datadir, f"{archive_name}.tgz") repo_url = prepare_repository_from_archive(archive_path, archive_name, tmp_path) - loader = HgBundle20Loader(url=repo_url, visit_date="2016-05-03 15:16:32+00",) + loader = HgLoaderFromDisk(url=repo_url, visit_date="2016-05-03 15:16:32+00") actual_load_status = loader.load() assert actual_load_status == {"status": "eventful"} # then stats = get_stats(loader.storage) assert stats == { "content": 3, "directory": 3, "origin": 1, "origin_visit": 1, "release": 1, "revision": 3, "skipped_content": 0, "snapshot": 1, } # cf. test_loader.org for explaining from where those hashes tip_release = hash_to_bytes("515c4d72e089404356d0f4b39d60f948b8999140") release = loader.storage.release_get([tip_release])[0] assert release is not None tip_revision_default = hash_to_bytes("c3dbe4fbeaaa98dd961834e4007edb3efb0e2a27") revision = loader.storage.revision_get([tip_revision_default])[0] assert revision is not None expected_snapshot = Snapshot( id=hash_to_bytes("d35668e02e2ba4321dc951cd308cf883786f918a"), branches={ b"default": SnapshotBranch( target=tip_revision_default, target_type=TargetType.REVISION, ), b"0.1": SnapshotBranch(target=tip_release, target_type=TargetType.RELEASE,), b"HEAD": SnapshotBranch(target=b"default", target_type=TargetType.ALIAS,), }, ) check_snapshot(expected_snapshot, loader.storage) assert_last_visit_matches( loader.storage, repo_url, type=RevisionType.MERCURIAL.value, status="full", snapshot=expected_snapshot.id, ) - # Ensure archive loader yields the same snapshot - loader2 = HgArchiveBundle20Loader( - url=archive_path, - archive_path=archive_path, - visit_date="2016-05-03 15:16:32+00", - ) - - actual_load_status = loader2.load() - assert actual_load_status == {"status": "eventful"} - - stats2 = get_stats(loader2.storage) - expected_stats = copy.deepcopy(stats) - expected_stats["origin"] += 1 - expected_stats["origin_visit"] += 1 - assert stats2 == expected_stats - - # That visit yields the same snapshot - assert_last_visit_matches( - loader2.storage, - archive_path, - status="full", - type="hg", - snapshot=expected_snapshot.id, - ) - - -def test_visit_with_archive_decompression_failure(swh_config, mocker, datadir): - """Failure to decompress should fail early, no data is ingested""" - mock_patoo = mocker.patch("swh.loader.mercurial.archive_extract.patoolib") - mock_patoo.side_effect = ValueError - - archive_name = "hello" - archive_path = os.path.join(datadir, f"{archive_name}.tgz") - # Ensure archive loader yields the same snapshot - loader = HgArchiveBundle20Loader( - url=archive_path, visit_date="2016-05-03 15:16:32+00", - ) - - actual_load_status = loader.load() - assert actual_load_status == {"status": "failed"} - - stats = get_stats(loader.storage) - assert stats == { - "content": 0, - "directory": 0, - "origin": 1, - "origin_visit": 1, - "release": 0, - "revision": 0, - "skipped_content": 0, - "snapshot": 0, - } - # That visit yields the same snapshot - assert_last_visit_matches( - loader.storage, archive_path, status="partial", type="hg", snapshot=None - ) - +# This test has as been adapted from the historical `HgBundle20Loader` tests +# to ensure compatibility of `HgLoaderFromDisk`. +# Hashes as been produced by copy pasting the result of the implementation +# to prevent regressions. def test_visit_repository_with_transplant_operations(swh_config, datadir, tmp_path): """Visit a mercurial repository visit transplant operations within should yield a snapshot as well. """ archive_name = "transplant" archive_path = os.path.join(datadir, f"{archive_name}.tgz") repo_url = prepare_repository_from_archive(archive_path, archive_name, tmp_path) - loader = HgBundle20Loader(url=repo_url, visit_date="2019-05-23 12:06:00+00",) + + loader = HgLoaderFromDisk(url=repo_url, visit_date="2016-05-03 15:16:32+00") # load hg repository actual_load_status = loader.load() assert actual_load_status == {"status": "eventful"} # collect swh revisions assert_last_visit_matches( loader.storage, repo_url, type=RevisionType.MERCURIAL.value, status="full" ) revisions = [] snapshot = snapshot_get_latest(loader.storage, repo_url) for branch in snapshot.branches.values(): if branch.target_type.value != "revision": continue revisions.append(branch.target) # extract original changesets info and the transplant sources hg_changesets = set() transplant_sources = set() for rev in loader.storage.revision_log(revisions): hg_changesets.add(rev["metadata"]["node"]) for k, v in rev["extra_headers"]: if k == b"transplant_source": transplant_sources.add(v.decode("ascii")) # check extracted data are valid assert len(hg_changesets) > 0 assert len(transplant_sources) > 0 assert transplant_sources.issubset(hg_changesets) - - -def test_clone_with_timeout_timeout(caplog, tmp_path, monkeypatch): - log = logging.getLogger("test_clone_with_timeout") - - def clone_timeout(source, dest): - time.sleep(60) - - monkeypatch.setattr(hglib, "clone", clone_timeout) - - with pytest.raises(CloneTimeoutError): - HgBundle20Loader.clone_with_timeout( - log, "https://www.mercurial-scm.org/repo/hello", tmp_path, 1 - ) - - for record in caplog.records: - assert record.levelname == "WARNING" - assert "https://www.mercurial-scm.org/repo/hello" in record.getMessage() - assert record.args == ("https://www.mercurial-scm.org/repo/hello", 1) - - -def test_clone_with_timeout_returns(caplog, tmp_path, monkeypatch): - log = logging.getLogger("test_clone_with_timeout") - - def clone_return(source, dest): - return (source, dest) - - monkeypatch.setattr(hglib, "clone", clone_return) - - assert HgBundle20Loader.clone_with_timeout( - log, "https://www.mercurial-scm.org/repo/hello", tmp_path, 1 - ) == ("https://www.mercurial-scm.org/repo/hello", tmp_path) - - -def test_clone_with_timeout_exception(caplog, tmp_path, monkeypatch): - log = logging.getLogger("test_clone_with_timeout") - - def clone_return(source, dest): - raise ValueError("Test exception") - - monkeypatch.setattr(hglib, "clone", clone_return) - - with pytest.raises(ValueError) as excinfo: - HgBundle20Loader.clone_with_timeout( - log, "https://www.mercurial-scm.org/repo/hello", tmp_path, 1 - ) - assert "Test exception" in excinfo.value.args[0] diff --git a/swh/loader/mercurial/tests/test_hgutil.py b/swh/loader/mercurial/tests/test_hgutil.py new file mode 100644 index 0000000..514e139 --- /dev/null +++ b/swh/loader/mercurial/tests/test_hgutil.py @@ -0,0 +1,46 @@ +# Copyright (C) 2020 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import time +import traceback + +import pytest +from mercurial import hg # type: ignore + +from .. import hgutil + + +def test_clone_timeout(monkeypatch): + src = "https://www.mercurial-scm.org/repo/hello" + dest = "/dev/null" + timeout = 1 + + def clone(*args): + time.sleep(5) + + monkeypatch.setattr(hg, "clone", clone) + + with pytest.raises(hgutil.CloneTimeout) as e: + hgutil.clone(src, dest, timeout) + assert e.value.args == (src, timeout) + + +def test_clone_error(caplog, tmp_path, monkeypatch): + src = "https://www.mercurial-scm.org/repo/hello" + dest = "/dev/null" + expected_traceback = "Some traceback" + + def clone(*args): + raise ValueError() + + def print_exc(file): + file.write(expected_traceback) + + monkeypatch.setattr(hg, "clone", clone) + monkeypatch.setattr(traceback, "print_exc", print_exc) + + with pytest.raises(hgutil.CloneFailure) as e: + hgutil.clone(src, dest, 1) + assert e.value.args == (src, dest, expected_traceback) diff --git a/swh/loader/mercurial/tests/test_loader.org b/swh/loader/mercurial/tests/test_loader.org deleted file mode 100644 index cfd7c8f..0000000 --- a/swh/loader/mercurial/tests/test_loader.org +++ /dev/null @@ -1,121 +0,0 @@ -#+title: Where the loader test data comes from - -Mercurial repositories are archived within the folder -swh/loader/mercurial/tests/resources. They contain mercurial -repository. - -The following demonstrates the commands executed from within the -repository to retrieve information. - -* the-sandbox - -Archive: the-sandbox.tgz - -** branches - -Listing of branches and their tip: -#+BEGIN_SRC sh -$ hg branches -develop 57:76cc0882284d -default 2:2f13849f14f5 (inactive) -#+END_SRC - -** Changesets - -#+BEGIN_SRC sh -$ for i in {0..57}; do hg checkout $i > /dev/null; echo $i $(swh-hashtree --ignore '.hg' --path .); done -0 e2e117569b086ceabeeedee4acd95f35298d4553 -1 9cd8160c67ac4b0bc97e2e2cd918a580425167d3 -2 180bd57623a7c2c47a8c43514a5f4d903503d0aa -3 180bd57623a7c2c47a8c43514a5f4d903503d0aa -4 180bd57623a7c2c47a8c43514a5f4d903503d0aa -5 180bd57623a7c2c47a8c43514a5f4d903503d0aa -6 180bd57623a7c2c47a8c43514a5f4d903503d0aa -7 180bd57623a7c2c47a8c43514a5f4d903503d0aa -8 180bd57623a7c2c47a8c43514a5f4d903503d0aa -9 180bd57623a7c2c47a8c43514a5f4d903503d0aa -10 180bd57623a7c2c47a8c43514a5f4d903503d0aa -11 180bd57623a7c2c47a8c43514a5f4d903503d0aa -12 180bd57623a7c2c47a8c43514a5f4d903503d0aa -13 180bd57623a7c2c47a8c43514a5f4d903503d0aa -14 180bd57623a7c2c47a8c43514a5f4d903503d0aa -15 180bd57623a7c2c47a8c43514a5f4d903503d0aa -16 180bd57623a7c2c47a8c43514a5f4d903503d0aa -17 180bd57623a7c2c47a8c43514a5f4d903503d0aa -18 180bd57623a7c2c47a8c43514a5f4d903503d0aa -19 180bd57623a7c2c47a8c43514a5f4d903503d0aa -20 180bd57623a7c2c47a8c43514a5f4d903503d0aa -21 180bd57623a7c2c47a8c43514a5f4d903503d0aa -22 180bd57623a7c2c47a8c43514a5f4d903503d0aa -23 180bd57623a7c2c47a8c43514a5f4d903503d0aa -24 180bd57623a7c2c47a8c43514a5f4d903503d0aa -25 180bd57623a7c2c47a8c43514a5f4d903503d0aa -26 180bd57623a7c2c47a8c43514a5f4d903503d0aa -27 180bd57623a7c2c47a8c43514a5f4d903503d0aa -28 180bd57623a7c2c47a8c43514a5f4d903503d0aa -29 180bd57623a7c2c47a8c43514a5f4d903503d0aa -30 180bd57623a7c2c47a8c43514a5f4d903503d0aa -31 180bd57623a7c2c47a8c43514a5f4d903503d0aa -32 180bd57623a7c2c47a8c43514a5f4d903503d0aa -33 180bd57623a7c2c47a8c43514a5f4d903503d0aa -34 180bd57623a7c2c47a8c43514a5f4d903503d0aa -35 180bd57623a7c2c47a8c43514a5f4d903503d0aa -36 180bd57623a7c2c47a8c43514a5f4d903503d0aa -37 180bd57623a7c2c47a8c43514a5f4d903503d0aa -38 180bd57623a7c2c47a8c43514a5f4d903503d0aa -39 180bd57623a7c2c47a8c43514a5f4d903503d0aa -40 180bd57623a7c2c47a8c43514a5f4d903503d0aa -41 180bd57623a7c2c47a8c43514a5f4d903503d0aa -42 180bd57623a7c2c47a8c43514a5f4d903503d0aa -43 180bd57623a7c2c47a8c43514a5f4d903503d0aa -44 180bd57623a7c2c47a8c43514a5f4d903503d0aa -45 180bd57623a7c2c47a8c43514a5f4d903503d0aa -46 180bd57623a7c2c47a8c43514a5f4d903503d0aa -47 180bd57623a7c2c47a8c43514a5f4d903503d0aa -48 180bd57623a7c2c47a8c43514a5f4d903503d0aa -49 180bd57623a7c2c47a8c43514a5f4d903503d0aa -50 180bd57623a7c2c47a8c43514a5f4d903503d0aa -51 180bd57623a7c2c47a8c43514a5f4d903503d0aa -52 180bd57623a7c2c47a8c43514a5f4d903503d0aa -53 180bd57623a7c2c47a8c43514a5f4d903503d0aa -54 180bd57623a7c2c47a8c43514a5f4d903503d0aa -55 180bd57623a7c2c47a8c43514a5f4d903503d0aa -56 180bd57623a7c2c47a8c43514a5f4d903503d0aa -57 180bd57623a7c2c47a8c43514a5f4d903503d0aa -#+END_SRC - -Note: swh-hashtree is a cli tool defined in swh-model/bin/swh-hashtree - -* hello - -Archive: hello.tgz - -** branches - -#+BEGIN_SRC sh -$ hg branches -default 1:82e55d328c8c -#+END_SRC - -** tags - -I added a tag to have some more data to load (1st repository has no tags): -#+BEGIN_SRC sh -$ hg tags -tip 2:b985ae4a07e1 -0.1 1:82e55d328c8c -#+END_SRC - -#+BEGIN_SRC sh -$ cat .hgtags -82e55d328c8ca4ee16520036c0aaace03a5beb65 0.1 -#+END_SRC - -** Changesets - -#+BEGIN_SRC sh -$ for i in {0..1}; do hg checkout $i > /dev/null; echo $i $(swh-hashtree --ignore '.hg' --path .); done -0 43d727f2f3f2f7cb3b098ddad1d7038464a4cee2 -1 b3f85f210ff86d334575f64cb01c5bf49895b63e -2 8f2be433c945384c85920a8e60f2a68d2c0f20fb -#+END_SRC diff --git a/swh/loader/mercurial/tests/test_loader.py b/swh/loader/mercurial/tests/test_loader.py index c84d27e..b36fc34 100644 --- a/swh/loader/mercurial/tests/test_loader.py +++ b/swh/loader/mercurial/tests/test_loader.py @@ -1,310 +1,298 @@ # Copyright (C) 2018-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import copy import logging import os import time import hglib import pytest from swh.loader.tests import ( assert_last_visit_matches, check_snapshot, get_stats, prepare_repository_from_archive, ) from swh.model.hashutil import hash_to_bytes from swh.model.model import RevisionType, Snapshot, SnapshotBranch, TargetType from swh.storage.algos.snapshot import snapshot_get_latest from ..loader import CloneTimeoutError, HgArchiveBundle20Loader, HgBundle20Loader -from .loader_checker import ExpectedSwhids, LoaderChecker - - -def test_examples(swh_config, datadir, tmp_path): - for archive_name in ("hello", "transplant", "the-sandbox", "example"): - archive_path = os.path.join(datadir, f"{archive_name}.tgz") - json_path = os.path.join(datadir, f"{archive_name}.json") - repo_url = prepare_repository_from_archive(archive_path, archive_name, tmp_path) - - LoaderChecker( - loader=HgBundle20Loader(repo_url), expected=ExpectedSwhids.load(json_path), - ).check() def test_loader_hg_new_visit_no_release(swh_config, datadir, tmp_path): """Eventful visit should yield 1 snapshot""" archive_name = "the-sandbox" archive_path = os.path.join(datadir, f"{archive_name}.tgz") repo_url = prepare_repository_from_archive(archive_path, archive_name, tmp_path) loader = HgBundle20Loader(repo_url) assert loader.load() == {"status": "eventful"} tip_revision_develop = "a9c4534552df370f43f0ef97146f393ef2f2a08c" tip_revision_default = "70e750bb046101fdced06f428e73fee471509c56" expected_snapshot = Snapshot( id=hash_to_bytes("3b8fe58e467deb7597b12a5fd3b2c096b8c02028"), branches={ b"develop": SnapshotBranch( target=hash_to_bytes(tip_revision_develop), target_type=TargetType.REVISION, ), b"default": SnapshotBranch( target=hash_to_bytes(tip_revision_default), target_type=TargetType.REVISION, ), b"HEAD": SnapshotBranch(target=b"develop", target_type=TargetType.ALIAS,), }, ) assert_last_visit_matches( loader.storage, repo_url, status="full", type="hg", snapshot=expected_snapshot.id, ) check_snapshot(expected_snapshot, loader.storage) stats = get_stats(loader.storage) assert stats == { "content": 2, "directory": 3, "origin": 1, "origin_visit": 1, "release": 0, "revision": 58, "skipped_content": 0, "snapshot": 1, } # Ensure archive loader yields the same snapshot loader2 = HgArchiveBundle20Loader( url=archive_path, archive_path=archive_path, visit_date="2016-05-03 15:16:32+00", ) actual_load_status = loader2.load() assert actual_load_status == {"status": "eventful"} stats2 = get_stats(loader2.storage) expected_stats = copy.deepcopy(stats) expected_stats["origin"] += 1 expected_stats["origin_visit"] += 1 assert stats2 == expected_stats # That visit yields the same snapshot assert_last_visit_matches( loader2.storage, archive_path, status="full", type="hg", snapshot=expected_snapshot.id, ) def test_loader_hg_new_visit_with_release(swh_config, datadir, tmp_path): """Eventful visit with release should yield 1 snapshot""" archive_name = "hello" archive_path = os.path.join(datadir, f"{archive_name}.tgz") repo_url = prepare_repository_from_archive(archive_path, archive_name, tmp_path) loader = HgBundle20Loader(url=repo_url, visit_date="2016-05-03 15:16:32+00",) actual_load_status = loader.load() assert actual_load_status == {"status": "eventful"} # then stats = get_stats(loader.storage) assert stats == { "content": 3, "directory": 3, "origin": 1, "origin_visit": 1, "release": 1, "revision": 3, "skipped_content": 0, "snapshot": 1, } # cf. test_loader.org for explaining from where those hashes tip_release = hash_to_bytes("515c4d72e089404356d0f4b39d60f948b8999140") release = loader.storage.release_get([tip_release])[0] assert release is not None tip_revision_default = hash_to_bytes("c3dbe4fbeaaa98dd961834e4007edb3efb0e2a27") revision = loader.storage.revision_get([tip_revision_default])[0] assert revision is not None expected_snapshot = Snapshot( id=hash_to_bytes("d35668e02e2ba4321dc951cd308cf883786f918a"), branches={ b"default": SnapshotBranch( target=tip_revision_default, target_type=TargetType.REVISION, ), b"0.1": SnapshotBranch(target=tip_release, target_type=TargetType.RELEASE,), b"HEAD": SnapshotBranch(target=b"default", target_type=TargetType.ALIAS,), }, ) check_snapshot(expected_snapshot, loader.storage) assert_last_visit_matches( loader.storage, repo_url, type=RevisionType.MERCURIAL.value, status="full", snapshot=expected_snapshot.id, ) # Ensure archive loader yields the same snapshot loader2 = HgArchiveBundle20Loader( url=archive_path, archive_path=archive_path, visit_date="2016-05-03 15:16:32+00", ) actual_load_status = loader2.load() assert actual_load_status == {"status": "eventful"} stats2 = get_stats(loader2.storage) expected_stats = copy.deepcopy(stats) expected_stats["origin"] += 1 expected_stats["origin_visit"] += 1 assert stats2 == expected_stats # That visit yields the same snapshot assert_last_visit_matches( loader2.storage, archive_path, status="full", type="hg", snapshot=expected_snapshot.id, ) def test_visit_with_archive_decompression_failure(swh_config, mocker, datadir): """Failure to decompress should fail early, no data is ingested""" mock_patoo = mocker.patch("swh.loader.mercurial.archive_extract.patoolib") mock_patoo.side_effect = ValueError archive_name = "hello" archive_path = os.path.join(datadir, f"{archive_name}.tgz") # Ensure archive loader yields the same snapshot loader = HgArchiveBundle20Loader( url=archive_path, visit_date="2016-05-03 15:16:32+00", ) actual_load_status = loader.load() assert actual_load_status == {"status": "failed"} stats = get_stats(loader.storage) assert stats == { "content": 0, "directory": 0, "origin": 1, "origin_visit": 1, "release": 0, "revision": 0, "skipped_content": 0, "snapshot": 0, } # That visit yields the same snapshot assert_last_visit_matches( loader.storage, archive_path, status="partial", type="hg", snapshot=None ) def test_visit_repository_with_transplant_operations(swh_config, datadir, tmp_path): """Visit a mercurial repository visit transplant operations within should yield a snapshot as well. """ archive_name = "transplant" archive_path = os.path.join(datadir, f"{archive_name}.tgz") repo_url = prepare_repository_from_archive(archive_path, archive_name, tmp_path) loader = HgBundle20Loader(url=repo_url, visit_date="2019-05-23 12:06:00+00",) # load hg repository actual_load_status = loader.load() assert actual_load_status == {"status": "eventful"} # collect swh revisions assert_last_visit_matches( loader.storage, repo_url, type=RevisionType.MERCURIAL.value, status="full" ) revisions = [] snapshot = snapshot_get_latest(loader.storage, repo_url) for branch in snapshot.branches.values(): if branch.target_type.value != "revision": continue revisions.append(branch.target) # extract original changesets info and the transplant sources hg_changesets = set() transplant_sources = set() for rev in loader.storage.revision_log(revisions): hg_changesets.add(rev["metadata"]["node"]) for k, v in rev["extra_headers"]: if k == b"transplant_source": transplant_sources.add(v.decode("ascii")) # check extracted data are valid assert len(hg_changesets) > 0 assert len(transplant_sources) > 0 assert transplant_sources.issubset(hg_changesets) def test_clone_with_timeout_timeout(caplog, tmp_path, monkeypatch): log = logging.getLogger("test_clone_with_timeout") def clone_timeout(source, dest): time.sleep(60) monkeypatch.setattr(hglib, "clone", clone_timeout) with pytest.raises(CloneTimeoutError): HgBundle20Loader.clone_with_timeout( log, "https://www.mercurial-scm.org/repo/hello", tmp_path, 1 ) for record in caplog.records: assert record.levelname == "WARNING" assert "https://www.mercurial-scm.org/repo/hello" in record.getMessage() assert record.args == ("https://www.mercurial-scm.org/repo/hello", 1) def test_clone_with_timeout_returns(caplog, tmp_path, monkeypatch): log = logging.getLogger("test_clone_with_timeout") def clone_return(source, dest): return (source, dest) monkeypatch.setattr(hglib, "clone", clone_return) assert HgBundle20Loader.clone_with_timeout( log, "https://www.mercurial-scm.org/repo/hello", tmp_path, 1 ) == ("https://www.mercurial-scm.org/repo/hello", tmp_path) def test_clone_with_timeout_exception(caplog, tmp_path, monkeypatch): log = logging.getLogger("test_clone_with_timeout") def clone_return(source, dest): raise ValueError("Test exception") monkeypatch.setattr(hglib, "clone", clone_return) with pytest.raises(ValueError) as excinfo: HgBundle20Loader.clone_with_timeout( log, "https://www.mercurial-scm.org/repo/hello", tmp_path, 1 ) assert "Test exception" in excinfo.value.args[0] diff --git a/swh/loader/mercurial/tests/test_tasks_from_disk.py b/swh/loader/mercurial/tests/test_tasks_from_disk.py new file mode 100644 index 0000000..9b5370e --- /dev/null +++ b/swh/loader/mercurial/tests/test_tasks_from_disk.py @@ -0,0 +1,47 @@ +# Copyright (C) 2018-2020 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + + +def test_loader( + mocker, swh_config, swh_scheduler_celery_app, swh_scheduler_celery_worker +): + mock_loader = mocker.patch("swh.loader.mercurial.from_disk.HgLoaderFromDisk.load") + mock_loader.return_value = {"status": "eventful"} + + res = swh_scheduler_celery_app.send_task( + "swh.loader.mercurial.tasks_from_disk.LoadMercurialFromDisk", + kwargs={"url": "origin_url", "directory": "/some/repo", "visit_date": "now",}, + ) + + assert res + res.wait() + assert res.successful() + + assert res.result == {"status": "eventful"} + mock_loader.assert_called_once_with() + + +def test_archive_loader( + mocker, swh_config, swh_scheduler_celery_app, swh_scheduler_celery_worker +): + mock_loader = mocker.patch( + "swh.loader.mercurial.from_disk.HgArchiveLoaderFromDisk.load" + ) + mock_loader.return_value = {"status": "uneventful"} + + res = swh_scheduler_celery_app.send_task( + "swh.loader.mercurial.tasks_from_disk.LoadArchiveMercurialFromDisk", + kwargs={ + "url": "another_url", + "archive_path": "/some/tar.tgz", + "visit_date": "now", + }, + ) + assert res + res.wait() + assert res.successful() + + assert res.result == {"status": "uneventful"} + mock_loader.assert_called_once_with()