diff --git a/setup.py b/setup.py --- a/setup.py +++ b/setup.py @@ -54,6 +54,7 @@ entry_points=""" [swh.workers] loader.mercurial=swh.loader.mercurial:register + loader.mercurial_from_disk=swh.loader.mercurial:register_from_disk [console_scripts] swh-hg-identify=swh.loader.mercurial.identify:main """, diff --git a/swh/loader/mercurial/__init__.py b/swh/loader/mercurial/__init__.py --- a/swh/loader/mercurial/__init__.py +++ b/swh/loader/mercurial/__init__.py @@ -15,3 +15,13 @@ "task_modules": [f"{__name__}.tasks"], "loader": HgBundle20Loader, } + + +def register_from_disk() -> Mapping[str, Any]: + """Register the current worker module's definition""" + from .from_disk import HgLoaderFromDisk + + return { + "task_modules": [f"{__name__}.tasks_from_disk"], + "loader": HgLoaderFromDisk, + } diff --git a/swh/loader/mercurial/from_disk.py b/swh/loader/mercurial/from_disk.py new file mode 100644 --- /dev/null +++ b/swh/loader/mercurial/from_disk.py @@ -0,0 +1,475 @@ +# Copyright (C) 2020 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import os +from collections import deque +from datetime import datetime, timezone +from shutil import rmtree +from tempfile import mkdtemp +from typing import Any, Deque, Dict, Optional, Tuple, Union + +import dateutil + +from swh.core.config import merge_configs +from swh.loader.core.loader import BaseLoader +from swh.loader.core.utils import clean_dangling_folders +from swh.model.from_disk import Content, DentryPerms, Directory +from swh.model.hashutil import MultiHash, hash_to_bytehex +from swh.model.model import Content as ModelContent +from swh.model.model import ( + ObjectType, + Origin, + Person, + Release, + Revision, + RevisionType, + Sha1Git, + Snapshot, + SnapshotBranch, + TargetType, + TimestampWithTimezone, +) + +from . import hgutil +from .archive_extract import tmp_extract +from .hgutil import HgNodeId + +FLAG_PERMS = { + b"l": DentryPerms.symlink, + b"x": DentryPerms.executable_content, + b"": DentryPerms.content, +} # type: Dict[bytes, DentryPerms] +DEFAULT_CONFIG: Dict[str, Any] = { + "temp_directory": "/tmp", + "clone_timeout_seconds": 7200, + "content_cache_size": 10_000, +} +TEMPORARY_DIR_PREFIX_PATTERN = "swh.loader.mercurial.from_disk" + + +def parse_visit_date(visit_date: Optional[Union[datetime, str]]) -> Optional[datetime]: + """Convert visit data from Optional[Union[str, datetime]] to Optional[datetime]. + + `HgLoaderFromDisk` accepts `str` and `datetime` as visit date + while `BaseLoader` only deals with `datetime`. + """ + if visit_date is None: + return None + + if isinstance(visit_date, datetime): + return visit_date + + if visit_date == "now": + return datetime.now(tz=timezone.utc) + + if isinstance(visit_date, str): + return dateutil.parser.parse(visit_date) + + return ValueError(f"invalid visit date {visit_date!r}") + + +class HgDirectory(Directory): + """A directory that creates parent directories if missing.""" + + def __setitem__(self, path: bytes, value: Union[Content, "HgDirectory"]) -> None: + if b"/" in path: + head, tail = path.split(b"/", 1) + + directory = self.get(head) + if directory is None: + directory = HgDirectory() + self[head] = directory + + directory[tail] = value + else: + super().__setitem__(path, value) + + +class HgLoaderFromDisk(BaseLoader): + """Load a mercurial repository from a local repository.""" + + CONFIG_BASE_FILENAME = "loader/mercurial" + + visit_type = "hg" + + def __init__( + self, + url: str, + directory: Optional[str] = None, + logging_class: str = "swh.loader.mercurial.LoaderFromDisk", + visit_date: Optional[Union[datetime, str]] = None, + config: Optional[Dict[str, Any]] = None, + ): + """Initialize the loader. + + Args: + url: url of the repository. + directory: directory of the local repository. + logging_class: class of the loader logger. + visit_date: visit date of the repository + config: loader configuration + """ + super().__init__(logging_class=logging_class, config=config or {}) + + self.config = merge_configs(DEFAULT_CONFIG, self.config) + self._temp_directory = self.config["temp_directory"] + self._clone_timeout = self.config["clone_timeout_seconds"] + + self.origin_url = url + self.visit_date = parse_visit_date(visit_date) + self.directory = directory + + self._repo: Optional[hgutil.Repository] = None + self._revision_nodeid_to_swhid: Dict[HgNodeId, Sha1Git] = {} + + @property + def repo(self) -> hgutil.Repository: + """A filtered mercurial repository. + + The repository will only return visible changesets. + If needed, use self.repo.unfiltered() to have a new repository + returning all the changesets. + """ + if self._repo is None: + self._repo = hgutil.repository(self.repo_directory) + return self._repo + + def pre_cleanup(self) -> None: + """As a first step, will try and check for dangling data to cleanup. + This should do its best to avoid raising issues. + + """ + clean_dangling_folders( + self._temp_directory, + pattern_check=TEMPORARY_DIR_PREFIX_PATTERN, + log=self.log, + ) + + def cleanup(self) -> None: + """Last step executed by the loader.""" + if self.repo_directory and os.path.exists(self.repo_directory): + self.log.debug(f"Cleanup up repository {self.repo_directory}") + rmtree(self.repo_directory) + + def prepare_origin_visit(self, *args, **kwargs) -> None: + """First step executed by the loader to prepare origin and visit + references. Set/update self.origin, and + optionally self.origin_url, self.visit_date. + + """ + self.origin = Origin(url=self.origin_url) + + def prepare(self, *args, **kwargs) -> None: + """Second step executed by the loader to prepare some state needed by + the loader. + + """ + + def fetch_data(self) -> bool: + """Fetch the data from the source the loader is currently loading + + Returns: + a value that is interpreted as a boolean. If True, fetch_data needs + to be called again to complete loading. + + """ + if not self.directory: # no local repository + self.repo_directory = mkdtemp( + prefix=TEMPORARY_DIR_PREFIX_PATTERN, + suffix=f"-{os.getpid()}", + dir=self._temp_directory, + ) + self.log.debug( + f"Cloning {self.origin_url} to {self.directory} " + f"with timeout {self._clone_timeout} seconds" + ) + hgutil.clone(self.origin_url, self.repo_directory, self._clone_timeout) + else: # existing local repository + # Allow to load on disk repository without cloning + # for testing purpose. + + # TODO Allow pull new changesets from remote + self.repo_directory = self.directory + + return False + + def store_data(self): + """Store fetched data in the database.""" + for rev in self.repo: + self.store_revision(self.repo[rev]) + + branch_by_hg_nodeid: Dict[HgNodeId, bytes] = { + hg_nodeid: name for name, hg_nodeid in hgutil.branches(self.repo).items() + } + tags_by_name: Dict[bytes, HgNodeId] = self.repo.tags() + tags_by_hg_nodeid: Dict[HgNodeId, bytes] = { + hg_nodeid: name for name, hg_nodeid in tags_by_name.items() + } + + snapshot_branches: Dict[bytes, SnapshotBranch] = {} + + for hg_nodeid, revision_swhid in self._revision_nodeid_to_swhid.items(): + tag_name = tags_by_hg_nodeid.get(hg_nodeid) + + # tip is listed in the tags by the mercurial api + # but its not a tag defined by the user in `.hgtags` + if tag_name and tag_name != b"tip": + snapshot_branches[tag_name] = SnapshotBranch( + target=self.store_release(tag_name, revision_swhid), + target_type=TargetType.RELEASE, + ) + + if hg_nodeid in branch_by_hg_nodeid: + name = branch_by_hg_nodeid[hg_nodeid] + snapshot_branches[name] = SnapshotBranch( + target=revision_swhid, target_type=TargetType.REVISION, + ) + + if hg_nodeid == tags_by_name[b"tip"]: + snapshot_branches[b"HEAD"] = SnapshotBranch( + target=name, target_type=TargetType.ALIAS, + ) + + snapshot = Snapshot(branches=snapshot_branches) + self.storage.snapshot_add([snapshot]) + + self.flush() + self.loaded_snapshot_id = snapshot.id + + def get_revision_id_from_hg_nodeid(self, hg_nodeid: HgNodeId) -> Sha1Git: + """Return the swhid of a revision given its hg nodeid. + + Args: + hg_nodeid: the hg nodeid of the revision. + + Returns: + the swhid of the revision. + """ + return self._revision_nodeid_to_swhid[hg_nodeid] + + def get_revision_parents(self, hg_nodeid: HgNodeId) -> Tuple[Sha1Git, ...]: + """Return the swhids of the parent revisions. + + Args: + hg_nodeid: the hg nodeid of the revision. + + Returns: + the swhids of the parent revisions. + """ + rev_ctx = self.repo[hg_nodeid] + + parents = [] + for parent_ctx in rev_ctx.parents(): + parent_hg_nodeid = parent_ctx.node() + # nullid is the value of a parent that does not exist + if parent_hg_nodeid == hgutil.NULLID: + continue + parents.append(self.get_revision_id_from_hg_nodeid(parent_hg_nodeid)) + + return tuple(parents) + + def store_revision(self, rev_ctx: hgutil.BaseContext) -> None: + """Store a revision given its hg nodeid. + + Args: + rev_ctx: the he revision context. + + Returns: + the swhid of the stored revision. + """ + hg_nodeid = rev_ctx.node() + + root_swhid = self.store_directories(rev_ctx) + + author = Person.from_fullname(rev_ctx.user()) + (timestamp, offset) = rev_ctx.date() + + # TimestampWithTimezone.from_dict will change name + # as it accept more than just dicts + rev_date = TimestampWithTimezone.from_dict(int(timestamp)) + + extra_headers = [ + (b"time_offset_seconds", str(offset).encode(),), + ] + for key, value in rev_ctx.extra().items(): + # The default branch is skipped to match + # the historical implementation + if key == b"branch" and value == b"default": + continue + + # transplant_source is converted to match + # the historical implementation + if key == b"transplant_source": + value = hash_to_bytehex(value) + extra_headers.append((key, value)) + + revision = Revision( + author=author, + date=rev_date, + committer=author, + committer_date=rev_date, + type=RevisionType.MERCURIAL, + directory=root_swhid, + message=rev_ctx.description(), + metadata={"node": hg_nodeid.hex()}, + extra_headers=tuple(extra_headers), + synthetic=False, + parents=self.get_revision_parents(rev_ctx), + ) + + self._revision_nodeid_to_swhid[hg_nodeid] = revision.id + self.storage.revision_add([revision]) + + def store_release(self, name: bytes, target=Sha1Git) -> Sha1Git: + """Store a release given its name and its target. + + A release correspond to a user defined tag in mercurial. + The mercurial api as a `tip` tag that must be ignored. + + Args: + name: name of the release. + target: swhid of the target revision. + + Returns: + the swhid of the stored release. + """ + release = Release( + name=name, + target=target, + target_type=ObjectType.REVISION, + message=None, + metadata=None, + synthetic=False, + author=Person(name=None, email=None, fullname=b""), + date=None, + ) + + self.storage.release_add([release]) + + return release.id + + def store_content(self, rev_ctx: hgutil.BaseContext, file_path: bytes) -> Content: + """Store a revision content hg nodeid and file path. + + Content is a mix of file content at a given revision + and its permissions found in the changeset's manifest. + + Args: + rev_ctx: the he revision context. + file_path: the hg path of the content. + + Returns: + the swhid of the top level directory. + """ + hg_nodeid = rev_ctx.node() + file_ctx = rev_ctx[file_path] + + perms = FLAG_PERMS[file_ctx.flags()] + data = file_ctx.data() # caching is simple and will come in the next revision. + + content_data = MultiHash.from_data(data).digest() + content_data["length"] = len(data) + content_data["perms"] = perms + content_data["data"] = data + content_data["status"] = "visible" + content = Content(content_data) + + model = content.to_model() + if isinstance(model, ModelContent): + self.storage.content_add([model]) + else: + raise ValueError( + f"{file_path!r} at rev {hg_nodeid.hex()!r} " + "produced {type(model)!r} instead of {ModelContent!r}" + ) + + # Here we make sure to return only necessary data. + return Content({"sha1_git": content.hash, "perms": perms}) + + def store_directories(self, rev_ctx: hgutil.BaseContext) -> Sha1Git: + """Store a revision directories given its hg nodeid. + + Mercurial as no directory as in git. A Git like tree must be build + from file paths to obtain each directory hash. + + Args: + rev_ctx: the he revision context. + + Returns: + the swhid of the top level directory. + """ + root = HgDirectory() + for file_path in rev_ctx.manifest(): + content = self.store_content(rev_ctx, file_path) + root[file_path] = content + + directories: Deque[Directory] = deque([root]) + while directories: + directory = directories.pop() + self.storage.directory_add([directory.to_model()]) + directories.extend( + [item for item in directory.values() if isinstance(item, Directory)] + ) + + return root.hash + + +class HgArchiveLoaderFromDisk(HgLoaderFromDisk): + """Mercurial loader for repository wrapped within tarballs.""" + + def __init__( + self, url: str, visit_date: Optional[datetime] = None, archive_path: str = None + ): + super().__init__( + url, + visit_date=visit_date, + logging_class="swh.loader.mercurial.ArchiveLoaderFromDisk", + ) + self.temp_dir = None + self.archive_path = archive_path + + def prepare(self, *args, **kwargs): + """Extract the archive instead of cloning.""" + self._temp_directory = tmp_extract( + archive=self.archive_path, + dir=self._temp_directory, + prefix=TEMPORARY_DIR_PREFIX_PATTERN, + suffix=f".dump-{os.getpid()}", + log=self.log, + source=self.origin_url, + ) + + repo_name = os.listdir(self.temp_dir)[0] + self.directory = os.path.join(self.temp_dir, repo_name) + super().prepare(*args, **kwargs) + + def cleanup(self) -> None: + """Remove the extracted archive instead of the cloned repository.""" + if self.temp_dir and os.path.exists(self.temp_dir): + rmtree(self.temp_dir) + super().cleanup() + + +# Allow direct usage of the loader from the command line with +# `python -m swh.loader.mercurial.from_disk $ORIGIN_URL` +if __name__ == "__main__": + import logging + + import click + + logging.basicConfig( + level=logging.DEBUG, format="%(asctime)s %(process)d %(message)s" + ) + + @click.command() + @click.option("--origin-url", help="origin url") + @click.option("--hg-directory", help="Path to mercurial repository to load") + @click.option("--visit-date", default=None, help="Visit date") + def main(origin_url, hg_directory, visit_date): + return HgLoaderFromDisk( + origin_url, directory=hg_directory, visit_date=visit_date + ).load() + + main() diff --git a/swh/loader/mercurial/hgutil.py b/swh/loader/mercurial/hgutil.py new file mode 100644 --- /dev/null +++ b/swh/loader/mercurial/hgutil.py @@ -0,0 +1,77 @@ +import io +import traceback +from multiprocessing import Process, Queue +from typing import Dict, NewType + +# The internal Mercurial API is not guaranteed to be stable. +import mercurial.ui # type: ignore +from mercurial import context, hg + +NULLID = mercurial.node.nullid +HgNodeId = NewType("HgNodeId", bytes) +Repository = hg.localrepo +BaseContext = context.basectx + + +def repository(path: str) -> hg.localrepo: + ui = mercurial.ui.ui.load() + return hg.repository(ui, path.encode()) + + +def branches(repo: hg.localrepo) -> Dict[bytes, HgNodeId]: + """List repository named branches and their tip node.""" + result = {} + for tag, heads, tip, isclosed in repo.branchmap().iterbranches(): + if isclosed: + continue + result[tag] = tip + return result + + +class CloneTimeout(Exception): + pass + + +class CloneFailure(Exception): + pass + + +def _clone_task(src: str, dest: str, errors: Queue) -> None: + """Clone task to run in a subprocess. + + Args: + src: clone source + dest: clone destination + errors: message queue to communicate errors + """ + try: + hg.clone(mercurial.ui.ui.load(), {}, src.encode(), dest.encode()) + except Exception as e: + exc_buffer = io.StringIO() + traceback.print_exc(file=exc_buffer) + errors.put_nowait(exc_buffer.getvalue()) + raise e + + +def clone(src: str, dest: str, timeout: int) -> None: + """Clone a repository with timeout. + + Args: + src: clone source + dest: clone destination + timeout: timeout in seconds + """ + errors: Queue = Queue() + process = Process(target=_clone_task, args=(src, dest, errors)) + process.start() + process.join(timeout) + + if process.is_alive(): + process.terminate() + process.join(1) + if process.is_alive(): + process.kill() + raise CloneTimeout(src, timeout) + + if not errors.empty(): + raise CloneFailure(src, dest, errors.get()) diff --git a/swh/loader/mercurial/tasks_from_disk.py b/swh/loader/mercurial/tasks_from_disk.py new file mode 100644 --- /dev/null +++ b/swh/loader/mercurial/tasks_from_disk.py @@ -0,0 +1,33 @@ +# Copyright (C) 2020 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from celery import shared_task + +from .from_disk import HgArchiveLoaderFromDisk, HgLoaderFromDisk + + +@shared_task(name=__name__ + ".LoadMercurialFromDisk") +def load_hg(*, url, directory=None, visit_date=None): + """Mercurial repository loading + + Import a mercurial tarball into swh. + + Args: see :func:`DepositLoader.load`. + + """ + loader = HgLoaderFromDisk(url, directory=directory, visit_date=visit_date) + return loader.load() + + +@shared_task(name=__name__ + ".LoadArchiveMercurialFromDisk") +def load_hg_from_archive(*, url, archive_path=None, visit_date=None): + """Import a mercurial tarball into swh. + + Args: see :func:`DepositLoader.load`. + """ + loader = HgArchiveLoaderFromDisk( + url, archive_path=archive_path, visit_date=visit_date + ) + return loader.load() diff --git a/swh/loader/mercurial/tests/test_loader.py b/swh/loader/mercurial/tests/test_from_disk.py copy from swh/loader/mercurial/tests/test_loader.py copy to swh/loader/mercurial/tests/test_from_disk.py --- a/swh/loader/mercurial/tests/test_loader.py +++ b/swh/loader/mercurial/tests/test_from_disk.py @@ -1,15 +1,9 @@ -# Copyright (C) 2018-2020 The Software Heritage developers +# Copyright (C) 2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -import copy -import logging import os -import time - -import hglib -import pytest from swh.loader.tests import ( assert_last_visit_matches, @@ -17,14 +11,30 @@ get_stats, prepare_repository_from_archive, ) +from swh.model.from_disk import Content from swh.model.hashutil import hash_to_bytes from swh.model.model import RevisionType, Snapshot, SnapshotBranch, TargetType from swh.storage.algos.snapshot import snapshot_get_latest -from ..loader import CloneTimeoutError, HgArchiveBundle20Loader, HgBundle20Loader +from ..from_disk import HgDirectory, HgLoaderFromDisk from .loader_checker import ExpectedSwhids, LoaderChecker +def test_hg_directory_creates_missing_directories(): + directory = HgDirectory() + directory[b"path/to/some/content"] = Content() + + +# Those tests assert expectations on repository loading +# by reading expected values from associated json files +# produced by the `swh-hg-identify` command line utility. +# +# It has more granularity than historical tests. +# Assertions will tell if the error comes from the directories +# revisions or release rather than only checking the snapshot. +# +# With more work it should event be possible to know which part +# of an object is faulty. def test_examples(swh_config, datadir, tmp_path): for archive_name in ("hello", "transplant", "the-sandbox", "example"): archive_path = os.path.join(datadir, f"{archive_name}.tgz") @@ -32,17 +42,21 @@ repo_url = prepare_repository_from_archive(archive_path, archive_name, tmp_path) LoaderChecker( - loader=HgBundle20Loader(repo_url), expected=ExpectedSwhids.load(json_path), + loader=HgLoaderFromDisk(repo_url), expected=ExpectedSwhids.load(json_path), ).check() +# This test has as been adapted from the historical `HgBundle20Loader` tests +# to ensure compatibility of `HgLoaderFromDisk`. +# Hashes as been produced by copy pasting the result of the implementation +# to prevent regressions. def test_loader_hg_new_visit_no_release(swh_config, datadir, tmp_path): """Eventful visit should yield 1 snapshot""" archive_name = "the-sandbox" archive_path = os.path.join(datadir, f"{archive_name}.tgz") repo_url = prepare_repository_from_archive(archive_path, archive_name, tmp_path) - loader = HgBundle20Loader(repo_url) + loader = HgLoaderFromDisk(url=repo_url) assert loader.load() == {"status": "eventful"} @@ -84,39 +98,19 @@ "snapshot": 1, } - # Ensure archive loader yields the same snapshot - loader2 = HgArchiveBundle20Loader( - url=archive_path, - archive_path=archive_path, - visit_date="2016-05-03 15:16:32+00", - ) - - actual_load_status = loader2.load() - assert actual_load_status == {"status": "eventful"} - - stats2 = get_stats(loader2.storage) - expected_stats = copy.deepcopy(stats) - expected_stats["origin"] += 1 - expected_stats["origin_visit"] += 1 - assert stats2 == expected_stats - - # That visit yields the same snapshot - assert_last_visit_matches( - loader2.storage, - archive_path, - status="full", - type="hg", - snapshot=expected_snapshot.id, - ) - +# This test has as been adapted from the historical `HgBundle20Loader` tests +# to ensure compatibility of `HgLoaderFromDisk`. +# Hashes as been produced by copy pasting the result of the implementation +# to prevent regressions. def test_loader_hg_new_visit_with_release(swh_config, datadir, tmp_path): """Eventful visit with release should yield 1 snapshot""" + archive_name = "hello" archive_path = os.path.join(datadir, f"{archive_name}.tgz") repo_url = prepare_repository_from_archive(archive_path, archive_name, tmp_path) - loader = HgBundle20Loader(url=repo_url, visit_date="2016-05-03 15:16:32+00",) + loader = HgLoaderFromDisk(url=repo_url, visit_date="2016-05-03 15:16:32+00") actual_load_status = loader.load() assert actual_load_status == {"status": "eventful"} @@ -163,64 +157,11 @@ snapshot=expected_snapshot.id, ) - # Ensure archive loader yields the same snapshot - loader2 = HgArchiveBundle20Loader( - url=archive_path, - archive_path=archive_path, - visit_date="2016-05-03 15:16:32+00", - ) - - actual_load_status = loader2.load() - assert actual_load_status == {"status": "eventful"} - - stats2 = get_stats(loader2.storage) - expected_stats = copy.deepcopy(stats) - expected_stats["origin"] += 1 - expected_stats["origin_visit"] += 1 - assert stats2 == expected_stats - - # That visit yields the same snapshot - assert_last_visit_matches( - loader2.storage, - archive_path, - status="full", - type="hg", - snapshot=expected_snapshot.id, - ) - - -def test_visit_with_archive_decompression_failure(swh_config, mocker, datadir): - """Failure to decompress should fail early, no data is ingested""" - mock_patoo = mocker.patch("swh.loader.mercurial.archive_extract.patoolib") - mock_patoo.side_effect = ValueError - - archive_name = "hello" - archive_path = os.path.join(datadir, f"{archive_name}.tgz") - # Ensure archive loader yields the same snapshot - loader = HgArchiveBundle20Loader( - url=archive_path, visit_date="2016-05-03 15:16:32+00", - ) - - actual_load_status = loader.load() - assert actual_load_status == {"status": "failed"} - - stats = get_stats(loader.storage) - assert stats == { - "content": 0, - "directory": 0, - "origin": 1, - "origin_visit": 1, - "release": 0, - "revision": 0, - "skipped_content": 0, - "snapshot": 0, - } - # That visit yields the same snapshot - assert_last_visit_matches( - loader.storage, archive_path, status="partial", type="hg", snapshot=None - ) - +# This test has as been adapted from the historical `HgBundle20Loader` tests +# to ensure compatibility of `HgLoaderFromDisk`. +# Hashes as been produced by copy pasting the result of the implementation +# to prevent regressions. def test_visit_repository_with_transplant_operations(swh_config, datadir, tmp_path): """Visit a mercurial repository visit transplant operations within should yield a snapshot as well. @@ -230,7 +171,8 @@ archive_name = "transplant" archive_path = os.path.join(datadir, f"{archive_name}.tgz") repo_url = prepare_repository_from_archive(archive_path, archive_name, tmp_path) - loader = HgBundle20Loader(url=repo_url, visit_date="2019-05-23 12:06:00+00",) + + loader = HgLoaderFromDisk(url=repo_url, visit_date="2016-05-03 15:16:32+00") # load hg repository actual_load_status = loader.load() @@ -261,50 +203,3 @@ assert len(hg_changesets) > 0 assert len(transplant_sources) > 0 assert transplant_sources.issubset(hg_changesets) - - -def test_clone_with_timeout_timeout(caplog, tmp_path, monkeypatch): - log = logging.getLogger("test_clone_with_timeout") - - def clone_timeout(source, dest): - time.sleep(60) - - monkeypatch.setattr(hglib, "clone", clone_timeout) - - with pytest.raises(CloneTimeoutError): - HgBundle20Loader.clone_with_timeout( - log, "https://www.mercurial-scm.org/repo/hello", tmp_path, 1 - ) - - for record in caplog.records: - assert record.levelname == "WARNING" - assert "https://www.mercurial-scm.org/repo/hello" in record.getMessage() - assert record.args == ("https://www.mercurial-scm.org/repo/hello", 1) - - -def test_clone_with_timeout_returns(caplog, tmp_path, monkeypatch): - log = logging.getLogger("test_clone_with_timeout") - - def clone_return(source, dest): - return (source, dest) - - monkeypatch.setattr(hglib, "clone", clone_return) - - assert HgBundle20Loader.clone_with_timeout( - log, "https://www.mercurial-scm.org/repo/hello", tmp_path, 1 - ) == ("https://www.mercurial-scm.org/repo/hello", tmp_path) - - -def test_clone_with_timeout_exception(caplog, tmp_path, monkeypatch): - log = logging.getLogger("test_clone_with_timeout") - - def clone_return(source, dest): - raise ValueError("Test exception") - - monkeypatch.setattr(hglib, "clone", clone_return) - - with pytest.raises(ValueError) as excinfo: - HgBundle20Loader.clone_with_timeout( - log, "https://www.mercurial-scm.org/repo/hello", tmp_path, 1 - ) - assert "Test exception" in excinfo.value.args[0] diff --git a/swh/loader/mercurial/tests/test_hgutil.py b/swh/loader/mercurial/tests/test_hgutil.py new file mode 100644 --- /dev/null +++ b/swh/loader/mercurial/tests/test_hgutil.py @@ -0,0 +1,46 @@ +# Copyright (C) 2020 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import time +import traceback + +import pytest +from mercurial import hg # type: ignore + +from .. import hgutil + + +def test_clone_timeout(monkeypatch): + src = "https://www.mercurial-scm.org/repo/hello" + dest = "/dev/null" + timeout = 1 + + def clone(*args): + time.sleep(5) + + monkeypatch.setattr(hg, "clone", clone) + + with pytest.raises(hgutil.CloneTimeout) as e: + hgutil.clone(src, dest, timeout) + assert e.value.args == (src, timeout) + + +def test_clone_error(caplog, tmp_path, monkeypatch): + src = "https://www.mercurial-scm.org/repo/hello" + dest = "/dev/null" + expected_traceback = "Some traceback" + + def clone(*args): + raise ValueError() + + def print_exc(file): + file.write(expected_traceback) + + monkeypatch.setattr(hg, "clone", clone) + monkeypatch.setattr(traceback, "print_exc", print_exc) + + with pytest.raises(hgutil.CloneFailure) as e: + hgutil.clone(src, dest, 1) + assert e.value.args == (src, dest, expected_traceback) diff --git a/swh/loader/mercurial/tests/test_loader.org b/swh/loader/mercurial/tests/test_loader.org deleted file mode 100644 --- a/swh/loader/mercurial/tests/test_loader.org +++ /dev/null @@ -1,121 +0,0 @@ -#+title: Where the loader test data comes from - -Mercurial repositories are archived within the folder -swh/loader/mercurial/tests/resources. They contain mercurial -repository. - -The following demonstrates the commands executed from within the -repository to retrieve information. - -* the-sandbox - -Archive: the-sandbox.tgz - -** branches - -Listing of branches and their tip: -#+BEGIN_SRC sh -$ hg branches -develop 57:76cc0882284d -default 2:2f13849f14f5 (inactive) -#+END_SRC - -** Changesets - -#+BEGIN_SRC sh -$ for i in {0..57}; do hg checkout $i > /dev/null; echo $i $(swh-hashtree --ignore '.hg' --path .); done -0 e2e117569b086ceabeeedee4acd95f35298d4553 -1 9cd8160c67ac4b0bc97e2e2cd918a580425167d3 -2 180bd57623a7c2c47a8c43514a5f4d903503d0aa -3 180bd57623a7c2c47a8c43514a5f4d903503d0aa -4 180bd57623a7c2c47a8c43514a5f4d903503d0aa -5 180bd57623a7c2c47a8c43514a5f4d903503d0aa -6 180bd57623a7c2c47a8c43514a5f4d903503d0aa -7 180bd57623a7c2c47a8c43514a5f4d903503d0aa -8 180bd57623a7c2c47a8c43514a5f4d903503d0aa -9 180bd57623a7c2c47a8c43514a5f4d903503d0aa -10 180bd57623a7c2c47a8c43514a5f4d903503d0aa -11 180bd57623a7c2c47a8c43514a5f4d903503d0aa -12 180bd57623a7c2c47a8c43514a5f4d903503d0aa -13 180bd57623a7c2c47a8c43514a5f4d903503d0aa -14 180bd57623a7c2c47a8c43514a5f4d903503d0aa -15 180bd57623a7c2c47a8c43514a5f4d903503d0aa -16 180bd57623a7c2c47a8c43514a5f4d903503d0aa -17 180bd57623a7c2c47a8c43514a5f4d903503d0aa -18 180bd57623a7c2c47a8c43514a5f4d903503d0aa -19 180bd57623a7c2c47a8c43514a5f4d903503d0aa -20 180bd57623a7c2c47a8c43514a5f4d903503d0aa -21 180bd57623a7c2c47a8c43514a5f4d903503d0aa -22 180bd57623a7c2c47a8c43514a5f4d903503d0aa -23 180bd57623a7c2c47a8c43514a5f4d903503d0aa -24 180bd57623a7c2c47a8c43514a5f4d903503d0aa -25 180bd57623a7c2c47a8c43514a5f4d903503d0aa -26 180bd57623a7c2c47a8c43514a5f4d903503d0aa -27 180bd57623a7c2c47a8c43514a5f4d903503d0aa -28 180bd57623a7c2c47a8c43514a5f4d903503d0aa -29 180bd57623a7c2c47a8c43514a5f4d903503d0aa -30 180bd57623a7c2c47a8c43514a5f4d903503d0aa -31 180bd57623a7c2c47a8c43514a5f4d903503d0aa -32 180bd57623a7c2c47a8c43514a5f4d903503d0aa -33 180bd57623a7c2c47a8c43514a5f4d903503d0aa -34 180bd57623a7c2c47a8c43514a5f4d903503d0aa -35 180bd57623a7c2c47a8c43514a5f4d903503d0aa -36 180bd57623a7c2c47a8c43514a5f4d903503d0aa -37 180bd57623a7c2c47a8c43514a5f4d903503d0aa -38 180bd57623a7c2c47a8c43514a5f4d903503d0aa -39 180bd57623a7c2c47a8c43514a5f4d903503d0aa -40 180bd57623a7c2c47a8c43514a5f4d903503d0aa -41 180bd57623a7c2c47a8c43514a5f4d903503d0aa -42 180bd57623a7c2c47a8c43514a5f4d903503d0aa -43 180bd57623a7c2c47a8c43514a5f4d903503d0aa -44 180bd57623a7c2c47a8c43514a5f4d903503d0aa -45 180bd57623a7c2c47a8c43514a5f4d903503d0aa -46 180bd57623a7c2c47a8c43514a5f4d903503d0aa -47 180bd57623a7c2c47a8c43514a5f4d903503d0aa -48 180bd57623a7c2c47a8c43514a5f4d903503d0aa -49 180bd57623a7c2c47a8c43514a5f4d903503d0aa -50 180bd57623a7c2c47a8c43514a5f4d903503d0aa -51 180bd57623a7c2c47a8c43514a5f4d903503d0aa -52 180bd57623a7c2c47a8c43514a5f4d903503d0aa -53 180bd57623a7c2c47a8c43514a5f4d903503d0aa -54 180bd57623a7c2c47a8c43514a5f4d903503d0aa -55 180bd57623a7c2c47a8c43514a5f4d903503d0aa -56 180bd57623a7c2c47a8c43514a5f4d903503d0aa -57 180bd57623a7c2c47a8c43514a5f4d903503d0aa -#+END_SRC - -Note: swh-hashtree is a cli tool defined in swh-model/bin/swh-hashtree - -* hello - -Archive: hello.tgz - -** branches - -#+BEGIN_SRC sh -$ hg branches -default 1:82e55d328c8c -#+END_SRC - -** tags - -I added a tag to have some more data to load (1st repository has no tags): -#+BEGIN_SRC sh -$ hg tags -tip 2:b985ae4a07e1 -0.1 1:82e55d328c8c -#+END_SRC - -#+BEGIN_SRC sh -$ cat .hgtags -82e55d328c8ca4ee16520036c0aaace03a5beb65 0.1 -#+END_SRC - -** Changesets - -#+BEGIN_SRC sh -$ for i in {0..1}; do hg checkout $i > /dev/null; echo $i $(swh-hashtree --ignore '.hg' --path .); done -0 43d727f2f3f2f7cb3b098ddad1d7038464a4cee2 -1 b3f85f210ff86d334575f64cb01c5bf49895b63e -2 8f2be433c945384c85920a8e60f2a68d2c0f20fb -#+END_SRC diff --git a/swh/loader/mercurial/tests/test_loader.py b/swh/loader/mercurial/tests/test_loader.py --- a/swh/loader/mercurial/tests/test_loader.py +++ b/swh/loader/mercurial/tests/test_loader.py @@ -22,18 +22,6 @@ from swh.storage.algos.snapshot import snapshot_get_latest from ..loader import CloneTimeoutError, HgArchiveBundle20Loader, HgBundle20Loader -from .loader_checker import ExpectedSwhids, LoaderChecker - - -def test_examples(swh_config, datadir, tmp_path): - for archive_name in ("hello", "transplant", "the-sandbox", "example"): - archive_path = os.path.join(datadir, f"{archive_name}.tgz") - json_path = os.path.join(datadir, f"{archive_name}.json") - repo_url = prepare_repository_from_archive(archive_path, archive_name, tmp_path) - - LoaderChecker( - loader=HgBundle20Loader(repo_url), expected=ExpectedSwhids.load(json_path), - ).check() def test_loader_hg_new_visit_no_release(swh_config, datadir, tmp_path): diff --git a/swh/loader/mercurial/tests/test_tasks_from_disk.py b/swh/loader/mercurial/tests/test_tasks_from_disk.py new file mode 100644 --- /dev/null +++ b/swh/loader/mercurial/tests/test_tasks_from_disk.py @@ -0,0 +1,47 @@ +# Copyright (C) 2018-2020 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + + +def test_loader( + mocker, swh_config, swh_scheduler_celery_app, swh_scheduler_celery_worker +): + mock_loader = mocker.patch("swh.loader.mercurial.from_disk.HgLoaderFromDisk.load") + mock_loader.return_value = {"status": "eventful"} + + res = swh_scheduler_celery_app.send_task( + "swh.loader.mercurial.tasks_from_disk.LoadMercurialFromDisk", + kwargs={"url": "origin_url", "directory": "/some/repo", "visit_date": "now",}, + ) + + assert res + res.wait() + assert res.successful() + + assert res.result == {"status": "eventful"} + mock_loader.assert_called_once_with() + + +def test_archive_loader( + mocker, swh_config, swh_scheduler_celery_app, swh_scheduler_celery_worker +): + mock_loader = mocker.patch( + "swh.loader.mercurial.from_disk.HgArchiveLoaderFromDisk.load" + ) + mock_loader.return_value = {"status": "uneventful"} + + res = swh_scheduler_celery_app.send_task( + "swh.loader.mercurial.tasks_from_disk.LoadArchiveMercurialFromDisk", + kwargs={ + "url": "another_url", + "archive_path": "/some/tar.tgz", + "visit_date": "now", + }, + ) + assert res + res.wait() + assert res.successful() + + assert res.result == {"status": "uneventful"} + mock_loader.assert_called_once_with()