diff --git a/swh/loader/mercurial/__init__.py b/swh/loader/mercurial/__init__.py --- a/swh/loader/mercurial/__init__.py +++ b/swh/loader/mercurial/__init__.py @@ -9,9 +9,9 @@ def register() -> Mapping[str, Any]: """Register the current worker module's definition""" - from .loader import HgBundle20Loader + from .loader import HgLoader return { "task_modules": [f"{__name__}.tasks"], - "loader": HgBundle20Loader, + "loader": HgLoader, } diff --git a/swh/loader/mercurial/cli.py b/swh/loader/mercurial/cli.py --- a/swh/loader/mercurial/cli.py +++ b/swh/loader/mercurial/cli.py @@ -4,8 +4,8 @@ # See top-level LICENSE file for more information import datetime -from itertools import chain import logging +from itertools import chain import click @@ -44,11 +44,11 @@ visit_date = datetime.datetime.now(tz=datetime.timezone.utc) kwargs = {"visit_date": visit_date, "origin_url": origin_url} if hg_archive: - from .loader import HgArchiveBundle20Loader as HgLoader + from .loader import HgArchiveLoader as HgLoader kwargs["archive_path"] = hg_archive else: - from .loader import HgBundle20Loader as HgLoader + from .loader import HgLoader kwargs["directory"] = hg_directory diff --git a/swh/loader/mercurial/loader.py b/swh/loader/mercurial/from_bundle.py copy from swh/loader/mercurial/loader.py copy to swh/loader/mercurial/from_bundle.py --- a/swh/loader/mercurial/loader.py +++ b/swh/loader/mercurial/from_bundle.py @@ -19,17 +19,17 @@ import datetime import os -from queue import Empty import random import re +import time +from queue import Empty from shutil import rmtree from tempfile import mkdtemp -import time from typing import Any, Dict, Iterable, List, Optional import billiard -from dateutil import parser import hglib +from dateutil import parser from swh.core.config import merge_configs from swh.loader.core.loader import DVCSLoader diff --git a/swh/loader/mercurial/from_disk.py b/swh/loader/mercurial/from_disk.py new file mode 100644 --- /dev/null +++ b/swh/loader/mercurial/from_disk.py @@ -0,0 +1,467 @@ +# Copyright (C) 2020 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import os +from datetime import datetime, timezone +from shutil import rmtree +from tempfile import mkdtemp +from typing import Any, Dict, List, Optional, Union + +import dateutil +from mercurial.util import lrucachedict # type: ignore + +from swh.core.config import merge_configs +from swh.loader.core.loader import BaseLoader +from swh.loader.core.utils import clean_dangling_folders +from swh.model.from_disk import Content, DentryPerms, Directory +from swh.model.hashutil import MultiHash, hash_to_bytehex +from swh.model.model import Content as ModelContent +from swh.model.model import ( + ObjectType, + Origin, + Person, + Release, + Revision, + RevisionType, + Sha1Git, + Snapshot, + SnapshotBranch, + TargetType, + TimestampWithTimezone, +) + +from . import hgutil +from .archive_extract import tmp_extract +from .hgutil import HgNodeId + +FLAG_PERMS = { + b"l": DentryPerms.symlink, + b"x": DentryPerms.executable_content, + b"": DentryPerms.content, +} # type: Dict[bytes, DentryPerms] +DEFAULT_CONFIG: Dict[str, Any] = { + "temp_directory": "/tmp", + "clone_timeout_seconds": 7200, + "content_cache_size": 10_000, +} +TEMPORARY_DIR_PREFIX_PATTERN = "swh.loader.mercurial.from_disk" + + +def parse_visit_date(visit_date: Optional[Union[datetime, str]]) -> Optional[datetime]: + if visit_date is None: + return None + + if isinstance(visit_date, datetime): + return visit_date + + if visit_date == "now": + return datetime.now(tz=timezone.utc) + + if isinstance(visit_date, str): + return dateutil.parser.parse(visit_date) + + return ValueError(f"invalid visit date {visit_date!r}") + + +class HgRootDirectory(Directory): + def add_file(self, path: bytes, content: Content) -> None: + """Add a file creating missing directories if needed.""" + parts = [part for part in path.split(os.path.sep.encode()) if part] + + current_dir = self + while len(parts) > 1: + part = parts.pop(0) + if part not in current_dir: + current_dir[part] = Directory() + current_dir = current_dir[part] + part = parts.pop(0) + current_dir[part] = content + + +class HgLoaderFromDisk(BaseLoader): + """Load a mercurial repository from a local repository.""" + + CONFIG_BASE_FILENAME = "loader/mercurial" + + visit_type = "hg" + + def __init__( + self, + url: str, + directory: Optional[str] = None, + logging_class: str = "swh.loader.mercurial.Loader", + visit_date: Optional[Union[datetime, str]] = None, + config: Optional[Dict[str, Any]] = None, + ): + """Initialize the loader. + + Args: + url: url of the repository. + directory: directory of the local repository. + logging_class: class of the loader logger. + visit_date: visit date of the repository + config: loader configuration + """ + super().__init__(logging_class=logging_class, config=config or {}) + + self.config = merge_configs(DEFAULT_CONFIG, self.config) + self.temp_directory = self.config["temp_directory"] + self.clone_timeout = self.config["clone_timeout_seconds"] + + self.origin_url = url + self.visit_date = parse_visit_date(visit_date) + self.directory = directory + + self._repo: Optional[hgutil.Repository] = None + self.revision_nodeid_to_swhid: Dict[HgNodeId, Sha1Git] = {} + + # Cache the content hash across revisions to avoid recalculation. + self.content_hash_cache: lrucachedict = lrucachedict( + self.config["content_cache_size"], + ) + + @property + def repo(self) -> hgutil.Repository: + """ An filtered mercurial repository. + + The repository will only return visible changesets. + If needed, use self.repo.unfiltered() to have a new repository + returning all the changesets. + """ + if self._repo is None: + self._repo = hgutil.repository(self.repo_directory) + return self._repo + + def pre_cleanup(self) -> None: + """As a first step, will try and check for dangling data to cleanup. + This should do its best to avoid raising issues. + + """ + clean_dangling_folders( + self.temp_directory, + pattern_check=TEMPORARY_DIR_PREFIX_PATTERN, + log=self.log, + ) + + def cleanup(self) -> None: + """Last step executed by the loader.""" + if self.repo_directory and os.path.exists(self.repo_directory): + self.log.debug(f"Cleanup up repository {self.repo_directory}") + rmtree(self.repo_directory) + + def prepare_origin_visit(self, *args, **kwargs) -> None: + """First step executed by the loader to prepare origin and visit + references. Set/update self.origin, and + optionally self.origin_url, self.visit_date. + + """ + self.origin = Origin(url=self.origin_url) + + def prepare(self, *args, **kwargs) -> None: + """Second step executed by the loader to prepare some state needed by + the loader. + + """ + + def fetch_data(self) -> bool: + """Fetch the data from the source the loader is currently loading + + Returns: + a value that is interpreted as a boolean. If True, fetch_data needs + to be called again to complete loading. + + """ + if not self.directory: # no local repository + self.repo_directory = mkdtemp( + prefix=TEMPORARY_DIR_PREFIX_PATTERN, + suffix=f"-{os.getpid()}", + dir=self.temp_directory, + ) + self.log.debug( + f"Cloning {self.origin_url} to {self.directory} " + f"with timeout {self.clone_timeout} seconds" + ) + hgutil.clone(self.origin_url, self.repo_directory, self.clone_timeout) + else: # existing local repository + self.repo_directory = self.directory + + return False + + def store_data(self): + """Store fetched data in the database.""" + for rev in self.repo: + hg_nodeid = self.repo[rev].node() + revision_swhid = self.store_revision(hg_nodeid) + self.revision_nodeid_to_swhid[hg_nodeid] = revision_swhid + + branch_by_hg_nodeid: Dict[HgNodeId, bytes] = { + hg_nodeid: name for name, hg_nodeid in hgutil.branches(self.repo).items() + } + tags_by_name: Dict[bytes, HgNodeId] = self.repo.tags() + tags_by_hg_nodeid: Dict[HgNodeId, bytes] = { + hg_nodeid: name for name, hg_nodeid in tags_by_name.items() + } + + snapshot_branches: Dict[bytes, SnapshotBranch] = {} + + for hg_nodeid, revision_swhid in self.revision_nodeid_to_swhid.items(): + tag_name = tags_by_hg_nodeid.get(hg_nodeid) + + # tip is listed in the tags by the mercurial api + # but its not a tag defined by the user in `.hgtags` + if tag_name and tag_name != b"tip": + snapshot_branches[tag_name] = SnapshotBranch( + target=self.store_release(tag_name, revision_swhid), + target_type=TargetType.RELEASE, + ) + + if hg_nodeid in branch_by_hg_nodeid: + name = branch_by_hg_nodeid[hg_nodeid] + snapshot_branches[name] = SnapshotBranch( + target=revision_swhid, target_type=TargetType.REVISION, + ) + + if hg_nodeid == tags_by_name[b"tip"]: + snapshot_branches[b"HEAD"] = SnapshotBranch( + target=name, target_type=TargetType.ALIAS, + ) + + snapshot = Snapshot(branches=snapshot_branches) + self.storage.snapshot_add([snapshot]) + + self.flush() + self.loaded_snapshot_id = snapshot.id + + def store_revision(self, hg_nodeid: HgNodeId) -> Sha1Git: + """Store a revision given its hg nodeid. + + If a parent revision of the revision to store is not yet stored, + it is recursively stored. Revisions are stored from oldest to newest, + thus recursion limit will not be reached. + + Args: + hg_nodeid: the hg nodeid of the revision. + + Returns: + the swhid of the stored revision. + """ + rev_ctx = self.repo[hg_nodeid] + + root_swhid = self.store_directories(hg_nodeid) + + author = Person.from_fullname(rev_ctx.user()) + (timestamp, offset) = rev_ctx.date() + + # TimestampWithTimezone.from_dict will change name + # as it accept more than just dicts + rev_date = TimestampWithTimezone.from_dict(int(timestamp)) + + extra_headers = [ + (b"time_offset_seconds", str(offset).encode(),), + ] + for key, value in rev_ctx.extra().items(): + # The default branch is skipped to match + # the historical implementation + if key == b"branch" and value == b"default": + continue + + # transplant_source is converted to match + # the historical implementation + if key == b"transplant_source": + value = hash_to_bytehex(value) + extra_headers.append((key, value)) + + parents = [] + for parent_ctx in rev_ctx.parents(): + parent_hg_nodeid = parent_ctx.node() + # nullid is the value of a parent that does not exist + if parent_hg_nodeid == hgutil.NULLID: + continue + + if parent_hg_nodeid not in self.revision_nodeid_to_swhid: + parent_revision_swhid = self.store_revision(parent_hg_nodeid) + self.revision_nodeid_to_swhid[parent_hg_nodeid] = parent_revision_swhid + + parents.append(self.revision_nodeid_to_swhid[parent_hg_nodeid]) + + revision = Revision( + author=author, + date=rev_date, + committer=author, + committer_date=rev_date, + type=RevisionType.MERCURIAL, + directory=root_swhid, + message=rev_ctx.description(), + metadata={"node": hg_nodeid.hex()}, + extra_headers=tuple(extra_headers), + synthetic=False, + parents=tuple(parents), + ) + + self.storage.revision_add([revision]) + + return revision.id + + def store_release(self, name: bytes, target=Sha1Git) -> Sha1Git: + """Store a release given its name and its target. + + Args: + name: name of the release. + target: swhid of the target revision. + + Returns: + the swhid of the stored release. + """ + release = Release( + name=name, + target=target, + target_type=ObjectType.REVISION, + message=None, + metadata=None, + synthetic=False, + author=Person(name=None, email=None, fullname=b""), + date=None, + ) + + self.storage.release_add([release]) + + return release.id + + def store_content(self, hg_nodeid: HgNodeId, file_path: bytes) -> Content: + """Store a revision content hg nodeid and file path. + + Args: + hg_nodeid: the hg nodeid of the revision. + file_path: the hg path of the content. + + Returns: + the swhid of the top level directory. + """ + rev_ctx = self.repo[hg_nodeid] + file_ctx = rev_ctx[file_path] + + file_nodeid = file_ctx._fileid + perms = FLAG_PERMS[file_ctx.flags()] + + # Key is file_nodeid + perms because permissions does not participate + # in content hash in hg while it is the case in swh. + cache_key = (file_nodeid, perms) + + sha1_git = self.content_hash_cache.get(cache_key) + if sha1_git is not None: + return Content({"sha1_git": sha1_git, "perms": perms}) + + data = file_ctx.data() + + content_data = MultiHash.from_data(data).digest() + content_data["length"] = len(data) + content_data["perms"] = perms + content_data["data"] = data + content_data["status"] = "visible" + content = Content(content_data) + + model = content.to_model() + if isinstance(model, ModelContent): + self.storage.content_add([model]) + else: + raise ValueError( + f"{file_path!r} at rev {hg_nodeid.hex()!r} " + "produced {type(model)!r} instead of {ModelContent!r}" + ) + + self.content_hash_cache[cache_key] = content.hash + + return content + + def store_directories(self, hg_nodeid: HgNodeId) -> Sha1Git: + """Store a revision directories given its hg nodeid. + + Args: + hg_nodeid: the hg nodeid of the revision. + + Returns: + the swhid of the top level directory. + """ + rev_ctx = self.repo[hg_nodeid] + + root = HgRootDirectory() + for file_path in rev_ctx.manifest(): + content = self.store_content(hg_nodeid, file_path) + root.add_file(file_path, content) + + directories: List[Directory] = [root] + while directories: + directory = directories.pop(0) + self.storage.directory_add([directory.to_model()]) + directories.extend( + [item for item in directory.values() if isinstance(item, Directory)] + ) + + return root.hash + + +class HgArchiveLoaderFromDisk(HgLoaderFromDisk): + def __init__( + self, url: str, visit_date: Optional[datetime] = None, archive_path: str = None + ): + super().__init__( + url, + visit_date=visit_date, + logging_class="swh.loader.mercurial.ArchiveLoader", + ) + self.temp_dir = None + self.archive_path = archive_path + + def prepare(self, *args, **kwargs): + self.temp_directory = tmp_extract( + archive=self.archive_path, + dir=self.temp_directory, + prefix=TEMPORARY_DIR_PREFIX_PATTERN, + suffix=f".dump-{os.getpid()}", + log=self.log, + source=self.origin_url, + ) + + repo_name = os.listdir(self.temp_dir)[0] + self.directory = os.path.join(self.temp_dir, repo_name) + super().prepare(*args, **kwargs) + + def cleanup(self) -> None: + if self.temp_dir and os.path.exists(self.temp_dir): + rmtree(self.temp_dir) + super().cleanup() + + +def get_hg_branches(repo) -> Dict[bytes, HgNodeId]: + """Equivalent of `hg branches`. + + For some reason, there is no direct way to get the same result of `hg branches` in + the localrepository interface. + """ + result = {} + for tag, heads, tip, isclosed in repo.branchmap().iterbranches(): + if isclosed: + continue + result[tag] = tip + return result + + +if __name__ == "__main__": + import logging + + import click + + logging.basicConfig( + level=logging.DEBUG, format="%(asctime)s %(process)d %(message)s" + ) + + @click.command() + @click.option("--origin-url", help="origin url") + @click.option("--hg-directory", help="Path to mercurial repository to load") + @click.option("--visit-date", default=None, help="Visit date") + def main(origin_url, hg_directory, visit_date): + return HgLoaderFromDisk().load(origin_url, hg_directory, visit_date) + + main() diff --git a/swh/loader/mercurial/hgutil.py b/swh/loader/mercurial/hgutil.py new file mode 100644 --- /dev/null +++ b/swh/loader/mercurial/hgutil.py @@ -0,0 +1,66 @@ +import io +import traceback +from multiprocessing import Process, Queue +from typing import Dict, NewType + +# The internal Mercurial API is not guaranteed to be stable. +import mercurial.ui # type: ignore +from mercurial import hg + +NULLID = mercurial.node.nullid +HgNodeId = NewType("HgNodeId", bytes) +Repository = hg.localrepo + + +def repository(path: str) -> hg.localrepo: + ui = mercurial.ui.ui.load() + return hg.repository(ui, path.encode()) + + +def branches(repo: hg.localrepo) -> Dict[bytes, HgNodeId]: + """Equivalent of `hg branches`. + + For some reason, there is no direct way to get the same result of `hg branches` in + the localrepository interface. + """ + result = {} + for tag, heads, tip, isclosed in repo.branchmap().iterbranches(): + if isclosed: + continue + result[tag] = tip + return result + + +class CloneTimeout(Exception): + pass + + +class CloneFailure(Exception): + pass + + +def _clone_task(src: str, dest: str, errors: Queue) -> None: + try: + hg.clone(mercurial.ui.ui.load(), {}, src.encode(), dest.encode()) + except Exception as e: + exc_buffer = io.StringIO() + traceback.print_exc(file=exc_buffer) + errors.put_nowait(exc_buffer.getvalue()) + raise e + + +def clone(src: str, dest: str, timeout: int) -> None: + errors: Queue = Queue() + process = Process(target=_clone_task, args=(src, dest, errors)) + process.start() + process.join(timeout) + + if process.is_alive(): + process.terminate() + process.join(1) + if process.is_alive(): + process.kill() + raise CloneTimeout(src, timeout) + + if not errors.empty(): + raise CloneFailure(src, dest, errors.get()) diff --git a/swh/loader/mercurial/loader.py b/swh/loader/mercurial/loader.py --- a/swh/loader/mercurial/loader.py +++ b/swh/loader/mercurial/loader.py @@ -1,641 +1,6 @@ -# Copyright (C) 2017-2020 The Software Heritage developers -# See the AUTHORS file at the top-level directory of this distribution -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information +# from .from_bundle import HgArchiveBundle20Loader as HgArchiveLoader +# from .from_bundle import HgBundle20Loader as HgLoader +from .from_disk import HgArchiveLoaderFromDisk as HgArchiveLoader +from .from_disk import HgLoaderFromDisk as HgLoader -"""This document contains a SWH loader for ingesting repository data -from Mercurial version 2 bundle files. - -""" - -# NOTE: The code here does expensive work twice in places because of the -# intermediate need to check for what is missing before sending to the database -# and the desire to not juggle very large amounts of data. - -# TODO: Decide whether to also serialize to disk and read back more quickly -# from there. Maybe only for very large repos and fast drives. -# - Avi - - -import datetime -import os -from queue import Empty -import random -import re -from shutil import rmtree -from tempfile import mkdtemp -import time -from typing import Any, Dict, Iterable, List, Optional - -import billiard -from dateutil import parser -import hglib - -from swh.core.config import merge_configs -from swh.loader.core.loader import DVCSLoader -from swh.loader.core.utils import clean_dangling_folders -from swh.model import identifiers -from swh.model.hashutil import ( - DEFAULT_ALGORITHMS, - MultiHash, - hash_to_bytehex, - hash_to_bytes, - hash_to_hex, -) -from swh.model.model import ( - BaseContent, - Content, - Directory, - ObjectType, - Origin, - Person, - Release, - Revision, - RevisionType, - Sha1Git, - SkippedContent, - Snapshot, - SnapshotBranch, - TargetType, - TimestampWithTimezone, -) -from swh.storage.algos.origin import origin_get_latest_visit_status - -from . import converters -from .archive_extract import tmp_extract -from .bundle20_reader import Bundle20Reader -from .converters import PRIMARY_ALGO as ALGO -from .objects import SelectiveCache, SimpleTree - -TAG_PATTERN = re.compile("[0-9A-Fa-f]{40}") - -TEMPORARY_DIR_PREFIX_PATTERN = "swh.loader.mercurial." - -HEAD_POINTER_NAME = b"tip" - - -class CloneTimeoutError(Exception): - pass - - -DEFAULT_CONFIG: Dict[str, Any] = { - "bundle_filename": "HG20_none_bundle", - "reduce_effort": False, - "temp_directory": "/tmp", - "cache1_size": 800 * 1024 * 1024, - "cache2_size": 800 * 1024 * 1024, - "clone_timeout_seconds": 7200, -} - - -class HgBundle20Loader(DVCSLoader): - """Mercurial loader able to deal with remote or local repository. - - """ - - visit_type = "hg" - - def __init__( - self, - url, - visit_date=None, - directory=None, - logging_class="swh.loader.mercurial.Bundle20Loader", - ): - super().__init__(logging_class=logging_class) - self.config = merge_configs(DEFAULT_CONFIG, self.config) - self.origin_url = url - self.visit_date = visit_date - self.directory = directory - self.bundle_filename = self.config["bundle_filename"] - self.reduce_effort_flag = self.config["reduce_effort"] - self.empty_repository = None - self.temp_directory = self.config["temp_directory"] - self.cache1_size = self.config["cache1_size"] - self.cache2_size = self.config["cache2_size"] - self.clone_timeout = self.config["clone_timeout_seconds"] - self.working_directory = None - self.bundle_path = None - self.heads = {} - self.releases = {} - self.last_snapshot_id: Optional[bytes] = None - - def pre_cleanup(self): - """Cleanup potential dangling files from prior runs (e.g. OOM killed - tasks) - - """ - clean_dangling_folders( - self.temp_directory, - pattern_check=TEMPORARY_DIR_PREFIX_PATTERN, - log=self.log, - ) - - def cleanup(self): - """Clean temporary working directory - - """ - if self.bundle_path and os.path.exists(self.bundle_path): - self.log.debug("Cleanup up working bundle %s" % self.bundle_path) - os.unlink(self.bundle_path) - if self.working_directory and os.path.exists(self.working_directory): - self.log.debug( - "Cleanup up working directory %s" % (self.working_directory,) - ) - rmtree(self.working_directory) - - def get_heads(self, repo): - """Read the closed branches heads (branch, bookmarks) and returns a - dict with key the branch_name (bytes) and values the tuple - (pointer nature (bytes), mercurial's node id - (bytes)). Those needs conversion to swh-ids. This is taken - care of in get_revisions. - - """ - b = {} - for _, node_hash_id, pointer_nature, branch_name, *_ in repo.heads(): - b[branch_name] = (pointer_nature, hash_to_bytes(node_hash_id.decode())) - - bookmarks = repo.bookmarks() - if bookmarks and bookmarks[0]: - for bookmark_name, _, target_short in bookmarks[0]: - target = repo[target_short].node() - b[bookmark_name] = (None, hash_to_bytes(target.decode())) - - return b - - def prepare_origin_visit(self, *args, **kwargs) -> None: - self.origin = Origin(url=self.origin_url) - visit_date = self.visit_date - if isinstance(visit_date, str): # visit_date can be string or datetime - visit_date = parser.parse(visit_date) - self.visit_date = visit_date - visit_and_status = origin_get_latest_visit_status( - self.storage, self.origin_url, require_snapshot=True - ) - if visit_and_status is None: - self.last_snapshot_id = None - else: - _, visit_status = visit_and_status - self.last_snapshot_id = visit_status.snapshot - - @staticmethod - def clone_with_timeout(log, origin, destination, timeout): - queue = billiard.Queue() - start = time.monotonic() - - def do_clone(queue, origin, destination): - try: - result = hglib.clone(source=origin, dest=destination) - except BaseException as e: - queue.put(e) - else: - queue.put(result) - - process = billiard.Process(target=do_clone, args=(queue, origin, destination)) - process.start() - - while True: - try: - result = queue.get(timeout=0.1) - break - except Empty: - duration = time.monotonic() - start - if timeout and duration > timeout: - log.warning( - "Timeout cloning `%s` within %s seconds", origin, timeout - ) - process.terminate() - process.join() - raise CloneTimeoutError(origin, timeout) - continue - - process.join() - if isinstance(result, Exception): - raise result from None - - return result - - def prepare(self, *args, **kwargs): - """Prepare the necessary steps to load an actual remote or local - repository. - - To load a local repository, pass the optional directory - parameter as filled with a path to a real local folder. - - To load a remote repository, pass the optional directory - parameter as None. - - Args: - origin_url (str): Origin url to load - visit_date (str/datetime): Date of the visit - directory (str/None): The local directory to load - - """ - self.branches = {} - self.tags = [] - self.releases = {} - self.node_2_rev = {} - self.heads = {} - - directory = self.directory - - if not directory: # remote repository - self.working_directory = mkdtemp( - prefix=TEMPORARY_DIR_PREFIX_PATTERN, - suffix="-%s" % os.getpid(), - dir=self.temp_directory, - ) - os.makedirs(self.working_directory, exist_ok=True) - self.hgdir = self.working_directory - - self.log.debug( - "Cloning %s to %s with timeout %s seconds", - self.origin_url, - self.hgdir, - self.clone_timeout, - ) - - self.clone_with_timeout( - self.log, self.origin_url, self.hgdir, self.clone_timeout - ) - - else: # local repository - self.working_directory = None - self.hgdir = directory - - self.bundle_path = os.path.join(self.hgdir, self.bundle_filename) - self.log.debug("Bundling at %s" % self.bundle_path) - with hglib.open(self.hgdir) as repo: - self.heads = self.get_heads(repo) - repo.bundle(bytes(self.bundle_path, "utf-8"), all=True, type=b"none-v2") - - self.cache_filename1 = os.path.join( - self.hgdir, "swh-cache-1-%s" % (hex(random.randint(0, 0xFFFFFF))[2:],) - ) - self.cache_filename2 = os.path.join( - self.hgdir, "swh-cache-2-%s" % (hex(random.randint(0, 0xFFFFFF))[2:],) - ) - - try: - self.br = Bundle20Reader( - bundlefile=self.bundle_path, - cache_filename=self.cache_filename1, - cache_size=self.cache1_size, - ) - except FileNotFoundError: - # Empty repository! Still a successful visit targeting an - # empty snapshot - self.log.warn("%s is an empty repository!" % self.hgdir) - self.empty_repository = True - else: - self.reduce_effort = set() - if self.reduce_effort_flag: - now = datetime.datetime.now(tz=datetime.timezone.utc) - if (now - self.visit_date).days > 1: - # Assuming that self.visit_date would be today for - # a new visit, treat older visit dates as - # indication of wanting to skip some processing - # effort. - for header, commit in self.br.yield_all_changesets(): - ts = commit["time"].timestamp() - if ts < self.visit_date.timestamp(): - self.reduce_effort.add(header["node"]) - - def has_contents(self): - return not self.empty_repository - - def has_directories(self): - return not self.empty_repository - - def has_revisions(self): - return not self.empty_repository - - def has_releases(self): - return not self.empty_repository - - def fetch_data(self): - """Fetch the data from the data source.""" - pass - - def get_contents(self) -> Iterable[BaseContent]: - """Get the contents that need to be loaded.""" - - # NOTE: This method generates blobs twice to reduce memory usage - # without generating disk writes. - self.file_node_to_hash = {} - hash_to_info = {} - self.num_contents = 0 - contents: Dict[bytes, BaseContent] = {} - missing_contents = set() - - for blob, node_info in self.br.yield_all_blobs(): - self.num_contents += 1 - file_name = node_info[0] - header = node_info[2] - - length = len(blob) - if header["linknode"] in self.reduce_effort: - algorithms = set([ALGO]) - else: - algorithms = DEFAULT_ALGORITHMS - h = MultiHash.from_data(blob, hash_names=algorithms) - content = h.digest() - content["length"] = length - blob_hash = content[ALGO] - self.file_node_to_hash[header["node"]] = blob_hash - - if header["linknode"] in self.reduce_effort: - continue - - hash_to_info[blob_hash] = node_info - if self.max_content_size is not None and length >= self.max_content_size: - contents[blob_hash] = SkippedContent( - status="absent", reason="Content too large", **content - ) - else: - contents[blob_hash] = Content(data=blob, status="visible", **content) - - if file_name == b".hgtags": - # https://www.mercurial-scm.org/wiki/GitConcepts#Tag_model - # overwrite until the last one - self.tags = (t for t in blob.split(b"\n") if t != b"") - - if contents: - missing_contents = set( - self.storage.content_missing( - [c.to_dict() for c in contents.values()], key_hash=ALGO - ) - ) - - # Clusters needed blobs by file offset and then only fetches the - # groups at the needed offsets. - focs: Dict[int, Dict[bytes, bytes]] = {} # "file/offset/contents" - for blob_hash in missing_contents: - _, file_offset, header = hash_to_info[blob_hash] - focs.setdefault(file_offset, {}) - focs[file_offset][header["node"]] = blob_hash - - for offset, node_hashes in sorted(focs.items()): - for header, data, *_ in self.br.yield_group_objects(group_offset=offset): - node = header["node"] - if node in node_hashes: - blob, meta = self.br.extract_meta_from_blob(data) - content = contents.pop(node_hashes[node], None) - if content: - if ( - self.max_content_size is not None - and len(blob) >= self.max_content_size - ): - yield SkippedContent.from_data( - blob, reason="Content too large" - ) - else: - yield Content.from_data(blob) - - def load_directories(self): - """This is where the work is done to convert manifest deltas from the - repository bundle into SWH directories. - - """ - self.mnode_to_tree_id = {} - cache_hints = self.br.build_manifest_hints() - - def tree_size(t): - return t.size() - - self.trees = SelectiveCache( - cache_hints=cache_hints, - size_function=tree_size, - filename=self.cache_filename2, - max_size=self.cache2_size, - ) - - tree = SimpleTree() - for header, added, removed in self.br.yield_all_manifest_deltas(cache_hints): - node = header["node"] - basenode = header["basenode"] - tree = self.trees.fetch(basenode) or tree # working tree - - for path in removed.keys(): - tree = tree.remove_tree_node_for_path(path) - for path, info in added.items(): - file_node, is_symlink, perms_code = info - tree = tree.add_blob( - path, self.file_node_to_hash[file_node], is_symlink, perms_code - ) - - if header["linknode"] in self.reduce_effort: - self.trees.store(node, tree) - else: - new_dirs = [] - self.mnode_to_tree_id[node] = tree.hash_changed(new_dirs) - self.trees.store(node, tree) - yield header, tree, new_dirs - - def get_directories(self) -> Iterable[Directory]: - """Compute directories to load - - """ - dirs: Dict[Sha1Git, Directory] = {} - self.num_directories = 0 - for _, _, new_dirs in self.load_directories(): - for d in new_dirs: - self.num_directories += 1 - dirs[d["id"]] = Directory.from_dict(d) - - missing_dirs: List[Sha1Git] = list(dirs.keys()) - if missing_dirs: - missing_dirs = list(self.storage.directory_missing(missing_dirs)) - - for _id in missing_dirs: - yield dirs[_id] - - def get_revisions(self) -> Iterable[Revision]: - """Compute revisions to load - - """ - revisions = {} - self.num_revisions = 0 - for header, commit in self.br.yield_all_changesets(): - if header["node"] in self.reduce_effort: - continue - - self.num_revisions += 1 - date_dict = identifiers.normalize_timestamp(int(commit["time"].timestamp())) - author_dict = converters.parse_author(commit["user"]) - if commit["manifest"] == Bundle20Reader.NAUGHT_NODE: - directory_id = SimpleTree().hash_changed() - else: - directory_id = self.mnode_to_tree_id[commit["manifest"]] - - extra_headers = [ - ( - b"time_offset_seconds", - str(commit["time_offset_seconds"]).encode("utf-8"), - ) - ] - extra = commit.get("extra") - if extra: - for e in extra.split(b"\x00"): - k, v = e.split(b":", 1) - # transplant_source stores binary reference to a changeset - # prefer to dump hexadecimal one in the revision metadata - if k == b"transplant_source": - v = hash_to_bytehex(v) - extra_headers.append((k, v)) - - parents = [] - p1 = self.node_2_rev.get(header["p1"]) - p2 = self.node_2_rev.get(header["p2"]) - if p1: - parents.append(p1) - if p2: - parents.append(p2) - - revision = Revision( - author=Person.from_dict(author_dict), - date=TimestampWithTimezone.from_dict(date_dict), - committer=Person.from_dict(author_dict), - committer_date=TimestampWithTimezone.from_dict(date_dict), - type=RevisionType.MERCURIAL, - directory=directory_id, - message=commit["message"], - metadata={"node": hash_to_hex(header["node"]),}, - extra_headers=tuple(extra_headers), - synthetic=False, - parents=tuple(parents), - ) - - self.node_2_rev[header["node"]] = revision.id - revisions[revision.id] = revision - - # Converts heads to use swh ids - self.heads = { - branch_name: (pointer_nature, self.node_2_rev[node_id]) - for branch_name, (pointer_nature, node_id) in self.heads.items() - } - - missing_revs = set(revisions.keys()) - if missing_revs: - missing_revs = set(self.storage.revision_missing(list(missing_revs))) - - for rev in missing_revs: - yield revisions[rev] - self.mnode_to_tree_id = None - - def _read_tag(self, tag, split_byte=b" "): - node, *name = tag.split(split_byte) - name = split_byte.join(name) - return node, name - - def get_releases(self) -> Iterable[Release]: - """Get the releases that need to be loaded.""" - self.num_releases = 0 - releases = {} - missing_releases = set() - for t in self.tags: - self.num_releases += 1 - node, name = self._read_tag(t) - node = node.decode() - node_bytes = hash_to_bytes(node) - if not TAG_PATTERN.match(node): - self.log.warn("Wrong pattern (%s) found in tags. Skipping" % (node,)) - continue - if node_bytes not in self.node_2_rev: - self.log.warn( - "No matching revision for tag %s " - "(hg changeset: %s). Skipping" % (name.decode(), node) - ) - continue - tgt_rev = self.node_2_rev[node_bytes] - release = Release( - name=name, - target=tgt_rev, - target_type=ObjectType.REVISION, - message=None, - metadata=None, - synthetic=False, - author=Person(name=None, email=None, fullname=b""), - date=None, - ) - missing_releases.add(release.id) - releases[release.id] = release - self.releases[name] = release.id - - if missing_releases: - missing_releases = set(self.storage.release_missing(list(missing_releases))) - - for _id in missing_releases: - yield releases[_id] - - def get_snapshot(self) -> Snapshot: - """Get the snapshot that need to be loaded.""" - branches: Dict[bytes, Optional[SnapshotBranch]] = {} - for name, (pointer_nature, target) in self.heads.items(): - branches[name] = SnapshotBranch( - target=target, target_type=TargetType.REVISION - ) - if pointer_nature == HEAD_POINTER_NAME: - branches[b"HEAD"] = SnapshotBranch( - target=name, target_type=TargetType.ALIAS - ) - for name, target in self.releases.items(): - branches[name] = SnapshotBranch( - target=target, target_type=TargetType.RELEASE - ) - - self.snapshot = Snapshot(branches=branches) - return self.snapshot - - def get_fetch_history_result(self): - """Return the data to store in fetch_history.""" - return { - "contents": self.num_contents, - "directories": self.num_directories, - "revisions": self.num_revisions, - "releases": self.num_releases, - } - - def load_status(self): - snapshot = self.get_snapshot() - load_status = "eventful" - if self.last_snapshot_id is not None and self.last_snapshot_id == snapshot.id: - load_status = "uneventful" - return { - "status": load_status, - } - - -class HgArchiveBundle20Loader(HgBundle20Loader): - """Mercurial loader for repository wrapped within archives. - - """ - - def __init__(self, url, visit_date=None, archive_path=None): - super().__init__( - url, - visit_date=visit_date, - logging_class="swh.loader.mercurial.HgArchiveBundle20Loader", - ) - self.temp_dir = None - self.archive_path = archive_path - - def prepare(self, *args, **kwargs): - self.temp_dir = tmp_extract( - archive=self.archive_path, - dir=self.temp_directory, - prefix=TEMPORARY_DIR_PREFIX_PATTERN, - suffix=".dump-%s" % os.getpid(), - log=self.log, - source=self.origin_url, - ) - - repo_name = os.listdir(self.temp_dir)[0] - self.directory = os.path.join(self.temp_dir, repo_name) - super().prepare(*args, **kwargs) - - def cleanup(self): - if self.temp_dir and os.path.exists(self.temp_dir): - rmtree(self.temp_dir) - super().cleanup() +__all__ = ["HgArchiveLoader", "HgLoader"] diff --git a/swh/loader/mercurial/tasks.py b/swh/loader/mercurial/tasks.py --- a/swh/loader/mercurial/tasks.py +++ b/swh/loader/mercurial/tasks.py @@ -5,7 +5,7 @@ from celery import shared_task -from .loader import HgArchiveBundle20Loader, HgBundle20Loader +from .loader import HgArchiveLoader, HgLoader @shared_task(name=__name__ + ".LoadMercurial") @@ -17,7 +17,7 @@ Args: see :func:`DepositLoader.load`. """ - loader = HgBundle20Loader(url, directory=directory, visit_date=visit_date) + loader = HgLoader(url, directory=directory, visit_date=visit_date) return loader.load() @@ -27,7 +27,5 @@ Args: see :func:`DepositLoader.load`. """ - loader = HgArchiveBundle20Loader( - url, archive_path=archive_path, visit_date=visit_date - ) + loader = HgArchiveLoader(url, archive_path=archive_path, visit_date=visit_date) return loader.load() diff --git a/swh/loader/mercurial/tests/test_loader.py b/swh/loader/mercurial/tests/test_from_bundle.py rename from swh/loader/mercurial/tests/test_loader.py rename to swh/loader/mercurial/tests/test_from_bundle.py --- a/swh/loader/mercurial/tests/test_loader.py +++ b/swh/loader/mercurial/tests/test_from_bundle.py @@ -21,7 +21,7 @@ from swh.model.model import RevisionType, Snapshot, SnapshotBranch, TargetType from swh.storage.algos.snapshot import snapshot_get_latest -from ..loader import CloneTimeoutError, HgArchiveBundle20Loader, HgBundle20Loader +from ..from_bundle import CloneTimeoutError, HgArchiveBundle20Loader, HgBundle20Loader from .loader_checker import ExpectedSwhids, LoaderChecker diff --git a/swh/loader/mercurial/tests/test_loader.py b/swh/loader/mercurial/tests/test_from_disk.py rename from swh/loader/mercurial/tests/test_loader.py rename to swh/loader/mercurial/tests/test_from_disk.py --- a/swh/loader/mercurial/tests/test_loader.py +++ b/swh/loader/mercurial/tests/test_from_disk.py @@ -1,15 +1,12 @@ -# Copyright (C) 2018-2020 The Software Heritage developers +# Copyright (C) 2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -import copy -import logging import os -import time +from urllib.parse import urlsplit -import hglib -import pytest +import dateutil from swh.loader.tests import ( assert_last_visit_matches, @@ -21,28 +18,45 @@ from swh.model.model import RevisionType, Snapshot, SnapshotBranch, TargetType from swh.storage.algos.snapshot import snapshot_get_latest -from ..loader import CloneTimeoutError, HgArchiveBundle20Loader, HgBundle20Loader +from ..from_disk import HgLoaderFromDisk from .loader_checker import ExpectedSwhids, LoaderChecker +# Those tests assert expectations on repository loading +# by reading expected values from associated json files +# produced by the `swh-hg-identify` command line utility. +# +# It has more granularity than historical tests. +# Assertions will tell if the error comes from the directories +# revisions or release rather than only checking the snapshot. +# +# With more work it should event be possible to know which part +# of an object is faulty. def test_examples(swh_config, datadir, tmp_path): for archive_name in ("hello", "transplant", "the-sandbox", "example"): archive_path = os.path.join(datadir, f"{archive_name}.tgz") json_path = os.path.join(datadir, f"{archive_name}.json") repo_url = prepare_repository_from_archive(archive_path, archive_name, tmp_path) + directory = urlsplit(repo_url).path LoaderChecker( - loader=HgBundle20Loader(repo_url), expected=ExpectedSwhids.load(json_path), + loader=HgLoaderFromDisk(repo_url, directory=directory), + expected=ExpectedSwhids.load(json_path), ).check() +# This test has as been adapted from the historical `HgBundle20Loader` tests +# to ensure compatibility of `HgLoaderFromDisk`. +# Hashes as been produced by copy pasting the result of the implementation +# to prevent regressions. def test_loader_hg_new_visit_no_release(swh_config, datadir, tmp_path): """Eventful visit should yield 1 snapshot""" archive_name = "the-sandbox" archive_path = os.path.join(datadir, f"{archive_name}.tgz") repo_url = prepare_repository_from_archive(archive_path, archive_name, tmp_path) + directory = urlsplit(repo_url).path - loader = HgBundle20Loader(repo_url) + loader = HgLoaderFromDisk(url=repo_url, directory=directory) assert loader.load() == {"status": "eventful"} @@ -84,39 +98,21 @@ "snapshot": 1, } - # Ensure archive loader yields the same snapshot - loader2 = HgArchiveBundle20Loader( - url=archive_path, - archive_path=archive_path, - visit_date="2016-05-03 15:16:32+00", - ) - - actual_load_status = loader2.load() - assert actual_load_status == {"status": "eventful"} - - stats2 = get_stats(loader2.storage) - expected_stats = copy.deepcopy(stats) - expected_stats["origin"] += 1 - expected_stats["origin_visit"] += 1 - assert stats2 == expected_stats - - # That visit yields the same snapshot - assert_last_visit_matches( - loader2.storage, - archive_path, - status="full", - type="hg", - snapshot=expected_snapshot.id, - ) - +# This test has as been adapted from the historical `HgBundle20Loader` tests +# to ensure compatibility of `HgLoaderFromDisk`. +# Hashes as been produced by copy pasting the result of the implementation +# to prevent regressions. def test_loader_hg_new_visit_with_release(swh_config, datadir, tmp_path): """Eventful visit with release should yield 1 snapshot""" + archive_name = "hello" archive_path = os.path.join(datadir, f"{archive_name}.tgz") repo_url = prepare_repository_from_archive(archive_path, archive_name, tmp_path) + directory = urlsplit(repo_url).path + visit_date = dateutil.parser.parse("2016-05-03 15:16:32+00") - loader = HgBundle20Loader(url=repo_url, visit_date="2016-05-03 15:16:32+00",) + loader = HgLoaderFromDisk(url=repo_url, directory=directory, visit_date=visit_date,) actual_load_status = loader.load() assert actual_load_status == {"status": "eventful"} @@ -163,64 +159,11 @@ snapshot=expected_snapshot.id, ) - # Ensure archive loader yields the same snapshot - loader2 = HgArchiveBundle20Loader( - url=archive_path, - archive_path=archive_path, - visit_date="2016-05-03 15:16:32+00", - ) - - actual_load_status = loader2.load() - assert actual_load_status == {"status": "eventful"} - - stats2 = get_stats(loader2.storage) - expected_stats = copy.deepcopy(stats) - expected_stats["origin"] += 1 - expected_stats["origin_visit"] += 1 - assert stats2 == expected_stats - - # That visit yields the same snapshot - assert_last_visit_matches( - loader2.storage, - archive_path, - status="full", - type="hg", - snapshot=expected_snapshot.id, - ) - - -def test_visit_with_archive_decompression_failure(swh_config, mocker, datadir): - """Failure to decompress should fail early, no data is ingested""" - mock_patoo = mocker.patch("swh.loader.mercurial.archive_extract.patoolib") - mock_patoo.side_effect = ValueError - - archive_name = "hello" - archive_path = os.path.join(datadir, f"{archive_name}.tgz") - # Ensure archive loader yields the same snapshot - loader = HgArchiveBundle20Loader( - url=archive_path, visit_date="2016-05-03 15:16:32+00", - ) - - actual_load_status = loader.load() - assert actual_load_status == {"status": "failed"} - - stats = get_stats(loader.storage) - assert stats == { - "content": 0, - "directory": 0, - "origin": 1, - "origin_visit": 1, - "release": 0, - "revision": 0, - "skipped_content": 0, - "snapshot": 0, - } - # That visit yields the same snapshot - assert_last_visit_matches( - loader.storage, archive_path, status="partial", type="hg", snapshot=None - ) - +# This test has as been adapted from the historical `HgBundle20Loader` tests +# to ensure compatibility of `HgLoaderFromDisk`. +# Hashes as been produced by copy pasting the result of the implementation +# to prevent regressions. def test_visit_repository_with_transplant_operations(swh_config, datadir, tmp_path): """Visit a mercurial repository visit transplant operations within should yield a snapshot as well. @@ -230,7 +173,10 @@ archive_name = "transplant" archive_path = os.path.join(datadir, f"{archive_name}.tgz") repo_url = prepare_repository_from_archive(archive_path, archive_name, tmp_path) - loader = HgBundle20Loader(url=repo_url, visit_date="2019-05-23 12:06:00+00",) + directory = urlsplit(repo_url).path + visit_date = dateutil.parser.parse("2016-05-03 15:16:32+00") + + loader = HgLoaderFromDisk(url=repo_url, directory=directory, visit_date=visit_date) # load hg repository actual_load_status = loader.load() @@ -261,50 +207,3 @@ assert len(hg_changesets) > 0 assert len(transplant_sources) > 0 assert transplant_sources.issubset(hg_changesets) - - -def test_clone_with_timeout_timeout(caplog, tmp_path, monkeypatch): - log = logging.getLogger("test_clone_with_timeout") - - def clone_timeout(source, dest): - time.sleep(60) - - monkeypatch.setattr(hglib, "clone", clone_timeout) - - with pytest.raises(CloneTimeoutError): - HgBundle20Loader.clone_with_timeout( - log, "https://www.mercurial-scm.org/repo/hello", tmp_path, 1 - ) - - for record in caplog.records: - assert record.levelname == "WARNING" - assert "https://www.mercurial-scm.org/repo/hello" in record.getMessage() - assert record.args == ("https://www.mercurial-scm.org/repo/hello", 1) - - -def test_clone_with_timeout_returns(caplog, tmp_path, monkeypatch): - log = logging.getLogger("test_clone_with_timeout") - - def clone_return(source, dest): - return (source, dest) - - monkeypatch.setattr(hglib, "clone", clone_return) - - assert HgBundle20Loader.clone_with_timeout( - log, "https://www.mercurial-scm.org/repo/hello", tmp_path, 1 - ) == ("https://www.mercurial-scm.org/repo/hello", tmp_path) - - -def test_clone_with_timeout_exception(caplog, tmp_path, monkeypatch): - log = logging.getLogger("test_clone_with_timeout") - - def clone_return(source, dest): - raise ValueError("Test exception") - - monkeypatch.setattr(hglib, "clone", clone_return) - - with pytest.raises(ValueError) as excinfo: - HgBundle20Loader.clone_with_timeout( - log, "https://www.mercurial-scm.org/repo/hello", tmp_path, 1 - ) - assert "Test exception" in excinfo.value.args[0] diff --git a/swh/loader/mercurial/tests/test_hgutil.py b/swh/loader/mercurial/tests/test_hgutil.py new file mode 100644 --- /dev/null +++ b/swh/loader/mercurial/tests/test_hgutil.py @@ -0,0 +1,46 @@ +# Copyright (C) 2020 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import time +import traceback + +import pytest +from mercurial import hg # type: ignore + +from .. import hgutil + + +def test_clone_timeout(monkeypatch): + src = "https://www.mercurial-scm.org/repo/hello" + dest = "/dev/null" + timeout = 1 + + def clone(*args): + time.sleep(5) + + monkeypatch.setattr(hg, "clone", clone) + + with pytest.raises(hgutil.CloneTimeout) as e: + hgutil.clone(src, dest, timeout) + assert e.value.args == (src, timeout) + + +def test_clone_error(caplog, tmp_path, monkeypatch): + src = "https://www.mercurial-scm.org/repo/hello" + dest = "/dev/null" + expected_traceback = "Some traceback" + + def clone(*args): + raise ValueError() + + def print_exc(file): + file.write(expected_traceback) + + monkeypatch.setattr(hg, "clone", clone) + monkeypatch.setattr(traceback, "print_exc", print_exc) + + with pytest.raises(hgutil.CloneFailure) as e: + hgutil.clone(src, dest, 1) + assert e.value.args == (src, dest, expected_traceback) diff --git a/swh/loader/mercurial/tests/test_loader.org b/swh/loader/mercurial/tests/test_loader.org deleted file mode 100644 --- a/swh/loader/mercurial/tests/test_loader.org +++ /dev/null @@ -1,121 +0,0 @@ -#+title: Where the loader test data comes from - -Mercurial repositories are archived within the folder -swh/loader/mercurial/tests/resources. They contain mercurial -repository. - -The following demonstrates the commands executed from within the -repository to retrieve information. - -* the-sandbox - -Archive: the-sandbox.tgz - -** branches - -Listing of branches and their tip: -#+BEGIN_SRC sh -$ hg branches -develop 57:76cc0882284d -default 2:2f13849f14f5 (inactive) -#+END_SRC - -** Changesets - -#+BEGIN_SRC sh -$ for i in {0..57}; do hg checkout $i > /dev/null; echo $i $(swh-hashtree --ignore '.hg' --path .); done -0 e2e117569b086ceabeeedee4acd95f35298d4553 -1 9cd8160c67ac4b0bc97e2e2cd918a580425167d3 -2 180bd57623a7c2c47a8c43514a5f4d903503d0aa -3 180bd57623a7c2c47a8c43514a5f4d903503d0aa -4 180bd57623a7c2c47a8c43514a5f4d903503d0aa -5 180bd57623a7c2c47a8c43514a5f4d903503d0aa -6 180bd57623a7c2c47a8c43514a5f4d903503d0aa -7 180bd57623a7c2c47a8c43514a5f4d903503d0aa -8 180bd57623a7c2c47a8c43514a5f4d903503d0aa -9 180bd57623a7c2c47a8c43514a5f4d903503d0aa -10 180bd57623a7c2c47a8c43514a5f4d903503d0aa -11 180bd57623a7c2c47a8c43514a5f4d903503d0aa -12 180bd57623a7c2c47a8c43514a5f4d903503d0aa -13 180bd57623a7c2c47a8c43514a5f4d903503d0aa -14 180bd57623a7c2c47a8c43514a5f4d903503d0aa -15 180bd57623a7c2c47a8c43514a5f4d903503d0aa -16 180bd57623a7c2c47a8c43514a5f4d903503d0aa -17 180bd57623a7c2c47a8c43514a5f4d903503d0aa -18 180bd57623a7c2c47a8c43514a5f4d903503d0aa -19 180bd57623a7c2c47a8c43514a5f4d903503d0aa -20 180bd57623a7c2c47a8c43514a5f4d903503d0aa -21 180bd57623a7c2c47a8c43514a5f4d903503d0aa -22 180bd57623a7c2c47a8c43514a5f4d903503d0aa -23 180bd57623a7c2c47a8c43514a5f4d903503d0aa -24 180bd57623a7c2c47a8c43514a5f4d903503d0aa -25 180bd57623a7c2c47a8c43514a5f4d903503d0aa -26 180bd57623a7c2c47a8c43514a5f4d903503d0aa -27 180bd57623a7c2c47a8c43514a5f4d903503d0aa -28 180bd57623a7c2c47a8c43514a5f4d903503d0aa -29 180bd57623a7c2c47a8c43514a5f4d903503d0aa -30 180bd57623a7c2c47a8c43514a5f4d903503d0aa -31 180bd57623a7c2c47a8c43514a5f4d903503d0aa -32 180bd57623a7c2c47a8c43514a5f4d903503d0aa -33 180bd57623a7c2c47a8c43514a5f4d903503d0aa -34 180bd57623a7c2c47a8c43514a5f4d903503d0aa -35 180bd57623a7c2c47a8c43514a5f4d903503d0aa -36 180bd57623a7c2c47a8c43514a5f4d903503d0aa -37 180bd57623a7c2c47a8c43514a5f4d903503d0aa -38 180bd57623a7c2c47a8c43514a5f4d903503d0aa -39 180bd57623a7c2c47a8c43514a5f4d903503d0aa -40 180bd57623a7c2c47a8c43514a5f4d903503d0aa -41 180bd57623a7c2c47a8c43514a5f4d903503d0aa -42 180bd57623a7c2c47a8c43514a5f4d903503d0aa -43 180bd57623a7c2c47a8c43514a5f4d903503d0aa -44 180bd57623a7c2c47a8c43514a5f4d903503d0aa -45 180bd57623a7c2c47a8c43514a5f4d903503d0aa -46 180bd57623a7c2c47a8c43514a5f4d903503d0aa -47 180bd57623a7c2c47a8c43514a5f4d903503d0aa -48 180bd57623a7c2c47a8c43514a5f4d903503d0aa -49 180bd57623a7c2c47a8c43514a5f4d903503d0aa -50 180bd57623a7c2c47a8c43514a5f4d903503d0aa -51 180bd57623a7c2c47a8c43514a5f4d903503d0aa -52 180bd57623a7c2c47a8c43514a5f4d903503d0aa -53 180bd57623a7c2c47a8c43514a5f4d903503d0aa -54 180bd57623a7c2c47a8c43514a5f4d903503d0aa -55 180bd57623a7c2c47a8c43514a5f4d903503d0aa -56 180bd57623a7c2c47a8c43514a5f4d903503d0aa -57 180bd57623a7c2c47a8c43514a5f4d903503d0aa -#+END_SRC - -Note: swh-hashtree is a cli tool defined in swh-model/bin/swh-hashtree - -* hello - -Archive: hello.tgz - -** branches - -#+BEGIN_SRC sh -$ hg branches -default 1:82e55d328c8c -#+END_SRC - -** tags - -I added a tag to have some more data to load (1st repository has no tags): -#+BEGIN_SRC sh -$ hg tags -tip 2:b985ae4a07e1 -0.1 1:82e55d328c8c -#+END_SRC - -#+BEGIN_SRC sh -$ cat .hgtags -82e55d328c8ca4ee16520036c0aaace03a5beb65 0.1 -#+END_SRC - -** Changesets - -#+BEGIN_SRC sh -$ for i in {0..1}; do hg checkout $i > /dev/null; echo $i $(swh-hashtree --ignore '.hg' --path .); done -0 43d727f2f3f2f7cb3b098ddad1d7038464a4cee2 -1 b3f85f210ff86d334575f64cb01c5bf49895b63e -2 8f2be433c945384c85920a8e60f2a68d2c0f20fb -#+END_SRC diff --git a/swh/loader/mercurial/tests/test_tasks.py b/swh/loader/mercurial/tests/test_tasks.py --- a/swh/loader/mercurial/tests/test_tasks.py +++ b/swh/loader/mercurial/tests/test_tasks.py @@ -7,7 +7,7 @@ def test_loader( mocker, swh_config, swh_scheduler_celery_app, swh_scheduler_celery_worker ): - mock_loader = mocker.patch("swh.loader.mercurial.loader.HgBundle20Loader.load") + mock_loader = mocker.patch("swh.loader.mercurial.loader.HgLoader.load") mock_loader.return_value = {"status": "eventful"} res = swh_scheduler_celery_app.send_task( @@ -26,9 +26,7 @@ def test_archive_loader( mocker, swh_config, swh_scheduler_celery_app, swh_scheduler_celery_worker ): - mock_loader = mocker.patch( - "swh.loader.mercurial.loader.HgArchiveBundle20Loader.load" - ) + mock_loader = mocker.patch("swh.loader.mercurial.loader.HgArchiveLoader.load") mock_loader.return_value = {"status": "uneventful"} res = swh_scheduler_celery_app.send_task(