diff --git a/swh/loader/mercurial/from_disk.py b/swh/loader/mercurial/from_disk.py new file mode 100644 --- /dev/null +++ b/swh/loader/mercurial/from_disk.py @@ -0,0 +1,348 @@ +# Copyright (C) 2020 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import os +from datetime import datetime +from typing import Any, Dict, List, NewType, Optional + +import dateutil + +# The internal Mercurial API is not guaranteed to be stable. +import mercurial.ui # type: ignore +from mercurial import hg + +from swh.loader.core.loader import BaseLoader +from swh.model.from_disk import Content, DentryPerms, Directory +from swh.model.hashutil import MultiHash, hash_to_bytehex +from swh.model.model import Content as ModelContent +from swh.model.model import ( + ObjectType, + Origin, + Person, + Release, + Revision, + RevisionType, + Sha1Git, + Snapshot, + SnapshotBranch, + TargetType, + TimestampWithTimezone, +) + +HgNodeId = NewType("HgNodeId", bytes) +FLAG_PERMS = { + b"l": DentryPerms.symlink, + b"x": DentryPerms.executable_content, + b"": DentryPerms.content, +} # type: Dict[bytes, DentryPerms] + + +class HgLoaderFromDisk(BaseLoader): + """Load a mercurial repository from a local repository.""" + + CONFIG_BASE_FILENAME = "loader/mercurial" + + visit_type = "hg" + + def __init__( + self, + url: str, + directory: str, + logging_class="swh.loader.mercurial.Loader", + visit_date: Optional[datetime] = None, + config: Optional[Dict[str, Any]] = None, + ): + """Initialize the loader. + + Args: + url: url of the repository. + directory: directory of the local repository. + logging_class: class of the loader logger. + visit_date: visit date of the repository + config: loader configuration + """ + super().__init__(logging_class=logging_class, config=config or {}) + self.origin_url = url + self.visit_date = visit_date + self.directory = directory + + self._revision_nodeid_to_swhid: Dict[HgNodeId, Sha1Git] = {} + + def cleanup(self) -> None: + """Last step executed by the loader.""" + pass + + def prepare_origin_visit(self, *args, **kwargs) -> None: + """First step executed by the loader to prepare origin and visit + references. Set/update self.origin, and + optionally self.origin_url, self.visit_date. + + """ + self.origin = Origin(url=self.origin_url) + + def prepare(self, *args, **kwargs) -> None: + """Second step executed by the loader to prepare some state needed by + the loader. + + """ + ui = mercurial.ui.ui.load() + + # The repository will only return visible changesets. + # If needed, use self._repo.unfiltered() to have a new repository + # returning all the changesets. + self._repo = hg.repository(ui, self.directory.encode()) + + def fetch_data(self) -> bool: + """Fetch the data from the source the loader is currently loading + + Returns: + a value that is interpreted as a boolean. If True, fetch_data needs + to be called again to complete loading. + + """ + return False # No data to fetch since we use a local repository + + def store_data(self): + """Store fetched data in the database.""" + for rev in self._repo: + hg_nodeid = self._repo[rev].node() + revision_swhid = self._store_revision(hg_nodeid) + self._revision_nodeid_to_swhid[hg_nodeid] = revision_swhid + + branch_by_hg_nodeid: Dict[HgNodeId, bytes] = { + hg_nodeid: name for name, hg_nodeid in get_hg_branches(self._repo).items() + } + tags_by_name: Dict[bytes, HgNodeId] = self._repo.tags() + tags_by_hg_nodeid: Dict[HgNodeId, bytes] = { + hg_nodeid: name for name, hg_nodeid in tags_by_name.items() + } + + snapshot_branches: Dict[bytes, SnapshotBranch] = {} + + for hg_nodeid, revision_swhid in self._revision_nodeid_to_swhid.items(): + tag_name = tags_by_hg_nodeid.get(hg_nodeid) + + # tip is listed in the tags by the mercurial api + # but its not a tag defined by the user in `.hgtags` + if tag_name and tag_name != b"tip": + snapshot_branches[tag_name] = SnapshotBranch( + target=self._store_release(tag_name, revision_swhid), + target_type=TargetType.RELEASE, + ) + + if hg_nodeid in branch_by_hg_nodeid: + name = branch_by_hg_nodeid[hg_nodeid] + snapshot_branches[name] = SnapshotBranch( + target=revision_swhid, target_type=TargetType.REVISION, + ) + + if hg_nodeid == tags_by_name[b"tip"]: + snapshot_branches[b"HEAD"] = SnapshotBranch( + target=name, target_type=TargetType.ALIAS, + ) + + snapshot = Snapshot(branches=snapshot_branches) + self.storage.snapshot_add([snapshot]) + + self.flush() + self.loaded_snapshot_id = snapshot.id + + def _store_revision(self, hg_nodeid: HgNodeId) -> Sha1Git: + """Store a revision given its hg nodeid. + + If a parent revision of the revision to store is not yet stored, + it is recursively stored. Revisions are stored from oldest to newest, + thus recursion limit will not be reached. + + Args: + hg_nodeid: the hg nodeid of the revision. + + Returns: + the swhid of the stored revision. + """ + rev_ctx = self._repo[hg_nodeid] + + root_swhid = self._store_directories(hg_nodeid) + + author = Person.from_fullname(rev_ctx.user()) + (timestamp, offset) = rev_ctx.date() + + # TimestampWithTimezone.from_dict will change name + # as it accept more than just dicts + rev_date = TimestampWithTimezone.from_dict(int(timestamp)) + + extra_headers = [ + (b"time_offset_seconds", str(offset).encode(),), + ] + for key, value in rev_ctx.extra().items(): + # The default branch is skipped to match + # the historical implementation + if key == b"branch" and value == b"default": + continue + + # transplant_source is converted to match + # the historical implementation + if key == b"transplant_source": + value = hash_to_bytehex(value) + extra_headers.append((key, value)) + + parents = [] + for parent_ctx in rev_ctx.parents(): + parent_hg_nodeid = parent_ctx.node() + # nullid is the value of a parent that does not exist + if parent_hg_nodeid == mercurial.node.nullid: + continue + + if parent_hg_nodeid not in self._revision_nodeid_to_swhid: + parent_revision_swhid = self._store_revision(parent_hg_nodeid) + self._revision_nodeid_to_swhid[parent_hg_nodeid] = parent_revision_swhid + + parents.append(self._revision_nodeid_to_swhid[parent_hg_nodeid]) + + revision = Revision( + author=author, + date=rev_date, + committer=author, + committer_date=rev_date, + type=RevisionType.MERCURIAL, + directory=root_swhid, + message=rev_ctx.description(), + metadata={"node": hg_nodeid.hex()}, + extra_headers=tuple(extra_headers), + synthetic=False, + parents=tuple(parents), + ) + + self.storage.revision_add([revision]) + + return revision.id + + def _store_release(self, name: bytes, target=Sha1Git) -> Sha1Git: + """Store a release given its name and its target. + + Args: + name: name of the release. + target: swhid of the target revision. + + Returns: + the swhid of the stored release. + """ + release = Release( + name=name, + target=target, + target_type=ObjectType.REVISION, + message=None, + metadata=None, + synthetic=False, + author=Person(name=None, email=None, fullname=b""), + date=None, + ) + + self.storage.release_add([release]) + + return release.id + + def _store_content(self, hg_nodeid: HgNodeId, file_path: bytes) -> Content: + """Store a revision content hg nodeid and file path. + + Args: + hg_nodeid: the hg nodeid of the revision. + file_path: the hg path of the content. + + Returns: + the swhid of the top level directory. + """ + rev_ctx = self._repo[hg_nodeid] + file_ctx = rev_ctx[file_path] + + data = file_ctx.data() + + content_data = MultiHash.from_data(data).digest() + content_data["length"] = len(data) + content_data["perms"] = FLAG_PERMS[file_ctx.flags()] + content_data["data"] = data + content_data["status"] = "visible" + content = Content(content_data) + + model = content.to_model() + if isinstance(model, ModelContent): + self.storage.content_add([model]) + else: + raise ValueError( + f"{file_path!r} at rev {hg_nodeid.hex()!r} " + "produced {type(model)!r} instead of {ModelContent!r}" + ) + + return content + + def _store_directories(self, hg_nodeid: HgNodeId) -> Sha1Git: + """Store a revision directories given its hg nodeid. + + Args: + hg_nodeid: the hg nodeid of the revision. + + Returns: + the swhid of the top level directory. + """ + rev_ctx = self._repo[hg_nodeid] + + root = Directory() + for file_path in rev_ctx.manifest(): + content = self._store_content(hg_nodeid, file_path) + parts = [part for part in file_path.split(os.path.sep.encode()) if part] + + current_dir = root + while len(parts) > 1: + part = parts.pop(0) + if part not in current_dir: + current_dir[part] = Directory() + current_dir = current_dir[part] + part = parts.pop(0) + current_dir[part] = content + + directories: List[Directory] = [root] + while directories: + directory = directories.pop(0) + self.storage.directory_add([directory.to_model()]) + directories.extend( + [item for item in directory.values() if isinstance(item, Directory)] + ) + + return root.hash + + +def get_hg_branches(repo) -> Dict[bytes, HgNodeId]: + """Equivalent of `hg branches`. + + For some reason, there is no direct way to get the same result of `hg branches` in + the localrepository interface. + """ + result = {} + for tag, heads, tip, isclosed in repo.branchmap().iterbranches(): + if isclosed: + continue + result[tag] = tip + return result + + +if __name__ == "__main__": + import logging + + import click + + logging.basicConfig( + level=logging.DEBUG, format="%(asctime)s %(process)d %(message)s" + ) + + @click.command() + @click.option("--origin-url", help="origin url") + @click.option("--hg-directory", help="Path to mercurial repository to load") + @click.option("--visit-date", default=None, help="Visit date") + def main(origin_url, hg_directory, visit_date): + visit_date = dateutil.parser.parse(visit_date) if visit_date else None + + return HgLoaderFromDisk().load(origin_url, hg_directory, visit_date) + + main() diff --git a/swh/loader/mercurial/tests/test_from_disk.py b/swh/loader/mercurial/tests/test_from_disk.py new file mode 100644 --- /dev/null +++ b/swh/loader/mercurial/tests/test_from_disk.py @@ -0,0 +1,209 @@ +# Copyright (C) 2020 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import os +from urllib.parse import urlsplit + +import dateutil + +from swh.loader.tests import ( + assert_last_visit_matches, + check_snapshot, + get_stats, + prepare_repository_from_archive, +) +from swh.model.hashutil import hash_to_bytes +from swh.model.model import RevisionType, Snapshot, SnapshotBranch, TargetType +from swh.storage.algos.snapshot import snapshot_get_latest + +from ..from_disk import HgLoaderFromDisk +from .loader_checker import ExpectedSwhids, LoaderChecker + + +# Those tests assert expectations on repository loading +# by reading expected values from associated json files +# produced by the `swh-hg-identify` command line utility. +# +# It has more granularity than historical tests. +# Assertions will tell if the error comes from the directories +# revisions or release rather than only checking the snapshot. +# +# With more work it should event be possible to know which part +# of an object is faulty. +def test_examples(swh_config, datadir, tmp_path): + for archive_name in ("hello", "transplant", "the-sandbox", "example"): + archive_path = os.path.join(datadir, f"{archive_name}.tgz") + json_path = os.path.join(datadir, f"{archive_name}.json") + repo_url = prepare_repository_from_archive(archive_path, archive_name, tmp_path) + directory = urlsplit(repo_url).path + + LoaderChecker( + loader=HgLoaderFromDisk(repo_url, directory=directory), + expected=ExpectedSwhids.load(json_path), + ).check() + + +# This test has as been adapted from the historical `HgBundle20Loader` tests +# to ensure compatibility of `HgLoaderFromDisk`. +# Hashes as been produced by copy pasting the result of the implementation +# to prevent regressions. +def test_loader_hg_new_visit_no_release(swh_config, datadir, tmp_path): + """Eventful visit should yield 1 snapshot""" + archive_name = "the-sandbox" + archive_path = os.path.join(datadir, f"{archive_name}.tgz") + repo_url = prepare_repository_from_archive(archive_path, archive_name, tmp_path) + directory = urlsplit(repo_url).path + + loader = HgLoaderFromDisk(url=repo_url, directory=directory) + + assert loader.load() == {"status": "eventful"} + + tip_revision_develop = "a9c4534552df370f43f0ef97146f393ef2f2a08c" + tip_revision_default = "70e750bb046101fdced06f428e73fee471509c56" + expected_snapshot = Snapshot( + id=hash_to_bytes("3b8fe58e467deb7597b12a5fd3b2c096b8c02028"), + branches={ + b"develop": SnapshotBranch( + target=hash_to_bytes(tip_revision_develop), + target_type=TargetType.REVISION, + ), + b"default": SnapshotBranch( + target=hash_to_bytes(tip_revision_default), + target_type=TargetType.REVISION, + ), + b"HEAD": SnapshotBranch(target=b"develop", target_type=TargetType.ALIAS,), + }, + ) + + assert_last_visit_matches( + loader.storage, + repo_url, + status="full", + type="hg", + snapshot=expected_snapshot.id, + ) + check_snapshot(expected_snapshot, loader.storage) + + stats = get_stats(loader.storage) + assert stats == { + "content": 2, + "directory": 3, + "origin": 1, + "origin_visit": 1, + "release": 0, + "revision": 58, + "skipped_content": 0, + "snapshot": 1, + } + + +# This test has as been adapted from the historical `HgBundle20Loader` tests +# to ensure compatibility of `HgLoaderFromDisk`. +# Hashes as been produced by copy pasting the result of the implementation +# to prevent regressions. +def test_loader_hg_new_visit_with_release(swh_config, datadir, tmp_path): + """Eventful visit with release should yield 1 snapshot""" + + archive_name = "hello" + archive_path = os.path.join(datadir, f"{archive_name}.tgz") + repo_url = prepare_repository_from_archive(archive_path, archive_name, tmp_path) + directory = urlsplit(repo_url).path + visit_date = dateutil.parser.parse("2016-05-03 15:16:32+00") + + loader = HgLoaderFromDisk(url=repo_url, directory=directory, visit_date=visit_date,) + + actual_load_status = loader.load() + assert actual_load_status == {"status": "eventful"} + + # then + stats = get_stats(loader.storage) + assert stats == { + "content": 3, + "directory": 3, + "origin": 1, + "origin_visit": 1, + "release": 1, + "revision": 3, + "skipped_content": 0, + "snapshot": 1, + } + + # cf. test_loader.org for explaining from where those hashes + tip_release = hash_to_bytes("515c4d72e089404356d0f4b39d60f948b8999140") + release = loader.storage.release_get([tip_release])[0] + assert release is not None + + tip_revision_default = hash_to_bytes("c3dbe4fbeaaa98dd961834e4007edb3efb0e2a27") + revision = loader.storage.revision_get([tip_revision_default])[0] + assert revision is not None + + expected_snapshot = Snapshot( + id=hash_to_bytes("d35668e02e2ba4321dc951cd308cf883786f918a"), + branches={ + b"default": SnapshotBranch( + target=tip_revision_default, target_type=TargetType.REVISION, + ), + b"0.1": SnapshotBranch(target=tip_release, target_type=TargetType.RELEASE,), + b"HEAD": SnapshotBranch(target=b"default", target_type=TargetType.ALIAS,), + }, + ) + + check_snapshot(expected_snapshot, loader.storage) + assert_last_visit_matches( + loader.storage, + repo_url, + type=RevisionType.MERCURIAL.value, + status="full", + snapshot=expected_snapshot.id, + ) + + +# This test has as been adapted from the historical `HgBundle20Loader` tests +# to ensure compatibility of `HgLoaderFromDisk`. +# Hashes as been produced by copy pasting the result of the implementation +# to prevent regressions. +def test_visit_repository_with_transplant_operations(swh_config, datadir, tmp_path): + """Visit a mercurial repository visit transplant operations within should yield a + snapshot as well. + + """ + + archive_name = "transplant" + archive_path = os.path.join(datadir, f"{archive_name}.tgz") + repo_url = prepare_repository_from_archive(archive_path, archive_name, tmp_path) + directory = urlsplit(repo_url).path + visit_date = dateutil.parser.parse("2016-05-03 15:16:32+00") + + loader = HgLoaderFromDisk(url=repo_url, directory=directory, visit_date=visit_date) + + # load hg repository + actual_load_status = loader.load() + assert actual_load_status == {"status": "eventful"} + + # collect swh revisions + assert_last_visit_matches( + loader.storage, repo_url, type=RevisionType.MERCURIAL.value, status="full" + ) + + revisions = [] + snapshot = snapshot_get_latest(loader.storage, repo_url) + for branch in snapshot.branches.values(): + if branch.target_type.value != "revision": + continue + revisions.append(branch.target) + + # extract original changesets info and the transplant sources + hg_changesets = set() + transplant_sources = set() + for rev in loader.storage.revision_log(revisions): + hg_changesets.add(rev["metadata"]["node"]) + for k, v in rev["extra_headers"]: + if k == b"transplant_source": + transplant_sources.add(v.decode("ascii")) + + # check extracted data are valid + assert len(hg_changesets) > 0 + assert len(transplant_sources) > 0 + assert transplant_sources.issubset(hg_changesets) diff --git a/swh/loader/mercurial/tests/test_loader.org b/swh/loader/mercurial/tests/test_loader.org deleted file mode 100644 --- a/swh/loader/mercurial/tests/test_loader.org +++ /dev/null @@ -1,121 +0,0 @@ -#+title: Where the loader test data comes from - -Mercurial repositories are archived within the folder -swh/loader/mercurial/tests/resources. They contain mercurial -repository. - -The following demonstrates the commands executed from within the -repository to retrieve information. - -* the-sandbox - -Archive: the-sandbox.tgz - -** branches - -Listing of branches and their tip: -#+BEGIN_SRC sh -$ hg branches -develop 57:76cc0882284d -default 2:2f13849f14f5 (inactive) -#+END_SRC - -** Changesets - -#+BEGIN_SRC sh -$ for i in {0..57}; do hg checkout $i > /dev/null; echo $i $(swh-hashtree --ignore '.hg' --path .); done -0 e2e117569b086ceabeeedee4acd95f35298d4553 -1 9cd8160c67ac4b0bc97e2e2cd918a580425167d3 -2 180bd57623a7c2c47a8c43514a5f4d903503d0aa -3 180bd57623a7c2c47a8c43514a5f4d903503d0aa -4 180bd57623a7c2c47a8c43514a5f4d903503d0aa -5 180bd57623a7c2c47a8c43514a5f4d903503d0aa -6 180bd57623a7c2c47a8c43514a5f4d903503d0aa -7 180bd57623a7c2c47a8c43514a5f4d903503d0aa -8 180bd57623a7c2c47a8c43514a5f4d903503d0aa -9 180bd57623a7c2c47a8c43514a5f4d903503d0aa -10 180bd57623a7c2c47a8c43514a5f4d903503d0aa -11 180bd57623a7c2c47a8c43514a5f4d903503d0aa -12 180bd57623a7c2c47a8c43514a5f4d903503d0aa -13 180bd57623a7c2c47a8c43514a5f4d903503d0aa -14 180bd57623a7c2c47a8c43514a5f4d903503d0aa -15 180bd57623a7c2c47a8c43514a5f4d903503d0aa -16 180bd57623a7c2c47a8c43514a5f4d903503d0aa -17 180bd57623a7c2c47a8c43514a5f4d903503d0aa -18 180bd57623a7c2c47a8c43514a5f4d903503d0aa -19 180bd57623a7c2c47a8c43514a5f4d903503d0aa -20 180bd57623a7c2c47a8c43514a5f4d903503d0aa -21 180bd57623a7c2c47a8c43514a5f4d903503d0aa -22 180bd57623a7c2c47a8c43514a5f4d903503d0aa -23 180bd57623a7c2c47a8c43514a5f4d903503d0aa -24 180bd57623a7c2c47a8c43514a5f4d903503d0aa -25 180bd57623a7c2c47a8c43514a5f4d903503d0aa -26 180bd57623a7c2c47a8c43514a5f4d903503d0aa -27 180bd57623a7c2c47a8c43514a5f4d903503d0aa -28 180bd57623a7c2c47a8c43514a5f4d903503d0aa -29 180bd57623a7c2c47a8c43514a5f4d903503d0aa -30 180bd57623a7c2c47a8c43514a5f4d903503d0aa -31 180bd57623a7c2c47a8c43514a5f4d903503d0aa -32 180bd57623a7c2c47a8c43514a5f4d903503d0aa -33 180bd57623a7c2c47a8c43514a5f4d903503d0aa -34 180bd57623a7c2c47a8c43514a5f4d903503d0aa -35 180bd57623a7c2c47a8c43514a5f4d903503d0aa -36 180bd57623a7c2c47a8c43514a5f4d903503d0aa -37 180bd57623a7c2c47a8c43514a5f4d903503d0aa -38 180bd57623a7c2c47a8c43514a5f4d903503d0aa -39 180bd57623a7c2c47a8c43514a5f4d903503d0aa -40 180bd57623a7c2c47a8c43514a5f4d903503d0aa -41 180bd57623a7c2c47a8c43514a5f4d903503d0aa -42 180bd57623a7c2c47a8c43514a5f4d903503d0aa -43 180bd57623a7c2c47a8c43514a5f4d903503d0aa -44 180bd57623a7c2c47a8c43514a5f4d903503d0aa -45 180bd57623a7c2c47a8c43514a5f4d903503d0aa -46 180bd57623a7c2c47a8c43514a5f4d903503d0aa -47 180bd57623a7c2c47a8c43514a5f4d903503d0aa -48 180bd57623a7c2c47a8c43514a5f4d903503d0aa -49 180bd57623a7c2c47a8c43514a5f4d903503d0aa -50 180bd57623a7c2c47a8c43514a5f4d903503d0aa -51 180bd57623a7c2c47a8c43514a5f4d903503d0aa -52 180bd57623a7c2c47a8c43514a5f4d903503d0aa -53 180bd57623a7c2c47a8c43514a5f4d903503d0aa -54 180bd57623a7c2c47a8c43514a5f4d903503d0aa -55 180bd57623a7c2c47a8c43514a5f4d903503d0aa -56 180bd57623a7c2c47a8c43514a5f4d903503d0aa -57 180bd57623a7c2c47a8c43514a5f4d903503d0aa -#+END_SRC - -Note: swh-hashtree is a cli tool defined in swh-model/bin/swh-hashtree - -* hello - -Archive: hello.tgz - -** branches - -#+BEGIN_SRC sh -$ hg branches -default 1:82e55d328c8c -#+END_SRC - -** tags - -I added a tag to have some more data to load (1st repository has no tags): -#+BEGIN_SRC sh -$ hg tags -tip 2:b985ae4a07e1 -0.1 1:82e55d328c8c -#+END_SRC - -#+BEGIN_SRC sh -$ cat .hgtags -82e55d328c8ca4ee16520036c0aaace03a5beb65 0.1 -#+END_SRC - -** Changesets - -#+BEGIN_SRC sh -$ for i in {0..1}; do hg checkout $i > /dev/null; echo $i $(swh-hashtree --ignore '.hg' --path .); done -0 43d727f2f3f2f7cb3b098ddad1d7038464a4cee2 -1 b3f85f210ff86d334575f64cb01c5bf49895b63e -2 8f2be433c945384c85920a8e60f2a68d2c0f20fb -#+END_SRC