Index: requirements.txt =================================================================== --- requirements.txt +++ requirements.txt @@ -9,3 +9,4 @@ retrying sqlitedict vcversioner +mercurial Index: swh/loader/mercurial/from_disk.py =================================================================== --- /dev/null +++ swh/loader/mercurial/from_disk.py @@ -0,0 +1,489 @@ +import os +import textwrap +from abc import ABC, abstractmethod +from datetime import datetime +from typing import Any, Dict, Iterable, Iterator, List, Optional + +import mercurial.ui # type: ignore +from mercurial import hg +from mercurial import tags as tagsmod + +from swh.loader.core.loader import DVCSLoader +from swh.model.hashutil import MultiHash, hash_to_bytes +from swh.model.model import ( + BaseContent, + Content, + Directory, + ObjectType, + Origin, + Person, + Release, + Revision, + RevisionType, + Snapshot, + TargetType, + Timestamp, + TimestampWithTimezone, +) + +DIR_PERM = 0o040000 +FLAG_PERMS = { + b"l": 0o120000, # symlink flag + b"x": 0o100755, # exec flag + b"": 0o100644, # no flag +} # type: Dict[bytes, int] + + +class HgLoaderFromDisk(DVCSLoader): + """Load a mercurial repository from a cloned mercurial directory. + + """ + + CONFIG_BASE_FILENAME = "loader/mercurial" + + visit_type = "hg" + + def __init__( + self, + url: str, + directory: str, # Path to a cloned mercurial repository + logging_class="swh.loader.mercurial.Loader", + visit_date: Optional[datetime] = None, + config: Optional[Dict[str, Any]] = None, + ): + super().__init__(logging_class=logging_class, config=config or {}) + self.origin_url = url + self.visit_date = visit_date + self.directory = directory + + def prepare_origin_visit(self, *args, **kwargs) -> None: + """First step executed by the loader to prepare origin and visit + references. Set/update self.origin, and + optionally self.origin_url, self.visit_date. + + """ + self.origin = Origin(url=self.origin_url) + + def prepare(self, *args, **kwargs): + """Second step executed by the loader to prepare some state needed by + the loader. + + """ + ui = mercurial.ui.ui.load() + self.repo = hg.repository(ui, self.directory.encode()).unfiltered() + self.hashes_cache = build_hashes_cache(self.repo) + self.tags_rev = get_tags_rev(self.repo) + + def save_data(self) -> None: + """Save the data associated to the current load""" + # TODO may be implemented with incremental loading + return + + def fetch_data(self) -> bool: + """Fetch the data from the data source.""" + return False # No data to fetch since we use an already cloned repo + + def has_contents(self) -> bool: + """Checks whether we need to load contents""" + # TODO may be implemented with incremental loading + return True + + def get_contents(self) -> Iterable[BaseContent]: + """Get the contents that need to be loaded""" + _hashes = [] + for rev in self.hashes_cache: + ctx = self.repo[rev] + + for filename in self.hashes_cache[rev]: + fctx = ctx[filename] + data = fctx.data() + hashes = self.hashes_cache[rev][filename] + _hashes.append(hashes["sha1_git"]) + + yield Content( + data=data, status="visible", length=len(data), **hashes, + ) + + def has_directories(self) -> bool: + """Checks whether we need to load directories""" + # TODO may be implemented with incremental loading + return True + + def get_directories(self) -> Iterable[Directory]: + """Get the directories that need to be loaded""" + self.rev_dir_hashes: Dict[bytes, bytes] = {} + for rev in self.hashes_cache: + tree = TreeDirectory(None) + + ctx = self.repo[rev] + for filepath in self.hashes_cache[rev]: + fctx = ctx[filepath] + tree.add_file( + path=filepath, + perms=FLAG_PERMS[fctx.flags()], + sha1_git=self.hashes_cache[rev][filepath]["sha1_git"], + ) + + yield from tree.directories() + self.rev_dir_hashes[rev] = tree.sha1_git() + + def has_revisions(self) -> bool: + """Checks whether we need to load revisions""" + # TODO may be implemented with incremental loading + return True + + def get_revisions(self) -> Iterable[Revision]: + """Get the revisions that need to be loaded""" + + # Keep track of calculated revisions sha1_git + # for release and snapshot generation + self._node_hashes: Dict[bytes, bytes] = {} + + for rev in self.repo: + ctx = self.repo[rev] + parents = tuple( + [ + self._node_hashes[p.node()] # revisions are in increasing order so + # this won't fail + for p in ctx.parents() + if p.node() != mercurial.node.nullid + ] + ) + + author = author_dict_from_str(ctx.user()) + rev_date = get_ctx_date(ctx).to_dict() + + revision = { + "author": author, + "date": rev_date, + "committer": author, + "committer_date": rev_date, + "type": RevisionType.MERCURIAL.value, + "directory": self.rev_dir_hashes[rev], + "message": ctx.description(), + "metadata": { + f"hg-extra-{key}": value for key, value in ctx.extra().items() + }, + "synthetic": False, + "parents": parents, + } + revision["id"] = hash_to_bytes(Revision.compute_hash(revision)) + self._node_hashes[ctx.node()] = revision["id"] + yield Revision.from_dict(revision) + + def has_releases(self) -> bool: + """Checks whether we need to load releases""" + # TODO may be implemented with incremental loading + return True + + def get_releases(self) -> Iterable[Release]: + """Get the releases that need to be loaded""" + + # Keep track of original tags target for snapshot generation + self.tags_target = {} + + for (name, nodeid) in self.repo.tags().items(): + target_ctx = self.repo[nodeid] + self.tags_target[name] = target_ctx.node() + if name == b"tip": + continue + + if nodeid == self.tags_rev[name]["target"]: + tag_ctx = self.repo[self.tags_rev[name]["rev"]] + + release = { + "author": author_dict_from_str(tag_ctx.user()), + "date": get_ctx_date(tag_ctx).to_dict(), + "name": name, + "target": self._node_hashes[target_ctx.node()], + "target_type": ObjectType.REVISION.value, + "message": None, + "metadata": None, + "synthetic": False, + } + release["id"] = hash_to_bytes(Release.compute_hash(release)) + + yield Release.from_dict(release) + else: + # TODO: should not happen -> warning or exception. + pass + + def get_snapshot(self) -> Snapshot: + """Get the snapshot that needs to be loaded""" + branches: Dict[bytes, Any] = {} + + for name, heads in self.repo.branchmap().items(): + # When a branch has multiple heads, the 1st is stored as "branch/{name}" + # while the next are stored has "wild-branch/{branch}/{target} + used_names = [] # To know if the first head of branch has used the name + for head in heads: + ctx = self.repo[head] + target = self._node_hashes[ctx.node()] + if name in used_names: + ref = b"wild-branch/" + name + b"/" + target.hex().encode() + else: + ref = b"branch/" + name + used_names.append(name) + branches[ref] = { + "target": target, + "target_type": TargetType.REVISION.value, + } + + for name, target in self.tags_target.items(): + # The "tip" tag is a special tag that is more like a head. + # All heads should have been handled before. + # There is no need to process "tip". + if name != b"tip": + ref = b"tag/" + name + branches[ref] = { + "target": self._node_hashes[target], + "target_type": TargetType.REVISION.value, + } + + snapshot = {"branches": branches} + snapshot["id"] = hash_to_bytes(Snapshot.compute_hash({"branches": branches})) + return Snapshot.from_dict(snapshot) + + def load_status(self): + """Detailed loading status. + + Defaults to logging an eventful load. + + Returns: a dictionary that is eventually passed back as the task's + result to the scheduler, allowing tuning of the task recurrence + mechanism. + """ + return { + "status": "eventful", + } + + +def get_ctx_date(ctx) -> TimestampWithTimezone: + """Return a changset timestamp. """ + (timestamp, offset) = ctx.date() + # TODO timestamp seems to always be negative. + # Some investigation is needed but it works. + result = TimestampWithTimezone( + timestamp=Timestamp(seconds=int(timestamp), microseconds=0), + offset=-(offset // 60), + negative_utc=(offset == 0), + ) + return result + + +def build_hashes_cache(repo): + """Build a cache of every file hash in advance. + + TODO: use lru cache at when incremental loading is implemented. + """ + result = {} + + for rev in repo: + result[rev] = {} + ctx = repo[rev] + manifest = ctx.manifest() + + for filepath in manifest: + fctx = ctx[filepath] + data = fctx.data() + hashes = MultiHash.from_data(data).digest() + result[rev][filepath] = hashes + + return result + + +def path_split_all(path): + """Split all parts of a path. """ + parts = [] + head, tail = os.path.split(path) + while tail != b"": + parts.append(tail) + head, tail = os.path.split(head) + return list(reversed(parts)) + + +def get_tags_rev(repo) -> Dict[str, Dict]: + """Map tags to their respective rev and target node """ + result: Dict[str, Dict] = {} + hgtags_log = repo.file(b".hgtags") + + for trev in hgtags_log: + # TODO: handle edge cases for linkrev + rev = hgtags_log.linkrev(trev) + + oldfnodes = [ + hgtags_log.node(p) + for p in hgtags_log.parentrevs(trev) + if p != mercurial.node.nullrev + ] + newfnodes = [hgtags_log.node(trev)] + + changes = tagsmod.difftags(repo.ui, repo, oldfnodes, newfnodes) + for tag, old, new in changes: + if new is None: # The tag has been removed + del result[tag] + else: # The tag has been added of moved + result[tag] = { + "rev": rev, # rev which added the tag + "target": new, # targeted node + } + + return result + + +class TreeElement(ABC): + """Interface that every element of the tree must respect. + """ + + @abstractmethod + def as_entry_dict(self) -> Dict[str, Any]: + """Return the element as a dict suitable for `Directory.from_dict`. + """ + + def directories(self) -> Iterator[Directory]: + """Yield all directories contained in the element. + """ + yield from [] + + +class TreeDirectory(TreeElement): + """Represent a directory structure. + + It is used to compute recursively compute the sha1_git of all the elements of the + tree. + + When incremental loading will be implemented, it will be used to recaclulate only + the changed parts. + """ + + def __init__(self, name: Optional[bytes]) -> None: + self._name = name + self._files: Dict[bytes, TreeFile] = {} + self._dirs: Dict[bytes, TreeDirectory] = {} + self._sha1_git: Optional[bytes] = None + + def add_file(self, path: bytes, perms: int, sha1_git: bytes) -> None: + """Add a file to the tree by its full path. + """ + path, filename = os.path.split(path) + parts = path_split_all(path) + + current_dir = self + for part in parts: + current_dir = current_dir._add_tree(part) + current_dir._add_file(filename, perms, sha1_git) + + def _add_tree(self, name: bytes) -> "TreeDirectory": + """Add a tree to the tree by is name. + """ + if name not in self._dirs: + self._dirs[name] = TreeDirectory(name) + return self._dirs[name] + + def _add_file(self, name: bytes, perms: int, sha1_git: bytes) -> None: + """Add a file to the tree by is name. + """ + if name in self._files: + raise Exception(f"name {name.decode()} already exists") + if name in self._dirs: + raise Exception(f"name {name.decode()} already is an existing directory") + self._files[name] = TreeFile(name, perms, sha1_git) + + def __str__(self): + """Display the tree (for debug purpose only). + """ + name = self._name.decode() if self._name else "/" + files = textwrap.indent("\n".join(map(str, self._files.values())), prefix=" ") + dirs = textwrap.indent("\n".join(map(str, self._dirs.values())), prefix=" ") + return "\n".join([f"{name} ({self.sha1_git().hex()})", files, dirs]) + + def sha1_git(self) -> bytes: + """Compute the hash of the tree. + """ + if self._sha1_git is None: + self._sha1_git = hash_to_bytes( + Directory.compute_hash({"entries": self._entries()}) + ) + return self._sha1_git + + def _entries(self) -> List[Dict[str, Any]]: + """List entries of the tree. + """ + file_entries = [f.as_entry_dict() for f in self._files.values()] + dir_entries = [d.as_entry_dict() for d in self._dirs.values()] + return file_entries + dir_entries + + def as_entry_dict(self) -> Dict[str, Any]: + """Return the tree as a dict suitable for `Directory.from_dict`. + """ + return { + "type": "dir", + "perms": DIR_PERM, + "name": self._name, + "target": self.sha1_git(), + } + + def directories(self) -> Iterator[Directory]: + """Yield all directories contained in the tree. + """ + for item in self._dirs.values(): + yield from item.directories() + + yield Directory.from_dict( + {"id": self.sha1_git(), "entries": self._entries(),} + ) + + +class TreeFile(TreeElement): + """Represent a file in a tree. + """ + + def __init__(self, name: bytes, perms: int, sha1_git: bytes) -> None: + self._name = name + self._perms = perms + self._sha1_git = sha1_git + + def __str__(self): + """Display the file (for debug purpose only). + """ + return f"{self._name.decode()} ({self._sha1_git.hex()})" + + def as_entry_dict(self) -> Dict[str, Any]: + """Return the file as a dict suitable for `Directory.from_dict`. + """ + return { + "type": "file", + "perms": self._perms, + "name": self._name, + "target": self._sha1_git, + } + + +def author_dict_from_str(author: bytes) -> Dict[str, bytes]: + result = Person.from_fullname(author).to_dict() + # git requires a space between name and email. Other wise the hash differs + # the fullname is used when present in the hash function + result["fullname"] = result["name"] + b" <" + result["email"] + b">" + return result + + +if __name__ == "__main__": + import logging + + import click + + logging.basicConfig( + level=logging.DEBUG, format="%(asctime)s %(process)d %(message)s" + ) + + @click.command() + @click.option("--origin-url", help="origin url") + @click.option("--hg-directory", help="Path to mercurial repository to load") + @click.option("--visit-date", default=None, help="Visit date") + def main(origin_url, hg_directory, visit_date): + if not visit_date: + visit_date = datetime.datetime.now(tz=datetime.timezone.utc) + + return HgLoaderFromDisk().load(origin_url, hg_directory, visit_date) + + main() Index: swh/loader/mercurial/tests/conftest.py =================================================================== --- swh/loader/mercurial/tests/conftest.py +++ swh/loader/mercurial/tests/conftest.py @@ -4,16 +4,19 @@ # See top-level LICENSE file for more information import os -import yaml +from typing import Any, Dict import pytest +import yaml -from typing import Any, Dict + +@pytest.fixture +def swh_storage_backend_config(): + yield {"cls": "validate", "storage": {"cls": "memory",}} @pytest.fixture def swh_loader_config(swh_storage_backend_config, tmp_path) -> Dict[str, Any]: - swh_storage_backend_config["journal_writer"] = {} return { "storage": { "cls": "pipeline", Index: swh/loader/mercurial/tests/data/expected-load-from-disk.json =================================================================== --- /dev/null +++ swh/loader/mercurial/tests/data/expected-load-from-disk.json @@ -0,0 +1,562 @@ +{ + "objects": { + "d85316f179e32e1df3488429abd5a05278ad5846": { + "type": "blob", + "perm": "100644", + "id": "d85316f179e32e1df3488429abd5a05278ad5846", + "name": ".hgtags", + "cat-file": "5f037e53460e26b01a71a050a8184f03197108b1 sometag\n" + }, + "750c98e88ac0b62fbc525a4cb347ebe214da9752": { + "type": "blob", + "perm": "100644", + "id": "750c98e88ac0b62fbc525a4cb347ebe214da9752", + "name": "README.md", + "cat-file": "# My Project\nShort project description.\n" + }, + "20df98ecc09e98b8b05154ed9b774b757c3f008e": { + "type": "blob", + "perm": "100644", + "id": "20df98ecc09e98b8b05154ed9b774b757c3f008e", + "name": "multihead", + "cat-file": "new head\n" + }, + "f102a9cadfa89ce554b3b26d2b90bfba2e05273c": { + "type": "blob", + "perm": "100644", + "id": "f102a9cadfa89ce554b3b26d2b90bfba2e05273c", + "name": "__init__.py", + "cat-file": "__version__ = \"0.0.1\"\n" + }, + "e06f9134ae2e1c2a4916d15f557401005d1db178": { + "type": "blob", + "perm": "100644", + "id": "e06f9134ae2e1c2a4916d15f557401005d1db178", + "name": "cli.py", + "cat-file": "# This is the CLI module\n" + }, + "477ecdc3720553aa0fa16820fafeeb3da322477e": { + "type": "blob", + "perm": "100644", + "id": "477ecdc3720553aa0fa16820fafeeb3da322477e", + "name": "utils.py", + "cat-file": "# This is the utils module\n" + }, + "4176e3f1bab97df933a8a22adacd05c4ea27f107": { + "type": "tree", + "perm": "040000", + "id": "4176e3f1bab97df933a8a22adacd05c4ea27f107", + "name": "myproject", + "objects": [ + "f102a9cadfa89ce554b3b26d2b90bfba2e05273c", + "e06f9134ae2e1c2a4916d15f557401005d1db178", + "477ecdc3720553aa0fa16820fafeeb3da322477e" + ] + }, + "4304edb0b155452e632ecbf767b7e344693ea7f0": { + "type": "tree", + "perm": null, + "id": "4304edb0b155452e632ecbf767b7e344693ea7f0", + "name": null, + "objects": [ + "d85316f179e32e1df3488429abd5a05278ad5846", + "750c98e88ac0b62fbc525a4cb347ebe214da9752", + "20df98ecc09e98b8b05154ed9b774b757c3f008e", + "4176e3f1bab97df933a8a22adacd05c4ea27f107" + ] + }, + "ea3c57189fcbec5f78b04cf7a659a0afed2e562c": { + "type": "commit", + "tree": "4304edb0b155452e632ecbf767b7e344693ea7f0", + "parents": [ + "d1cc8ca629e80cbc7bc19b504138718b806f2146" + ], + "author": "Full Name 1594824716 +0200", + "committer": "Full Name 1594824716 +0200", + "message": "empty commit", + "id": "ea3c57189fcbec5f78b04cf7a659a0afed2e562c" + }, + "d1cc8ca629e80cbc7bc19b504138718b806f2146": { + "type": "commit", + "tree": "4304edb0b155452e632ecbf767b7e344693ea7f0", + "parents": [ + "01750e33294f6445386735cd8080dead28408371" + ], + "author": "Full Name 1594824716 +0200", + "committer": "Full Name 1594824716 +0200", + "message": "Multi head", + "id": "d1cc8ca629e80cbc7bc19b504138718b806f2146" + }, + "18e99f815ed2bd3fa497933bd291fe89f05dbd55": { + "type": "blob", + "perm": "100644", + "id": "18e99f815ed2bd3fa497933bd291fe89f05dbd55", + "name": "multihead.1", + "cat-file": "mer. 15 juil. 2020 16:51:54 CEST 1\n" + }, + "3fac4de82b28e6f3b65941553c61ba37101025c3": { + "type": "blob", + "perm": "100644", + "id": "3fac4de82b28e6f3b65941553c61ba37101025c3", + "name": "multihead.2", + "cat-file": "mer. 15 juil. 2020 16:51:54 CEST 2\n" + }, + "8c8abb5f75d4616cabe53c68bf1553be632fc80f": { + "type": "blob", + "perm": "100644", + "id": "8c8abb5f75d4616cabe53c68bf1553be632fc80f", + "name": "multihead.3", + "cat-file": "mer. 15 juil. 2020 16:51:54 CEST 3\n" + }, + "10d28109199a65c259c3f36e1b1097c41b46093c": { + "type": "blob", + "perm": "100644", + "id": "10d28109199a65c259c3f36e1b1097c41b46093c", + "name": "multihead.4", + "cat-file": "mer. 15 juil. 2020 16:51:55 CEST 4\n" + }, + "17307a692ef42756e0502db5184b7b83800171b5": { + "type": "blob", + "perm": "100644", + "id": "17307a692ef42756e0502db5184b7b83800171b5", + "name": "multihead.5", + "cat-file": "mer. 15 juil. 2020 16:51:55 CEST 5\n" + }, + "abd35930eebbd7b1cf5fa178c831a7c6339494a3": { + "type": "tree", + "perm": null, + "id": "abd35930eebbd7b1cf5fa178c831a7c6339494a3", + "name": null, + "objects": [ + "d85316f179e32e1df3488429abd5a05278ad5846", + "750c98e88ac0b62fbc525a4cb347ebe214da9752", + "18e99f815ed2bd3fa497933bd291fe89f05dbd55", + "3fac4de82b28e6f3b65941553c61ba37101025c3", + "8c8abb5f75d4616cabe53c68bf1553be632fc80f", + "10d28109199a65c259c3f36e1b1097c41b46093c", + "17307a692ef42756e0502db5184b7b83800171b5", + "4176e3f1bab97df933a8a22adacd05c4ea27f107" + ] + }, + "3601c816399f1aa78e601749663ec129968d2cb8": { + "type": "commit", + "tree": "abd35930eebbd7b1cf5fa178c831a7c6339494a3", + "parents": [ + "a813e9cc8bb8888981cb08708939ee0c65a87c4a" + ], + "author": "Full Name 1594824715 +0200", + "committer": "Full Name 1594824715 +0200", + "message": "useless commit 5", + "id": "3601c816399f1aa78e601749663ec129968d2cb8" + }, + "10f538b90244d9d31f3ba63582662a60a917c2d7": { + "type": "tree", + "perm": null, + "id": "10f538b90244d9d31f3ba63582662a60a917c2d7", + "name": null, + "objects": [ + "d85316f179e32e1df3488429abd5a05278ad5846", + "750c98e88ac0b62fbc525a4cb347ebe214da9752", + "18e99f815ed2bd3fa497933bd291fe89f05dbd55", + "3fac4de82b28e6f3b65941553c61ba37101025c3", + "8c8abb5f75d4616cabe53c68bf1553be632fc80f", + "10d28109199a65c259c3f36e1b1097c41b46093c", + "4176e3f1bab97df933a8a22adacd05c4ea27f107" + ] + }, + "a813e9cc8bb8888981cb08708939ee0c65a87c4a": { + "type": "commit", + "tree": "10f538b90244d9d31f3ba63582662a60a917c2d7", + "parents": [ + "84cbdaacde1ffb3194a02c0f46610154b81f8604" + ], + "author": "Full Name 1594824715 +0200", + "committer": "Full Name 1594824715 +0200", + "message": "useless commit 4", + "id": "a813e9cc8bb8888981cb08708939ee0c65a87c4a" + }, + "5cb65f18160addf2c580d5e73b4cf340dd87f9fb": { + "type": "tree", + "perm": null, + "id": "5cb65f18160addf2c580d5e73b4cf340dd87f9fb", + "name": null, + "objects": [ + "d85316f179e32e1df3488429abd5a05278ad5846", + "750c98e88ac0b62fbc525a4cb347ebe214da9752", + "18e99f815ed2bd3fa497933bd291fe89f05dbd55", + "3fac4de82b28e6f3b65941553c61ba37101025c3", + "8c8abb5f75d4616cabe53c68bf1553be632fc80f", + "4176e3f1bab97df933a8a22adacd05c4ea27f107" + ] + }, + "84cbdaacde1ffb3194a02c0f46610154b81f8604": { + "type": "commit", + "tree": "5cb65f18160addf2c580d5e73b4cf340dd87f9fb", + "parents": [ + "0551a2e02efdfa9eda0d337cac252573978e83ff" + ], + "author": "Full Name 1594824715 +0200", + "committer": "Full Name 1594824715 +0200", + "message": "useless commit 3", + "id": "84cbdaacde1ffb3194a02c0f46610154b81f8604" + }, + "0a6fbbb82ccb54c09451a746d55a767b26bff89d": { + "type": "tree", + "perm": null, + "id": "0a6fbbb82ccb54c09451a746d55a767b26bff89d", + "name": null, + "objects": [ + "d85316f179e32e1df3488429abd5a05278ad5846", + "750c98e88ac0b62fbc525a4cb347ebe214da9752", + "4176e3f1bab97df933a8a22adacd05c4ea27f107" + ] + }, + "01750e33294f6445386735cd8080dead28408371": { + "type": "commit", + "tree": "0a6fbbb82ccb54c09451a746d55a767b26bff89d", + "parents": [ + "985149e735f63d143cfb5c195f4f1d92c24295fd" + ], + "author": "Full Name 1594824714 +0200", + "committer": "Full Name 1594824714 +0200", + "message": "Added tag sometag for changeset 5f037e53460e", + "id": "01750e33294f6445386735cd8080dead28408371" + }, + "b6ea5429cb51b12e0322ce47e7516d75b541737f": { + "type": "tree", + "perm": null, + "id": "b6ea5429cb51b12e0322ce47e7516d75b541737f", + "name": null, + "objects": [ + "d85316f179e32e1df3488429abd5a05278ad5846", + "750c98e88ac0b62fbc525a4cb347ebe214da9752", + "18e99f815ed2bd3fa497933bd291fe89f05dbd55", + "3fac4de82b28e6f3b65941553c61ba37101025c3", + "4176e3f1bab97df933a8a22adacd05c4ea27f107" + ] + }, + "0551a2e02efdfa9eda0d337cac252573978e83ff": { + "type": "commit", + "tree": "b6ea5429cb51b12e0322ce47e7516d75b541737f", + "parents": [ + "7c8dfc35c9e082561a7589ee418d3046b7632d56" + ], + "author": "Full Name 1594824714 +0200", + "committer": "Full Name 1594824714 +0200", + "message": "useless commit 2", + "id": "0551a2e02efdfa9eda0d337cac252573978e83ff" + }, + "54e137cbf01711d30a3a7d7822a93afd2301459d": { + "type": "tree", + "perm": null, + "id": "54e137cbf01711d30a3a7d7822a93afd2301459d", + "name": null, + "objects": [ + "d85316f179e32e1df3488429abd5a05278ad5846", + "750c98e88ac0b62fbc525a4cb347ebe214da9752", + "18e99f815ed2bd3fa497933bd291fe89f05dbd55", + "4176e3f1bab97df933a8a22adacd05c4ea27f107" + ] + }, + "7c8dfc35c9e082561a7589ee418d3046b7632d56": { + "type": "commit", + "tree": "54e137cbf01711d30a3a7d7822a93afd2301459d", + "parents": [ + "01750e33294f6445386735cd8080dead28408371" + ], + "author": "Full Name 1594824714 +0200", + "committer": "Full Name 1594824714 +0200", + "message": "useless commit 1", + "id": "7c8dfc35c9e082561a7589ee418d3046b7632d56" + }, + "3dc1f76bc69e3f559bee6253b24fc93acee9e1f9": { + "type": "blob", + "perm": "100644", + "id": "3dc1f76bc69e3f559bee6253b24fc93acee9e1f9", + "name": "__init__.py", + "cat-file": "__version__ = \"0.1.0\"\n" + }, + "e4754e4457515eff6d2772b75c659d3b2e05d0e0": { + "type": "tree", + "perm": "040000", + "id": "e4754e4457515eff6d2772b75c659d3b2e05d0e0", + "name": "myproject", + "objects": [ + "3dc1f76bc69e3f559bee6253b24fc93acee9e1f9", + "e06f9134ae2e1c2a4916d15f557401005d1db178", + "477ecdc3720553aa0fa16820fafeeb3da322477e" + ] + }, + "ecf37a29314efe473b399b700c7e5eacc063ba6e": { + "type": "tree", + "perm": null, + "id": "ecf37a29314efe473b399b700c7e5eacc063ba6e", + "name": null, + "objects": [ + "750c98e88ac0b62fbc525a4cb347ebe214da9752", + "e4754e4457515eff6d2772b75c659d3b2e05d0e0" + ] + }, + "e9b86db719a08e81d7fe7972fa91007fba04bdf7": { + "type": "commit", + "tree": "ecf37a29314efe473b399b700c7e5eacc063ba6e", + "parents": [ + "d4a7b5bfb845c038f49eb4ea2b5d0c2591936b72", + "985149e735f63d143cfb5c195f4f1d92c24295fd" + ], + "author": "Full Name 1594824713 +0200", + "committer": "Full Name 1594824713 +0200", + "message": "Merge default", + "id": "e9b86db719a08e81d7fe7972fa91007fba04bdf7" + }, + "181a22e7ad8bbad9bb5846f51c377a7597a0c914": { + "type": "tree", + "perm": null, + "id": "181a22e7ad8bbad9bb5846f51c377a7597a0c914", + "name": null, + "objects": [ + "750c98e88ac0b62fbc525a4cb347ebe214da9752", + "4176e3f1bab97df933a8a22adacd05c4ea27f107" + ] + }, + "985149e735f63d143cfb5c195f4f1d92c24295fd": { + "type": "commit", + "tree": "181a22e7ad8bbad9bb5846f51c377a7597a0c914", + "parents": [ + "787aad15e7800d6cfc385e8020719145fd9d4582" + ], + "author": "Full Name 1594824713 +0200", + "committer": "Full Name 1594824713 +0200", + "message": "Create myproject.utils module", + "id": "985149e735f63d143cfb5c195f4f1d92c24295fd" + }, + "3b93d0be0c9ec9943a1076a0152b8c244261a3a1": { + "type": "blob", + "perm": "100644", + "id": "3b93d0be0c9ec9943a1076a0152b8c244261a3a1", + "name": "__init__.py", + "cat-file": "__version__ = \"0.0.2\"\n" + }, + "6146139c16f3ea99ef1300ac040163569c2f3c2a": { + "type": "tree", + "perm": "040000", + "id": "6146139c16f3ea99ef1300ac040163569c2f3c2a", + "name": "myproject", + "objects": [ + "3b93d0be0c9ec9943a1076a0152b8c244261a3a1", + "e06f9134ae2e1c2a4916d15f557401005d1db178" + ] + }, + "09a1bb68db049b4e37540e52ebde76f59126b3a8": { + "type": "tree", + "perm": null, + "id": "09a1bb68db049b4e37540e52ebde76f59126b3a8", + "name": null, + "objects": [ + "750c98e88ac0b62fbc525a4cb347ebe214da9752", + "6146139c16f3ea99ef1300ac040163569c2f3c2a" + ] + }, + "71eb3f80edfc3a9bfa0e61b2960e54fc2d891cbf": { + "type": "commit", + "tree": "09a1bb68db049b4e37540e52ebde76f59126b3a8", + "parents": [ + "0935616bb4f228840859c60be809fb138f103049", + "787aad15e7800d6cfc385e8020719145fd9d4582" + ], + "author": "Full Name 1594824712 +0200", + "committer": "Full Name 1594824712 +0200", + "message": "Close branch v0.0.2", + "id": "71eb3f80edfc3a9bfa0e61b2960e54fc2d891cbf" + }, + "5b676f6c42567c33db8fbb6d9480af1098c99e09": { + "type": "tree", + "perm": "040000", + "id": "5b676f6c42567c33db8fbb6d9480af1098c99e09", + "name": "myproject", + "objects": [ + "3dc1f76bc69e3f559bee6253b24fc93acee9e1f9", + "e06f9134ae2e1c2a4916d15f557401005d1db178" + ] + }, + "048960a9eff9a9f22ce2fc2e2bc9b5f73cdfc26a": { + "type": "tree", + "perm": null, + "id": "048960a9eff9a9f22ce2fc2e2bc9b5f73cdfc26a", + "name": null, + "objects": [ + "750c98e88ac0b62fbc525a4cb347ebe214da9752", + "5b676f6c42567c33db8fbb6d9480af1098c99e09" + ] + }, + "d4a7b5bfb845c038f49eb4ea2b5d0c2591936b72": { + "type": "commit", + "tree": "048960a9eff9a9f22ce2fc2e2bc9b5f73cdfc26a", + "parents": [ + "787aad15e7800d6cfc385e8020719145fd9d4582" + ], + "author": "Full Name 1594824712 +0200", + "committer": "Full Name 1594824712 +0200", + "message": "Bump version to 0.1.0", + "id": "d4a7b5bfb845c038f49eb4ea2b5d0c2591936b72" + }, + "900ca0f6985cda749e84f11d94d00301039a6e25": { + "type": "tree", + "perm": "040000", + "id": "900ca0f6985cda749e84f11d94d00301039a6e25", + "name": "myproject", + "objects": [ + "f102a9cadfa89ce554b3b26d2b90bfba2e05273c", + "e06f9134ae2e1c2a4916d15f557401005d1db178" + ] + }, + "d476a11ddfcfce07236a0a03f78e3c1a73bc20ae": { + "type": "tree", + "perm": null, + "id": "d476a11ddfcfce07236a0a03f78e3c1a73bc20ae", + "name": null, + "objects": [ + "750c98e88ac0b62fbc525a4cb347ebe214da9752", + "900ca0f6985cda749e84f11d94d00301039a6e25" + ] + }, + "787aad15e7800d6cfc385e8020719145fd9d4582": { + "type": "commit", + "tree": "d476a11ddfcfce07236a0a03f78e3c1a73bc20ae", + "parents": [ + "38c8e35e88e0b4fecfc5419759d2d1a7f77961ac" + ], + "author": "Full Name 1594824711 +0200", + "committer": "Full Name 1594824711 +0200", + "message": "Create myproject.cli module", + "id": "787aad15e7800d6cfc385e8020719145fd9d4582" + }, + "1af0cb6a4d557fbb7493880b3fe88f19f7b4575b": { + "type": "tree", + "perm": "040000", + "id": "1af0cb6a4d557fbb7493880b3fe88f19f7b4575b", + "name": "myproject", + "objects": [ + "3b93d0be0c9ec9943a1076a0152b8c244261a3a1" + ] + }, + "218ccb1594f7026492c72309974b44aba353d7dc": { + "type": "tree", + "perm": null, + "id": "218ccb1594f7026492c72309974b44aba353d7dc", + "name": null, + "objects": [ + "750c98e88ac0b62fbc525a4cb347ebe214da9752", + "1af0cb6a4d557fbb7493880b3fe88f19f7b4575b" + ] + }, + "0935616bb4f228840859c60be809fb138f103049": { + "type": "commit", + "tree": "218ccb1594f7026492c72309974b44aba353d7dc", + "parents": [ + "38c8e35e88e0b4fecfc5419759d2d1a7f77961ac" + ], + "author": "Full Name 1594824710 +0200", + "committer": "Full Name 1594824710 +0200", + "message": "Bump version to 0.0.2", + "id": "0935616bb4f228840859c60be809fb138f103049" + }, + "9754545c661d7fbb00e687c5b74bf7ca4cce9130": { + "type": "tree", + "perm": "040000", + "id": "9754545c661d7fbb00e687c5b74bf7ca4cce9130", + "name": "myproject", + "objects": [ + "f102a9cadfa89ce554b3b26d2b90bfba2e05273c" + ] + }, + "0dad640e1eb9f31cb9d874158318f1f180be9b3a": { + "type": "tree", + "perm": null, + "id": "0dad640e1eb9f31cb9d874158318f1f180be9b3a", + "name": null, + "objects": [ + "750c98e88ac0b62fbc525a4cb347ebe214da9752", + "9754545c661d7fbb00e687c5b74bf7ca4cce9130" + ] + }, + "38c8e35e88e0b4fecfc5419759d2d1a7f77961ac": { + "type": "commit", + "tree": "0dad640e1eb9f31cb9d874158318f1f180be9b3a", + "parents": [ + "700f9c2117d045906cff1472de3d7de33c6e3559" + ], + "author": "Full Name 1594824710 +0200", + "committer": "Full Name 1594824710 +0200", + "message": "Create python package", + "id": "38c8e35e88e0b4fecfc5419759d2d1a7f77961ac" + }, + "93e88b135dc8c3420cd4984e21d8d1eb2781ddce": { + "type": "tree", + "perm": null, + "id": "93e88b135dc8c3420cd4984e21d8d1eb2781ddce", + "name": null, + "objects": [ + "750c98e88ac0b62fbc525a4cb347ebe214da9752" + ] + }, + "700f9c2117d045906cff1472de3d7de33c6e3559": { + "type": "commit", + "tree": "93e88b135dc8c3420cd4984e21d8d1eb2781ddce", + "parents": [ + "c100ba1d499e14e5d1992741a4260c134a6cf1a4" + ], + "author": "Full Name 1594824710 +0200", + "committer": "Full Name 1594824710 +0200", + "message": "Add project description", + "id": "700f9c2117d045906cff1472de3d7de33c6e3559" + }, + "a2beefd59223ea16000788d77e62f96bdaf23c7c": { + "type": "blob", + "perm": "100644", + "id": "a2beefd59223ea16000788d77e62f96bdaf23c7c", + "name": "README.md", + "cat-file": "# My Project\n" + }, + "fa5e6af79e30fc26ab4acbd96388fde22b4c2f36": { + "type": "tree", + "perm": null, + "id": "fa5e6af79e30fc26ab4acbd96388fde22b4c2f36", + "name": null, + "objects": [ + "a2beefd59223ea16000788d77e62f96bdaf23c7c" + ] + }, + "c100ba1d499e14e5d1992741a4260c134a6cf1a4": { + "type": "commit", + "tree": "fa5e6af79e30fc26ab4acbd96388fde22b4c2f36", + "parents": [], + "author": "Full Name 1594824709 +0200", + "committer": "Full Name 1594824709 +0200", + "message": "Add README", + "id": "c100ba1d499e14e5d1992741a4260c134a6cf1a4" + } + }, + "branches": { + "tag/sometag": { + "target": "985149e735f63d143cfb5c195f4f1d92c24295fd", + "target_type": "revision" + }, + "branch/default": { + "target": "3601c816399f1aa78e601749663ec129968d2cb8", + "target_type": "revision" + }, + "wild-branch/default/ea3c57189fcbec5f78b04cf7a659a0afed2e562c": { + "target": "ea3c57189fcbec5f78b04cf7a659a0afed2e562c", + "target_type": "revision" + }, + "branch/v0.0.2": { + "target": "71eb3f80edfc3a9bfa0e61b2960e54fc2d891cbf", + "target_type": "revision" + }, + "branch/v0.1.x": { + "target": "e9b86db719a08e81d7fe7972fa91007fba04bdf7", + "target_type": "revision" + } + } +} \ No newline at end of file Index: swh/loader/mercurial/tests/data/make_example.sh =================================================================== --- /dev/null +++ swh/loader/mercurial/tests/data/make_example.sh @@ -0,0 +1,113 @@ +#!/usr/bin/env bash + +# Create an example repository and convert it to git using hg-git to extract +# expected disk load data to be used in loder tests. + +set -e + +# Create a mercurial venv with the tools required to build the test example. +[ -e "hg-git" ] || python -m venv hg-git +[ -e "hg-git/bin/hg" ] || ./hg-git/bin/pip install \ + "hg-git==0.9.0a1" \ + "mercurial==5.4.1" \ + "hg-evolve==10.0.0" \ + "dulwich<0.20" +PATH="$PWD/hg-git/bin:$PATH" + +# Create and use a local mercurial configuration. +HOME="$PWD/hg-git" # Mercurial as no way to specify an alternative config file +[ -e "$HGRC" ] || { + echo "[ui]" + echo "username = Full Name" + echo "[extensions]" + echo "hggit=" +} >> "$HOME/.hgrc" + +# Create a repository with all tested cases. +# This is a function to allow mutting of repository generation +# to debug other parts of the script like expected disk load +# or archive generation. +create_example_repo() { + [ -e "example" ] && rm -rf example + hg init example/repo + cd example/repo + + # simple file + echo "# My Project" >> README.md + hg add README.md + hg commit -m "Add README" + + # file with modification + echo "Short project description." >> README.md + hg commit -m "Add project description" + + # file in directory + mkdir -p myproject + echo '__version__ = "0.0.1"' >> myproject/__init__.py + hg add myproject/__init__.py + hg commit -m "Create python package" + + # public changesets + hg phase --public -r ::. + + # closed branch + hg branch v0.0.2 + echo '__version__ = "0.0.2"' > myproject/__init__.py + hg commit -m "Bump version to 0.0.2" + hg update default + echo "# This is the CLI module" >> myproject/cli.py + hg add myproject/cli.py + hg commit -m "Create myproject.cli module" + hg update v0.0.2 + hg merge -r default + hg commit --close-branch -m "Close branch v0.0.2" + hg update default + + # living branch + hg branch v0.1.x + echo '__version__ = "0.1.0"' > myproject/__init__.py + hg commit -m "Bump version to 0.1.0" + hg update default + echo "# This is the utils module" >> myproject/utils.py + hg add myproject/utils.py + hg commit -m "Create myproject.utils module" + hg update v0.1.x + hg merge -r default + hg commit -m "Merge default" + hg update default + + # tag + hg tag sometag + + # multihead + for i in $(seq 5); do + echo "$i" + echo "$(date) $i" >> "multihead.$i" + hg add "multihead.$i" + hg commit -m "useless commit $i" + done + hg up .~5 + echo "new head" >> "multihead" + hg add multihead + hg commit -m "Multi head" + + # empty commit + hg commit --config ui.allowemptycommit=yes -m "empty commit" + + # create bookmarks to create git branches + hg heads --closed --template="hg bookmark -r {rev} {rev}-{branch};" | sh + + # Create bare git from mercurial repository + hg gexport + + cd ../.. +} + +# Create a repository with all tested usecases. +create_example_repo + +# Create the expected disk load from git repository. +./make_expected.py example/repo/.hg/git expected-load-from-disk.json + +# Create archive. +tar --directory=./example/ -czf example.tar.gz repo Index: swh/loader/mercurial/tests/data/make_expected.py =================================================================== --- /dev/null +++ swh/loader/mercurial/tests/data/make_expected.py @@ -0,0 +1,282 @@ +#!/usr/bin/env python + +""" Extract expected disk load data to be used in loder tests. + + This script is used as part of make_example.sh +""" + +import hashlib +import json +import subprocess +import sys + + +class Repository: + def __init__(self, path): + self._path = path + + def __call__(self, *args): + return subprocess.check_output(["git", *args], cwd=self._path) + + def commits(self): + return self("log", "--format=%H", "--branches").decode().splitlines() + + def cat_file_content(self, hash): + return self("cat-file", "-p", hash) + + def refs(self): + result = [ + line.split(" ", 2) + for line in self( + "for-each-ref", "--format=%(objectname) %(objecttype) %(refname)" + ) + .decode() + .splitlines() + ] + return result + + def heads(self): + return [ + {"target": objectname, "name": refname[len("refs/heads/") :]} + for objectname, objecttype, refname in self.refs() + if refname.startswith("refs/heads/") + ] + + def tags(self): + result = [ + {"target": objectname, "name": refname[len("refs/tags/") :]} + for objectname, objecttype, refname in self.refs() + if refname.startswith("refs/tags/") + ] + return result + + def head_ref(self): + return self("describe", "--all", "HEAD") + + +def git_obj_from_line(repo, line): + obj_perm, obj_type, obj_hash, obj_name = line.decode().split() + type_map = { + "tree": Tree, + "blob": Blob, + } + return type_map[obj_type](repo, obj_hash, obj_perm, obj_name) + + +class GitObj: + def __init__(self, repo, hash): + self._repo = repo + self._hash = hash + + def id(self): + return self._hash + + def cat_file(self): + return self._repo.cat_file_content(self._hash) + + +class Commit(GitObj): + def __init__(self, repo, hash, commits): + super().__init__(repo, hash) + self._commits = commits + self._tree = None + self._object = None + + def id(self): + return self.object()["id"] + + def tree(self): + if self._tree is None: + self._tree = Tree(self._repo, self.object()["tree"]) + return self._tree + + def rm_hg_metadata(self, data): + """ Removes metadata added by hg-git. """ + lines = [] + + for line in data.split(b"\n"): + if line.startswith(b"HG:"): + continue + if b"--HG--" in line: + break + lines.append(line) + + return b"\n".join(lines).strip() + + def cat_parts(self): + """ Return commit object parts. + + Used to rehash the commit id with removed hg-git metadata. + """ + data = self.rm_hg_metadata(self.cat_file()) + parts = {"parents": []} + + line, remaining = data.split(b"\n", 1) + parts["tree"] = line.split()[1] + + while remaining.startswith(b"parent"): + line, remaining = remaining.split(b"\n", 1) + parts["parents"].append(line.split()[1]) + + author, committer, _, message = remaining.split(b"\n", 3) + + parts["author"] = author.split(b" ", 1)[1] + parts["committer"] = committer.split(b" ", 1)[1] + parts["message"] = message + + return parts + + def object(self): + """ Return the object as dict for test expected result. """ + if self._object is None: + parts = self.cat_parts() + self._object = { + "type": "commit", + "tree": parts["tree"].decode(), + "parents": [], + "author": parts["author"].decode(), + "committer": parts["committer"].decode(), + "message": parts["message"].decode(), + } + + # replace parents with re-hashed ids (removed hg-git metadata) + for parent in map(bytes.decode, parts["parents"]): + if parent not in self._commits: + self._commits[parent] = Commit(self._repo, parent, self._commits) + self._object["parents"].append(self._commits[parent].id()) + parts["parents"] = [parent.encode() for parent in self._object["parents"]] + + # re-hash id because parents id has changed + self._object["id"] = commit_id_from_parts(parts) + + return self._object + + def objects(self): + """ Return a stream objects composing the commit. """ + yield from self.tree().objects() + yield self.object() + + +def commit_id_from_parts(parts): + data = b"\n".join( + [ + b"tree " + parts["tree"], + *[b"parent " + parent for parent in parts["parents"]], + b"author " + parts["author"], + b"committer " + parts["committer"], + b"\n" + parts["message"], + ] + ) + return commit_id(data) + + +def commit_id(data): + data_len = str(len(data)).encode() + commit = b"commit %b\0%b" % (data_len, data) + hash = hashlib.sha1(commit) + return hash.hexdigest() + + +class Tree(GitObj): + def __init__(self, repo, hash, perm=None, name=None): + super().__init__(repo, hash) + self._perm = perm + self._name = name + self._items = None + + def items(self): + if self._items is None: + self._items = [ + git_obj_from_line(self._repo, line) + for line in self.cat_file().splitlines() + ] + return self._items + + def content(self): + for obj in self.items(): + yield from obj.content() + + def object(self): + """ Return the object as dict for test expected result. """ + return { + "type": "tree", + "perm": self._perm, + "id": self.id(), + "name": self._name, + "objects": [obj.id() for obj in self.items()], + } + + def objects(self): + """ Return a stream objects composing the tree. """ + for obj in self.items(): + yield from obj.objects() + yield self.object() + + +class Blob(GitObj): + def __init__(self, repo, hash, perm=None, name=None): + super().__init__(repo, hash) + self._perm = perm + self._name = name + + def content(self): + yield self._hash + + def object(self): + """ Return the object as dict for test expected result. """ + return { + "type": "blob", + "perm": self._perm, + "id": self.id(), + "name": self._name, + "cat-file": self.cat_file().decode(), + } + + def objects(self): + """ Return self as a stream. """ + yield self.object() + + +def make_expected(path): + repo = Repository(path) + + commits_cache = {} + objects = [] + for hash in repo.commits(): + # calling commit.objects() will add parent commits + # make sure to not redo the work twice + if hash not in commits_cache: + commits_cache[hash] = Commit(repo, hash, commits_cache) + objects.extend(commits_cache[hash].objects()) + + refs = {} + for tag in repo.tags(): + target = commits_cache[tag["target"]].id() + refs[f"tag/{tag['name']}"] = { + "target": target, + "target_type": "revision", + } + + seen_branches = [] + for head in sorted(repo.heads(), key=lambda h: h["name"]): + target = commits_cache[head["target"]].id() + rev, name = head["name"].split("-", 1) + if name in seen_branches: + refs[f"wild-branch/{name}/{target}"] = { + "target": target, + "target_type": "revision", + } + else: + refs[f"branch/{name}"] = { + "target": target, + "target_type": "revision", + } + seen_branches.append(name) + + return {"objects": {obj["id"]: obj for obj in objects}, "branches": refs} + + +if __name__ == "__main__": + json.dump( + make_expected(sys.argv[1]), open(sys.argv[2], "w"), indent=2, + ) Index: swh/loader/mercurial/tests/test_from_disk.py =================================================================== --- /dev/null +++ swh/loader/mercurial/tests/test_from_disk.py @@ -0,0 +1,118 @@ +# Copyright (C) 2020 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import json +import os + +import pytest + +from swh.loader.mercurial.from_disk import HgLoaderFromDisk +from swh.loader.tests import ( + assert_last_visit_matches, + check_snapshot, + get_stats, + prepare_repository_from_archive, +) +from swh.model.hashutil import hash_to_bytes +from swh.model.identifiers import snapshot_identifier +from swh.model.model import RevisionType + + +@pytest.fixture +def expected_data_to_check(swh_config, datadir): + """Fixture to load expected.json data to check against a loading visit + + @acezar: + + -> swh_config: check conftest/swh_config fixture pytest (that actually write to disk + some config the loader is aupposed to know how to load). You can actually + override it here by redefining here as you see fit. A priori, you should not have + to. + + -> datadir: target folder from an "absolute" point of view the "./data" folder so it + makes sense for the tests to retrieve file for example + + """ + pathname = "expected-load-from-disk.json" + # store file at current "./data/expected-load-from-disk.json" + path = os.path.join(datadir, pathname) + assert os.path.exists(path), f"No file {pathname} found! Could not test." + return json.load(open(path, "r")) + + +def test_visit_from_disk(expected_data_to_check, datadir, tmp_path): + """Loading from a disk a repository should yield 1 snapshot + + """ + archive_path = os.path.join(datadir, "example.tar.gz") + repo_url = prepare_repository_from_archive(archive_path, "repo", tmp_path) + # better yet, use an existing repository already ingested? <- Why ? + + loader = HgLoaderFromDisk(repo_url, repo_url) + + # at what point does this need to be loaded? + expected_data = expected_data_to_check + + expected_contents = [ + obj["id"] for obj in expected_data["objects"].values() if obj["type"] == "blob" + ] + + expected_dir_count = sum( + [1 for obj in expected_data["objects"].values() if obj["type"] == "tree"] + ) + expected_person_count = len( + { + obj["author"].rsplit(" ", 2)[0] + for obj in expected_data["objects"].values() + if obj["type"] == "commit" + } + ) + + expected_revisions = { + obj["id"]: obj["tree"] + for obj in expected_data["objects"].values() + if obj["type"] == "commit" + } + + expected_snapshot = {"branches": expected_data["branches"]} + expected_snapshot["id"] = snapshot_identifier( + { + "branches": { + name.encode(): { + "target": branch["target"].encode() + if name == "HEAD" + else hash_to_bytes(branch["target"]), + "target_type": branch["target_type"], + } + for name, branch in expected_snapshot["branches"].items() + } + } + ) + + actual_load_status = loader.load() + assert actual_load_status == {"status": "eventful"} + + stats = get_stats(loader.storage) + assert stats == { + "content": len(expected_contents), + "directory": expected_dir_count, + "origin": 1, # 1 origin + "origin_visit": 1, # with 1 visit + "person": expected_person_count, + "release": 1, + "revision": len(expected_revisions), + "skipped_content": 0, + "snapshot": 1, + } + + assert_last_visit_matches( + loader.storage, + repo_url, + type=RevisionType.MERCURIAL.value, + status="full", + snapshot=hash_to_bytes(expected_snapshot["id"]), + ) + + check_snapshot(expected_snapshot, loader.storage)