diff --git a/swh/loader/mercurial/from_disk.py b/swh/loader/mercurial/from_disk.py new file mode 100644 --- /dev/null +++ b/swh/loader/mercurial/from_disk.py @@ -0,0 +1,440 @@ +import os +import textwrap +from datetime import datetime +from typing import Any, Dict, Iterable, Iterator, List, Optional + +import mercurial.ui # type: ignore +from mercurial import hg +from mercurial import tags as tagsmod +from swh.loader.core.loader import DVCSLoader +from swh.model.hashutil import MultiHash, hash_to_bytes +from swh.model.model import ( + BaseContent, + Content, + Directory, + ObjectType, + Origin, + Person, + Release, + Revision, + RevisionType, + Snapshot, + TargetType, + Timestamp, + TimestampWithTimezone, +) + +DIR_PERM = 0o040000 +FLAG_PERMS = { + b"l": 0o120000, # symlink flag + b"x": 0o100755, # exec flag + b"": 0o100644, # no flag +} # type: Dict[bytes, int] + + +class HgLoaderFromDisk(DVCSLoader): + """Load a mercurial repository from a cloned mercurial directory. + + """ + + CONFIG_BASE_FILENAME = "loader/mercurial" + + visit_type = "hg" + + def __init__( + self, + url: str, + logging_class="swh.loader.mercurial.Loader", + visit_date: Optional[datetime] = None, + directory: Optional[str] = None, + config: Optional[Dict[str, Any]] = None, + ): + super().__init__(logging_class=logging_class, config=config or {}) + self.origin_url = url + self.visit_date = visit_date + self.directory = directory + + def prepare_origin_visit(self, *args, **kwargs) -> None: + """First step executed by the loader to prepare origin and visit + references. Set/update self.origin, and + optionally self.origin_url, self.visit_date. + + """ + self.origin = Origin(url=self.origin_url) + + def prepare(self, *args, **kwargs): + """Second step executed by the loader to prepare some state needed by + the loader. + + """ + ui = mercurial.ui.ui.load() + self.repo = hg.repository(ui, self.directory.encode()).unfiltered() + self.hashes_cache = build_hashes_cache(self.repo) + self.tags_rev = get_tags_rev(self.repo) + + def save_data(self) -> None: + """Save the data associated to the current load""" + # TODO ? + return + + def fetch_data(self) -> bool: + """Fetch the data from the data source.""" + return False + + def has_contents(self) -> bool: + """Checks whether we need to load contents""" + return True + + def get_contents(self) -> Iterable[BaseContent]: + """Get the contents that need to be loaded""" + _hashes = [] + for rev in self.hashes_cache: + ctx = self.repo[rev] + + for filename in self.hashes_cache[rev]: + fctx = ctx[filename] + data = fctx.data() + hashes = self.hashes_cache[rev][filename] + _hashes.append(hashes["sha1_git"]) + + yield Content( + data=data, status="visible", length=len(data), **hashes, + ) + + def has_directories(self) -> bool: + """Checks whether we need to load directories""" + # TODO + return True + + def get_directories(self) -> Iterable[Directory]: + """Get the directories that need to be loaded""" + + self.rev_dir_hashes: Dict[bytes, bytes] = {} + for rev in self.hashes_cache: + tree = Tree() + + ctx = self.repo[rev] + for filepath in self.hashes_cache[rev]: + fctx = ctx[filepath] + tree.add_file( + path=filepath, + perms=FLAG_PERMS[fctx.flags()], + sha1_git=self.hashes_cache[rev][filepath]["sha1_git"], + ) + + yield from tree.directories() + self.rev_dir_hashes[rev] = tree.sha1_git() + + def has_revisions(self) -> bool: + """Checks whether we need to load revisions""" + # TODO + return True + + def get_revisions(self) -> Iterable[Revision]: + """Get the revisions that need to be loaded""" + self._node_hashes: Dict[bytes, bytes] = {} + for rev in self.repo: + ctx = self.repo[rev] + parents = tuple( + [ + self._node_hashes[p.node()] # revisions are in increasing order so + # this won't fail + for p in ctx.parents() + if p.node() != mercurial.node.nullid + ] + ) + + author = author_dict_from_str(ctx.user()) + rev_date = get_ctx_date(ctx).to_dict() + metadata = {f"hg-extra-{k}": v for k, v in ctx.extra().items()} + + revision = { + "author": author, + "date": rev_date, + "committer": author, + "committer_date": rev_date, + "type": RevisionType.MERCURIAL, + "directory": self.rev_dir_hashes[rev], + "message": ctx.description(), + "metadata": metadata, + "synthetic": False, + "parents": parents, + } + revision["id"] = hash_to_bytes(Revision.compute_hash(revision)) + self._node_hashes[ctx.node()] = revision["id"] + yield Revision.from_dict(revision) + + def has_releases(self) -> bool: + """Checks whether we need to load releases""" + # TODO + return True + + def get_releases(self) -> Iterable[Release]: + """Get the releases that need to be loaded""" + self.tags_target = {} + for (name, nodeid) in self.repo.tags().items(): + target_ctx = self.repo[nodeid] + self.tags_target[name] = target_ctx.node() + if name == b"tip": + continue + + if nodeid == self.tags_rev[name]["target"]: + tag_ctx = self.repo[self.tags_rev[name]["rev"]] + + release = { + "author": author_dict_from_str(tag_ctx.user()), + "date": get_ctx_date(tag_ctx).to_dict(), + "name": name, + "target": self._node_hashes[target_ctx.node()], + "target_type": ObjectType.REVISION, + "message": None, + "metadata": None, + "synthetic": False, + } + release["id"] = Release.compute_hash(release) + + yield Release.from_dict(release) + else: + # TODO: should not happen -> warning or exception. + pass + + def get_snapshot(self) -> Snapshot: + """Get the snapshot that needs to be loaded""" + branches: Dict[bytes, Any] = {} + + head_branch_map = {} + for name, heads in self.repo.branchmap().items(): + for head in heads: + ctx = self.repo[head] + # branch/{branch} = num rev le plus grand + # sinon: wild-branch/{branch}/{hash} + # voir heptapod + ref = ( + b"refs/heads/" + + name + + b"-" + + self._node_hashes[ctx.node()].hex().encode() + ) + branches[ref] = { + "target": self._node_hashes[ctx.node()], + "target_type": TargetType.REVISION.value, + } + head_branch_map[ctx.node()] = ref + for name, target in self.tags_target.items(): + if name == b"tip": + pass + # branches[b"HEAD"] = { + # "target": head_branch_map[target], + # "target_type": TargetType.ALIAS.value, + # } + else: + ref = b"refs/tags/" + name + branches[ref] = { + "target": self._node_hashes[target], + "target_type": TargetType.REVISION.value, + } + + snapshot = {"branches": branches} + snapshot["id"] = hash_to_bytes(Snapshot.compute_hash({"branches": branches})) + return Snapshot.from_dict(snapshot) + + def load_status(self): + """Detailed loading status. + + Defaults to logging an eventful load. + + Returns: a dictionary that is eventually passed back as the task's + result to the scheduler, allowing tuning of the task recurrence + mechanism. + """ + return { + "status": "eventful", + } + + +def get_ctx_date(ctx) -> TimestampWithTimezone: + (timestamp, offset) = ctx.date() + result = TimestampWithTimezone( + timestamp=Timestamp(seconds=int(timestamp), microseconds=0), + offset=-(offset // 60), + negative_utc=(offset == 0), + ) + return result + + +def build_hashes_cache(repo): + result = {} + + for rev in repo: + result[rev] = {} + ctx = repo[rev] + manifest = ctx.manifest() + + for filepath in manifest: + fctx = ctx[filepath] + data = fctx.data() + hashes = MultiHash.from_data(data).digest() + result[rev][filepath] = hashes + + return result + + +def path_split_all(path): + parts = [] + head, tail = os.path.split(path) + while tail != b"": + parts.append(tail) + head, tail = os.path.split(head) + return list(reversed(parts)) + + +def get_tags_rev(repo): + result = {} + hgtags_log = repo.file(b".hgtags") + + for trev in hgtags_log: + # TODO: handle edge cases for linkrev + rev = hgtags_log.linkrev(trev) + + oldfnodes = [ + hgtags_log.node(p) + for p in hgtags_log.parentrevs(trev) + if p != mercurial.node.nullrev + ] + newfnodes = [hgtags_log.node(trev)] + + changes = tagsmod.difftags(repo.ui, repo, oldfnodes, newfnodes) + for tag, old, new in changes: + if new is None: + del result[tag] + else: + result[tag] = {"rev": rev, "target": new} + + return result + + +class Tree: + def __init__(self): + self._root = TreeDirectory(None) + + def __str__(self): + return str(self._root) + + def add_file(self, path: bytes, perms: int, sha1_git: bytes): + path, filename = os.path.split(path) + parts = path_split_all(path) + + current_dir = self._root + for part in parts: + current_dir = current_dir.add_dir(part) + current_dir.add_file(filename, perms, sha1_git) + + def directories(self): + yield from self._root.directories() + + def sha1_git(self): + return self._root.sha1_git() + + +class TreeDirectory: + def __init__(self, name: bytes) -> None: + self._name = name + self._files: Dict[bytes, TreeFile] = {} + self._dirs: Dict[bytes, TreeDirectory] = {} + self._sha1_git: Optional[bytes] = None + + def add_dir(self, name: bytes) -> "TreeDirectory": + if name not in self._dirs: + self._dirs[name] = TreeDirectory(name) + return self._dirs[name] + + def add_file(self, name: bytes, perms: int, sha1_git: bytes) -> None: + if name in self._files: + raise Exception(f"name {name.decode()} already exists") + if name in self._dirs: + raise Exception(f"name {name.decode()} already is an existing directory") + self._files[name] = TreeFile(name, perms, sha1_git) + + def __str__(self): + name = self._name.decode() if self._name else "/" + files = textwrap.indent("\n".join(map(str, self._files.values())), prefix=" ") + dirs = textwrap.indent("\n".join(map(str, self._dirs.values())), prefix=" ") + return "\n".join([f"{name} ({self.sha1_git().hex()})", files, dirs]) + + def sha1_git(self) -> bytes: + if self._sha1_git is None: + self._sha1_git = hash_to_bytes( + Directory.compute_hash({"entries": self.entries()}) + ) + return self._sha1_git + + def entries(self) -> List[Dict[str, Any]]: + file_entries = [f.as_entry_dict() for f in self._files.values()] + dir_entries = [d.as_entry_dict() for d in self._dirs.values()] + return file_entries + dir_entries + + def as_entry_dict(self) -> Dict[str, Any]: + return { + "type": "dir", + "perms": DIR_PERM, + "name": self._name, + "target": self.sha1_git(), + } + + def directories(self) -> Iterator[Directory]: + for item in self._dirs.values(): + yield from item.directories() + + yield Directory.from_dict( + {"id": self.sha1_git(), "entries": self.entries(),} + ) + + +class TreeFile: + def __init__(self, name: bytes, perms: int, sha1_git: bytes) -> None: + self._name = name + self._perms = perms + self._sha1_git = sha1_git + + def __str__(self): + return f"{self._name.decode()} ({self._sha1_git.hex()})" + + def as_entry_dict(self) -> Dict[str, Any]: + return { + "type": "file", + "perms": self._perms, + "name": self._name, + "target": self._sha1_git, + } + + def directories(self) -> Iterator[Directory]: + yield from [] + + +def author_dict_from_str(author: bytes) -> Dict[str, bytes]: + result = Person.from_fullname(author).to_dict() + # git requires a space between name and email. Other wise the hash differs + # the fullname is used when present in the hash function + result["fullname"] = result["name"] + b" <" + result["email"] + b">" + return result + + +if __name__ == "__main__": + import click + import logging + + logging.basicConfig( + level=logging.DEBUG, format="%(asctime)s %(process)d %(message)s" + ) + + @click.command() + @click.option("--origin-url", help="origin url") + @click.option("--hg-directory", help="Path to mercurial repository to load") + @click.option("--visit-date", default=None, help="Visit date") + def main(origin_url, hg_directory, visit_date): + if not visit_date: + visit_date = datetime.datetime.now(tz=datetime.timezone.utc) + + return HgLoaderFromDisk().load(origin_url, hg_directory, visit_date) + + main() diff --git a/swh/loader/mercurial/tests/resources/make_example.sh b/swh/loader/mercurial/tests/resources/make_example.sh new file mode 100755 --- /dev/null +++ b/swh/loader/mercurial/tests/resources/make_example.sh @@ -0,0 +1,93 @@ +#!/usr/bin/env bash + +# Create an example repository and convert it to git using hg-git +# Use make_expected.py after generation to extract expected test results from +# generated git repository. + +set -e +[ -e "hg-git" ] || python -m venv hg-git +[ -e "hg-git/bin/hg" ] || ./hg-git/bin/pip install \ + "hg-git==0.9.0a1" \ + "mercurial==5.4.1" \ + "hg-evolve==10.0.0" \ + "dulwich<0.20" +PATH="$PWD/hg-git/bin:$PATH" +HOME="$PWD/hg-git" +HGRC="$HOME/.hgrc" + +[ -e "$HGRC" ] || { + echo "[ui]" + echo "username = Full Name" + echo "[extensions]" + echo "hggit=" +} >> "$HGRC" + +[ -e "example" ] && rm -rf example +hg init example/repo +cd example/repo + +# simple file +echo "# My Project" >> README.md +hg add README.md +hg commit -m "Add README" + +# file with modification +echo "Short project description." >> README.md +hg commit -m "Add project description" + +# file in directory +mkdir -p myproject +echo '__version__ = "0.0.1"' >> myproject/__init__.py +hg add myproject/__init__.py +hg commit -m "Create python package" + +# public changesets +hg phase --public -r ::. + +# closed branch +hg branch v0.0.2 +echo '__version__ = "0.0.2"' > myproject/__init__.py +hg commit -m "Bump version to 0.0.2" +hg update default +echo "# This is the CLI module" >> myproject/cli.py +hg add myproject/cli.py +hg commit -m "Create myproject.cli module" +hg update v0.0.2 +hg merge -r default +hg commit --close-branch -m "Close branch v0.0.2" +hg update default + +# living branch +hg branch v0.1.x +echo '__version__ = "0.1.0"' > myproject/__init__.py +hg commit -m "Bump version to 0.1.0" +hg update default +echo "# This is the utils module" >> myproject/utils.py +hg add myproject/utils.py +hg commit -m "Create myproject.utils module" +hg update v0.1.x +hg merge -r default +hg commit -m "Merge default" +hg update default + +# TODO tag +# hg tag sometag + +# TODO multihead +# for i in `seq 5`; do +# echo "`date` $i" >> head +# hg add head +# hg ci -m "useless commit $i" +# done +# hg up .~5 +# echo "new head" >> head +# hg add head +# hg ci -m "Multi head" + +# TODO empty commit +# hg commit --config ui.allowemptycommit=yes -m "empty commit" + +# create bookmarks to create git branches +hg branches --closed --template="hg bookmark -r {branch} hg-{branch};" | sh +# hg to git +hg gexport diff --git a/swh/loader/mercurial/tests/resources/make_expected.py b/swh/loader/mercurial/tests/resources/make_expected.py new file mode 100755 --- /dev/null +++ b/swh/loader/mercurial/tests/resources/make_expected.py @@ -0,0 +1,251 @@ +#!/usr/bin/env python + +""" Extract expected test results from repository generated with make_example.sh """ + +import hashlib +import json +import subprocess + + +class Repository: + def __init__(self, path): + self._path = path + + def __call__(self, *args): + return subprocess.check_output(["git", *args], cwd=self._path) + + def commits(self): + return self("log", "--format=%H", "--branches").decode().splitlines() + + def cat_file_content(self, hash): + return self("cat-file", "-p", hash) + + def refs(self): + return self("for-each-ref", "--format=%(objectname) %(objecttype) %(refname)") + + def head_ref(self): + return self("describe", "--all", "HEAD") + + +def git_obj_from_line(repo, line): + obj_perm, obj_type, obj_hash, obj_name = line.decode().split() + type_map = { + "tree": Tree, + "blob": Blob, + } + return type_map[obj_type](repo, obj_hash, obj_perm, obj_name) + + +class GitObj: + def __init__(self, repo, hash): + self._repo = repo + self._hash = hash + + def id(self): + return self._hash + + def cat_file(self): + return self._repo.cat_file_content(self._hash) + + +class Commit(GitObj): + def __init__(self, repo, hash, commits): + super().__init__(repo, hash) + self._commits = commits + self._tree = None + self._object = None + + def id(self): + return self.object()["id"] + + def tree(self): + if self._tree is None: + self._tree = Tree(self._repo, self.object()["tree"]) + return self._tree + + def rm_hg_metadata(self, data): + """ Removes metadata added by hg-git. """ + lines = [] + + for line in data.split(b"\n"): + if line.startswith(b"HG:"): + continue + if b"--HG--" in line: + break + lines.append(line) + + return b"\n".join(lines).strip() + + def cat_parts(self): + """ Return commit object parts. + + Used to rehash the commit id with removed hg-git metadata. + """ + data = self.rm_hg_metadata(self.cat_file()) + parts = {"parents": []} + + line, remaining = data.split(b"\n", 1) + parts["tree"] = line.split()[1] + + while remaining.startswith(b"parent"): + line, remaining = remaining.split(b"\n", 1) + parts["parents"].append(line.split()[1]) + + author, committer, _, message = remaining.split(b"\n", 3) + + parts["author"] = author.split(b" ", 1)[1] + parts["committer"] = committer.split(b" ", 1)[1] + parts["message"] = message + + return parts + + def object(self): + """ Return the object as dict for test expected result. """ + if self._object is None: + parts = self.cat_parts() + self._object = { + "type": "commit", + "tree": parts["tree"].decode(), + "parents": [], + "author": parts["author"].decode(), + "committer": parts["committer"].decode(), + "message": parts["message"].decode(), + } + + # replace parents with re-hashed ids (removed hg-git metadata) + for parent in map(bytes.decode, parts["parents"]): + if parent not in self._commits: + self._commits[parent] = Commit(self._repo, parent, self._commits) + self._object["parents"].append(self._commits[parent].id()) + parts["parents"] = [parent.encode() for parent in self._object["parents"]] + + # re-hash id because parents id has changed + self._object["id"] = commit_id_from_parts(parts) + + return self._object + + def objects(self): + """ Return a stream objects composing the commit. """ + yield from self.tree().objects() + yield self.object() + + +def commit_id_from_parts(parts): + data = b"\n".join( + [ + b"tree " + parts["tree"], + *[b"parent " + parent for parent in parts["parents"]], + b"author " + parts["author"], + b"committer " + parts["committer"], + b"\n" + parts["message"], + ] + ) + # print("---", commit_id(data),"---") + # print(data.decode()) + # print("--- END ---") + return commit_id(data) + + +def commit_id(data): + data_len = str(len(data)).encode() + commit = b"commit %b\0%b" % (data_len, data) + hash = hashlib.sha1(commit) + return hash.hexdigest() + + +class Tree(GitObj): + def __init__(self, repo, hash, perm=None, name=None): + super().__init__(repo, hash) + self._perm = perm + self._name = name + self._items = None + + def items(self): + if self._items is None: + self._items = [ + git_obj_from_line(self._repo, line) + for line in self.cat_file().splitlines() + ] + return self._items + + def content(self): + for obj in self.items(): + yield from obj.content() + + def object(self): + """ Return the object as dict for test expected result. """ + return { + "type": "tree", + "perm": self._perm, + "id": self.id(), + "name": self._name, + "objects": [obj.id() for obj in self.items()], + } + + def objects(self): + """ Return a stream objects composing the tree. """ + for obj in self.items(): + yield from obj.objects() + yield self.object() + + +class Blob(GitObj): + def __init__(self, repo, hash, perm=None, name=None): + super().__init__(repo, hash) + self._perm = perm + self._name = name + + def content(self): + yield self._hash + + def object(self): + """ Return the object as dict for test expected result. """ + return { + "type": "blob", + "perm": self._perm, + "id": self.id(), + "name": self._name, + "cat-file": self.cat_file().decode(), + } + + def objects(self): + """ Return self as a stream. """ + yield self.object() + + +def make_expected(path): + commits_cache = {} + objects = [] + repo = Repository(path) + for hash in repo.commits(): + # calling commit.objects() will add parent commits + # make sure to not redo the work twice + if hash not in commits_cache: + commits_cache[hash] = Commit(repo, hash, commits_cache) + objects.extend(commits_cache[hash].objects()) + + branches = {} + refs_target = {} + for line in repo.refs().decode().splitlines(): + target, ref_type, ref = line.split(" ", 2) + _, ref_kind, ref_name = ref.split("/", 2) + if ref_type in ("tag", "commit"): + ref_name = ref_name.lstrip("hg-") + target = commits_cache[target].id() + refs_target[f"refs/{ref_kind}/{ref_name}"] = target + branches[f"refs/{ref_kind}/{ref_name}-{target}"] = { + "target": target, + "target_type": "revision", + } + else: + raise Exception(f"unhandled ref type: {ref_type}") + + return {"objects": {obj["id"]: obj for obj in objects}, "branches": branches} + + +if __name__ == "__main__": + json.dump( + make_expected("example/repo/.hg/git"), + open("example/expected.json", "w"), + indent=2, + ) diff --git a/swh/loader/mercurial/tests/test_from_disk.py b/swh/loader/mercurial/tests/test_from_disk.py new file mode 100644 --- /dev/null +++ b/swh/loader/mercurial/tests/test_from_disk.py @@ -0,0 +1,118 @@ +import json +import os + +from swh.loader.core.tests import BaseLoaderTest +from swh.loader.mercurial.from_disk import HgLoaderFromDisk +from swh.model.hashutil import hash_to_bytes +from swh.model.identifiers import snapshot_identifier + + +class TestLoader(HgLoaderFromDisk): + def parse_config_file(self, *args, **kwargs): + return { + "storage": { + "cls": "pipeline", + "steps": [{"cls": "filter"}, {"cls": "memory"},], + }, + "max_content_size": 100 * 1024 * 1024, + "save_data": False, + } + + +class BaseHgLoaderFromDiskTest(BaseLoaderTest): + def setUp(self, archive_name, filename=None): + super().setUp( + archive_name=archive_name, + filename=filename, + prefix_tmp_folder_name="swh.loader.mercurial.", + start_path=os.path.dirname(__file__), + uncompress_archive=False, + ) + self.loader = TestLoader(url=self.repo_url, directory=self.destination_path,) + self.storage = self.loader.storage + self.repo = self.destination_path + + def load(self): + return self.loader.load() + + +class HgLoaderFromDiskTests(BaseHgLoaderFromDiskTest): + def setUp(self): + super().setUp(archive_name="example/repo") + self._expected = None + + def expected(self): + if self._expected is None: + path = os.path.join(os.path.dirname(self.destination_path), "expected.json") + self._expected = json.load(open(path, "r")) + return self._expected + + def expected_content(self): + return [ + obj["id"] + for obj in self.expected()["objects"].values() + if obj["type"] == "blob" + ] + + def expected_dir_count(self): + return sum( + [1 for obj in self.expected()["objects"].values() if obj["type"] == "tree"] + ) + + def expected_revisions(self): + return { + obj["id"]: obj["tree"] + for obj in self.expected()["objects"].values() + if obj["type"] == "commit" + } + + def expected_snapshot(self): + result = {"branches": self.expected()["branches"]} + result["id"] = snapshot_identifier( + { + "branches": { + name.encode(): { + "target": branch["target"].encode() + if name == "HEAD" + else hash_to_bytes(branch["target"]), + "target_type": branch["target_type"], + } + for name, branch in result["branches"].items() + } + } + ) + return result + + def assertContentsContainSha1Git(self, expected): + self.assertEqual( + list( + self.storage.content_missing( + [{"sha1_git": hash_to_bytes(sha1_git)} for sha1_git in expected] + ) + ), + [], + ) + + def test_load(self): + result = self.load() + self.assertEqual(result["status"], "eventful", result) + + self.assertContentsContainSha1Git(self.expected_content()) + self.assertCountDirectories(self.expected_dir_count()) + + revisions = self.expected_revisions() + self.assertCountRevisions(len(revisions)) + self.assertRevisionsContain(revisions) + + snapshot = self.expected_snapshot() + self.assertCountSnapshots(1) + self.assertSnapshotEqual(snapshot) + + self.assertCountReleases(0) + + self.assertEqual(self.loader.load_status(), {"status": "eventful"}) + self.assertEqual(self.loader.visit_status(), "full") + + visit = self.storage.origin_visit_get_latest(self.repo_url) + self.assertEqual(visit["snapshot"], hash_to_bytes(snapshot["id"])) + self.assertEqual(visit["status"], "full")