diff --git a/setup.py b/setup.py --- a/setup.py +++ b/setup.py @@ -54,6 +54,8 @@ entry_points=""" [swh.workers] loader.mercurial=swh.loader.mercurial:register + [console_scripts] + swh-hg-identify=swh.loader.mercurial.identify:main """, classifiers=[ "Programming Language :: Python :: 3", diff --git a/swh/loader/mercurial/identify.py b/swh/loader/mercurial/identify.py new file mode 100644 --- /dev/null +++ b/swh/loader/mercurial/identify.py @@ -0,0 +1,532 @@ +# Copyright (C) 2018-2020 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import json +import re +import subprocess +from codecs import escape_decode # type: ignore +from pathlib import Path +from typing import Any, Dict, Iterator, List, NamedTuple, Optional, Union + +# WARNING: do not import unnecessary things here to keep cli startup time under +# control +import click + +from swh.model.hashutil import hash_to_bytehex +from swh.model.identifiers import normalize_timestamp, swhid +from swh.model.model import RevisionType + +TAG_PATTERN = re.compile(b"([0-9A-Fa-f]{40}) +(.+)") + + +class HgAuthor(NamedTuple): + """Represent a Mercurial revision author.""" + + fullname: bytes + """full name of the author""" + + name: Optional[bytes] + """name of the author""" + + email: Optional[bytes] + """email of the author""" + + @staticmethod + def from_bytes(data: bytes) -> "HgAuthor": + """Convert bytes to an HgAuthor named tuple. + + Expected format: "name " + """ + from swh.loader.mercurial.converters import parse_author + + result = parse_author(data) + return HgAuthor( + fullname=result["fullname"], name=result["name"], email=result["email"] + ) + + def to_dict(self) -> Dict[str, Optional[bytes]]: + return {"fullname": self.fullname, "name": self.name, "email": self.email} + + +HG_REVISION_TEMPLATE = "\n".join( + [ + "node_id:{node}", + "author:{author}", + "timestamp_offset:{date|json}", + "p1:{p1.node}", + "p2:{p2.node}", + "extras:{join(extras, '\nextras:')}", + ] +) # Log template for HgRevision.from_bytes + +NULL_NODE_ID = b"0" * 40 # Value used when no parent + + +class HgRevision(NamedTuple): + """Represent a Mercurial revision.""" + + node_id: bytes + """raw bytes of the revision hash""" + + author: HgAuthor + """author of the revision""" + + timestamp: bytes + """timestamp of the revision""" + + offset: bytes + """offset of the revision""" + + parents: List[bytes] + """hex bytes of the revision's parents""" + + extras: Dict[bytes, bytes] + """metadata of the revision""" + + description: bytes + """description of the revision""" + + @staticmethod + def from_bytes(data: bytes, description: bytes) -> "HgRevision": + """Convert bytes to an HgRevision named tuple. + + Expected data format: + ''' + node_id:{node} + author:{author} + timestamp_offset:[{timestamp}, {offset}] + p1:{p1} + p2:{p2} + extras:{key1}={value1} + ... + extras:{keyn}={value} + ''' + + """ + lines = data.split(b"\n") + tuples = [line.split(b":", 1) for line in lines] + fields: Dict[str, Any] = { + "parents": [], + "extras": {}, + "description": description, + } + for key, value in tuples: + if key == b"timestamp_offset": + timestamp, offset = json.loads(value) + fields["timestamp"] = timestamp + fields["offset"] = offset + elif key in (b"p1", b"p2"): + if value != NULL_NODE_ID: + fields["parents"].append(value) + elif key == b"extras": + extra_key, extra_value = value.split(b"=", 1) + fields["extras"][extra_key] = extra_value + elif key == b"author": + fields["author"] = HgAuthor.from_bytes(value) + else: + fields[key.decode()] = value + + return HgRevision(**fields) + + def branch(self) -> bytes: + return self.extras.get(b"branch", b"default") + + def to_dict(self) -> Dict: + """Convert a HgRevision to a dict for SWHID computation""" + date = normalize_timestamp(int(self.timestamp)) + + extra_headers = [ + (b"time_offset_seconds", str(self.offset).encode("utf-8")), + ] + + for key, value in self.extras.items(): + if key == b"branch" and value == b"default": + # branch default is skipped to match actual implementation + continue + if key == b"transplant_source": + # transplant_source is converted to hex to match actual implementation + value = hash_to_bytehex(escape_decode(value)[0]) + extra_headers.append((key, value)) + + author = self.author.to_dict() + + return { + "author": author, + "date": date, + "committer": author, + "committer_date": date, + "type": RevisionType.MERCURIAL.value, + "message": self.description, + "metadata": {"node": self.node_id}, + "extra_headers": tuple(extra_headers), + "synthetic": False, + "parents": self.parents, + } + + +class HgBranch(NamedTuple): + """Represent a Mercurial branch.""" + + name: bytes + """name of the branch""" + + node_id: bytes + """row bytes of the target revision hash""" + + +class HgTag(NamedTuple): + """Represent a Mercurial tag.""" + + name: bytes + """name of the tag""" + + node_id: bytes + """hex bytes of the target revision""" + + +class Hg: + """Provide methods to extract data from a Mercurial repository.""" + + def __init__(self, repository_root: Path) -> None: + self._root = repository_root + + def _output(self, *args) -> bytes: + """Return the outpout of a `hg` call.""" + return subprocess.check_output(["hg", *args], cwd=self._root) + + def _call(self, *args) -> None: + """Perform a `hg` call.""" + subprocess.check_call( + ["hg", *args], + cwd=self._root, + stderr=subprocess.PIPE, + stdout=subprocess.PIPE, + ) + + def root(self) -> Path: + """Return the root of the Mercurial repository.""" + return self._root + + def log(self, rev: Optional[Union[bytes, str]] = None) -> List[HgRevision]: + """Return the specified revisions of the Mercurial repository. + + Mercurial revsets are supported. (See `hg help revsets`) + + If no revision range is specified, return all revisions". + """ + if rev: + node_ids = self._output("log", "-r", rev, "-T", "{node}\n").splitlines() + else: + node_ids = self._output("log", "-T", "{node}\n").splitlines() + + revisions = [self._revision(node_id) for node_id in reversed(node_ids)] + + return revisions + + def _revision(self, revision: bytes) -> HgRevision: + data = self._output("log", "-r", revision, "-T", HG_REVISION_TEMPLATE) + + # hg log strips the description so the raw description has to be taken + # from debugdata + _, desc = self._output("debugdata", "-c", revision).split(b"\n\n", 1) + + return HgRevision.from_bytes(data, desc) + + def up(self, rev: bytes) -> None: + """Update the repository working directory to the specified revision.""" + self._call("up", rev) + + def branches(self) -> List[HgBranch]: + """List the repository named branches.""" + output = self._output("branches", "-T", "{branch}\n{node}\n\n").strip() + + branches = [] + + for block in output.split(b"\n\n"): + name, node_id = block.splitlines() + branches.append(HgBranch(name=name, node_id=node_id)) + + return branches + + def tip(self) -> HgRevision: + """Return the `tip` revision.""" + return self.log("tip")[0] + + def tags(self) -> List[HgTag]: + """Return the repository tags at the current revision.""" + hgtags = self._root / ".hgtags" + + tags = {} + + if hgtags.is_file(): + for line in hgtags.read_bytes().splitlines(): + match = TAG_PATTERN.match(line) + if match is None: + continue + node_id, name = match.groups() + tags[node_id] = name + + return [HgTag(name=name, node_id=node_id) for node_id, name in tags.items()] + + +@click.group() +@click.option( + "--directory", + "-d", + help=("Path to the Mercurial repository. If unset, the current directory is used"), +) +@click.pass_context +def main(ctx, directory=None): + """Compute the Software Heritage persistent identifier (SWHID) for the given + source code object(s). + + For more details about SWHIDs see: + + https://docs.softwareheritage.org/devel/swh-model/persistent-identifiers.html + """ + # ensure that ctx.obj exists and is a dict (in case `cli()` is called + # by means other than the `if` block below) + ctx.ensure_object(dict) + + root = Path(directory) if directory else Path() + if not root.exists(): + raise IOError(f"{root!r} does not exists") + + ctx.obj["HG_ROOT"] = root + + +def identify_directory(path: Path) -> str: + """Return the SWHID of the given path.""" + output = subprocess.check_output( + ["swh-identify", "--exclude", ".hg", "."], cwd=path + ).decode() + return output.split()[0].split(":")[-1] + + +class RevisionIdentity(NamedTuple): + """Represent a swh revision identity.""" + + swhid: bytes + """SWHID raw bytes""" + + node_id: bytes + """node_id hex bytes""" + + directory_swhid: bytes + + def dir_uri(self) -> str: + """Return the SWHID uri of the revision's directory.""" + return f"swh:1:dir:{self.directory_swhid.hex()}\t{self.node_id.decode()}" + + def __str__(self) -> str: + """Return the string representation of a RevisionIdentity.""" + uri = swhid("revision", self.swhid.hex()) + return f"{uri}\t{self.node_id.decode()}" + + +def identify_revision( + hg: Hg, + rev: Optional[bytes] = None, + node_id_2_swhid: Optional[Dict[bytes, bytes]] = None, +) -> Iterator[RevisionIdentity]: + """Return the repository revision identities. + + hg: A `Hg` repository instance + rev: An optional revision or Mercurial revsets (See `hg help revsets`) + If not provided all the repository revisions will be computed. + node_id_2_swhid: An optional cache mapping hg node ids to SWHIDs + It will be updated in place with new mappings. + """ + from swh.model.hashutil import hash_to_bytes + from swh.model.model import Revision + + if node_id_2_swhid is None: + node_id_2_swhid = {} + + for revision in hg.log(rev): + data = revision.to_dict() + + hg.up(revision.node_id) + directory_swhid = hash_to_bytes(identify_directory(hg.root())) + data["directory"] = directory_swhid + + parents = [] + for parent in data["parents"]: + if parent not in node_id_2_swhid: + parent_revision = next(identify_revision(hg, parent, node_id_2_swhid)) + node_id_2_swhid[parent] = parent_revision.swhid + parents.append(node_id_2_swhid[parent]) + data["parents"] = parents + + revision_swhid = hash_to_bytes(Revision.from_dict(data).id) + node_id_2_swhid[revision.node_id] = revision_swhid + + yield RevisionIdentity( + swhid=revision_swhid, + node_id=revision.node_id, + directory_swhid=directory_swhid, + ) + + +class ReleaseIdentity(NamedTuple): + """Represent a swh release identity.""" + + swhid: str + """SWHID hex string""" + + node_id: bytes + """node_id hex bytes""" + + name: bytes + """name of the release""" + + def __str__(self) -> str: + """Return the string representation of a ReleaseIdentity.""" + uri = swhid("release", self.swhid) + return f"{uri}\t{self.name.decode()}" + + +def identify_release( + hg: Hg, node_id_2_swhid: Optional[Dict[bytes, bytes]] = None, +) -> Iterator[ReleaseIdentity]: + """Return the repository's release identities. + + hg: A `Hg` repository instance + node_id_2_swhid: An optional cache mapping hg node ids to SWHIDs + If not provided it will be computed using `identify_revision`. + """ + from swh.model.model import ObjectType, Release + + if node_id_2_swhid is None: + node_id_2_swhid = { + revision.node_id: revision.swhid for revision in identify_revision(hg) + } + + for tag in hg.tags(): + data = { + "name": tag.name, + "target": node_id_2_swhid[tag.node_id], + "target_type": ObjectType.REVISION.value, + "message": None, + "metadata": None, + "synthetic": False, + "author": {"name": None, "email": None, "fullname": b""}, + "date": None, + } + + release_swhid = Release.from_dict(data).id + + yield ReleaseIdentity( + swhid=release_swhid, node_id=tag.node_id, name=tag.name, + ) + + +def identify_snapshot( + hg: Hg, + node_id_2_swhid: Optional[Dict[bytes, bytes]] = None, + releases: Optional[List[ReleaseIdentity]] = None, +) -> str: + """Return the repository snapshot identity. + + hg: A `Hg` repository instance + node_id_2_swhid: An optional cache mapping hg node ids to SWHIDs + If not provided it will be computed using `identify_revision`. + release: an optional list of `ReleaseIdentity`. + If not provided it will be computed using `identify_release`. + """ + from swh.model.model import Snapshot, TargetType + + if node_id_2_swhid is None: + node_id_2_swhid = { + revision.node_id: revision.swhid for revision in identify_revision(hg) + } + + if releases is None: + releases = [release for release in identify_release(hg, node_id_2_swhid)] + + branches = {} + + tip = hg.tip() + branches[b"HEAD"] = { + "target": tip.branch(), + "target_type": TargetType.ALIAS.value, + } + + for branch in hg.branches(): + branches[branch.name] = { + "target": node_id_2_swhid[branch.node_id], + "target_type": TargetType.REVISION.value, + } + + for release in releases: + branches[release.name] = { + "target": release.swhid, + "target_type": TargetType.RELEASE.value, + } + + return Snapshot.from_dict({"branches": branches}).id + + +@main.command() +@click.argument("rev", required=False) +@click.pass_context +def revision(ctx, rev): + """Compute the SWHID of a given revision. + + If specified REV allow to select a single or multiple revisions + (using the Mercurial revsets language: `hg help revsets`) + """ + hg = Hg(ctx.obj["HG_ROOT"]) + + for identity in identify_revision(hg, rev): + click.echo(identity) + + +@main.command() +@click.pass_context +def snapshot(ctx): + """Compute the SWHID of the snapshot.""" + root = ctx.obj["HG_ROOT"] + hg = Hg(root) + + snapshot_swhid = identify_snapshot(hg) + + uri = swhid("snapshot", snapshot_swhid) + click.echo(f"{uri}\t{root}") + + +@main.command() +@click.pass_context +def all(ctx): + """Compute the SWHID of all the repository objects.""" + root = ctx.obj["HG_ROOT"] + hg = Hg(root) + + dir_uris = [] + rev_uris = [] + rel_uris = [] + + node_id_2_swhid = {} + for revision in identify_revision(hg): + dir_uris.append(revision.dir_uri()) + rev_uris.append(str(revision)) + node_id_2_swhid[revision.node_id] = revision.swhid + + releases = [] + for release in identify_release(hg, node_id_2_swhid): + rel_uris.append(str(release)) + releases.append(release) + + snapshot_swhid = identify_snapshot(hg, node_id_2_swhid, releases) + + for uri in dir_uris + rev_uris + rel_uris: + click.echo(uri) + + uri = swhid("snapshot", snapshot_swhid) + click.echo(f"{uri}\t{root}") + + +if __name__ == "__main__": + main() diff --git a/swh/loader/mercurial/tests/data/build.py b/swh/loader/mercurial/tests/data/build.py new file mode 100755 --- /dev/null +++ b/swh/loader/mercurial/tests/data/build.py @@ -0,0 +1,187 @@ +#!/usr/bin/env python +# Copyright (C) 2018-2020 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import json +import shutil +import subprocess +from datetime import datetime +from pathlib import Path + +import click + + +def abort(message): + """Abort the script with a message.""" + click.echo(message, err=True) + click.get_current_context().abort() + + +def backup(path: Path): + """Rename an existing path.""" + click.echo(f"Creating backup of {path}") + now = datetime.now() + backup_path = path.with_suffix(f"{path.suffix}.bak.{now:%Y%m%d%H%M%S}") + path.rename(backup_path) + click.echo(f"Backup created: {str(backup_path)!r}") + + +def _build_repository(script: str) -> Path: + """Build a repository from a bash script.""" + script_path = Path(script).absolute() + + if not script_path.exists(): + abort(f"Path {script_path!r} does not exists.") + + if script_path.suffix != ".sh": + abort(f"Wrong suffix: {script_path.suffix!r}. Expected: '.sh'") + + repository_path = script_path.with_suffix("") + + if repository_path.exists(): + backup(repository_path) + + click.echo(f"Running build script: {str(script_path)!r}") + subprocess.call( + ["bash", "-euo", "pipefail", script_path], env={"HG_REPO": str(repository_path)} + ) + + return repository_path + + +def _build_json(source: str) -> Path: + if source.endswith(".tgz"): + archive_path = Path(source).absolute() + repository_path = archive_path.with_suffix("") + + if repository_path.exists(): + backup(repository_path) + + subprocess.call(["tar", "-xf", archive_path], cwd=archive_path.parent) + elif source.endswith(".sh"): + repository_path = _build_repository(source) + else: + repository_path = Path(source).absolute() + + click.echo(f"Extracting object identities: {str(repository_path)!r}") + output = subprocess.check_output(["swh-hg-identify", "all"], cwd=repository_path) + lines = output.decode().splitlines() + + directory_swhids = [] + revision_swhids = [] + release_swhids = [] + + for line in lines: + uri, _ = line.split("\t") + _, _, swhid_type, swhid = uri.split(":") + if swhid_type == "dir": + directory_swhids.append(swhid) + elif swhid_type == "rev": + revision_swhids.append(swhid) + elif swhid_type == "rel": + release_swhids.append(swhid) + elif swhid_type == "snp": + snapshot_swhid = swhid + else: + abort(f"{line!r} unknown type {swhid_type!r}") + + json_path = repository_path.with_suffix(".json") + + if json_path.exists(): + backup(json_path) + + click.echo(f"Creating object identities file: {str(json_path)!r}") + json_path.write_text( + json.dumps( + { + "directories": directory_swhids, + "revisions": revision_swhids, + "releases": release_swhids, + "snapshot": snapshot_swhid, + } + ) + ) + + return json_path + + +@click.group() +def main(): + """Build example repositories archive from bash scripts.""" + + +@main.command("repository") +@click.argument("script") +def build_repository(script: str): + """Build a repository. + + SCRIPT must be is a bash script with a `.sh` suffix + + The generated repository will have the same path minor the `.sh` suffix. + + The script will be executed in an already initialized repository. + So it only need to execute commands to populate the repository. + """ + _build_repository(script) + + +@main.command("json") +@click.argument("source") +def build_json(source: str): + """Build a json file of object identities. + + SOURCE can be a script as required by the `repository` command + (see repository --help), a repository archive, or an existing repository. + + The produced file will have the source path the `.json` suffix. + """ + _build_json(source) + + +@main.command("archive") +@click.option( + "--clean", "-c", default=False, is_flag=True, help="Remove created artifacts", +) +@click.argument("source") +def build_archive(source: str, clean: bool = False): + """Build a repository archive. + + SOURCE can be a script as required by the `repository` command + (see repository --help), or an existing repository. + + The produced archive will have the source path with the `.tgz` suffix. + It will contain the repository along with the json file of object identities. + """ + if source.endswith(".sh"): + repository_path = _build_repository(source) + else: + repository_path = Path(source).absolute() + if not (repository_path / ".hg").exists(): + abort(f"{str(repository_path)!r} is not a Mercurial repository") + + json_path = _build_json(str(repository_path)) + + archive_path = repository_path.with_suffix(".tgz") + if archive_path.exists(): + backup(archive_path) + + subprocess.call( + [ + "tar", + "-cf", + archive_path.relative_to(archive_path.parent), + repository_path.relative_to(archive_path.parent), + json_path.relative_to(archive_path.parent), + ], + cwd=archive_path.parent, + ) + + if clean: + shutil.rmtree(repository_path) + json_path.unlink() + + +if __name__ == "__main__": + main() diff --git a/swh/loader/mercurial/tests/data/hello.json b/swh/loader/mercurial/tests/data/hello.json new file mode 100644 --- /dev/null +++ b/swh/loader/mercurial/tests/data/hello.json @@ -0,0 +1 @@ +{"directories": ["43d727f2f3f2f7cb3b098ddad1d7038464a4cee2", "8f2be433c945384c85920a8e60f2a68d2c0f20fb", "b3f85f210ff86d334575f64cb01c5bf49895b63e"], "revisions": ["8dd3db5d5519e4947f035d141581d304565372d2", "93b48d515580522a05f389bec93227fc8e43d940", "c3dbe4fbeaaa98dd961834e4007edb3efb0e2a27"], "releases": ["515c4d72e089404356d0f4b39d60f948b8999140"], "snapshot": "d35668e02e2ba4321dc951cd308cf883786f918a"} \ No newline at end of file diff --git a/swh/loader/mercurial/tests/data/the-sandbox.json b/swh/loader/mercurial/tests/data/the-sandbox.json new file mode 100644 --- /dev/null +++ b/swh/loader/mercurial/tests/data/the-sandbox.json @@ -0,0 +1 @@ +{"directories": ["180bd57623a7c2c47a8c43514a5f4d903503d0aa", "180bd57623a7c2c47a8c43514a5f4d903503d0aa", "180bd57623a7c2c47a8c43514a5f4d903503d0aa", "180bd57623a7c2c47a8c43514a5f4d903503d0aa", "180bd57623a7c2c47a8c43514a5f4d903503d0aa", "180bd57623a7c2c47a8c43514a5f4d903503d0aa", "180bd57623a7c2c47a8c43514a5f4d903503d0aa", "180bd57623a7c2c47a8c43514a5f4d903503d0aa", "180bd57623a7c2c47a8c43514a5f4d903503d0aa", "180bd57623a7c2c47a8c43514a5f4d903503d0aa", "180bd57623a7c2c47a8c43514a5f4d903503d0aa", "180bd57623a7c2c47a8c43514a5f4d903503d0aa", "180bd57623a7c2c47a8c43514a5f4d903503d0aa", "180bd57623a7c2c47a8c43514a5f4d903503d0aa", "180bd57623a7c2c47a8c43514a5f4d903503d0aa", "180bd57623a7c2c47a8c43514a5f4d903503d0aa", "180bd57623a7c2c47a8c43514a5f4d903503d0aa", "180bd57623a7c2c47a8c43514a5f4d903503d0aa", "180bd57623a7c2c47a8c43514a5f4d903503d0aa", "180bd57623a7c2c47a8c43514a5f4d903503d0aa", "180bd57623a7c2c47a8c43514a5f4d903503d0aa", "180bd57623a7c2c47a8c43514a5f4d903503d0aa", "180bd57623a7c2c47a8c43514a5f4d903503d0aa", "180bd57623a7c2c47a8c43514a5f4d903503d0aa", "180bd57623a7c2c47a8c43514a5f4d903503d0aa", "180bd57623a7c2c47a8c43514a5f4d903503d0aa", "180bd57623a7c2c47a8c43514a5f4d903503d0aa", "180bd57623a7c2c47a8c43514a5f4d903503d0aa", "180bd57623a7c2c47a8c43514a5f4d903503d0aa", "180bd57623a7c2c47a8c43514a5f4d903503d0aa", "180bd57623a7c2c47a8c43514a5f4d903503d0aa", "180bd57623a7c2c47a8c43514a5f4d903503d0aa", "180bd57623a7c2c47a8c43514a5f4d903503d0aa", "180bd57623a7c2c47a8c43514a5f4d903503d0aa", "180bd57623a7c2c47a8c43514a5f4d903503d0aa", "180bd57623a7c2c47a8c43514a5f4d903503d0aa", "180bd57623a7c2c47a8c43514a5f4d903503d0aa", "180bd57623a7c2c47a8c43514a5f4d903503d0aa", "180bd57623a7c2c47a8c43514a5f4d903503d0aa", "180bd57623a7c2c47a8c43514a5f4d903503d0aa", "180bd57623a7c2c47a8c43514a5f4d903503d0aa", "180bd57623a7c2c47a8c43514a5f4d903503d0aa", "180bd57623a7c2c47a8c43514a5f4d903503d0aa", "180bd57623a7c2c47a8c43514a5f4d903503d0aa", "180bd57623a7c2c47a8c43514a5f4d903503d0aa", "180bd57623a7c2c47a8c43514a5f4d903503d0aa", "180bd57623a7c2c47a8c43514a5f4d903503d0aa", "180bd57623a7c2c47a8c43514a5f4d903503d0aa", "180bd57623a7c2c47a8c43514a5f4d903503d0aa", "180bd57623a7c2c47a8c43514a5f4d903503d0aa", "180bd57623a7c2c47a8c43514a5f4d903503d0aa", "180bd57623a7c2c47a8c43514a5f4d903503d0aa", "180bd57623a7c2c47a8c43514a5f4d903503d0aa", "180bd57623a7c2c47a8c43514a5f4d903503d0aa", "180bd57623a7c2c47a8c43514a5f4d903503d0aa", "180bd57623a7c2c47a8c43514a5f4d903503d0aa", "9cd8160c67ac4b0bc97e2e2cd918a580425167d3", "e2e117569b086ceabeeedee4acd95f35298d4553"], "revisions": ["17a62618eb6e91a1d5d8e1246ccedae020d3b222", "18012a93d5aadc331c468dac84b524430f4abc19", "1ee770fd10ea2d8c4f6e68a1dbe79378a86611e0", "24f45e41637240b7f9e16d2791b5eacb4a406d0f", "25f5b27dfa5ed15d336188ef46bef743d88327d4", "2652147529269778757d96e09aaf081695548218", "2973e5dc9568ac491b198f6b7f10c44ddc04e0a3", "2d4a801c9a9645fcd3a9f4c06418d8393206b1f3", "31cd7c5f669868651c57e3a2ba25ac45f76fa5cf", "32eb0354a660128e205bf7c3a84b46040ef70d92", "34192ceef239b8b72141efcc58b1d7f1676a18c9", "3565e7d385af0745ec208d719e469c2f58be8e94", "3ed4b85d30401fe32ae3b1d650f215a588293a9e", "40def747398c76ceec1bd248e3a6cb2a52e22dc5", "4d640e8064fe69b4c851dfd43915c431e80c7497", "4e2dc6d6073f0b6d348f84ded52f9143b10344b9", "4ef794980f820d44be94b2f0d53eb34d4241638c", "5017ce0b285351da09a2029ea2cf544f79b593c7", "553b09724bd30d9691b290e157b27a73e2d3e537", "5ee9ea92ed8cc1737b7670e39dab6081c64f2598", "5f4eba626c3f826820c4475d2d81410759ec911b", "61d762d65afb3150e2653d6735068241779c1fcf", "62ff4741eac1821190f6c2cdab7c8a9d7db64ad0", "6910964416438ca8d1698f6295871d727c4d4851", "70e750bb046101fdced06f428e73fee471509c56", "74335db9f45a5d1c8133ff7a7db5ed7a8d4a197b", "769db00b34b9e085dc699c8f1550c95793d0e904", "88b80615ed8561be74a700b92883ec0374ddacb0", "94be9abcf9558213ff301af0ecd8223451ce991d", "9c9e0ff08f215a5a5845ce3dbfc5b48c8050bdaf", "9e912851eb64e3a1e08fbb587de7a4c897ce5a0a", "9f82d95bd3edfb7f18b1a21d6171170395ea44ce", "a1f000fb8216838aa2a120738cc6c7fef2d1b4d8", "a41e2a548ba51ee47f22baad8e88994853d3e2f5", "a701d39a17a9f48c61a06eee08bd9ac0b8e3838b", "a9c4534552df370f43f0ef97146f393ef2f2a08c", "aafb69fd7496ca617f741d38c40808ff2382aabe", "b6932cb7f59e746899e4804f3d496126d1343615", "be34b8c7857a6c04e41cc06b26338d8e59cb2601", "be44d5e6cc66580f59c108f8bff5911ee91a22e4", "bec4c0a31b0b2502f44f34aeb9827cd090cca621", "c313df50bfcaa773dcbe038d00f8bd770ba997f8", "c346f6ff7f42f2a8ff867f92ab83a6721057d86c", "c4a95d5097519dedac437fddf0ef775136081241", "c77e776d22548d47a8d96463a3556172776cd59b", "c875bad563a73a25c5f3379828b161b1441a7c5d", "caef0cb155eb6c55215aa59aabe04a9c702bbe6a", "cb36b894129ca7910bb81c457c72d69d5ff111bc", "d2164061453ecb03d4347a05a77db83f706b8e15", "dafa445964230e808148db043c126063ea1dc9b6", "db9e625ba90056304897a94c92e5d27bc60f112d", "dc3e3ab7fe257d04769528e5e17ad9f1acb44659", "dcba06661c607fe55ec67b1712d153b69f65e38c", "dcddcc32740d2de0e1403e21a5c4ed837b352992", "ddecbc16f4c916c39eacfcb2302e15a9e70a231e", "e326a7bbb5bc00f1d8cacd6108869dedef15569c", "e874cd5967efb1f45282e9f5ce87cc68a898a6d0", "f2afbb94b319ef5d60823859875284afb95dcc18"], "releases": [], "snapshot": "3b8fe58e467deb7597b12a5fd3b2c096b8c02028"} \ No newline at end of file diff --git a/swh/loader/mercurial/tests/data/transplant.json b/swh/loader/mercurial/tests/data/transplant.json new file mode 100644 --- /dev/null +++ b/swh/loader/mercurial/tests/data/transplant.json @@ -0,0 +1 @@ +{"directories": ["42a6ed8f073f00bb18114bc6228360283d26aef2", "615b36018bc3f8cc5a94ba34cb6fcd06b8a0cce7", "657ac477edb7ff761b33d1ebe53df29c12e21ca7", "6c3b565c034591f9e1ba9e7197ec6af62ed1ac8c", "96ee448816b927c395aa87a48734a41ab9a801b9", "c321c30480f216b818c32bbf7f0a5c728faa42cd"], "revisions": ["07589281b64120558940e2e38729b0decf16a88a", "2153ae1c0ac7825aa4fbf82647fa6548cb886546", "2e10f90a4e30ce3a07f1a11cc41e007b1def0bc1", "a95327fed1b5d6db6c7d5ad83621cb61f0f5f7d8", "c75e5af8f4b49ebc622d815459c88eda35ab050d", "ee83768fd9aadc306f835fc8f7caadc1cdc0e3df"], "releases": [], "snapshot": "42e9007138d3834723ffddaedea9139edb576036"} \ No newline at end of file diff --git a/swh/loader/mercurial/tests/dvcs_loader_checker.py b/swh/loader/mercurial/tests/dvcs_loader_checker.py new file mode 100644 --- /dev/null +++ b/swh/loader/mercurial/tests/dvcs_loader_checker.py @@ -0,0 +1,132 @@ +import json +from pathlib import Path +from typing import List, NamedTuple, Set + +from swh.loader.core.loader import DVCSLoader +from swh.model.model import Directory, Release, Revision, Snapshot + + +class ExpectedSwhids(NamedTuple): + """List the of swhids expected from the loader.""" + + directories: Set[str] # hex swhid of the root directory of each revision + revisions: Set[str] # hex swhid of each revision + releases: Set[str] # hex swhid of each release + snapshot: str # hex swhid of the snapshot + + @staticmethod + def load(path: Path) -> "ExpectedSwhids": + """Load expected swhids from a json file. + + See `build.py` in the data directory on how to extract that json file + from an existing repository or archive. + """ + data = json.load(open(path)) + return ExpectedSwhids( + directories=set(data["directories"]), + revisions=set(data["revisions"]), + releases=set(data["releases"]), + snapshot=data["snapshot"], + ) + + +class DVCSLoaderChecker: + """Check the swhids produced by a DVCSLoader.""" + + def __init__(self, loader: DVCSLoader, expected: ExpectedSwhids) -> None: + self._loader = loader + self._expected = expected + + self._directories: List[Directory] = [] + self._revisions: List[Revision] = [] + self._releases: List[Release] = [] + self._snapshot: Snapshot + + self._wrap_get_directories() + self._wrap_get_revisions() + self._wrap_get_releases() + self._wrap_get_snapshot() + + def _wrap_get_directories(self): + """Replace the original loader's get_directories method to inspect its + output. + """ + get_directories = self._loader.get_directories + + def wrapper(*args, **kwargs): + for directory in get_directories(*args, **kwargs): + self._directories.append(directory) + yield directory + self._check_directories() + + self._loader.get_directories = wrapper + + def _check_directories(self): + """Check the emitted root directories. + + Only check that the expected directories are present as only + root directories are expected. + """ + swhids = set(directory.id.hex() for directory in self._directories) + assert (self._expected.directories - swhids) == set() + + def _wrap_get_revisions(self): + """Replace the original loader's get_revisions method to inspect its + output. + """ + get_revisions = self._loader.get_revisions + + def wrapper(*args, **kwargs): + for revision in get_revisions(*args, **kwargs): + self._revisions.append(revision) + yield revision + self._check_revisions() + + self._loader.get_revisions = wrapper + + def _check_revisions(self): + """Check the emitted revisions.""" + assert self._expected.revisions == set( + revision.id.hex() for revision in self._revisions + ) + + def _wrap_get_releases(self): + """Replace the original loader's get_releases method to inspect its + output. + """ + get_releases = self._loader.get_releases + + def wrapper(*args, **kwargs): + for release in get_releases(*args, **kwargs): + self._releases.append(release) + yield release + self._check_releases() + + self._loader.get_releases = wrapper + + def _check_releases(self): + """Check the emitted releases.""" + assert self._expected.releases == set( + release.id.hex() for release in self._releases + ) + + def _wrap_get_snapshot(self): + """Replace the original loader's get_snapshot method to inspect its + output. + """ + get_snapshot = self._loader.get_snapshot + + def wrapper(*args, **kwargs): + self._snapshot = get_snapshot(*args, **kwargs) + self._check_snapshot() + return self._snapshot + + self._loader.get_snapshot = wrapper + + def _check_snapshot(self): + """Check the emitted snapshot.""" + assert self._expected.snapshot == self._snapshot.id.hex() + + def check(self): + """Check loader's outputs.""" + assert self._loader.load() == {"status": "eventful"} diff --git a/swh/loader/mercurial/tests/test_identify.py b/swh/loader/mercurial/tests/test_identify.py new file mode 100644 --- /dev/null +++ b/swh/loader/mercurial/tests/test_identify.py @@ -0,0 +1,88 @@ +import os +from urllib.parse import urlsplit + +from click.testing import CliRunner + +from swh.loader.mercurial.identify import main +from swh.loader.tests import prepare_repository_from_archive + + +def test_all_revisions(datadir: str, tmp_path: str): + archive_name = "hello" + archive_path = os.path.join(datadir, f"{archive_name}.tgz") + repo_url = prepare_repository_from_archive(archive_path, archive_name, tmp_path) + directory = urlsplit(repo_url).path + + runner = CliRunner() + result = runner.invoke(main, ["-d", directory, "revision"]) + + expected = ( + "\n".join( + [ + "swh:1:rev:93b48d515580522a05f389bec93227fc8e43d940" + "\t0a04b987be5ae354b710cefeba0e2d9de7ad41a9", + "swh:1:rev:8dd3db5d5519e4947f035d141581d304565372d2" + "\t82e55d328c8ca4ee16520036c0aaace03a5beb65", + "swh:1:rev:c3dbe4fbeaaa98dd961834e4007edb3efb0e2a27" + "\tb985ae4a07e12ac662f45a171e2d42b13be5b50c", + ] + ) + + "\n" + ) + assert result.output == expected + + +def test_single_revision(datadir: str, tmp_path: str): + archive_name = "hello" + archive_path = os.path.join(datadir, f"{archive_name}.tgz") + repo_url = prepare_repository_from_archive(archive_path, archive_name, tmp_path) + directory = urlsplit(repo_url).path + + runner = CliRunner() + result = runner.invoke( + main, ["-d", directory, "revision", "0a04b987be5ae354b710cefeba0e2d9de7ad41a9"] + ) + + expected = ( + "\n".join( + [ + "swh:1:rev:93b48d515580522a05f389bec93227fc8e43d940" + "\t0a04b987be5ae354b710cefeba0e2d9de7ad41a9", + ] + ) + + "\n" + ) + assert result.output == expected + + +def test_all(datadir: str, tmp_path: str): + archive_name = "hello" + archive_path = os.path.join(datadir, f"{archive_name}.tgz") + repo_url = prepare_repository_from_archive(archive_path, archive_name, tmp_path) + directory = urlsplit(repo_url).path + + runner = CliRunner() + result = runner.invoke(main, ["-d", directory, "all"]) + + expected = ( + "\n".join( + [ + "swh:1:dir:43d727f2f3f2f7cb3b098ddad1d7038464a4cee2" + "\t0a04b987be5ae354b710cefeba0e2d9de7ad41a9", + "swh:1:dir:b3f85f210ff86d334575f64cb01c5bf49895b63e" + "\t82e55d328c8ca4ee16520036c0aaace03a5beb65", + "swh:1:dir:8f2be433c945384c85920a8e60f2a68d2c0f20fb" + "\tb985ae4a07e12ac662f45a171e2d42b13be5b50c", + "swh:1:rev:93b48d515580522a05f389bec93227fc8e43d940" + "\t0a04b987be5ae354b710cefeba0e2d9de7ad41a9", + "swh:1:rev:8dd3db5d5519e4947f035d141581d304565372d2" + "\t82e55d328c8ca4ee16520036c0aaace03a5beb65", + "swh:1:rev:c3dbe4fbeaaa98dd961834e4007edb3efb0e2a27" + "\tb985ae4a07e12ac662f45a171e2d42b13be5b50c", + "swh:1:rel:515c4d72e089404356d0f4b39d60f948b8999140\t0.1", + f"swh:1:snp:d35668e02e2ba4321dc951cd308cf883786f918a\t{directory}", + ] + ) + + "\n" + ) + assert result.output == expected diff --git a/swh/loader/mercurial/tests/test_loader.py b/swh/loader/mercurial/tests/test_loader.py --- a/swh/loader/mercurial/tests/test_loader.py +++ b/swh/loader/mercurial/tests/test_loader.py @@ -22,6 +22,18 @@ from swh.storage.algos.snapshot import snapshot_get_latest from ..loader import CloneTimeoutError, HgArchiveBundle20Loader, HgBundle20Loader +from .dvcs_loader_checker import DVCSLoaderChecker, ExpectedSwhids + + +def test_examples(swh_config, datadir, tmp_path): + for archive_name in ("hello", "transplant", "the-sandbox"): + archive_path = os.path.join(datadir, f"{archive_name}.tgz") + json_path = os.path.join(datadir, f"{archive_name}.json") + repo_url = prepare_repository_from_archive(archive_path, archive_name, tmp_path) + + DVCSLoaderChecker( + loader=HgBundle20Loader(repo_url), expected=ExpectedSwhids.load(json_path), + ).check() def test_loader_hg_new_visit_no_release(swh_config, datadir, tmp_path):