diff --git a/swh/loader/mercurial/tests/data/hello.json b/swh/loader/mercurial/tests/data/hello.json new file mode 100644 index 0000000..065f442 --- /dev/null +++ b/swh/loader/mercurial/tests/data/hello.json @@ -0,0 +1 @@ +{"directories": ["43d727f2f3f2f7cb3b098ddad1d7038464a4cee2", "8f2be433c945384c85920a8e60f2a68d2c0f20fb", "b3f85f210ff86d334575f64cb01c5bf49895b63e"], "revisions": ["8dd3db5d5519e4947f035d141581d304565372d2", "93b48d515580522a05f389bec93227fc8e43d940", "c3dbe4fbeaaa98dd961834e4007edb3efb0e2a27"], "releases": ["515c4d72e089404356d0f4b39d60f948b8999140"], "snapshot": "d35668e02e2ba4321dc951cd308cf883786f918a"} \ No newline at end of file diff --git a/swh/loader/mercurial/tests/data/the-sandbox.json b/swh/loader/mercurial/tests/data/the-sandbox.json new file mode 100644 index 0000000..0e9dbf6 --- /dev/null +++ b/swh/loader/mercurial/tests/data/the-sandbox.json @@ -0,0 +1 @@ +{"directories": ["180bd57623a7c2c47a8c43514a5f4d903503d0aa", "180bd57623a7c2c47a8c43514a5f4d903503d0aa", "180bd57623a7c2c47a8c43514a5f4d903503d0aa", "180bd57623a7c2c47a8c43514a5f4d903503d0aa", "180bd57623a7c2c47a8c43514a5f4d903503d0aa", "180bd57623a7c2c47a8c43514a5f4d903503d0aa", "180bd57623a7c2c47a8c43514a5f4d903503d0aa", "180bd57623a7c2c47a8c43514a5f4d903503d0aa", "180bd57623a7c2c47a8c43514a5f4d903503d0aa", "180bd57623a7c2c47a8c43514a5f4d903503d0aa", "180bd57623a7c2c47a8c43514a5f4d903503d0aa", "180bd57623a7c2c47a8c43514a5f4d903503d0aa", "180bd57623a7c2c47a8c43514a5f4d903503d0aa", "180bd57623a7c2c47a8c43514a5f4d903503d0aa", "180bd57623a7c2c47a8c43514a5f4d903503d0aa", "180bd57623a7c2c47a8c43514a5f4d903503d0aa", "180bd57623a7c2c47a8c43514a5f4d903503d0aa", "180bd57623a7c2c47a8c43514a5f4d903503d0aa", "180bd57623a7c2c47a8c43514a5f4d903503d0aa", "180bd57623a7c2c47a8c43514a5f4d903503d0aa", "180bd57623a7c2c47a8c43514a5f4d903503d0aa", "180bd57623a7c2c47a8c43514a5f4d903503d0aa", "180bd57623a7c2c47a8c43514a5f4d903503d0aa", "180bd57623a7c2c47a8c43514a5f4d903503d0aa", "180bd57623a7c2c47a8c43514a5f4d903503d0aa", "180bd57623a7c2c47a8c43514a5f4d903503d0aa", "180bd57623a7c2c47a8c43514a5f4d903503d0aa", "180bd57623a7c2c47a8c43514a5f4d903503d0aa", "180bd57623a7c2c47a8c43514a5f4d903503d0aa", "180bd57623a7c2c47a8c43514a5f4d903503d0aa", "180bd57623a7c2c47a8c43514a5f4d903503d0aa", "180bd57623a7c2c47a8c43514a5f4d903503d0aa", "180bd57623a7c2c47a8c43514a5f4d903503d0aa", "180bd57623a7c2c47a8c43514a5f4d903503d0aa", "180bd57623a7c2c47a8c43514a5f4d903503d0aa", "180bd57623a7c2c47a8c43514a5f4d903503d0aa", "180bd57623a7c2c47a8c43514a5f4d903503d0aa", "180bd57623a7c2c47a8c43514a5f4d903503d0aa", "180bd57623a7c2c47a8c43514a5f4d903503d0aa", "180bd57623a7c2c47a8c43514a5f4d903503d0aa", "180bd57623a7c2c47a8c43514a5f4d903503d0aa", "180bd57623a7c2c47a8c43514a5f4d903503d0aa", "180bd57623a7c2c47a8c43514a5f4d903503d0aa", "180bd57623a7c2c47a8c43514a5f4d903503d0aa", "180bd57623a7c2c47a8c43514a5f4d903503d0aa", "180bd57623a7c2c47a8c43514a5f4d903503d0aa", "180bd57623a7c2c47a8c43514a5f4d903503d0aa", "180bd57623a7c2c47a8c43514a5f4d903503d0aa", "180bd57623a7c2c47a8c43514a5f4d903503d0aa", "180bd57623a7c2c47a8c43514a5f4d903503d0aa", "180bd57623a7c2c47a8c43514a5f4d903503d0aa", "180bd57623a7c2c47a8c43514a5f4d903503d0aa", "180bd57623a7c2c47a8c43514a5f4d903503d0aa", "180bd57623a7c2c47a8c43514a5f4d903503d0aa", "180bd57623a7c2c47a8c43514a5f4d903503d0aa", "180bd57623a7c2c47a8c43514a5f4d903503d0aa", "9cd8160c67ac4b0bc97e2e2cd918a580425167d3", "e2e117569b086ceabeeedee4acd95f35298d4553"], "revisions": ["17a62618eb6e91a1d5d8e1246ccedae020d3b222", "18012a93d5aadc331c468dac84b524430f4abc19", "1ee770fd10ea2d8c4f6e68a1dbe79378a86611e0", "24f45e41637240b7f9e16d2791b5eacb4a406d0f", "25f5b27dfa5ed15d336188ef46bef743d88327d4", "2652147529269778757d96e09aaf081695548218", "2973e5dc9568ac491b198f6b7f10c44ddc04e0a3", "2d4a801c9a9645fcd3a9f4c06418d8393206b1f3", "31cd7c5f669868651c57e3a2ba25ac45f76fa5cf", "32eb0354a660128e205bf7c3a84b46040ef70d92", "34192ceef239b8b72141efcc58b1d7f1676a18c9", "3565e7d385af0745ec208d719e469c2f58be8e94", "3ed4b85d30401fe32ae3b1d650f215a588293a9e", "40def747398c76ceec1bd248e3a6cb2a52e22dc5", "4d640e8064fe69b4c851dfd43915c431e80c7497", "4e2dc6d6073f0b6d348f84ded52f9143b10344b9", "4ef794980f820d44be94b2f0d53eb34d4241638c", "5017ce0b285351da09a2029ea2cf544f79b593c7", "553b09724bd30d9691b290e157b27a73e2d3e537", "5ee9ea92ed8cc1737b7670e39dab6081c64f2598", "5f4eba626c3f826820c4475d2d81410759ec911b", "61d762d65afb3150e2653d6735068241779c1fcf", "62ff4741eac1821190f6c2cdab7c8a9d7db64ad0", "6910964416438ca8d1698f6295871d727c4d4851", "70e750bb046101fdced06f428e73fee471509c56", "74335db9f45a5d1c8133ff7a7db5ed7a8d4a197b", "769db00b34b9e085dc699c8f1550c95793d0e904", "88b80615ed8561be74a700b92883ec0374ddacb0", "94be9abcf9558213ff301af0ecd8223451ce991d", "9c9e0ff08f215a5a5845ce3dbfc5b48c8050bdaf", "9e912851eb64e3a1e08fbb587de7a4c897ce5a0a", "9f82d95bd3edfb7f18b1a21d6171170395ea44ce", "a1f000fb8216838aa2a120738cc6c7fef2d1b4d8", "a41e2a548ba51ee47f22baad8e88994853d3e2f5", "a701d39a17a9f48c61a06eee08bd9ac0b8e3838b", "a9c4534552df370f43f0ef97146f393ef2f2a08c", "aafb69fd7496ca617f741d38c40808ff2382aabe", "b6932cb7f59e746899e4804f3d496126d1343615", "be34b8c7857a6c04e41cc06b26338d8e59cb2601", "be44d5e6cc66580f59c108f8bff5911ee91a22e4", "bec4c0a31b0b2502f44f34aeb9827cd090cca621", "c313df50bfcaa773dcbe038d00f8bd770ba997f8", "c346f6ff7f42f2a8ff867f92ab83a6721057d86c", "c4a95d5097519dedac437fddf0ef775136081241", "c77e776d22548d47a8d96463a3556172776cd59b", "c875bad563a73a25c5f3379828b161b1441a7c5d", "caef0cb155eb6c55215aa59aabe04a9c702bbe6a", "cb36b894129ca7910bb81c457c72d69d5ff111bc", "d2164061453ecb03d4347a05a77db83f706b8e15", "dafa445964230e808148db043c126063ea1dc9b6", "db9e625ba90056304897a94c92e5d27bc60f112d", "dc3e3ab7fe257d04769528e5e17ad9f1acb44659", "dcba06661c607fe55ec67b1712d153b69f65e38c", "dcddcc32740d2de0e1403e21a5c4ed837b352992", "ddecbc16f4c916c39eacfcb2302e15a9e70a231e", "e326a7bbb5bc00f1d8cacd6108869dedef15569c", "e874cd5967efb1f45282e9f5ce87cc68a898a6d0", "f2afbb94b319ef5d60823859875284afb95dcc18"], "releases": [], "snapshot": "3b8fe58e467deb7597b12a5fd3b2c096b8c02028"} \ No newline at end of file diff --git a/swh/loader/mercurial/tests/data/transplant.json b/swh/loader/mercurial/tests/data/transplant.json new file mode 100644 index 0000000..6bb7651 --- /dev/null +++ b/swh/loader/mercurial/tests/data/transplant.json @@ -0,0 +1 @@ +{"directories": ["42a6ed8f073f00bb18114bc6228360283d26aef2", "615b36018bc3f8cc5a94ba34cb6fcd06b8a0cce7", "657ac477edb7ff761b33d1ebe53df29c12e21ca7", "6c3b565c034591f9e1ba9e7197ec6af62ed1ac8c", "96ee448816b927c395aa87a48734a41ab9a801b9", "c321c30480f216b818c32bbf7f0a5c728faa42cd"], "revisions": ["07589281b64120558940e2e38729b0decf16a88a", "2153ae1c0ac7825aa4fbf82647fa6548cb886546", "2e10f90a4e30ce3a07f1a11cc41e007b1def0bc1", "a95327fed1b5d6db6c7d5ad83621cb61f0f5f7d8", "c75e5af8f4b49ebc622d815459c88eda35ab050d", "ee83768fd9aadc306f835fc8f7caadc1cdc0e3df"], "releases": [], "snapshot": "42e9007138d3834723ffddaedea9139edb576036"} \ No newline at end of file diff --git a/swh/loader/mercurial/tests/loader_checker.py b/swh/loader/mercurial/tests/loader_checker.py new file mode 100644 index 0000000..de69707 --- /dev/null +++ b/swh/loader/mercurial/tests/loader_checker.py @@ -0,0 +1,74 @@ +# Copyright (C) 2020 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import json +from pathlib import Path +from typing import NamedTuple, Set + +from swh.loader.core.loader import BaseLoader +from swh.model.hashutil import hash_to_bytes + + +class ExpectedSwhids(NamedTuple): + """List the of swhids expected from the loader.""" + + directories: Set[str] + """Hex swhid of the root directory of each revision.""" + + revisions: Set[str] + """Hex swhid of each revision.""" + + releases: Set[str] + """Hex swhid of each release.""" + + snapshot: str + """Hex swhid of the snapshot.""" + + @staticmethod + def load(path: Path) -> "ExpectedSwhids": + """Load expected swhids from a json file. + + See `build.py` in the data directory on how to extract that json file + from an existing repository or archive. + """ + data = json.load(open(path)) + return ExpectedSwhids( + directories=set(data["directories"]), + revisions=set(data["revisions"]), + releases=set(data["releases"]), + snapshot=data["snapshot"], + ) + + +class LoaderChecker: + """Check the swhids produced by a BaseLoader.""" + + def __init__(self, loader: BaseLoader, expected: ExpectedSwhids) -> None: + self._loader = loader + self._expected = expected + + def check(self) -> None: + """Check loader's outputs.""" + assert self._loader.load() == {"status": "eventful"} + + missing_directories = self._loader.storage.directory_missing( + [hash_to_bytes(id) for id in self._expected.directories] + ) + assert list(missing_directories) == [] + + missing_revisions = self._loader.storage.revision_missing( + [hash_to_bytes(id) for id in self._expected.revisions] + ) + assert list(missing_revisions) == [] + + missing_releases = self._loader.storage.release_missing( + [hash_to_bytes(id) for id in self._expected.releases] + ) + assert list(missing_releases) == [] + + snapshot = self._loader.storage.snapshot_get( + hash_to_bytes(self._expected.snapshot) + ) + assert snapshot is not None diff --git a/swh/loader/mercurial/tests/test_loader.py b/swh/loader/mercurial/tests/test_loader.py index b36fc34..9ca6e7c 100644 --- a/swh/loader/mercurial/tests/test_loader.py +++ b/swh/loader/mercurial/tests/test_loader.py @@ -1,298 +1,310 @@ # Copyright (C) 2018-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import copy import logging import os import time import hglib import pytest from swh.loader.tests import ( assert_last_visit_matches, check_snapshot, get_stats, prepare_repository_from_archive, ) from swh.model.hashutil import hash_to_bytes from swh.model.model import RevisionType, Snapshot, SnapshotBranch, TargetType from swh.storage.algos.snapshot import snapshot_get_latest from ..loader import CloneTimeoutError, HgArchiveBundle20Loader, HgBundle20Loader +from .loader_checker import ExpectedSwhids, LoaderChecker + + +def test_examples(swh_config, datadir, tmp_path): + for archive_name in ("hello", "transplant", "the-sandbox"): + archive_path = os.path.join(datadir, f"{archive_name}.tgz") + json_path = os.path.join(datadir, f"{archive_name}.json") + repo_url = prepare_repository_from_archive(archive_path, archive_name, tmp_path) + + LoaderChecker( + loader=HgBundle20Loader(repo_url), expected=ExpectedSwhids.load(json_path), + ).check() def test_loader_hg_new_visit_no_release(swh_config, datadir, tmp_path): """Eventful visit should yield 1 snapshot""" archive_name = "the-sandbox" archive_path = os.path.join(datadir, f"{archive_name}.tgz") repo_url = prepare_repository_from_archive(archive_path, archive_name, tmp_path) loader = HgBundle20Loader(repo_url) assert loader.load() == {"status": "eventful"} tip_revision_develop = "a9c4534552df370f43f0ef97146f393ef2f2a08c" tip_revision_default = "70e750bb046101fdced06f428e73fee471509c56" expected_snapshot = Snapshot( id=hash_to_bytes("3b8fe58e467deb7597b12a5fd3b2c096b8c02028"), branches={ b"develop": SnapshotBranch( target=hash_to_bytes(tip_revision_develop), target_type=TargetType.REVISION, ), b"default": SnapshotBranch( target=hash_to_bytes(tip_revision_default), target_type=TargetType.REVISION, ), b"HEAD": SnapshotBranch(target=b"develop", target_type=TargetType.ALIAS,), }, ) assert_last_visit_matches( loader.storage, repo_url, status="full", type="hg", snapshot=expected_snapshot.id, ) check_snapshot(expected_snapshot, loader.storage) stats = get_stats(loader.storage) assert stats == { "content": 2, "directory": 3, "origin": 1, "origin_visit": 1, "release": 0, "revision": 58, "skipped_content": 0, "snapshot": 1, } # Ensure archive loader yields the same snapshot loader2 = HgArchiveBundle20Loader( url=archive_path, archive_path=archive_path, visit_date="2016-05-03 15:16:32+00", ) actual_load_status = loader2.load() assert actual_load_status == {"status": "eventful"} stats2 = get_stats(loader2.storage) expected_stats = copy.deepcopy(stats) expected_stats["origin"] += 1 expected_stats["origin_visit"] += 1 assert stats2 == expected_stats # That visit yields the same snapshot assert_last_visit_matches( loader2.storage, archive_path, status="full", type="hg", snapshot=expected_snapshot.id, ) def test_loader_hg_new_visit_with_release(swh_config, datadir, tmp_path): """Eventful visit with release should yield 1 snapshot""" archive_name = "hello" archive_path = os.path.join(datadir, f"{archive_name}.tgz") repo_url = prepare_repository_from_archive(archive_path, archive_name, tmp_path) loader = HgBundle20Loader(url=repo_url, visit_date="2016-05-03 15:16:32+00",) actual_load_status = loader.load() assert actual_load_status == {"status": "eventful"} # then stats = get_stats(loader.storage) assert stats == { "content": 3, "directory": 3, "origin": 1, "origin_visit": 1, "release": 1, "revision": 3, "skipped_content": 0, "snapshot": 1, } # cf. test_loader.org for explaining from where those hashes tip_release = hash_to_bytes("515c4d72e089404356d0f4b39d60f948b8999140") release = loader.storage.release_get([tip_release])[0] assert release is not None tip_revision_default = hash_to_bytes("c3dbe4fbeaaa98dd961834e4007edb3efb0e2a27") revision = loader.storage.revision_get([tip_revision_default])[0] assert revision is not None expected_snapshot = Snapshot( id=hash_to_bytes("d35668e02e2ba4321dc951cd308cf883786f918a"), branches={ b"default": SnapshotBranch( target=tip_revision_default, target_type=TargetType.REVISION, ), b"0.1": SnapshotBranch(target=tip_release, target_type=TargetType.RELEASE,), b"HEAD": SnapshotBranch(target=b"default", target_type=TargetType.ALIAS,), }, ) check_snapshot(expected_snapshot, loader.storage) assert_last_visit_matches( loader.storage, repo_url, type=RevisionType.MERCURIAL.value, status="full", snapshot=expected_snapshot.id, ) # Ensure archive loader yields the same snapshot loader2 = HgArchiveBundle20Loader( url=archive_path, archive_path=archive_path, visit_date="2016-05-03 15:16:32+00", ) actual_load_status = loader2.load() assert actual_load_status == {"status": "eventful"} stats2 = get_stats(loader2.storage) expected_stats = copy.deepcopy(stats) expected_stats["origin"] += 1 expected_stats["origin_visit"] += 1 assert stats2 == expected_stats # That visit yields the same snapshot assert_last_visit_matches( loader2.storage, archive_path, status="full", type="hg", snapshot=expected_snapshot.id, ) def test_visit_with_archive_decompression_failure(swh_config, mocker, datadir): """Failure to decompress should fail early, no data is ingested""" mock_patoo = mocker.patch("swh.loader.mercurial.archive_extract.patoolib") mock_patoo.side_effect = ValueError archive_name = "hello" archive_path = os.path.join(datadir, f"{archive_name}.tgz") # Ensure archive loader yields the same snapshot loader = HgArchiveBundle20Loader( url=archive_path, visit_date="2016-05-03 15:16:32+00", ) actual_load_status = loader.load() assert actual_load_status == {"status": "failed"} stats = get_stats(loader.storage) assert stats == { "content": 0, "directory": 0, "origin": 1, "origin_visit": 1, "release": 0, "revision": 0, "skipped_content": 0, "snapshot": 0, } # That visit yields the same snapshot assert_last_visit_matches( loader.storage, archive_path, status="partial", type="hg", snapshot=None ) def test_visit_repository_with_transplant_operations(swh_config, datadir, tmp_path): """Visit a mercurial repository visit transplant operations within should yield a snapshot as well. """ archive_name = "transplant" archive_path = os.path.join(datadir, f"{archive_name}.tgz") repo_url = prepare_repository_from_archive(archive_path, archive_name, tmp_path) loader = HgBundle20Loader(url=repo_url, visit_date="2019-05-23 12:06:00+00",) # load hg repository actual_load_status = loader.load() assert actual_load_status == {"status": "eventful"} # collect swh revisions assert_last_visit_matches( loader.storage, repo_url, type=RevisionType.MERCURIAL.value, status="full" ) revisions = [] snapshot = snapshot_get_latest(loader.storage, repo_url) for branch in snapshot.branches.values(): if branch.target_type.value != "revision": continue revisions.append(branch.target) # extract original changesets info and the transplant sources hg_changesets = set() transplant_sources = set() for rev in loader.storage.revision_log(revisions): hg_changesets.add(rev["metadata"]["node"]) for k, v in rev["extra_headers"]: if k == b"transplant_source": transplant_sources.add(v.decode("ascii")) # check extracted data are valid assert len(hg_changesets) > 0 assert len(transplant_sources) > 0 assert transplant_sources.issubset(hg_changesets) def test_clone_with_timeout_timeout(caplog, tmp_path, monkeypatch): log = logging.getLogger("test_clone_with_timeout") def clone_timeout(source, dest): time.sleep(60) monkeypatch.setattr(hglib, "clone", clone_timeout) with pytest.raises(CloneTimeoutError): HgBundle20Loader.clone_with_timeout( log, "https://www.mercurial-scm.org/repo/hello", tmp_path, 1 ) for record in caplog.records: assert record.levelname == "WARNING" assert "https://www.mercurial-scm.org/repo/hello" in record.getMessage() assert record.args == ("https://www.mercurial-scm.org/repo/hello", 1) def test_clone_with_timeout_returns(caplog, tmp_path, monkeypatch): log = logging.getLogger("test_clone_with_timeout") def clone_return(source, dest): return (source, dest) monkeypatch.setattr(hglib, "clone", clone_return) assert HgBundle20Loader.clone_with_timeout( log, "https://www.mercurial-scm.org/repo/hello", tmp_path, 1 ) == ("https://www.mercurial-scm.org/repo/hello", tmp_path) def test_clone_with_timeout_exception(caplog, tmp_path, monkeypatch): log = logging.getLogger("test_clone_with_timeout") def clone_return(source, dest): raise ValueError("Test exception") monkeypatch.setattr(hglib, "clone", clone_return) with pytest.raises(ValueError) as excinfo: HgBundle20Loader.clone_with_timeout( log, "https://www.mercurial-scm.org/repo/hello", tmp_path, 1 ) assert "Test exception" in excinfo.value.args[0]