diff --git a/swh/provenance/graph.py b/swh/provenance/graph.py --- a/swh/provenance/graph.py +++ b/swh/provenance/graph.py @@ -199,11 +199,11 @@ fdates: Dict[Sha1Git, datetime] = {} # map {file_id: date} while stack: current = stack.pop() - if current.dbdate is None or current.dbdate > revision.date: + if current.dbdate is None or current.dbdate >= revision.date: # If current directory has an associated date in the isochrone frontier that # is greater or equal to the current revision's one, it should be ignored as # the revision is being processed out of order. - if current.dbdate is not None and current.dbdate > revision.date: + if current.dbdate is not None and current.dbdate >= revision.date: current.invalidate() # Pre-query all known dates for directories in the current directory diff --git a/swh/provenance/tests/test_consistency.py b/swh/provenance/tests/test_consistency.py new file mode 100644 --- /dev/null +++ b/swh/provenance/tests/test_consistency.py @@ -0,0 +1,68 @@ +# Copyright (C) 2021 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from swh.model.hashutil import hash_to_bytes +from swh.provenance.archive import ArchiveInterface +from swh.provenance.interface import ( + DirectoryData, + ProvenanceInterface, + RelationData, + RelationType, +) +from swh.provenance.model import RevisionEntry +from swh.provenance.revision import revision_add +from swh.provenance.tests.conftest import fill_storage, load_repo_data, ts2dt + + +def test_consistency( + provenance: ProvenanceInterface, + archive: ArchiveInterface, +) -> None: + data = load_repo_data("cmdbts2") + fill_storage(archive.storage, data) + + revisions = {rev["id"]: rev for rev in data["revision"]} + + # Process R00 first as expected + rev_00 = revisions[hash_to_bytes("c0d8929936631ecbcf9147be6b8aa13b13b014e4")] + r00 = RevisionEntry( + id=rev_00["id"], + date=ts2dt(rev_00["date"]), + root=rev_00["directory"], + ) + revision_add(provenance, archive, [r00]) + + # Register contents A/B/C/b from R01 in the storage to simulate a crash + rev_01 = revisions[hash_to_bytes("1444db96cbd8cd791abe83527becee73d3c64e86")] + r01 = RevisionEntry( + id=rev_01["id"], + date=ts2dt(rev_01["date"]), + root=rev_01["directory"], + ) + assert r01.date is not None # for mypy + cnt_b_sha1 = hash_to_bytes("50e9cdb03f9719261dd39d7f2920b906db3711a3") + provenance.storage.content_add({cnt_b_sha1: r01.date}) + + # Process R02 (this should set a frontier in directory C) + rev_02 = revisions[hash_to_bytes("0d45f1ee524db8f6f0b5a267afac4e733b4b2cee")] + r02 = RevisionEntry( + id=rev_02["id"], + date=ts2dt(rev_02["date"]), + root=rev_02["directory"], + ) + revision_add(provenance, archive, [r02]) + + dir_C_sha1 = hash_to_bytes("c9cabe7f49012e3fdef6ac6b929efb5654f583cf") + assert provenance.storage.directory_get([dir_C_sha1]) == { + dir_C_sha1: DirectoryData(r01.date, True) + } + + # Process R01 out of order (frontier in C should not be reused to guarantee that the + # first occurrence of A/B/C/b is in the CNT_EARLY_IN_REV relation) + revision_add(provenance, archive, [r01]) + + assert provenance.storage.relation_get( + RelationType.CNT_EARLY_IN_REV, [cnt_b_sha1] + ) == {cnt_b_sha1: {RelationData(r01.id, b"A/B/C/b")}}