diff --git a/swh/provenance/tests/mongo/test_backend.py b/swh/provenance/tests/mongo/test_backend.py new file mode 100644 --- /dev/null +++ b/swh/provenance/tests/mongo/test_backend.py @@ -0,0 +1,342 @@ +# Copyright (C) 2021 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +""" +Unit tests for the mongo backend +""" + +from datetime import datetime + +import pymongo.database +import pytest + +from swh.provenance.mongo.backend import ProvenanceStorageMongoDb + + +class TestDBIndex: + + # FIXME, test datamodel and data types + + def test_contnet_sha1_unique_index(self, mongodb): + # These tests are specific to mongo indexes + pass + + def test_contnet_sha1_ts_combination_index(self, mongodb): + # These tests are specific to mongo indexes + pass + + def test_directory_sha1_unique_index(self, mongodb): + # These tests are specific to mongo indexes + pass + + def test_directory_sha1_ts_combination_index(self, mongodb): + # These tests are specific to mongo indexes + pass + + def test_origin_sha1_unique_index(self, mongodb): + # These tests are specific to mongo indexes + pass + + +class TestBackend: + @pytest.fixture + def backend(self, mongodb: pymongo.database.Database): + return ProvenanceStorageMongoDb(mongodb) + + # add content tests + + def test_add_content_empty(self, backend, mongodb): + assert backend.content_add({}) is True + assert mongodb.content.count_documents({}) == 0 + + assert backend.content_add([]) is True + assert mongodb.content.count_documents({}) == 0 + + def test_add_content_with_insert(self, backend, mongodb): + # add data using add_contnet + # get data from mongo and compare + sha1 = "cf23df2207d99a74fbe169e3eba035e633b65d94" + data = {sha1: None} + assert backend.content_add(data) is True + + assert mongodb.content.count_documents({}) == 1 + cnt = mongodb.content.find_one({"sha1": sha1}) + assert cnt["ts"] is None + assert cnt["revision"] == {} + assert cnt["directory"] == {} + + def test_add_content_with_update_later_date(self, backend, mongodb): + sha1 = "cf23df2207d99a74fbe169e3eba035e633b65d94" + revision = {"test": "test"} + mongodb.content.insert_one( + {"sha1": sha1, "ts": 1631881748, "revision": revision, "directory": {}} + ) + + new_date = datetime(2010, 10, 10) + data = {sha1: new_date} + # data has a date earlier than the one in the db + assert backend.content_add(data) is True + cnt = mongodb.content.find_one({"sha1": sha1}) + assert cnt["ts"] == new_date.timestamp() + assert cnt["revision"] == revision + assert cnt["directory"] == {} + + def test_add_content_with_update_none_date(self, backend, mongodb): + sha1 = "cf23df2207d99a74fbe169e3eba035e633b65d94" + revision = {"test": "test"} + mongodb.content.insert_one( + {"sha1": sha1, "ts": None, "revision": revision, "directory": {}} + ) + + new_date = datetime(2010, 10, 10) + data = {sha1: new_date} + # data has a date earlier than the one in the db + assert backend.content_add(data) is True + cnt = mongodb.content.find_one({"sha1": sha1}) + assert cnt["ts"] == new_date.timestamp() + assert cnt["revision"] == revision + assert cnt["directory"] == {} + + def test_add_content_do_not_update_older_date(self, backend, mongodb): + sha1 = "cf23df2207d99a74fbe169e3eba035e633b65d94" + revision = {"test": "test"} + mongodb.content.insert_one( + {"sha1": sha1, "ts": 1286661600, "revision": revision, "directory": {}} + ) + + new_date = datetime(2020, 10, 10) + data = {sha1: new_date} + # data has a date later than the one in the db + assert backend.content_add(data) is True + cnt = mongodb.content.find_one({"sha1": sha1}) + assert cnt["ts"] == 1286661600 + + def test_add_content_multiple(self, backend, mongodb): + sha1_1 = "cf23df2207d99a74fbe169e3eba035e633b65d94" + sha1_2 = "a94a8fe5ccb19ba61c4c0873d391e987982fbbd3" + sha1_3 = "109f4b3c50d7b0df729d299bc6f8e9ef9066971f" + sha1_4 = "3ebfa301dc59196f18593c45e519287a23297589" + revision = {"test": "test"} + mongodb.content.insert_one( + {"sha1": sha1_1, "ts": 1286661600, "revision": revision, "directory": {}} + ) + mongodb.content.insert_one( + {"sha1": sha1_2, "ts": None, "revision": revision, "directory": {}} + ) + mongodb.content.insert_one( + {"sha1": sha1_3, "ts": 1631889655, "revision": revision, "directory": {}} + ) + + data = { + sha1_1: datetime(2020, 10, 10), # given date is in future, no update + sha1_2: datetime(2020, 10, 10), # will update None date + sha1_3: datetime(2010, 10, 10), # date in the past, will update + sha1_4: datetime(2010, 10, 10), # new rcd, will insert + } + + assert backend.content_add(data) is True + mongodb.content.count_documents({}) == 4 + cnt = mongodb.content.find_one({"sha1": sha1_1}) + assert cnt["ts"] == 1286661600 + assert cnt["revision"] == revision + assert cnt["directory"] == {} + + cnt = mongodb.content.find_one({"sha1": sha1_2}) + assert cnt["ts"] == datetime(2020, 10, 10).timestamp() + assert cnt["revision"] == revision + assert cnt["directory"] == {} + + cnt = mongodb.content.find_one({"sha1": sha1_3}) + assert cnt["ts"] == datetime(2010, 10, 10).timestamp() + assert cnt["revision"] == revision + assert cnt["directory"] == {} + + cnt = mongodb.content.find_one({"sha1": sha1_4}) + assert cnt["ts"] == datetime(2010, 10, 10).timestamp() + assert cnt["revision"] == {} + assert cnt["directory"] == {} + + # add directory tests + + def test_add_directory_empty(self, backend, mongodb): + assert backend.directory_add({}) is True + assert mongodb.directory.count_documents({}) == 0 + + assert backend.directory_add([]) is True + assert mongodb.directory.count_documents({}) == 0 + + def test_add_directory_with_insert(self, backend, mongodb): + # add data using add_directory + # get data from mongo and compare + sha1 = "cf23df2207d99a74fbe169e3eba035e633b65d94" + data = {sha1: None} + assert backend.directory_add(data) is True + + assert mongodb.directory.count_documents({}) == 1 + cnt = mongodb.directory.find_one({"sha1": sha1}) + assert cnt["ts"] is None + assert cnt["revision"] == {} + + def test_add_directory_with_update_later_date(self, backend, mongodb): + sha1 = "cf23df2207d99a74fbe169e3eba035e633b65d94" + revision = {"test": "test"} + mongodb.directory.insert_one( + {"sha1": sha1, "ts": 1631881748, "revision": revision} + ) + + new_date = datetime(2010, 10, 10) + data = {sha1: new_date} + # data has a date earlier than the one in the db + assert backend.directory_add(data) is True + diy = mongodb.directory.find_one({"sha1": sha1}) + assert diy["ts"] == new_date.timestamp() + assert diy["revision"] == revision + + def test_add_directory_with_update_none_date(self, backend, mongodb): + sha1 = "cf23df2207d99a74fbe169e3eba035e633b65d94" + revision = {"test": "test"} + mongodb.directory.insert_one({"sha1": sha1, "ts": None, "revision": revision}) + + new_date = datetime(2010, 10, 10) + data = {sha1: new_date} + # data has a date earlier than the one in the db + assert backend.directory_add(data) is True + diy = mongodb.directory.find_one({"sha1": sha1}) + assert diy["ts"] == new_date.timestamp() + assert diy["revision"] == revision + + def test_add_directory_do_not_update_older_date(self, backend, mongodb): + sha1 = "cf23df2207d99a74fbe169e3eba035e633b65d94" + revision = {"test": "test"} + mongodb.directory.insert_one( + {"sha1": sha1, "ts": 1286661600, "revision": revision} + ) + + new_date = datetime(2020, 10, 10) + data = {sha1: new_date} + # data has a date later than the one in the db + assert backend.directory_add(data) is True + cnt = mongodb.directory.find_one({"sha1": sha1}) + assert cnt["ts"] == 1286661600 + + def test_add_directory_multiple(self, backend, mongodb): + sha1_1 = "cf23df2207d99a74fbe169e3eba035e633b65d94" + sha1_2 = "a94a8fe5ccb19ba61c4c0873d391e987982fbbd3" + sha1_3 = "109f4b3c50d7b0df729d299bc6f8e9ef9066971f" + sha1_4 = "3ebfa301dc59196f18593c45e519287a23297589" + revision = {"test": "test"} + mongodb.directory.insert_one( + {"sha1": sha1_1, "ts": 1286661600, "revision": revision} + ) + mongodb.directory.insert_one({"sha1": sha1_2, "ts": None, "revision": revision}) + mongodb.directory.insert_one( + {"sha1": sha1_3, "ts": 1631889655, "revision": revision} + ) + + data = { + sha1_1: datetime(2020, 10, 10), # given date is in future, no update + sha1_2: datetime(2020, 10, 10), # will update None date + sha1_3: datetime(2010, 10, 10), # date in the past, will update + sha1_4: datetime(2010, 10, 10), # new rcd, will insert + } + + assert backend.directory_add(data) is True + mongodb.directory.count_documents({}) == 4 + dry = mongodb.directory.find_one({"sha1": sha1_1}) + assert dry["ts"] == 1286661600 + assert dry["revision"] == revision + + dry = mongodb.directory.find_one({"sha1": sha1_2}) + assert dry["ts"] == datetime(2020, 10, 10).timestamp() + assert dry["revision"] == revision + + dry = mongodb.directory.find_one({"sha1": sha1_3}) + assert dry["ts"] == datetime(2010, 10, 10).timestamp() + assert dry["revision"] == revision + + dry = mongodb.directory.find_one({"sha1": sha1_4}) + assert dry["ts"] == datetime(2010, 10, 10).timestamp() + assert dry["revision"] == {} + + # content_get tests + + def test_content_get_empty_ids(self, backend, mongodb): + results = backend.content_get([]) + assert results == {} + + def test_content_get(self, backend, mongodb): + # FIXME, add a entity_add method in the interface to + # make this backend agnostic (now using mongo insert) + sha1_1 = "cf23df2207d99a74fbe169e3eba035e633b65d94" + sha1_2 = "a94a8fe5ccb19ba61c4c0873d391e987982fbbd3" + sha1_3 = "109f4b3c50d7b0df729d299bc6f8e9ef9066971f" + data1 = {"sha1": sha1_1, "ts": datetime(2010, 10, 8).timestamp()} + data2 = {"sha1": sha1_2, "ts": datetime(2020, 8, 20).timestamp()} + + # This has None for ts, will not be returend from content_get + data3 = {"sha1": sha1_2, "ts": None} + mongodb.content.insert_one(data1) + mongodb.content.insert_one(data2) + mongodb.content.insert_one(data3) + results = backend.content_get([sha1_1, sha1_2, sha1_3]) + assert len(results) == 2 + results[sha1_1] = datetime(2010, 10, 8).timestamp() + results[sha1_2] = datetime(2020, 8, 20).timestamp() + + # directory_get tests + + def test_directory_get_empty_ids(self, backend, mongodb): + results = backend.directory_get([]) + assert results == {} + + def test_directory_get(self, backend, mongodb): + # FIXME, add a entity_add method in the interface to + # make this backend agnostic (now using mongo insert) + sha1_1 = "cf23df2207d99a74fbe169e3eba035e633b65d94" + sha1_2 = "a94a8fe5ccb19ba61c4c0873d391e987982fbbd3" + sha1_3 = "109f4b3c50d7b0df729d299bc6f8e9ef9066971f" + data1 = {"sha1": sha1_1, "ts": datetime(2010, 10, 8).timestamp()} + data2 = {"sha1": sha1_2, "ts": datetime(2020, 8, 20).timestamp()} + + # This has None for ts, will not be returend from directory_get + data3 = {"sha1": sha1_2, "ts": None} + mongodb.directory.insert_one(data1) + mongodb.directory.insert_one(data2) + mongodb.directory.insert_one(data3) + results = backend.directory_get([sha1_1, sha1_2, sha1_3]) + assert len(results) == 2 + results[sha1_1] = datetime(2010, 10, 8).timestamp() + results[sha1_2] = datetime(2020, 8, 20).timestamp() + + # location_add tests + + def test_location_add(self, backend): + # FIXME, this will change with model change + assert backend.location_add([]) is True + + # origin_add tests + + def origin_add_empty(self, backend, mongodb): + assert backend.origin_add([]) is True + mongodb.origin.count_documents({}) == 0 + + def origin_add_new(self, backend, mongodb): + sha1_1 = "cf23df2207d99a74fbe169e3eba035e633b65d94" + sha1_2 = "a94a8fe5ccb19ba61c4c0873d391e987982fbbd3" + data = {sha1_1: "1.example.com", sha1_2: "2.example.com"} + assert backend.origin_add(data) is True + mongodb.origin.count_documents({}) == 2 + + def origin_add_skip_existing(self, backend, mongodb): + # sending an existing hash will not add or update any record + sha1 = "cf23df2207d99a74fbe169e3eba035e633b65d94" + mongodb.origin.insert_one({"sha1": sha1, "url": "example.com"}) + + sha1_1 = "a94a8fe5ccb19ba61c4c0873d391e987982fbbd3" + data = {sha1: "1.example.com", sha1_1: "2.example.com"} + assert backend.origin_add(data) is True + mongodb.origin.count_documents({}) == 2 + origin = mongodb.origin.find({"sha1": sha1}) + assert origin["url"] == "example.com" # not 1.example.com