Changeset View
Changeset View
Standalone View
Standalone View
swh/indexer/tests/storage/test_storage.py
# Copyright (C) 2015-2020 The Software Heritage developers | # Copyright (C) 2015-2020 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
import inspect | import inspect | ||||
import math | import math | ||||
import threading | import threading | ||||
from typing import Any, Dict, List, Tuple, Union | from typing import Any, Dict, List, Tuple, Union | ||||
import pytest | import pytest | ||||
from swh.indexer.storage.exc import DuplicateId, IndexerStorageArgumentException | from swh.indexer.storage.exc import DuplicateId, IndexerStorageArgumentException | ||||
from swh.indexer.storage.interface import IndexerStorageInterface | from swh.indexer.storage.interface import IndexerStorageInterface | ||||
from swh.indexer.storage.model import BaseRow, ContentMimetypeRow | from swh.indexer.storage.model import BaseRow, ContentLicenseRow, ContentMimetypeRow | ||||
from swh.model.hashutil import hash_to_bytes | from swh.model.hashutil import hash_to_bytes | ||||
def prepare_mimetypes_from(fossology_licenses: List[Dict]) -> List[ContentMimetypeRow]: | def prepare_mimetypes_from_licenses( | ||||
fossology_licenses: List[ContentLicenseRow], | |||||
) -> List[ContentMimetypeRow]: | |||||
"""Fossology license needs some consistent data in db to run. | """Fossology license needs some consistent data in db to run. | ||||
""" | """ | ||||
mimetypes = [] | mimetypes = [] | ||||
for c in fossology_licenses: | for c in fossology_licenses: | ||||
mimetypes.append( | mimetypes.append( | ||||
ContentMimetypeRow( | ContentMimetypeRow( | ||||
id=c["id"], | id=c.id, | ||||
mimetype="text/plain", # for filtering on textual data to work | mimetype="text/plain", # for filtering on textual data to work | ||||
encoding="utf-8", | encoding="utf-8", | ||||
indexer_configuration_id=c["indexer_configuration_id"], | indexer_configuration_id=c.indexer_configuration_id, | ||||
) | ) | ||||
) | ) | ||||
return mimetypes | return mimetypes | ||||
def endpoint_name(etype: str, ename: str) -> str: | def endpoint_name(etype: str, ename: str) -> str: | ||||
"""Compute the storage's endpoint's name | """Compute the storage's endpoint's name | ||||
▲ Show 20 Lines • Show All 960 Lines • ▼ Show 20 Lines | ) -> None: | ||||
[{"id": data.sha1_2, "indexer_configuration_id": tool["id"],}] | [{"id": data.sha1_2, "indexer_configuration_id": tool["id"],}] | ||||
) | ) | ||||
class TestIndexerStorageContentFossologyLicense: | class TestIndexerStorageContentFossologyLicense: | ||||
endpoint_type = "content_fossology_license" | endpoint_type = "content_fossology_license" | ||||
tool_name = "nomos" | tool_name = "nomos" | ||||
row_from_dict = ContentLicenseRow.from_dict | |||||
dict_from_row = staticmethod(lambda x: x.to_dict()) | |||||
def test_content_fossology_license_add__new_license_added( | def test_content_fossology_license_add__new_license_added( | ||||
self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any] | self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any] | ||||
) -> None: | ) -> None: | ||||
storage, data = swh_indexer_storage_with_data | storage, data = swh_indexer_storage_with_data | ||||
# given | # given | ||||
tool = data.tools["nomos"] | tool = data.tools["nomos"] | ||||
tool_id = tool["id"] | tool_id = tool["id"] | ||||
license_v1 = { | license1 = ContentLicenseRow( | ||||
"id": data.sha1_1, | id=data.sha1_1, license="Apache-2.0", indexer_configuration_id=tool_id, | ||||
"licenses": ["Apache-2.0"], | ) | ||||
"indexer_configuration_id": tool_id, | |||||
} | |||||
# given | # given | ||||
storage.content_fossology_license_add([license_v1]) | storage.content_fossology_license_add([license1]) | ||||
# conflict does nothing | # conflict does nothing | ||||
storage.content_fossology_license_add([license_v1]) | storage.content_fossology_license_add([license1]) | ||||
# when | # when | ||||
actual_licenses = list(storage.content_fossology_license_get([data.sha1_1])) | actual_licenses = list(storage.content_fossology_license_get([data.sha1_1])) | ||||
# then | # then | ||||
expected_license = {data.sha1_1: [{"licenses": ["Apache-2.0"], "tool": tool,}]} | expected_licenses = [ | ||||
assert actual_licenses == [expected_license] | ContentLicenseRow(id=data.sha1_1, license="Apache-2.0", tool=tool,) | ||||
] | |||||
assert actual_licenses == expected_licenses | |||||
# given | # given | ||||
license_v2 = license_v1.copy() | license2 = ContentLicenseRow( | ||||
license_v2.update( | id=data.sha1_1, license="BSD-2-Clause", indexer_configuration_id=tool_id, | ||||
{"licenses": ["BSD-2-Clause"],} | |||||
) | ) | ||||
storage.content_fossology_license_add([license_v2]) | storage.content_fossology_license_add([license2]) | ||||
actual_licenses = list(storage.content_fossology_license_get([data.sha1_1])) | actual_licenses = list(storage.content_fossology_license_get([data.sha1_1])) | ||||
expected_license = { | expected_licenses.append( | ||||
data.sha1_1: [{"licenses": ["Apache-2.0", "BSD-2-Clause"], "tool": tool}] | ContentLicenseRow(id=data.sha1_1, license="BSD-2-Clause", tool=tool,) | ||||
} | ) | ||||
# license did not change as the v2 was dropped. | # first license was not removed when the second one was added | ||||
assert actual_licenses == [expected_license] | assert sorted(actual_licenses) == sorted(expected_licenses) | ||||
def test_generate_content_fossology_license_get_partition_failure( | def test_generate_content_fossology_license_get_partition_failure( | ||||
self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any] | self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any] | ||||
) -> None: | ) -> None: | ||||
"""get_partition call with wrong limit input should fail""" | """get_partition call with wrong limit input should fail""" | ||||
storage, data = swh_indexer_storage_with_data | storage, data = swh_indexer_storage_with_data | ||||
indexer_configuration_id = 42 | indexer_configuration_id = 42 | ||||
with pytest.raises( | with pytest.raises( | ||||
IndexerStorageArgumentException, match="limit should not be None" | IndexerStorageArgumentException, match="limit should not be None" | ||||
): | ): | ||||
storage.content_fossology_license_get_partition( | storage.content_fossology_license_get_partition( | ||||
indexer_configuration_id, 0, 3, limit=None, # type: ignore | indexer_configuration_id, 0, 3, limit=None, # type: ignore | ||||
) | ) | ||||
def test_generate_content_fossology_license_get_partition_no_limit( | def test_generate_content_fossology_license_get_partition_no_limit( | ||||
self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any] | self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any] | ||||
) -> None: | ) -> None: | ||||
"""get_partition should return results""" | """get_partition should return results""" | ||||
storage, data = swh_indexer_storage_with_data | storage, data = swh_indexer_storage_with_data | ||||
# craft some consistent mimetypes | # craft some consistent mimetypes | ||||
fossology_licenses = data.fossology_licenses | fossology_licenses = data.fossology_licenses | ||||
mimetypes = prepare_mimetypes_from(fossology_licenses) | mimetypes = prepare_mimetypes_from_licenses(fossology_licenses) | ||||
indexer_configuration_id = fossology_licenses[0]["indexer_configuration_id"] | indexer_configuration_id = fossology_licenses[0].indexer_configuration_id | ||||
storage.content_mimetype_add(mimetypes, conflict_update=True) | storage.content_mimetype_add(mimetypes, conflict_update=True) | ||||
# add fossology_licenses to storage | # add fossology_licenses to storage | ||||
storage.content_fossology_license_add(fossology_licenses) | storage.content_fossology_license_add(fossology_licenses) | ||||
# All ids from the db | # All ids from the db | ||||
expected_ids = set([c["id"] for c in fossology_licenses]) | expected_ids = set([c.id for c in fossology_licenses]) | ||||
assert len(fossology_licenses) == 10 | assert len(fossology_licenses) == 10 | ||||
assert len(mimetypes) == 10 | assert len(mimetypes) == 10 | ||||
nb_partitions = 4 | nb_partitions = 4 | ||||
actual_ids = [] | actual_ids = [] | ||||
for partition_id in range(nb_partitions): | for partition_id in range(nb_partitions): | ||||
Show All 11 Lines | def test_generate_content_fossology_license_get_partition_full( | ||||
self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any] | self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any] | ||||
) -> None: | ) -> None: | ||||
"""get_partition for a single partition should return available ids | """get_partition for a single partition should return available ids | ||||
""" | """ | ||||
storage, data = swh_indexer_storage_with_data | storage, data = swh_indexer_storage_with_data | ||||
# craft some consistent mimetypes | # craft some consistent mimetypes | ||||
fossology_licenses = data.fossology_licenses | fossology_licenses = data.fossology_licenses | ||||
mimetypes = prepare_mimetypes_from(fossology_licenses) | mimetypes = prepare_mimetypes_from_licenses(fossology_licenses) | ||||
indexer_configuration_id = fossology_licenses[0]["indexer_configuration_id"] | indexer_configuration_id = fossology_licenses[0].indexer_configuration_id | ||||
storage.content_mimetype_add(mimetypes, conflict_update=True) | storage.content_mimetype_add(mimetypes, conflict_update=True) | ||||
# add fossology_licenses to storage | # add fossology_licenses to storage | ||||
storage.content_fossology_license_add(fossology_licenses) | storage.content_fossology_license_add(fossology_licenses) | ||||
# All ids from the db | # All ids from the db | ||||
expected_ids = set([c["id"] for c in fossology_licenses]) | expected_ids = set([c.id for c in fossology_licenses]) | ||||
actual_result = storage.content_fossology_license_get_partition( | actual_result = storage.content_fossology_license_get_partition( | ||||
indexer_configuration_id, 0, 1 | indexer_configuration_id, 0, 1 | ||||
) | ) | ||||
assert actual_result.next_page_token is None | assert actual_result.next_page_token is None | ||||
actual_ids = actual_result.results | actual_ids = actual_result.results | ||||
assert len(set(actual_ids)) == len(expected_ids) | assert len(set(actual_ids)) == len(expected_ids) | ||||
for actual_id in actual_ids: | for actual_id in actual_ids: | ||||
assert actual_id in expected_ids | assert actual_id in expected_ids | ||||
def test_generate_content_fossology_license_get_partition_empty( | def test_generate_content_fossology_license_get_partition_empty( | ||||
self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any] | self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any] | ||||
) -> None: | ) -> None: | ||||
"""get_partition when at least one of the partitions is empty""" | """get_partition when at least one of the partitions is empty""" | ||||
storage, data = swh_indexer_storage_with_data | storage, data = swh_indexer_storage_with_data | ||||
# craft some consistent mimetypes | # craft some consistent mimetypes | ||||
fossology_licenses = data.fossology_licenses | fossology_licenses = data.fossology_licenses | ||||
mimetypes = prepare_mimetypes_from(fossology_licenses) | mimetypes = prepare_mimetypes_from_licenses(fossology_licenses) | ||||
indexer_configuration_id = fossology_licenses[0]["indexer_configuration_id"] | indexer_configuration_id = fossology_licenses[0].indexer_configuration_id | ||||
storage.content_mimetype_add(mimetypes, conflict_update=True) | storage.content_mimetype_add(mimetypes, conflict_update=True) | ||||
# add fossology_licenses to storage | # add fossology_licenses to storage | ||||
storage.content_fossology_license_add(fossology_licenses) | storage.content_fossology_license_add(fossology_licenses) | ||||
# All ids from the db | # All ids from the db | ||||
expected_ids = set([c["id"] for c in fossology_licenses]) | expected_ids = set([c.id for c in fossology_licenses]) | ||||
# nb_partitions = smallest power of 2 such that at least one of | # nb_partitions = smallest power of 2 such that at least one of | ||||
# the partitions is empty | # the partitions is empty | ||||
nb_licenses = len(fossology_licenses) | nb_licenses = len(fossology_licenses) | ||||
nb_partitions = 1 << math.floor(math.log2(nb_licenses) + 1) | nb_partitions = 1 << math.floor(math.log2(nb_licenses) + 1) | ||||
seen_ids = [] | seen_ids = [] | ||||
Show All 17 Lines | def test_generate_content_fossology_license_get_partition_with_pagination( | ||||
self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any] | self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any] | ||||
) -> None: | ) -> None: | ||||
"""get_partition should return ids provided with paginationv | """get_partition should return ids provided with paginationv | ||||
""" | """ | ||||
storage, data = swh_indexer_storage_with_data | storage, data = swh_indexer_storage_with_data | ||||
# craft some consistent mimetypes | # craft some consistent mimetypes | ||||
fossology_licenses = data.fossology_licenses | fossology_licenses = data.fossology_licenses | ||||
mimetypes = prepare_mimetypes_from(fossology_licenses) | mimetypes = prepare_mimetypes_from_licenses(fossology_licenses) | ||||
indexer_configuration_id = fossology_licenses[0]["indexer_configuration_id"] | indexer_configuration_id = fossology_licenses[0].indexer_configuration_id | ||||
storage.content_mimetype_add(mimetypes, conflict_update=True) | storage.content_mimetype_add(mimetypes, conflict_update=True) | ||||
# add fossology_licenses to storage | # add fossology_licenses to storage | ||||
storage.content_fossology_license_add(fossology_licenses) | storage.content_fossology_license_add(fossology_licenses) | ||||
# All ids from the db | # All ids from the db | ||||
expected_ids = [c["id"] for c in fossology_licenses] | expected_ids = [c.id for c in fossology_licenses] | ||||
nb_partitions = 4 | nb_partitions = 4 | ||||
actual_ids = [] | actual_ids = [] | ||||
for partition_id in range(nb_partitions): | for partition_id in range(nb_partitions): | ||||
next_page_token = None | next_page_token = None | ||||
while True: | while True: | ||||
actual_result = storage.content_fossology_license_get_partition( | actual_result = storage.content_fossology_license_get_partition( | ||||
Show All 12 Lines | ) -> None: | ||||
for actual_id in actual_ids: | for actual_id in actual_ids: | ||||
assert actual_id in expected_ids | assert actual_id in expected_ids | ||||
def test_add_empty( | def test_add_empty( | ||||
self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any] | self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any] | ||||
) -> None: | ) -> None: | ||||
(storage, data) = swh_indexer_storage_with_data | (storage, data) = swh_indexer_storage_with_data | ||||
etype = self.endpoint_type | etype = self.endpoint_type | ||||
tool = data.tools[self.tool_name] | |||||
summary = endpoint(storage, etype, "add")( | summary = endpoint(storage, etype, "add")([]) | ||||
[ | |||||
{ | |||||
"id": data.sha1_2, | |||||
"indexer_configuration_id": tool["id"], | |||||
"licenses": [], | |||||
} | |||||
] | |||||
) | |||||
assert summary == {"content_fossology_license:add": 0} | assert summary == {"content_fossology_license:add": 0} | ||||
actual_license = list(endpoint(storage, etype, "get")([data.sha1_2])) | actual_license = list(endpoint(storage, etype, "get")([data.sha1_2])) | ||||
assert actual_license == [] | assert actual_license == [] | ||||
def test_get_unknown( | def test_get_unknown( | ||||
self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any] | self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any] | ||||
▲ Show 20 Lines • Show All 818 Lines • Show Last 20 Lines |