Differential D4400 Diff 15612 swh/indexer/tests/storage/test_storage.py

Changeset View

Standalone View

swh/indexer/tests/storage/test_storage.py

Show First 20 Lines • Show All 131 Lines • ▼ Show 20 Lines	) -> None:
)		)

assert summary == expected_summary(1, etype)		assert summary == expected_summary(1, etype)

# we expect only the other one returned		# we expect only the other one returned
actual_missing = endpoint(storage, etype, "missing")(query)		actual_missing = endpoint(storage, etype, "missing")(query)
assert list(actual_missing) == [data.sha1_1]		assert list(actual_missing) == [data.sha1_1]

def test_add__drop_duplicate(
self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any]
) -> None:
storage, data = swh_indexer_storage_with_data
etype = self.endpoint_type
tool_id = data.tools[self.tool_name]["id"]

# add the first object
data_v1 = {
"id": data.sha1_2,
**self.example_data[0],
"indexer_configuration_id": tool_id,
}
summary = endpoint(storage, etype, "add")([self.row_class.from_dict(data_v1)])
assert summary == expected_summary(1, etype)

# should be able to retrieve it
actual_data = list(endpoint(storage, etype, "get")([data.sha1_2]))
expected_data_v1 = [
self.row_class.from_dict(
{
"id": data.sha1_2,
**self.example_data[0],
"tool": data.tools[self.tool_name],
}
)
]
assert actual_data == expected_data_v1

# now if we add a modified version of the same object (same id)
data_v2 = data_v1.copy()
data_v2.update(self.example_data[1])
summary2 = endpoint(storage, etype, "add")([self.row_class.from_dict(data_v2)])
assert summary2 == expected_summary(0, etype) # not added

# we expect to retrieve the original data, not the modified one
actual_data = list(endpoint(storage, etype, "get")([data.sha1_2]))
assert actual_data == expected_data_v1

def test_add__update_in_place_duplicate(		def test_add__update_in_place_duplicate(
self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any]		self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any]
) -> None:		) -> None:
storage, data = swh_indexer_storage_with_data		storage, data = swh_indexer_storage_with_data
etype = self.endpoint_type		etype = self.endpoint_type
tool = data.tools[self.tool_name]		tool = data.tools[self.tool_name]

data_v1 = {		data_v1 = {
Show All 17 Lines	) -> None:

# then		# then
assert actual_data == expected_data_v1		assert actual_data == expected_data_v1

# given		# given
data_v2 = data_v1.copy()		data_v2 = data_v1.copy()
data_v2.update(self.example_data[1])		data_v2.update(self.example_data[1])

endpoint(storage, etype, "add")(		endpoint(storage, etype, "add")([self.row_class.from_dict(data_v2)])
[self.row_class.from_dict(data_v2)], conflict_update=True
)
assert summary == expected_summary(1, etype) # modified so counted		assert summary == expected_summary(1, etype) # modified so counted

actual_data = list(endpoint(storage, etype, "get")([data.sha1_2]))		actual_data = list(endpoint(storage, etype, "get")([data.sha1_2]))

expected_data_v2 = [		expected_data_v2 = [
self.row_class.from_dict(		self.row_class.from_dict(
{"id": data.sha1_2, **self.example_data[1], "tool": tool,}		{"id": data.sha1_2, **self.example_data[1], "tool": tool,}
)		)
]		]

# data did change as the v2 was used to overwrite v1		# data did change as the v2 was used to overwrite v1
assert actual_data == expected_data_v2		assert actual_data == expected_data_v2

def test_add__update_in_place_deadlock(		def test_add_deadlock(
self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any]		self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any]
) -> None:		) -> None:
storage, data = swh_indexer_storage_with_data		storage, data = swh_indexer_storage_with_data
etype = self.endpoint_type		etype = self.endpoint_type
tool = data.tools[self.tool_name]		tool = data.tools[self.tool_name]

hashes = [		hashes = [
hash_to_bytes("34973274ccef6ab4dfaaf86599792fa9c3fe4{:03d}".format(i))		hash_to_bytes("34973274ccef6ab4dfaaf86599792fa9c3fe4{:03d}".format(i))
Show All 25 Lines	) -> None:
# all items to be in the DB.		# all items to be in the DB.
data_v2a = data_v2[1:]		data_v2a = data_v2[1:]
data_v2b = list(reversed(data_v2[0:-1]))		data_v2b = list(reversed(data_v2[0:-1]))

# given		# given
endpoint(storage, etype, "add")(data_v1)		endpoint(storage, etype, "add")(data_v1)

# when		# when
actual_data = list(endpoint(storage, etype, "get")(hashes))		actual_data = sorted(
		endpoint(storage, etype, "get")(hashes), key=lambda x: x.id,
		)

expected_data_v1 = [		expected_data_v1 = [
self.row_class.from_dict(		self.row_class.from_dict(
{"id": hash_, **self.example_data[0], "tool": tool}		{"id": hash_, **self.example_data[0], "tool": tool}
)		)
for hash_ in hashes		for hash_ in hashes
]		]

# then		# then
assert actual_data == expected_data_v1		assert actual_data == expected_data_v1

# given		# given
def f1() -> None:		def f1() -> None:
endpoint(storage, etype, "add")(data_v2a, conflict_update=True)		endpoint(storage, etype, "add")(data_v2a)

def f2() -> None:		def f2() -> None:
endpoint(storage, etype, "add")(data_v2b, conflict_update=True)		endpoint(storage, etype, "add")(data_v2b)

t1 = threading.Thread(target=f1)		t1 = threading.Thread(target=f1)
t2 = threading.Thread(target=f2)		t2 = threading.Thread(target=f2)
t2.start()		t2.start()
t1.start()		t1.start()

t1.join()		t1.join()
t2.join()		t2.join()

actual_data = sorted(		actual_data = sorted(
(row.to_dict() for row in endpoint(storage, etype, "get")(hashes)),		endpoint(storage, etype, "get")(hashes), key=lambda x: x.id,
key=lambda x: x["id"],
)		)

expected_data_v2 = [		expected_data_v2 = [
{"id": hash_, **self.example_data[1], "tool": tool} for hash_ in hashes		self.row_class.from_dict(
		{"id": hash_, **self.example_data[1], "tool": tool}
		)
		for hash_ in hashes
]		]

assert actual_data == expected_data_v2		assert len(actual_data) == len(expected_data_v1) == len(expected_data_v2)
		for (item, expected_item_v1, expected_item_v2) in zip(
		actual_data, expected_data_v1, expected_data_v2
		):
		assert item in (expected_item_v1, expected_item_v2)

def test_add__duplicate_twice(		def test_add__duplicate_twice(
self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any]		self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any]
) -> None:		) -> None:
storage, data = swh_indexer_storage_with_data		storage, data = swh_indexer_storage_with_data
etype = self.endpoint_type		etype = self.endpoint_type
tool = data.tools[self.tool_name]		tool = data.tools[self.tool_name]

Show All 13 Lines	) -> None:
}		}
)		)

# when		# when
summary = endpoint(storage, etype, "add")([data_rev1])		summary = endpoint(storage, etype, "add")([data_rev1])
assert summary == expected_summary(1, etype)		assert summary == expected_summary(1, etype)

with pytest.raises(DuplicateId):		with pytest.raises(DuplicateId):
endpoint(storage, etype, "add")(		endpoint(storage, etype, "add")([data_rev2, data_rev2])
[data_rev2, data_rev2], conflict_update=True
)

# then		# then
actual_data = list(		actual_data = list(
endpoint(storage, etype, "get")([data.revision_id_2, data.revision_id_1])		endpoint(storage, etype, "get")([data.revision_id_2, data.revision_id_1])
)		)

expected_data = [		expected_data = [
self.row_class.from_dict(		self.row_class.from_dict(
▲ Show 20 Lines • Show All 194 Lines • ▼ Show 20 Lines	example_data = [
{"name": "done", "kind": "variable", "line": 119, "lang": "OCaml",},		{"name": "done", "kind": "variable", "line": 119, "lang": "OCaml",},
{"name": "done", "kind": "variable", "line": 100, "lang": "Python",},		{"name": "done", "kind": "variable", "line": 100, "lang": "Python",},
{"name": "main", "kind": "function", "line": 119, "lang": "Python",},		{"name": "main", "kind": "function", "line": 119, "lang": "Python",},
]		]
row_class = ContentCtagsRow		row_class = ContentCtagsRow

# the following tests are disabled because CTAGS behaves differently		# the following tests are disabled because CTAGS behaves differently
@pytest.mark.skip		@pytest.mark.skip
def test_add__drop_duplicate(self):
pass

@pytest.mark.skip
def test_add__update_in_place_duplicate(self):		def test_add__update_in_place_duplicate(self):
pass		pass

@pytest.mark.skip		@pytest.mark.skip
def test_add__update_in_place_deadlock(self):		def test_add_deadlock(self):
pass		pass

@pytest.mark.skip		@pytest.mark.skip
def test_add__duplicate_twice(self):		def test_add__duplicate_twice(self):
pass		pass

@pytest.mark.skip		@pytest.mark.skip
def test_get(self):		def test_get(self):
▲ Show 20 Lines • Show All 165 Lines • ▼ Show 20 Lines	) -> None:
indexer_configuration_id=tool_id,		indexer_configuration_id=tool_id,
name="defn",		name="defn",
kind="function",		kind="function",
line=120,		line=120,
lang="Scheme",		lang="Scheme",
)		)
ctag2_with_tool = attr.evolve(ctag2, indexer_configuration_id=None, tool=tool)		ctag2_with_tool = attr.evolve(ctag2, indexer_configuration_id=None, tool=tool)

storage.content_ctags_add([ctag1, ctag2], conflict_update=True)		storage.content_ctags_add([ctag1, ctag2])

actual_ctags = list(storage.content_ctags_get([data.sha1_2]))		actual_ctags = list(storage.content_ctags_get([data.sha1_2]))

assert actual_ctags == [ctag1_with_tool, ctag2_with_tool]		assert actual_ctags == [ctag1_with_tool, ctag2_with_tool]

def test_add_empty(		def test_add_empty(
self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any]		self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any]
) -> None:		) -> None:
▲ Show 20 Lines • Show All 136 Lines • ▼ Show 20 Lines	class TestIndexerStorageContentFossologyLicense:
) -> None:		) -> None:
"""get_partition should return results"""		"""get_partition should return results"""
storage, data = swh_indexer_storage_with_data		storage, data = swh_indexer_storage_with_data
# craft some consistent mimetypes		# craft some consistent mimetypes
fossology_licenses = data.fossology_licenses		fossology_licenses = data.fossology_licenses
mimetypes = prepare_mimetypes_from_licenses(fossology_licenses)		mimetypes = prepare_mimetypes_from_licenses(fossology_licenses)
indexer_configuration_id = fossology_licenses[0].indexer_configuration_id		indexer_configuration_id = fossology_licenses[0].indexer_configuration_id

storage.content_mimetype_add(mimetypes, conflict_update=True)		storage.content_mimetype_add(mimetypes)
# add fossology_licenses to storage		# add fossology_licenses to storage
storage.content_fossology_license_add(fossology_licenses)		storage.content_fossology_license_add(fossology_licenses)

# All ids from the db		# All ids from the db
expected_ids = set([c.id for c in fossology_licenses])		expected_ids = set([c.id for c in fossology_licenses])

assert len(fossology_licenses) == 10		assert len(fossology_licenses) == 10
assert len(mimetypes) == 10		assert len(mimetypes) == 10
Show All 19 Lines	) -> None:

"""		"""
storage, data = swh_indexer_storage_with_data		storage, data = swh_indexer_storage_with_data
# craft some consistent mimetypes		# craft some consistent mimetypes
fossology_licenses = data.fossology_licenses		fossology_licenses = data.fossology_licenses
mimetypes = prepare_mimetypes_from_licenses(fossology_licenses)		mimetypes = prepare_mimetypes_from_licenses(fossology_licenses)
indexer_configuration_id = fossology_licenses[0].indexer_configuration_id		indexer_configuration_id = fossology_licenses[0].indexer_configuration_id

storage.content_mimetype_add(mimetypes, conflict_update=True)		storage.content_mimetype_add(mimetypes)
# add fossology_licenses to storage		# add fossology_licenses to storage
storage.content_fossology_license_add(fossology_licenses)		storage.content_fossology_license_add(fossology_licenses)

# All ids from the db		# All ids from the db
expected_ids = set([c.id for c in fossology_licenses])		expected_ids = set([c.id for c in fossology_licenses])

actual_result = storage.content_fossology_license_get_partition(		actual_result = storage.content_fossology_license_get_partition(
indexer_configuration_id, 0, 1		indexer_configuration_id, 0, 1
Show All 9 Lines	class TestIndexerStorageContentFossologyLicense:
) -> None:		) -> None:
"""get_partition when at least one of the partitions is empty"""		"""get_partition when at least one of the partitions is empty"""
storage, data = swh_indexer_storage_with_data		storage, data = swh_indexer_storage_with_data
# craft some consistent mimetypes		# craft some consistent mimetypes
fossology_licenses = data.fossology_licenses		fossology_licenses = data.fossology_licenses
mimetypes = prepare_mimetypes_from_licenses(fossology_licenses)		mimetypes = prepare_mimetypes_from_licenses(fossology_licenses)
indexer_configuration_id = fossology_licenses[0].indexer_configuration_id		indexer_configuration_id = fossology_licenses[0].indexer_configuration_id

storage.content_mimetype_add(mimetypes, conflict_update=True)		storage.content_mimetype_add(mimetypes)
# add fossology_licenses to storage		# add fossology_licenses to storage
storage.content_fossology_license_add(fossology_licenses)		storage.content_fossology_license_add(fossology_licenses)

# All ids from the db		# All ids from the db
expected_ids = set([c.id for c in fossology_licenses])		expected_ids = set([c.id for c in fossology_licenses])

# nb_partitions = smallest power of 2 such that at least one of		# nb_partitions = smallest power of 2 such that at least one of
# the partitions is empty		# the partitions is empty
Show All 25 Lines	) -> None:

"""		"""
storage, data = swh_indexer_storage_with_data		storage, data = swh_indexer_storage_with_data
# craft some consistent mimetypes		# craft some consistent mimetypes
fossology_licenses = data.fossology_licenses		fossology_licenses = data.fossology_licenses
mimetypes = prepare_mimetypes_from_licenses(fossology_licenses)		mimetypes = prepare_mimetypes_from_licenses(fossology_licenses)
indexer_configuration_id = fossology_licenses[0].indexer_configuration_id		indexer_configuration_id = fossology_licenses[0].indexer_configuration_id

storage.content_mimetype_add(mimetypes, conflict_update=True)		storage.content_mimetype_add(mimetypes)
# add fossology_licenses to storage		# add fossology_licenses to storage
storage.content_fossology_license_add(fossology_licenses)		storage.content_fossology_license_add(fossology_licenses)

# All ids from the db		# All ids from the db
expected_ids = [c.id for c in fossology_licenses]		expected_ids = [c.id for c in fossology_licenses]

nb_partitions = 4		nb_partitions = 4

▲ Show 20 Lines • Show All 83 Lines • ▼ Show 20 Lines	) -> None:
tool=data.tools["swh-metadata-detector"],		tool=data.tools["swh-metadata-detector"],
from_revision=data.revision_id_2,		from_revision=data.revision_id_2,
mappings=["mapping1"],		mappings=["mapping1"],
)		)
]		]

assert actual_metadata == expected_metadata		assert actual_metadata == expected_metadata

def test_origin_intrinsic_metadata_add_drop_duplicate(
self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any]
) -> None:
storage, data = swh_indexer_storage_with_data
# given
tool_id = data.tools["swh-metadata-detector"]["id"]

metadata_v1: Dict[str, Any] = {
"version": None,
"name": None,
}
metadata_rev_v1 = RevisionIntrinsicMetadataRow(
id=data.revision_id_1,
metadata=metadata_v1.copy(),
mappings=[],
indexer_configuration_id=tool_id,
)
metadata_origin_v1 = OriginIntrinsicMetadataRow(
id=data.origin_url_1,
metadata=metadata_v1.copy(),
indexer_configuration_id=tool_id,
mappings=[],
from_revision=data.revision_id_1,
)

# given
storage.revision_intrinsic_metadata_add([metadata_rev_v1])
storage.origin_intrinsic_metadata_add([metadata_origin_v1])

# when
actual_metadata = list(
storage.origin_intrinsic_metadata_get([data.origin_url_1, "no://where"])
)

expected_metadata_v1 = [
OriginIntrinsicMetadataRow(
id=data.origin_url_1,
metadata=metadata_v1,
tool=data.tools["swh-metadata-detector"],
from_revision=data.revision_id_1,
mappings=[],
)
]

assert actual_metadata == expected_metadata_v1

# given
metadata_v2 = metadata_v1.copy()
metadata_v2.update(
{"name": "test_metadata", "author": "MG",}
)
metadata_rev_v2 = attr.evolve(metadata_rev_v1, metadata=metadata_v2)
metadata_origin_v2 = attr.evolve(metadata_origin_v1, metadata=metadata_v2)

storage.revision_intrinsic_metadata_add([metadata_rev_v2])
storage.origin_intrinsic_metadata_add([metadata_origin_v2])

# then
actual_metadata = list(
storage.origin_intrinsic_metadata_get([data.origin_url_1])
)

# metadata did not change as the v2 was dropped.
assert actual_metadata == expected_metadata_v1

def test_origin_intrinsic_metadata_add_update_in_place_duplicate(		def test_origin_intrinsic_metadata_add_update_in_place_duplicate(
self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any]		self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any]
) -> None:		) -> None:
storage, data = swh_indexer_storage_with_data		storage, data = swh_indexer_storage_with_data
# given		# given
tool_id = data.tools["swh-metadata-detector"]["id"]		tool_id = data.tools["swh-metadata-detector"]["id"]

metadata_v1: Dict[str, Any] = {		metadata_v1: Dict[str, Any] = {
▲ Show 20 Lines • Show All 44 Lines • ▼ Show 20 Lines	) -> None:
metadata_origin_v2 = OriginIntrinsicMetadataRow(		metadata_origin_v2 = OriginIntrinsicMetadataRow(
id=data.origin_url_1,		id=data.origin_url_1,
metadata=metadata_v2.copy(),		metadata=metadata_v2.copy(),
indexer_configuration_id=tool_id,		indexer_configuration_id=tool_id,
mappings=["npm"],		mappings=["npm"],
from_revision=data.revision_id_1,		from_revision=data.revision_id_1,
)		)

storage.revision_intrinsic_metadata_add([metadata_rev_v2], conflict_update=True)		storage.revision_intrinsic_metadata_add([metadata_rev_v2])
storage.origin_intrinsic_metadata_add(		storage.origin_intrinsic_metadata_add([metadata_origin_v2])
[metadata_origin_v2], conflict_update=True
)

actual_metadata = list(		actual_metadata = list(
storage.origin_intrinsic_metadata_get([data.origin_url_1])		storage.origin_intrinsic_metadata_get([data.origin_url_1])
)		)

expected_metadata_v2 = [		expected_metadata_v2 = [
OriginIntrinsicMetadataRow(		OriginIntrinsicMetadataRow(
id=data.origin_url_1,		id=data.origin_url_1,
metadata=metadata_v2,		metadata=metadata_v2,
tool=data.tools["swh-metadata-detector"],		tool=data.tools["swh-metadata-detector"],
from_revision=data.revision_id_1,		from_revision=data.revision_id_1,
mappings=["npm"],		mappings=["npm"],
)		)
]		]

# metadata did change as the v2 was used to overwrite v1		# metadata did change as the v2 was used to overwrite v1
assert actual_metadata == expected_metadata_v2		assert actual_metadata == expected_metadata_v2

def test_origin_intrinsic_metadata_add__update_in_place_deadlock(		def test_origin_intrinsic_metadata_add__deadlock(
self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any]		self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any]
) -> None:		) -> None:
storage, data = swh_indexer_storage_with_data		storage, data = swh_indexer_storage_with_data
# given		# given
tool_id = data.tools["swh-metadata-detector"]["id"]		tool_id = data.tools["swh-metadata-detector"]["id"]

ids = list(range(10))		ids = list(range(10))

▲ Show 20 Lines • Show All 55 Lines • ▼ Show 20 Lines	) -> None:
for id_ in ids		for id_ in ids
]		]

# then		# then
assert actual_data == expected_data_v1		assert actual_data == expected_data_v1

# given		# given
def f1() -> None:		def f1() -> None:
storage.origin_intrinsic_metadata_add(data_v2a, conflict_update=True)		storage.origin_intrinsic_metadata_add(data_v2a)

def f2() -> None:		def f2() -> None:
storage.origin_intrinsic_metadata_add(data_v2b, conflict_update=True)		storage.origin_intrinsic_metadata_add(data_v2b)

t1 = threading.Thread(target=f1)		t1 = threading.Thread(target=f1)
t2 = threading.Thread(target=f2)		t2 = threading.Thread(target=f2)
t2.start()		t2.start()
t1.start()		t1.start()

t1.join()		t1.join()
t2.join()		t2.join()

actual_data = list(storage.origin_intrinsic_metadata_get(origins))		actual_data = list(storage.origin_intrinsic_metadata_get(origins))

expected_data_v2 = [		expected_data_v2 = [
OriginIntrinsicMetadataRow(		OriginIntrinsicMetadataRow(
id="file:///tmp/origin%d" % id_,		id="file:///tmp/origin%d" % id_,
from_revision=data.revision_id_2,		from_revision=data.revision_id_2,
tool=data.tools["swh-metadata-detector"],		tool=data.tools["swh-metadata-detector"],
**example_data2,		**example_data2,
)		)
for id_ in ids		for id_ in ids
]		]

assert len(actual_data) == len(expected_data_v2)		assert len(actual_data) == len(expected_data_v1) == len(expected_data_v2)
assert sorted(actual_data, key=lambda x: x.id) == expected_data_v2		for (item, expected_item_v1, expected_item_v2) in zip(
		actual_data, expected_data_v1, expected_data_v2
		):
		assert item in (expected_item_v1, expected_item_v2)

def test_origin_intrinsic_metadata_add__duplicate_twice(		def test_origin_intrinsic_metadata_add__duplicate_twice(
self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any]		self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any]
) -> None:		) -> None:
storage, data = swh_indexer_storage_with_data		storage, data = swh_indexer_storage_with_data
# given		# given
tool_id = data.tools["swh-metadata-detector"]["id"]		tool_id = data.tools["swh-metadata-detector"]["id"]

▲ Show 20 Lines • Show All 449 Lines • Show Last 20 Lines