diff --git a/swh/indexer/metadata.py b/swh/indexer/metadata.py --- a/swh/indexer/metadata.py +++ b/swh/indexer/metadata.py @@ -371,33 +371,11 @@ # Deduplicate revisions rev_metadata: List[RevisionIntrinsicMetadataRow] = [] orig_metadata: List[OriginIntrinsicMetadataRow] = [] - revs_to_delete: List[Dict] = [] - origs_to_delete: List[Dict] = [] summary: Dict = {} for (orig_item, rev_item) in results: assert rev_item.metadata == orig_item.metadata - if not rev_item.metadata or rev_item.metadata.keys() <= {"@context"}: - # If we didn't find any metadata, don't store a DB record - # (and delete existing ones, if any) - if rev_item not in revs_to_delete: - revs_to_delete.append( - { - "id": rev_item.id, - "indexer_configuration_id": ( - rev_item.indexer_configuration_id - ), - } - ) - if orig_item not in origs_to_delete: - origs_to_delete.append( - { - "id": orig_item.id, - "indexer_configuration_id": ( - orig_item.indexer_configuration_id - ), - } - ) - else: + if rev_item.metadata and not (rev_item.metadata.keys() <= {"@context"}): + # Only store non-empty metadata sets if rev_item not in rev_metadata: rev_metadata.append(rev_item) if orig_item not in orig_metadata: @@ -414,19 +392,4 @@ ) summary.update(summary_ori) - # revs_to_delete should always be empty unless we changed a mapping - # to detect less files or less content. - # However, origs_to_delete may be empty whenever an upstream deletes - # a metadata file. - if origs_to_delete: - summary_ori = self.idx_storage.origin_intrinsic_metadata_delete( - origs_to_delete - ) - summary.update(summary_ori) - if revs_to_delete: - summary_rev = self.idx_storage.revision_intrinsic_metadata_delete( - revs_to_delete - ) - summary.update(summary_rev) - return summary diff --git a/swh/indexer/storage/__init__.py b/swh/indexer/storage/__init__.py --- a/swh/indexer/storage/__init__.py +++ b/swh/indexer/storage/__init__.py @@ -562,15 +562,6 @@ "revision_intrinsic_metadata:add": count, } - @timed - @process_metrics - @db_transaction() - def revision_intrinsic_metadata_delete( - self, entries: List[Dict], db=None, cur=None - ) -> Dict: - count = db.revision_intrinsic_metadata_delete(entries, cur) - return {"revision_intrinsic_metadata:del": count} - @timed @db_transaction() def origin_intrinsic_metadata_get( @@ -611,17 +602,6 @@ "origin_intrinsic_metadata:add": count, } - @timed - @process_metrics - @db_transaction() - def origin_intrinsic_metadata_delete( - self, entries: List[Dict], db=None, cur=None - ) -> Dict: - count = db.origin_intrinsic_metadata_delete(entries, cur) - return { - "origin_intrinsic_metadata:del": count, - } - @timed @db_transaction() def origin_intrinsic_metadata_search_fulltext( diff --git a/swh/indexer/storage/db.py b/swh/indexer/storage/db.py --- a/swh/indexer/storage/db.py +++ b/swh/indexer/storage/db.py @@ -399,17 +399,6 @@ ) return cur.fetchone()[0] - def revision_intrinsic_metadata_delete(self, entries, cur=None): - cur = self._cursor(cur) - cur.execute( - "DELETE from revision_intrinsic_metadata " - "WHERE (id, indexer_configuration_id) IN " - " (VALUES %s) " - "RETURNING id" % (", ".join("%s" for _ in entries)), - tuple((e["id"], e["indexer_configuration_id"]) for e in entries), - ) - return len(cur.fetchall()) - def revision_intrinsic_metadata_get_from_list(self, ids, cur=None): yield from self._get_from_list( "revision_intrinsic_metadata", @@ -447,17 +436,6 @@ ) return cur.fetchone()[0] - def origin_intrinsic_metadata_delete(self, entries, cur=None): - cur = self._cursor(cur) - cur.execute( - "DELETE from origin_intrinsic_metadata " - "WHERE (id, indexer_configuration_id) IN" - " (VALUES %s) " - "RETURNING id" % (", ".join("%s" for _ in entries)), - tuple((e["id"], e["indexer_configuration_id"]) for e in entries), - ) - return len(cur.fetchall()) - def origin_intrinsic_metadata_get_from_list(self, ids, cur=None): yield from self._get_from_list( "origin_intrinsic_metadata", diff --git a/swh/indexer/storage/in_memory.py b/swh/indexer/storage/in_memory.py --- a/swh/indexer/storage/in_memory.py +++ b/swh/indexer/storage/in_memory.py @@ -226,22 +226,6 @@ self._sorted_ids.add(id_) return count - def delete(self, entries: List[Dict]) -> int: - """Delete entries and return the number of entries deleted. - - """ - deleted = 0 - for entry in entries: - (id_, tool_id) = (entry["id"], entry["indexer_configuration_id"]) - if tool_id in self._tools_per_id[id_]: - self._tools_per_id[id_].remove(tool_id) - if id_ in self._data: - key = self._key_from_dict(entry) - if key in self._data[id_]: - deleted += 1 - del self._data[id_][key] - return deleted - class IndexerStorage: """In-memory SWH indexer storage.""" @@ -394,10 +378,6 @@ added = self._revision_intrinsic_metadata.add(metadata, conflict_update) return {"revision_intrinsic_metadata:add": added} - def revision_intrinsic_metadata_delete(self, entries: List[Dict]) -> Dict: - deleted = self._revision_intrinsic_metadata.delete(entries) - return {"revision_intrinsic_metadata:del": deleted} - def origin_intrinsic_metadata_get( self, urls: Iterable[str] ) -> List[OriginIntrinsicMetadataRow]: @@ -409,10 +389,6 @@ added = self._origin_intrinsic_metadata.add(metadata, conflict_update) return {"origin_intrinsic_metadata:add": added} - def origin_intrinsic_metadata_delete(self, entries: List[Dict]) -> Dict: - deleted = self._origin_intrinsic_metadata.delete(entries) - return {"origin_intrinsic_metadata:del": deleted} - def origin_intrinsic_metadata_search_fulltext( self, conjunction: List[str], limit: int = 100 ) -> List[OriginIntrinsicMetadataRow]: diff --git a/swh/indexer/storage/interface.py b/swh/indexer/storage/interface.py --- a/swh/indexer/storage/interface.py +++ b/swh/indexer/storage/interface.py @@ -408,22 +408,6 @@ """ ... - @remote_api_endpoint("revision_intrinsic_metadata/delete") - def revision_intrinsic_metadata_delete(self, entries: List[Dict]) -> Dict: - """Remove revision metadata from the storage. - - Args: - entries (dict): dictionaries with the following keys: - - - **id** (bytes): revision identifier - - **indexer_configuration_id** (int): tool used to compute - metadata - - Returns: - Summary of number of rows deleted - """ - ... - @remote_api_endpoint("origin_intrinsic_metadata") def origin_intrinsic_metadata_get( self, urls: Iterable[str] @@ -455,22 +439,6 @@ """ ... - @remote_api_endpoint("origin_intrinsic_metadata/delete") - def origin_intrinsic_metadata_delete(self, entries: List[Dict]) -> Dict: - """Remove origin metadata from the storage. - - Args: - entries (dict): dictionaries with the following keys: - - - **id** (str): origin urls - - **indexer_configuration_id** (int): tool used to compute - metadata - - Returns: - Summary of number of rows deleted - """ - ... - @remote_api_endpoint("origin_intrinsic_metadata/search/fulltext") def origin_intrinsic_metadata_search_fulltext( self, conjunction: List[str], limit: int = 100 diff --git a/swh/indexer/tests/storage/test_storage.py b/swh/indexer/tests/storage/test_storage.py --- a/swh/indexer/tests/storage/test_storage.py +++ b/swh/indexer/tests/storage/test_storage.py @@ -5,7 +5,7 @@ import math import threading -from typing import Any, Dict, List, Tuple, Type, cast +from typing import Any, Dict, List, Tuple, Type import attr import pytest @@ -817,45 +817,6 @@ ] row_class = RevisionIntrinsicMetadataRow - def test_revision_intrinsic_metadata_delete( - self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any] - ) -> None: - storage, data = swh_indexer_storage_with_data - etype = self.endpoint_type - tool = data.tools[self.tool_name] - - query = [data.sha1_2, data.sha1_1] - data1 = RevisionIntrinsicMetadataRow( - id=data.sha1_2, - indexer_configuration_id=tool["id"], - **self.example_data[0], # type: ignore - ) - - # when - summary = endpoint(storage, etype, "add")([data1]) - assert summary == expected_summary(1, etype) - - summary2 = endpoint(storage, etype, "delete")( - [{"id": data.sha1_2, "indexer_configuration_id": tool["id"],}] - ) - assert summary2 == expected_summary(1, etype, "del") - - # then - actual_data = list(endpoint(storage, etype, "get")(query)) - - # then - assert not actual_data - - def test_revision_intrinsic_metadata_delete_nonexisting( - self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any] - ) -> None: - storage, data = swh_indexer_storage_with_data - etype = self.endpoint_type - tool = data.tools[self.tool_name] - endpoint(storage, etype, "delete")( - [{"id": data.sha1_2, "indexer_configuration_id": tool["id"],}] - ) - class TestIndexerStorageContentFossologyLicense: endpoint_type = "content_fossology_license" @@ -1132,60 +1093,6 @@ assert actual_metadata == expected_metadata - def test_origin_intrinsic_metadata_delete( - self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any] - ) -> None: - storage, data = swh_indexer_storage_with_data - # given - tool_id = data.tools["swh-metadata-detector"]["id"] - - metadata = { - "version": None, - "name": None, - } - metadata_rev = RevisionIntrinsicMetadataRow( - id=data.revision_id_2, - indexer_configuration_id=tool_id, - metadata=metadata, - mappings=["mapping1"], - ) - metadata_origin = OriginIntrinsicMetadataRow( - id=data.origin_url_1, - metadata=metadata, - indexer_configuration_id=tool_id, - mappings=["mapping1"], - from_revision=data.revision_id_2, - ) - metadata_origin2 = attr.evolve(metadata_origin, id=data.origin_url_2) - - # when - storage.revision_intrinsic_metadata_add([metadata_rev]) - storage.origin_intrinsic_metadata_add([metadata_origin, metadata_origin2]) - - storage.origin_intrinsic_metadata_delete( - [{"id": data.origin_url_1, "indexer_configuration_id": tool_id}] - ) - - # then - actual_metadata = list( - storage.origin_intrinsic_metadata_get( - [data.origin_url_1, data.origin_url_2, "no://where"] - ) - ) - assert [ - attr.evolve(m, indexer_configuration_id=cast(Dict, m.tool)["id"], tool=None) - for m in actual_metadata - ] == [metadata_origin2] - - def test_origin_intrinsic_metadata_delete_nonexisting( - self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any] - ) -> None: - storage, data = swh_indexer_storage_with_data - tool_id = data.tools["swh-metadata-detector"]["id"] - storage.origin_intrinsic_metadata_delete( - [{"id": data.origin_url_1, "indexer_configuration_id": tool_id}] - ) - def test_origin_intrinsic_metadata_add_drop_duplicate( self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any] ) -> None: diff --git a/swh/indexer/tests/test_origin_metadata.py b/swh/indexer/tests/test_origin_metadata.py --- a/swh/indexer/tests/test_origin_metadata.py +++ b/swh/indexer/tests/test_origin_metadata.py @@ -211,32 +211,6 @@ assert orig_results == [] -def test_origin_metadata_indexer_delete_metadata( - idx_storage: IndexerStorageInterface, storage: StorageInterface, obj_storage -) -> None: - - indexer = OriginMetadataIndexer(config=REVISION_METADATA_CONFIG) - origin = "https://github.com/librariesio/yarn-parser" - indexer.run([origin]) - - rev_id = REVISION.id - - rev_results = list(indexer.idx_storage.revision_intrinsic_metadata_get([rev_id])) - assert rev_results != [] - - orig_results = list(indexer.idx_storage.origin_intrinsic_metadata_get([origin])) - assert orig_results != [] - - with patch("swh.indexer.metadata_dictionary.npm.NpmMapping.filename", b"foo.json"): - indexer.run([origin]) - - rev_results = list(indexer.idx_storage.revision_intrinsic_metadata_get([rev_id])) - assert rev_results == [] - - orig_results = list(indexer.idx_storage.origin_intrinsic_metadata_get([origin])) - assert orig_results == [] - - def test_origin_metadata_indexer_unknown_origin( idx_storage: IndexerStorageInterface, storage: StorageInterface, obj_storage ) -> None: