diff --git a/sql/upgrades/117.sql b/sql/upgrades/117.sql new file mode 100644 --- /dev/null +++ b/sql/upgrades/117.sql @@ -0,0 +1,6 @@ +-- SWH Indexer DB schema upgrade +-- from_version: 116 +-- to_version: 117 +-- description: Add fulltext search index for origin intrinsic metadata + +create index origin_intrinsic_metadata_fulltext_idx on origin_intrinsic_metadata using gin (to_tsvector('pg_catalog.simple', metadata)); diff --git a/swh/indexer/sql/60-swh-indexes.sql b/swh/indexer/sql/60-swh-indexes.sql --- a/swh/indexer/sql/60-swh-indexes.sql +++ b/swh/indexer/sql/60-swh-indexes.sql @@ -64,3 +64,5 @@ alter table origin_intrinsic_metadata validate constraint origin_intrinsic_metadata_indexer_configuration_id_fkey; alter table origin_intrinsic_metadata add constraint origin_intrinsic_metadata_revision_metadata_fkey foreign key (from_revision, indexer_configuration_id) references revision_metadata(id, indexer_configuration_id) not valid; alter table origin_intrinsic_metadata validate constraint origin_intrinsic_metadata_revision_metadata_fkey; + +create index origin_intrinsic_metadata_fulltext_idx on origin_intrinsic_metadata using gin (to_tsvector('pg_catalog.simple', metadata)); -- "pg_catalog.simple" is a dictionary with no stopword, so it should be suitable for proper names and non-English text. diff --git a/swh/indexer/storage/__init__.py b/swh/indexer/storage/__init__.py --- a/swh/indexer/storage/__init__.py +++ b/swh/indexer/storage/__init__.py @@ -628,6 +628,28 @@ cur) db.origin_intrinsic_metadata_add_from_temp(conflict_update, cur) + @remote_api_endpoint('origin_intrinsic_metadata/search/fulltext') + @db_transaction_generator() + def origin_intrinsic_metadata_search_fulltext( + self, conjunction, db=None, cur=None): + """Returns the list of origins whose metadata contain all the terms. + + Args: + conjunction (List[str]): List of terms to be searched for. + + Yields: + list: dictionaries with the following keys: + + - **id** (int) + - **metadata** (str): associated metadata + - **tool** (dict): tool used to compute metadata + + """ + for c in db.origin_intrinsic_metadata_search_fulltext( + conjunction, cur): + yield converters.db_to_metadata( + dict(zip(db.origin_intrinsic_metadata_cols, c))) + @remote_api_endpoint('indexer_configuration/add') @db_transaction_generator() def indexer_configuration_add(self, tools, db=None, cur=None): diff --git a/swh/indexer/storage/db.py b/swh/indexer/storage/db.py --- a/swh/indexer/storage/db.py +++ b/swh/indexer/storage/db.py @@ -313,6 +313,13 @@ 'origin_id', 'metadata', 'from_revision', 'tool_id', 'tool_name', 'tool_version', 'tool_configuration'] + origin_intrinsic_metadata_regconfig = 'pg_catalog.simple' + """The dictionary used to normalize 'metadata' and queries. + 'pg_catalog.simple' provides no stopword, so it should be suitable + for proper names and non-English content. + When updating this value, make sure to add a new index on + origin_intrinsic_metadata.metadata.""" + @stored_procedure('swh_mktemp_origin_intrinsic_metadata') def mktemp_origin_intrinsic_metadata(self, cur=None): pass @@ -329,6 +336,23 @@ self.origin_intrinsic_metadata_cols, cur=cur, id_col='origin_id') + def origin_intrinsic_metadata_search_fulltext(self, terms, cur=None): + regconfig = self.origin_intrinsic_metadata_regconfig + tsquery_template = ' && '.join("plainto_tsquery('%s', %%s)" % regconfig + for _ in terms) + tsquery_args = [(term,) for term in terms] + keys = map(self._convert_key, self.origin_intrinsic_metadata_cols) + query = ("SELECT {keys} FROM origin_intrinsic_metadata AS oim " + "INNER JOIN indexer_configuration AS i " + "ON oim.indexer_configuration_id=i.id " + "WHERE to_tsvector('{regconfig}', metadata) " + " @@ ({tsquery_template});" + ).format(keys=', '.join(keys), + regconfig=regconfig, + tsquery_template=tsquery_template) + cur.execute(query, tsquery_args) + yield from cur + indexer_configuration_cols = ['id', 'tool_name', 'tool_version', 'tool_configuration'] diff --git a/swh/indexer/tests/storage/test_storage.py b/swh/indexer/tests/storage/test_storage.py --- a/swh/indexer/tests/storage/test_storage.py +++ b/swh/indexer/tests/storage/test_storage.py @@ -44,6 +44,7 @@ self.revision_id_2 = hash_to_bytes( '7026b7c1a2af56521e9587659012345678904321') self.origin_id_1 = 54974445 + self.origin_id_2 = 44434342 cur = self.test_db[self.TEST_DB_NAME].cursor tools = {} @@ -1511,6 +1512,60 @@ # metadata did change as the v2 was used to overwrite v1 self.assertEqual(actual_metadata, expected_metadata_v2) + def test_origin_intrinsic_metadata_search_fulltext(self): + # given + tool_id = self.tools['swh-metadata-detector']['id'] + + metadata1 = { + 'author': 'John Doe', + } + metadata1_rev = { + 'id': self.revision_id_1, + 'translated_metadata': metadata1, + 'indexer_configuration_id': tool_id, + } + metadata1_origin = { + 'origin_id': self.origin_id_1, + 'metadata': metadata1, + 'indexer_configuration_id': tool_id, + 'from_revision': self.revision_id_1, + } + metadata2 = { + 'author': 'Jane Doe', + } + metadata2_rev = { + 'id': self.revision_id_2, + 'translated_metadata': metadata2, + 'indexer_configuration_id': tool_id, + } + metadata2_origin = { + 'origin_id': self.origin_id_2, + 'metadata': metadata2, + 'indexer_configuration_id': tool_id, + 'from_revision': self.revision_id_2, + } + + # when + self.storage.revision_metadata_add([metadata1_rev]) + self.storage.origin_intrinsic_metadata_add([metadata1_origin]) + self.storage.revision_metadata_add([metadata2_rev]) + self.storage.origin_intrinsic_metadata_add([metadata2_origin]) + + # then + search = self.storage.origin_intrinsic_metadata_search_fulltext + self.assertEqual( + {res['origin_id'] for res in search(['Doe'])}, + {self.origin_id_1, self.origin_id_2}) + self.assertEqual( + {res['origin_id'] for res in search(['John', 'Doe'])}, + {self.origin_id_1}) + self.assertEqual( + {res['origin_id'] for res in search(['John'])}, + {self.origin_id_1}) + self.assertEqual( + {res['origin_id'] for res in search(['John', 'Jane'])}, + set()) + def test_indexer_configuration_add(self): tool = { 'tool_name': 'some-unknown-tool',