diff --git a/sql/upgrades/117.sql b/sql/upgrades/117.sql new file mode 100644 --- /dev/null +++ b/sql/upgrades/117.sql @@ -0,0 +1,8 @@ +-- SWH Indexer DB schema upgrade +-- from_version: 116 +-- to_version: 117 +-- description: Add fulltext search index for origin intrinsic metadata + +alter table origin_intrinsic_metadata add column metadata_tsvector tsvector; +update origin_intrinsic_metadata set metadata_tsvector = to_tsvector('pg_catalog.simple', metadata); +create index origin_intrinsic_metadata_fulltext_idx on origin_intrinsic_metadata using gin (metadata_tsvector); diff --git a/swh/indexer/sql/30-swh-schema.sql b/swh/indexer/sql/30-swh-schema.sql --- a/swh/indexer/sql/30-swh-schema.sql +++ b/swh/indexer/sql/30-swh-schema.sql @@ -130,7 +130,8 @@ origin_id bigserial not null, metadata jsonb, indexer_configuration_id bigint not null, - from_revision sha1_git not null + from_revision sha1_git not null, + metadata_tsvector tsvector ); comment on table origin_intrinsic_metadata is 'keeps intrinsic metadata for an origin'; diff --git a/swh/indexer/sql/40-swh-func.sql b/swh/indexer/sql/40-swh-func.sql --- a/swh/indexer/sql/40-swh-func.sql +++ b/swh/indexer/sql/40-swh-func.sql @@ -408,16 +408,19 @@ language plpgsql as $$ begin + perform swh_origin_intrinsic_metadata_compute_tsvector(); if conflict_update then - insert into origin_intrinsic_metadata (origin_id, metadata, indexer_configuration_id, from_revision) - select origin_id, metadata, indexer_configuration_id, from_revision + insert into origin_intrinsic_metadata (origin_id, metadata, indexer_configuration_id, from_revision, metadata_tsvector) + select origin_id, metadata, indexer_configuration_id, from_revision, + metadata_tsvector from tmp_origin_intrinsic_metadata on conflict(origin_id, indexer_configuration_id) do update set metadata = excluded.metadata; else - insert into origin_intrinsic_metadata (origin_id, metadata, indexer_configuration_id, from_revision) - select origin_id, metadata, indexer_configuration_id, from_revision + insert into origin_intrinsic_metadata (origin_id, metadata, indexer_configuration_id, from_revision, metadata_tsvector) + select origin_id, metadata, indexer_configuration_id, from_revision, + metadata_tsvector from tmp_origin_intrinsic_metadata on conflict(origin_id, indexer_configuration_id) do nothing; @@ -426,4 +429,19 @@ end $$; + +-- Compute the metadata_tsvector column in tmp_origin_intrinsic_metadata. +-- +-- It uses the "pg_catalog.simple" dictionary, as it has no stopword, +-- so it should be suitable for proper names and non-English text. +create or replace function swh_origin_intrinsic_metadata_compute_tsvector() + returns void + language plpgsql +as $$ +begin + update tmp_origin_intrinsic_metadata + set metadata_tsvector = to_tsvector('pg_catalog.simple', metadata); +end +$$; + comment on function swh_origin_intrinsic_metadata_add(boolean) IS 'Add new origin intrinsic metadata'; diff --git a/swh/indexer/sql/60-swh-indexes.sql b/swh/indexer/sql/60-swh-indexes.sql --- a/swh/indexer/sql/60-swh-indexes.sql +++ b/swh/indexer/sql/60-swh-indexes.sql @@ -64,3 +64,5 @@ alter table origin_intrinsic_metadata validate constraint origin_intrinsic_metadata_indexer_configuration_id_fkey; alter table origin_intrinsic_metadata add constraint origin_intrinsic_metadata_revision_metadata_fkey foreign key (from_revision, indexer_configuration_id) references revision_metadata(id, indexer_configuration_id) not valid; alter table origin_intrinsic_metadata validate constraint origin_intrinsic_metadata_revision_metadata_fkey; + +create index origin_intrinsic_metadata_fulltext_idx on origin_intrinsic_metadata using gin (metadata_tsvector); diff --git a/swh/indexer/storage/__init__.py b/swh/indexer/storage/__init__.py --- a/swh/indexer/storage/__init__.py +++ b/swh/indexer/storage/__init__.py @@ -628,6 +628,29 @@ cur) db.origin_intrinsic_metadata_add_from_temp(conflict_update, cur) + @remote_api_endpoint('origin_intrinsic_metadata/search/fulltext') + @db_transaction_generator() + def origin_intrinsic_metadata_search_fulltext( + self, conjunction, limit=100, db=None, cur=None): + """Returns the list of origins whose metadata contain all the terms. + + Args: + conjunction (List[str]): List of terms to be searched for. + limit (int): The maximum number of results to return + + Yields: + list: dictionaries with the following keys: + + - **id** (int) + - **metadata** (str): associated metadata + - **tool** (dict): tool used to compute metadata + + """ + for c in db.origin_intrinsic_metadata_search_fulltext( + conjunction, limit=limit, cur=cur): + yield converters.db_to_metadata( + dict(zip(db.origin_intrinsic_metadata_cols, c))) + @remote_api_endpoint('indexer_configuration/add') @db_transaction_generator() def indexer_configuration_add(self, tools, db=None, cur=None): diff --git a/swh/indexer/storage/db.py b/swh/indexer/storage/db.py --- a/swh/indexer/storage/db.py +++ b/swh/indexer/storage/db.py @@ -313,6 +313,13 @@ 'origin_id', 'metadata', 'from_revision', 'tool_id', 'tool_name', 'tool_version', 'tool_configuration'] + origin_intrinsic_metadata_regconfig = 'pg_catalog.simple' + """The dictionary used to normalize 'metadata' and queries. + 'pg_catalog.simple' provides no stopword, so it should be suitable + for proper names and non-English content. + When updating this value, make sure to add a new index on + origin_intrinsic_metadata.metadata.""" + @stored_procedure('swh_mktemp_origin_intrinsic_metadata') def mktemp_origin_intrinsic_metadata(self, cur=None): pass @@ -329,6 +336,26 @@ self.origin_intrinsic_metadata_cols, cur=cur, id_col='origin_id') + def origin_intrinsic_metadata_search_fulltext(self, terms, *, limit, + cur=None): + regconfig = self.origin_intrinsic_metadata_regconfig + tsquery_template = ' && '.join("plainto_tsquery('%s', %%s)" % regconfig + for _ in terms) + tsquery_args = [(term,) for term in terms] + keys = map(self._convert_key, self.origin_intrinsic_metadata_cols) + query = ("SELECT {keys} FROM origin_intrinsic_metadata AS oim " + "INNER JOIN indexer_configuration AS i " + "ON oim.indexer_configuration_id=i.id " + "JOIN LATERAL (SELECT {tsquery_template}) AS s(tsq) ON true " + "WHERE to_tsvector('{regconfig}', metadata) @@ tsq " + "ORDER BY ts_rank(oim.metadata_tsvector, tsq, 1) DESC " + "LIMIT %s;" + ).format(keys=', '.join(keys), + regconfig=regconfig, + tsquery_template=tsquery_template) + cur.execute(query, tsquery_args + [limit]) + yield from cur + indexer_configuration_cols = ['id', 'tool_name', 'tool_version', 'tool_configuration'] diff --git a/swh/indexer/tests/storage/test_storage.py b/swh/indexer/tests/storage/test_storage.py --- a/swh/indexer/tests/storage/test_storage.py +++ b/swh/indexer/tests/storage/test_storage.py @@ -44,6 +44,7 @@ self.revision_id_2 = hash_to_bytes( '7026b7c1a2af56521e9587659012345678904321') self.origin_id_1 = 54974445 + self.origin_id_2 = 44434342 cur = self.test_db[self.TEST_DB_NAME].cursor tools = {} @@ -1511,6 +1512,128 @@ # metadata did change as the v2 was used to overwrite v1 self.assertEqual(actual_metadata, expected_metadata_v2) + def test_origin_intrinsic_metadata_search_fulltext(self): + # given + tool_id = self.tools['swh-metadata-detector']['id'] + + metadata1 = { + 'author': 'John Doe', + } + metadata1_rev = { + 'id': self.revision_id_1, + 'translated_metadata': metadata1, + 'indexer_configuration_id': tool_id, + } + metadata1_origin = { + 'origin_id': self.origin_id_1, + 'metadata': metadata1, + 'indexer_configuration_id': tool_id, + 'from_revision': self.revision_id_1, + } + metadata2 = { + 'author': 'Jane Doe', + } + metadata2_rev = { + 'id': self.revision_id_2, + 'translated_metadata': metadata2, + 'indexer_configuration_id': tool_id, + } + metadata2_origin = { + 'origin_id': self.origin_id_2, + 'metadata': metadata2, + 'indexer_configuration_id': tool_id, + 'from_revision': self.revision_id_2, + } + + # when + self.storage.revision_metadata_add([metadata1_rev]) + self.storage.origin_intrinsic_metadata_add([metadata1_origin]) + self.storage.revision_metadata_add([metadata2_rev]) + self.storage.origin_intrinsic_metadata_add([metadata2_origin]) + + # then + search = self.storage.origin_intrinsic_metadata_search_fulltext + self.assertCountEqual( + [res['origin_id'] for res in search(['Doe'])], + [self.origin_id_1, self.origin_id_2]) + self.assertEqual( + [res['origin_id'] for res in search(['John', 'Doe'])], + [self.origin_id_1]) + self.assertEqual( + [res['origin_id'] for res in search(['John'])], + [self.origin_id_1]) + self.assertEqual( + [res['origin_id'] for res in search(['John', 'Jane'])], + []) + + def test_origin_intrinsic_metadata_search_fulltext_rank(self): + # given + tool_id = self.tools['swh-metadata-detector']['id'] + + # The following authors have "Random Person" to add some more content + # to the JSON data, to work around normalization quirks when there + # are few words (rank/(1+ln(nb_words)) is very sensitive to nb_words + # for small values of nb_words). + metadata1 = { + 'author': [ + 'Random Person', + 'John Doe', + 'Jane Doe', + ] + } + metadata1_rev = { + 'id': self.revision_id_1, + 'translated_metadata': metadata1, + 'indexer_configuration_id': tool_id, + } + metadata1_origin = { + 'origin_id': self.origin_id_1, + 'metadata': metadata1, + 'indexer_configuration_id': tool_id, + 'from_revision': self.revision_id_1, + } + metadata2 = { + 'author': [ + 'Random Person', + 'Jane Doe', + ] + } + metadata2_rev = { + 'id': self.revision_id_2, + 'translated_metadata': metadata2, + 'indexer_configuration_id': tool_id, + } + metadata2_origin = { + 'origin_id': self.origin_id_2, + 'metadata': metadata2, + 'indexer_configuration_id': tool_id, + 'from_revision': self.revision_id_2, + } + + # when + self.storage.revision_metadata_add([metadata1_rev]) + self.storage.origin_intrinsic_metadata_add([metadata1_origin]) + self.storage.revision_metadata_add([metadata2_rev]) + self.storage.origin_intrinsic_metadata_add([metadata2_origin]) + + # then + search = self.storage.origin_intrinsic_metadata_search_fulltext + self.assertEqual( + [res['origin_id'] for res in search(['Doe'])], + [self.origin_id_1, self.origin_id_2]) + self.assertEqual( + [res['origin_id'] for res in search(['Doe'], limit=1)], + [self.origin_id_1]) + self.assertEqual( + [res['origin_id'] for res in search(['John'])], + [self.origin_id_1]) + self.assertEqual( + [res['origin_id'] for res in search(['Jane'])], + [self.origin_id_2, self.origin_id_1]) + self.assertEqual( + [res['origin_id'] for res in search(['John', 'Jane'])], + [self.origin_id_1]) + def test_indexer_configuration_add(self): tool = { 'tool_name': 'some-unknown-tool',