diff --git a/sql/upgrades/128.sql b/sql/upgrades/128.sql new file mode 100644 --- /dev/null +++ b/sql/upgrades/128.sql @@ -0,0 +1,9 @@ +-- SWH Indexer DB schema upgrade +-- from_version: 127 +-- to_version: 128 +-- description: Add index on content_mimetype table to improve read queries + +insert into dbversion(version, release, description) +values(128, now(), 'Work In Progress'); + +create index on content_mimetype(id) where mimetype like 'text/%'; diff --git a/swh/indexer/sql/30-swh-schema.sql b/swh/indexer/sql/30-swh-schema.sql --- a/swh/indexer/sql/30-swh-schema.sql +++ b/swh/indexer/sql/30-swh-schema.sql @@ -14,7 +14,7 @@ ); insert into dbversion(version, release, description) - values(127, now(), 'Work In Progress'); + values(128, now(), 'Work In Progress'); -- Computing metadata on sha1's contents -- a SHA1 checksum (not necessarily originating from Git) diff --git a/swh/indexer/sql/60-swh-indexes.sql b/swh/indexer/sql/60-swh-indexes.sql --- a/swh/indexer/sql/60-swh-indexes.sql +++ b/swh/indexer/sql/60-swh-indexes.sql @@ -39,6 +39,8 @@ alter table content_mimetype add constraint content_mimetype_indexer_configuration_id_fkey foreign key (indexer_configuration_id) references indexer_configuration(id) not valid; alter table content_mimetype validate constraint content_mimetype_indexer_configuration_id_fkey; +create index on content_mimetype(id) where mimetype like 'text/%'; + -- content_language create unique index content_language_pkey on content_language(id, indexer_configuration_id); alter table content_language add primary key using index content_language_pkey; diff --git a/swh/indexer/storage/db.py b/swh/indexer/storage/db.py --- a/swh/indexer/storage/db.py +++ b/swh/indexer/storage/db.py @@ -134,19 +134,24 @@ table = self.content_indexer_names[content_type] if with_textual_data: extra = """inner join content_mimetype cm - on (t.id=cm.id and cm.mimetype like 'text/%%')""" + on (t.id=cm.id and cm.mimetype like 'text/%%' and + %(start)s <= cm.id and cm.id <= %(end)s) + """ else: extra = "" - query = """select t.id - from %s t - inner join indexer_configuration ic - on t.indexer_configuration_id=ic.id - %s - where ic.id=%%s and - %%s <= t.id and t.id <= %%s - order by t.indexer_configuration_id, t.id - limit %%s""" % (table, extra) - cur.execute(query, (indexer_configuration_id, start, end, limit)) + query = f"""select t.id + from {table} t + {extra} + where t.indexer_configuration_id=%(tool_id)s + and %(start)s <= t.id and t.id <= %(end)s + order by t.indexer_configuration_id, t.id + limit %(limit)s""" + cur.execute(query, { + 'start': start, + 'end': end, + 'tool_id': indexer_configuration_id, + 'limit': limit, + }) yield from cur def content_mimetype_get_from_list(self, ids, cur=None):