diff --git a/sql/upgrades/135.sql b/sql/upgrades/135.sql new file mode 100644 --- /dev/null +++ b/sql/upgrades/135.sql @@ -0,0 +1,13 @@ +-- SWH DB schema upgrade +-- from_version: 134 +-- to_version: 135 +-- description: Add an index on origin.url, drop the index on (origin.type, origin.url) + +insert into dbversion(version, release, description) + values(135, now(), 'Work In Progress'); + +create extension if not exists pg_trgm; + +drop index origin_type_url_idx; +create index concurrently on origin using gin (url gin_trgm_ops); +create index concurrently on origin using hash (url); diff --git a/swh/storage/db.py b/swh/storage/db.py --- a/swh/storage/db.py +++ b/swh/storage/db.py @@ -650,7 +650,8 @@ query = """SELECT %s FROM (VALUES %%s) as t(type, url) LEFT JOIN origin - ON (t.type=origin.type AND t.url=origin.url) + ON ((t.type IS NULL OR t.type=origin.type) + AND t.url=origin.url) """ % ','.join('origin.' + col for col in self.origin_cols) yield from execute_values_generator( diff --git a/swh/storage/in_memory.py b/swh/storage/in_memory.py --- a/swh/storage/in_memory.py +++ b/swh/storage/in_memory.py @@ -929,10 +929,13 @@ """Return origins, either all identified by their ids or all identified by tuples (type, url). + If the url is given and the type is omitted, one of the origins with + that url is returned. + Args: origin: a list of dictionaries representing the individual origins to find. - These dicts have either the keys type and url: + These dicts have either the key url (and optionally type): - type (FIXME: enum TBD): the origin type ('git', 'wget', ...) - url (bytes): the url the origin points to @@ -964,18 +967,17 @@ and not all('id' in origin for origin in origins): raise ValueError( 'Either all origins or none at all should have an "id".') - if any('type' in origin and 'url' in origin for origin in origins) \ - and not all('type' in origin and 'url' in origin - for origin in origins): + if any('url' in origin for origin in origins) \ + and not all('url' in origin for origin in origins): raise ValueError( - 'Either all origins or none at all should have a ' - '"type" and an "url".') + 'Either all origins or none at all should have ' + 'an "url" key.') results = [] for origin in origins: if 'id' in origin: origin_id = origin['id'] - elif 'type' in origin and 'url' in origin: + elif 'url' in origin: origin_id = self._origin_id(origin) else: raise ValueError( @@ -1512,8 +1514,9 @@ def _origin_id(self, origin): origin_id = None for stored_origin in self._origins: - if stored_origin['type'] == origin['type'] and \ - stored_origin['url'] == origin['url']: + if stored_origin['url'] == origin['url'] \ + and ('type' not in origin + or stored_origin['type'] == origin['type']): origin_id = stored_origin['id'] break return origin_id diff --git a/swh/storage/sql/10-swh-init.sql b/swh/storage/sql/10-swh-init.sql --- a/swh/storage/sql/10-swh-init.sql +++ b/swh/storage/sql/10-swh-init.sql @@ -2,5 +2,6 @@ create extension if not exists btree_gist; create extension if not exists pgcrypto; +create extension if not exists pg_trgm; create or replace language plpgsql; diff --git a/swh/storage/sql/60-swh-indexes.sql b/swh/storage/sql/60-swh-indexes.sql --- a/swh/storage/sql/60-swh-indexes.sql +++ b/swh/storage/sql/60-swh-indexes.sql @@ -14,7 +14,8 @@ create unique index concurrently origin_pkey on origin(id); alter table origin add primary key using index origin_pkey; -create index concurrently on origin(type, url); +create index concurrently on origin using gin (url gin_trgm_ops); +create index concurrently on origin using hash (url); -- skipped_content diff --git a/swh/storage/storage.py b/swh/storage/storage.py --- a/swh/storage/storage.py +++ b/swh/storage/storage.py @@ -1346,10 +1346,13 @@ """Return origins, either all identified by their ids or all identified by tuples (type, url). + If the url is given and the type is omitted, one of the origins with + that url is returned. + Args: origin: a list of dictionaries representing the individual origins to find. - These dicts have either the keys type and url: + These dicts have either the key url (and optionally type): - type (FIXME: enum TBD): the origin type ('git', 'wget', ...) - url (bytes): the url the origin points to @@ -1388,16 +1391,16 @@ else: raise ValueError( 'Either all origins or none at all should have an "id".') - elif any(type_ and url for (type_, url) in origin_types_and_urls): + elif any(url for (type_, url) in origin_types_and_urls): # Lookup per type + URL - if all(type_ and url for (type_, url) in origin_types_and_urls): + if all(url for (type_, url) in origin_types_and_urls): results = db.origin_get_with(origin_types_and_urls, cur) else: raise ValueError( - 'Either all origins or none at all should have a ' - '"type" and an "url".') + 'Either all origins or none at all should have ' + 'an "url" key.') else: # unsupported lookup - raise ValueError('Origin must have either id or (type and url).') + raise ValueError('Origin must have either id or url.') results = [dict(zip(self.origin_keys, result)) for result in results] diff --git a/swh/storage/tests/test_storage.py b/swh/storage/tests/test_storage.py --- a/swh/storage/tests/test_storage.py +++ b/swh/storage/tests/test_storage.py @@ -1290,6 +1290,37 @@ self.assertEqual(add1, add2) + def test_origin_get_without_type(self): + origin0 = self.storage.origin_get([self.origin])[0] + self.assertIsNone(origin0) + + origin3 = self.origin2.copy() + origin3['type'] += 'foo' + + origin1, origin2, origin3 = self.storage.origin_add( + [self.origin, self.origin2, origin3]) + + actual_origin = self.storage.origin_get([{ + 'url': self.origin['url'], + }])[0] + self.assertEqual(actual_origin['id'], origin1['id']) + + actual_origin_2_or_3 = self.storage.origin_get([{ + 'url': self.origin2['url'], + }])[0] + self.assertIn( + actual_origin_2_or_3['id'], + [origin2['id'], origin3['id']]) + + del actual_origin['id'] + del actual_origin_2_or_3['id'] + del origin3['id'] + + self.assertEqual(list(self.journal_writer.objects), + [('origin', self.origin), + ('origin', self.origin2), + ('origin', origin3)]) + def test_origin_get_legacy(self): self.assertIsNone(self.storage.origin_get(self.origin)) id = self.storage.origin_add_one(self.origin)