Page MenuHomeSoftware Heritage

D1556.id5118.diff
No OneTemporary

D1556.id5118.diff

diff --git a/sql/upgrades/135.sql b/sql/upgrades/135.sql
new file mode 100644
--- /dev/null
+++ b/sql/upgrades/135.sql
@@ -0,0 +1,13 @@
+-- SWH DB schema upgrade
+-- from_version: 134
+-- to_version: 135
+-- description: Add an index on origin.url, drop the index on (origin.type, origin.url)
+
+insert into dbversion(version, release, description)
+ values(135, now(), 'Work In Progress');
+
+create extension if not exists pg_trgm;
+
+drop index origin_type_url_idx;
+create index concurrently on origin using gin (url gin_trgm_ops);
+create index concurrently on origin using hash (url);
diff --git a/swh/storage/db.py b/swh/storage/db.py
--- a/swh/storage/db.py
+++ b/swh/storage/db.py
@@ -650,7 +650,8 @@
query = """SELECT %s FROM (VALUES %%s) as t(type, url)
LEFT JOIN origin
- ON (t.type=origin.type AND t.url=origin.url)
+ ON ((t.type IS NULL OR t.type=origin.type)
+ AND t.url=origin.url)
""" % ','.join('origin.' + col for col in self.origin_cols)
yield from execute_values_generator(
diff --git a/swh/storage/in_memory.py b/swh/storage/in_memory.py
--- a/swh/storage/in_memory.py
+++ b/swh/storage/in_memory.py
@@ -929,10 +929,13 @@
"""Return origins, either all identified by their ids or all
identified by tuples (type, url).
+ If the url is given and the type is omitted, one of the origins with
+ that url is returned.
+
Args:
origin: a list of dictionaries representing the individual
origins to find.
- These dicts have either the keys type and url:
+ These dicts have either the key url (and optionally type):
- type (FIXME: enum TBD): the origin type ('git', 'wget', ...)
- url (bytes): the url the origin points to
@@ -964,18 +967,17 @@
and not all('id' in origin for origin in origins):
raise ValueError(
'Either all origins or none at all should have an "id".')
- if any('type' in origin and 'url' in origin for origin in origins) \
- and not all('type' in origin and 'url' in origin
- for origin in origins):
+ if any('url' in origin for origin in origins) \
+ and not all('url' in origin for origin in origins):
raise ValueError(
- 'Either all origins or none at all should have a '
- '"type" and an "url".')
+ 'Either all origins or none at all should have '
+ 'an "url" key.')
results = []
for origin in origins:
if 'id' in origin:
origin_id = origin['id']
- elif 'type' in origin and 'url' in origin:
+ elif 'url' in origin:
origin_id = self._origin_id(origin)
else:
raise ValueError(
@@ -1512,8 +1514,9 @@
def _origin_id(self, origin):
origin_id = None
for stored_origin in self._origins:
- if stored_origin['type'] == origin['type'] and \
- stored_origin['url'] == origin['url']:
+ if stored_origin['url'] == origin['url'] \
+ and ('type' not in origin
+ or stored_origin['type'] == origin['type']):
origin_id = stored_origin['id']
break
return origin_id
diff --git a/swh/storage/sql/10-swh-init.sql b/swh/storage/sql/10-swh-init.sql
--- a/swh/storage/sql/10-swh-init.sql
+++ b/swh/storage/sql/10-swh-init.sql
@@ -2,5 +2,6 @@
create extension if not exists btree_gist;
create extension if not exists pgcrypto;
+create extension if not exists pg_trgm;
create or replace language plpgsql;
diff --git a/swh/storage/sql/60-swh-indexes.sql b/swh/storage/sql/60-swh-indexes.sql
--- a/swh/storage/sql/60-swh-indexes.sql
+++ b/swh/storage/sql/60-swh-indexes.sql
@@ -14,7 +14,8 @@
create unique index concurrently origin_pkey on origin(id);
alter table origin add primary key using index origin_pkey;
-create index concurrently on origin(type, url);
+create index concurrently on origin using gin (url gin_trgm_ops);
+create index concurrently on origin using hash (url);
-- skipped_content
diff --git a/swh/storage/storage.py b/swh/storage/storage.py
--- a/swh/storage/storage.py
+++ b/swh/storage/storage.py
@@ -1346,10 +1346,13 @@
"""Return origins, either all identified by their ids or all
identified by tuples (type, url).
+ If the url is given and the type is omitted, one of the origins with
+ that url is returned.
+
Args:
origin: a list of dictionaries representing the individual
origins to find.
- These dicts have either the keys type and url:
+ These dicts have either the key url (and optionally type):
- type (FIXME: enum TBD): the origin type ('git', 'wget', ...)
- url (bytes): the url the origin points to
@@ -1388,16 +1391,16 @@
else:
raise ValueError(
'Either all origins or none at all should have an "id".')
- elif any(type_ and url for (type_, url) in origin_types_and_urls):
+ elif any(url for (type_, url) in origin_types_and_urls):
# Lookup per type + URL
- if all(type_ and url for (type_, url) in origin_types_and_urls):
+ if all(url for (type_, url) in origin_types_and_urls):
results = db.origin_get_with(origin_types_and_urls, cur)
else:
raise ValueError(
- 'Either all origins or none at all should have a '
- '"type" and an "url".')
+ 'Either all origins or none at all should have '
+ 'an "url" key.')
else: # unsupported lookup
- raise ValueError('Origin must have either id or (type and url).')
+ raise ValueError('Origin must have either id or url.')
results = [dict(zip(self.origin_keys, result))
for result in results]
diff --git a/swh/storage/tests/test_storage.py b/swh/storage/tests/test_storage.py
--- a/swh/storage/tests/test_storage.py
+++ b/swh/storage/tests/test_storage.py
@@ -1290,6 +1290,37 @@
self.assertEqual(add1, add2)
+ def test_origin_get_without_type(self):
+ origin0 = self.storage.origin_get([self.origin])[0]
+ self.assertIsNone(origin0)
+
+ origin3 = self.origin2.copy()
+ origin3['type'] += 'foo'
+
+ origin1, origin2, origin3 = self.storage.origin_add(
+ [self.origin, self.origin2, origin3])
+
+ actual_origin = self.storage.origin_get([{
+ 'url': self.origin['url'],
+ }])[0]
+ self.assertEqual(actual_origin['id'], origin1['id'])
+
+ actual_origin_2_or_3 = self.storage.origin_get([{
+ 'url': self.origin2['url'],
+ }])[0]
+ self.assertIn(
+ actual_origin_2_or_3['id'],
+ [origin2['id'], origin3['id']])
+
+ del actual_origin['id']
+ del actual_origin_2_or_3['id']
+ del origin3['id']
+
+ self.assertEqual(list(self.journal_writer.objects),
+ [('origin', self.origin),
+ ('origin', self.origin2),
+ ('origin', origin3)])
+
def test_origin_get_legacy(self):
self.assertIsNone(self.storage.origin_get(self.origin))
id = self.storage.origin_add_one(self.origin)

File Metadata

Mime Type
text/plain
Expires
Nov 5 2024, 6:35 PM (11 w, 14 h ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3222770

Event Timeline