Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F7123134
D2346.id8086.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
7 KB
Subscribers
None
D2346.id8086.diff
View Options
diff --git a/sql/upgrades/144.sql b/sql/upgrades/144.sql
new file mode 100644
--- /dev/null
+++ b/sql/upgrades/144.sql
@@ -0,0 +1,10 @@
+-- SWH DB schema upgrade
+-- from_version: 143
+-- to_version: 144
+-- description: add index on sha1(origin.url)
+
+insert into dbversion(version, release, description)
+ values(143, now(), 'Work In Progress');
+
+create index concurrently on origin using btree(digest(url, 'sha1'));
+
diff --git a/swh/storage/api/client.py b/swh/storage/api/client.py
--- a/swh/storage/api/client.py
+++ b/swh/storage/api/client.py
@@ -139,6 +139,9 @@
DeprecationWarning)
return self.post('origin/get', {'origins': origins})
+ def origin_get_by_sha1(self, sha1s):
+ return self.post('origin/get_sha1', {'sha1s': sha1s})
+
def origin_search(self, url_pattern, offset=0, limit=50, regexp=False,
with_visit=False):
return self.post('origin/search', {'url_pattern': url_pattern,
diff --git a/swh/storage/api/server.py b/swh/storage/api/server.py
--- a/swh/storage/api/server.py
+++ b/swh/storage/api/server.py
@@ -351,6 +351,13 @@
return encode_data(get_storage().origin_get(**decode_request(request)))
+@app.route('/origin/get_sha1', methods=['POST'])
+@timed
+def origin_get_by_sha1():
+ return encode_data(get_storage().origin_get_by_sha1(
+ **decode_request(request)))
+
+
@app.route('/origin/get_range', methods=['POST'])
@timed
def origin_get_range():
diff --git a/swh/storage/converters.py b/swh/storage/converters.py
--- a/swh/storage/converters.py
+++ b/swh/storage/converters.py
@@ -7,6 +7,7 @@
from swh.core.utils import decode_with_escape, encode_with_unescape
from swh.model import identifiers
+from swh.model.hashutil import MultiHash
DEFAULT_AUTHOR = {
@@ -310,3 +311,10 @@
ret['object_id'] = db_release['object_id']
return ret
+
+
+def origin_url_to_sha1(origin_url):
+ """Convert an origin URL to a sha1. Encodes URL to utf-8."""
+ return MultiHash.from_data(
+ origin_url.encode('utf-8'), {'sha1'}
+ ).digest()['sha1']
diff --git a/swh/storage/db.py b/swh/storage/db.py
--- a/swh/storage/db.py
+++ b/swh/storage/db.py
@@ -637,6 +637,17 @@
yield from execute_values_generator(
cur, query, ((url,) for url in origins))
+ def origin_get_by_sha1(self, sha1s, cur=None):
+ """Retrieve origin urls from sha1s if found."""
+ cur = self._cursor(cur)
+
+ query = """SELECT %s FROM (VALUES %%s) as t(sha1)
+ LEFT JOIN origin ON t.sha1 = digest(origin.url, 'sha1')
+ """ % ','.join('origin.' + col for col in self.origin_cols)
+
+ yield from execute_values_generator(
+ cur, query, ((sha1,) for sha1 in sha1s))
+
def origin_id_get_by_url(self, origins, cur=None):
"""Retrieve origin `(type, url)` from urls if found."""
cur = self._cursor(cur)
diff --git a/swh/storage/in_memory.py b/swh/storage/in_memory.py
--- a/swh/storage/in_memory.py
+++ b/swh/storage/in_memory.py
@@ -22,6 +22,7 @@
from swh.objstorage.exc import ObjNotFoundError
from .storage import get_journal_writer
+from .converters import origin_url_to_sha1
# Max block size of contents to return
BULK_BLOCK_CONTENT_LEN_MAX = 10000
@@ -52,6 +53,7 @@
self._snapshots = {}
self._origins = {}
self._origins_by_id = []
+ self._origins_by_sha1 = {}
self._origin_visits = {}
self._persons = []
self._origin_metadata = defaultdict(list)
@@ -1071,6 +1073,22 @@
else:
return results
+ def origin_get_by_sha1(self, sha1s):
+ """Return origins, identified by the sha1 of their URLs.
+
+ Args:
+ sha1s (list[bytes]): a list of sha1s
+
+ Yields:
+ dicts containing origin information as returned
+ by :meth:`swh.storage.in_memory.Storage.origin_get`, or None if an
+ origin matching the sha1 is not found.
+ """
+ return [
+ self._convert_origin(self._origins_by_sha1.get(sha1))
+ for sha1 in sha1s
+ ]
+
def origin_get_range(self, origin_from=1, origin_count=100):
"""Retrieve ``origin_count`` origins whose ids are greater
or equal than ``origin_from``.
@@ -1196,6 +1214,7 @@
assert len(self._origins_by_id) == origin_id
self._origins[origin.url] = origin
+ self._origins_by_sha1[origin_url_to_sha1(origin.url)] = origin
self._origin_visits[origin.url] = []
self._objects[origin.url].append(('origin', origin.url))
diff --git a/swh/storage/sql/30-swh-schema.sql b/swh/storage/sql/30-swh-schema.sql
--- a/swh/storage/sql/30-swh-schema.sql
+++ b/swh/storage/sql/30-swh-schema.sql
@@ -17,7 +17,7 @@
-- latest schema version
insert into dbversion(version, release, description)
- values(143, now(), 'Work In Progress');
+ values(144, now(), 'Work In Progress');
-- a SHA1 checksum
create domain sha1 as bytea check (length(value) = 20);
diff --git a/swh/storage/sql/60-swh-indexes.sql b/swh/storage/sql/60-swh-indexes.sql
--- a/swh/storage/sql/60-swh-indexes.sql
+++ b/swh/storage/sql/60-swh-indexes.sql
@@ -16,6 +16,7 @@
create index concurrently on origin using gin (url gin_trgm_ops);
create index concurrently on origin using hash (url);
+create index concurrently on origin using btree(digest(url, 'sha1'));
-- skipped_content
diff --git a/swh/storage/storage.py b/swh/storage/storage.py
--- a/swh/storage/storage.py
+++ b/swh/storage/storage.py
@@ -1488,6 +1488,25 @@
else:
return [None if res['url'] is None else res for res in results]
+ @db_transaction_generator(statement_timeout=500)
+ def origin_get_by_sha1(self, sha1s, db=None, cur=None):
+ """Return origins, identified by the sha1 of their URLs.
+
+ Args:
+ sha1s (list[bytes]): a list of sha1s
+
+ Yields:
+ dicts containing origin information as returned
+ by :meth:`swh.storage.storage.Storage.origin_get`, or None if an
+ origin matching the sha1 is not found.
+
+ """
+ for line in db.origin_get_by_sha1(sha1s, cur):
+ if line[0] is not None:
+ yield dict(zip(db.origin_cols, line))
+ else:
+ yield None
+
@db_transaction_generator()
def origin_get_range(self, origin_from=1, origin_count=100,
db=None, cur=None):
diff --git a/swh/storage/tests/test_storage.py b/swh/storage/tests/test_storage.py
--- a/swh/storage/tests/test_storage.py
+++ b/swh/storage/tests/test_storage.py
@@ -23,6 +23,7 @@
from swh.model.hashutil import hash_to_bytes
from swh.model.hypothesis_strategies import objects
from swh.storage import HashCollision
+from swh.storage.converters import origin_url_to_sha1 as sha1
from .storage_data import data
@@ -935,6 +936,24 @@
assert len(actual_origin0) == 1
assert actual_origin0[0]['url'] == data.origin['url']
+ def test_origin_get_by_sha1(self, swh_storage):
+ assert swh_storage.origin_get(data.origin) is None
+ swh_storage.origin_add_one(data.origin)
+
+ origins = list(swh_storage.origin_get_by_sha1([
+ sha1(data.origin['url'])
+ ]))
+ assert len(origins) == 1
+ assert origins[0]['url'] == data.origin['url']
+
+ def test_origin_get_by_sha1_not_found(self, swh_storage):
+ assert swh_storage.origin_get(data.origin) is None
+ origins = list(swh_storage.origin_get_by_sha1([
+ sha1(data.origin['url'])
+ ]))
+ assert len(origins) == 1
+ assert origins[0] is None
+
def test_origin_search_single_result(self, swh_storage):
found_origins = list(swh_storage.origin_search(data.origin['url']))
assert len(found_origins) == 0
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Tue, Dec 17, 11:11 PM (2 d, 14 h ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3226244
Attached To
D2346: Implement origin lookup by sha1
Event Timeline
Log In to Comment