Page MenuHomeSoftware Heritage

D1288.diff
No OneTemporary

D1288.diff

diff --git a/swh/storage/db.py b/swh/storage/db.py
--- a/swh/storage/db.py
+++ b/swh/storage/db.py
@@ -14,6 +14,7 @@
"""Proxy to the SWH DB, with wrappers around stored procedures
"""
+
def mktemp_dir_entry(self, entry_type, cur=None):
self._cursor(cur).execute('SELECT swh_mktemp_dir_entry(%s)',
(('directory_entry_%s' % entry_type),))
@@ -217,16 +218,23 @@
"""
cur = self._cursor(cur)
+ checksum_dict = {'sha1': sha1, 'sha1_git': sha1_git,
+ 'sha256': sha256, 'blake2s256': blake2s256}
+ where_parts = []
+ args = []
+ # Adds only those keys which have value other than None
+ for algorithm in checksum_dict:
+ if checksum_dict[algorithm] is not None:
+ args.append(checksum_dict[algorithm])
+ where_parts.append(algorithm + '= %s')
+ query = ' AND '.join(where_parts)
cur.execute("""SELECT %s
- FROM swh_content_find(%%s, %%s, %%s, %%s)
- LIMIT 1""" % ','.join(self.content_find_cols),
- (sha1, sha1_git, sha256, blake2s256))
-
- content = cur.fetchone()
- if set(content) == {None}:
- return None
- else:
- return content
+ FROM content WHERE %s
+ """
+ % (','.join(self.content_find_cols), query),
+ args)
+ content = cur.fetchall()
+ return content
def directory_missing_from_list(self, directories, cur=None):
cur = self._cursor(cur)
@@ -829,7 +837,7 @@
query = '''SELECT %s
FROM swh_origin_metadata_get_by_origin(
%%s)''' % (','.join(
- self.origin_metadata_get_cols))
+ self.origin_metadata_get_cols))
cur.execute(query, (origin_id, ))
@@ -837,7 +845,7 @@
query = '''SELECT %s
FROM swh_origin_metadata_get_by_provider_type(
%%s, %%s)''' % (','.join(
- self.origin_metadata_get_cols))
+ self.origin_metadata_get_cols))
cur.execute(query, (origin_id, provider_type))
@@ -862,8 +870,8 @@
where name=%%s and
version=%%s and
configuration=%%s''' % (
- ','.join(self.tool_cols)),
- (name, version, configuration))
+ ','.join(self.tool_cols)),
+ (name, version, configuration))
return cur.fetchone()
@@ -879,7 +887,7 @@
RETURNING id"""
cur.execute(insert, (provider_name, provider_type, provider_url,
- jsonize(metadata)))
+ jsonize(metadata)))
return cur.fetchone()[0]
def metadata_provider_get(self, provider_id, cur=None):
@@ -887,8 +895,8 @@
cur.execute('''select %s
from metadata_provider
where id=%%s ''' % (
- ','.join(self.metadata_provider_cols)),
- (provider_id, ))
+ ','.join(self.metadata_provider_cols)),
+ (provider_id, ))
return cur.fetchone()
@@ -899,7 +907,7 @@
from metadata_provider
where provider_name=%%s and
provider_url=%%s''' % (
- ','.join(self.metadata_provider_cols)),
- (provider_name, provider_url))
+ ','.join(self.metadata_provider_cols)),
+ (provider_name, provider_url))
return cur.fetchone()
diff --git a/swh/storage/in_memory.py b/swh/storage/in_memory.py
--- a/swh/storage/in_memory.py
+++ b/swh/storage/in_memory.py
@@ -76,7 +76,8 @@
if key in self._contents:
continue
for algorithm in DEFAULT_ALGORITHMS:
- if content[algorithm] in self._content_indexes[algorithm]:
+ if content[algorithm] in self._content_indexes[algorithm]\
+ and (algorithm not in {'blake2s256', 'sha256'}):
from . import HashCollision
raise HashCollision(algorithm, content[algorithm], key)
for algorithm in DEFAULT_ALGORITHMS:
@@ -291,9 +292,7 @@
if not found:
return
keys = list(set.intersection(*found))
-
- # FIXME: should really be a list of all the objects found
- return copy.deepcopy(self._contents[keys[0]])
+ return copy.deepcopy([self._contents[key] for key in keys])
def content_missing(self, contents, key_hash='sha1'):
"""List content missing from storage
@@ -320,10 +319,9 @@
yield content[key_hash]
break
else:
- # content_find cannot return None here, because we checked
- # above that there is a content with matching hashes.
- if self.content_find(content)['status'] == 'missing':
- yield content[key_hash]
+ for result in self.content_find(content):
+ if result['status'] == 'missing':
+ yield content[key_hash]
def content_missing_per_sha1(self, contents):
"""List content missing from storage based only on sha1.
@@ -404,8 +402,10 @@
ret = dict.fromkeys(keys)
ret.update(dentry)
if ret['type'] == 'file':
+ # TODO: Make it able to handle more than one content
content = self.content_find({'sha1_git': ret['target']})
if content:
+ content = content[0]
for key in keys:
ret[key] = content[key]
return ret
diff --git a/swh/storage/storage.py b/swh/storage/storage.py
--- a/swh/storage/storage.py
+++ b/swh/storage/storage.py
@@ -511,13 +511,16 @@
raise ValueError('content keys must contain at least one of: '
'sha1, sha1_git, sha256, blake2s256')
- c = db.content_find(sha1=content.get('sha1'),
- sha1_git=content.get('sha1_git'),
- sha256=content.get('sha256'),
- blake2s256=content.get('blake2s256'),
- cur=cur)
- if c:
- return dict(zip(db.content_find_cols, c))
+ contents = db.content_find(sha1=content.get('sha1'),
+ sha1_git=content.get('sha1_git'),
+ sha256=content.get('sha256'),
+ blake2s256=content.get('blake2s256'),
+ cur=cur)
+ if contents:
+ contents_list = []
+ for cont in contents:
+ contents_list.append(dict(zip(db.content_find_cols, cont)))
+ return contents_list
return None
@db_transaction()
diff --git a/swh/storage/tests/test_storage.py b/swh/storage/tests/test_storage.py
--- a/swh/storage/tests/test_storage.py
+++ b/swh/storage/tests/test_storage.py
@@ -2408,7 +2408,7 @@
actually_present = self.storage.content_find({'sha1': cont['sha1']})
- self.assertEqual(actually_present, {
+ self.assertEqual(actually_present[0], {
'ctime': now,
'sha1': cont['sha1'],
'sha256': cont['sha256'],
@@ -2423,45 +2423,50 @@
cont = self.cont
self.storage.content_add([cont])
- actually_present = self.storage.content_find({'sha1': cont['sha1']})
-
- actually_present.pop('ctime')
- self.assertEqual(actually_present, {
- 'sha1': cont['sha1'],
- 'sha256': cont['sha256'],
- 'sha1_git': cont['sha1_git'],
- 'blake2s256': cont['blake2s256'],
- 'length': cont['length'],
- 'status': 'visible'
- })
+ actually_present = self.storage.content_find(
+ {'sha1': cont['sha1']}
+ )
+ self.assertEqual(1, len(actually_present))
+ actually_present[0].pop('ctime')
+
+ self.assertEqual(actually_present[0], {
+ 'sha1': cont['sha1'],
+ 'sha256': cont['sha256'],
+ 'sha1_git': cont['sha1_git'],
+ 'blake2s256': cont['blake2s256'],
+ 'length': cont['length'],
+ 'status': 'visible'
+ })
# 2. with something to find
actually_present = self.storage.content_find(
{'sha1_git': cont['sha1_git']})
-
- actually_present.pop('ctime')
- self.assertEqual(actually_present, {
- 'sha1': cont['sha1'],
- 'sha256': cont['sha256'],
- 'sha1_git': cont['sha1_git'],
- 'blake2s256': cont['blake2s256'],
- 'length': cont['length'],
- 'status': 'visible'
- })
+ self.assertEqual(1, len(actually_present))
+
+ actually_present[0].pop('ctime')
+ self.assertEqual(actually_present[0], {
+ 'sha1': cont['sha1'],
+ 'sha256': cont['sha256'],
+ 'sha1_git': cont['sha1_git'],
+ 'blake2s256': cont['blake2s256'],
+ 'length': cont['length'],
+ 'status': 'visible'
+ })
# 3. with something to find
actually_present = self.storage.content_find(
{'sha256': cont['sha256']})
-
- actually_present.pop('ctime')
- self.assertEqual(actually_present, {
- 'sha1': cont['sha1'],
- 'sha256': cont['sha256'],
- 'sha1_git': cont['sha1_git'],
- 'blake2s256': cont['blake2s256'],
- 'length': cont['length'],
- 'status': 'visible'
- })
+ self.assertEqual(1, len(actually_present))
+
+ actually_present[0].pop('ctime')
+ self.assertEqual(actually_present[0], {
+ 'sha1': cont['sha1'],
+ 'sha256': cont['sha256'],
+ 'sha1_git': cont['sha1_git'],
+ 'blake2s256': cont['blake2s256'],
+ 'length': cont['length'],
+ 'status': 'visible'
+ })
# 4. with something to find
actually_present = self.storage.content_find({
@@ -2470,16 +2475,17 @@
'sha256': cont['sha256'],
'blake2s256': cont['blake2s256'],
})
-
- actually_present.pop('ctime')
- self.assertEqual(actually_present, {
- 'sha1': cont['sha1'],
- 'sha256': cont['sha256'],
- 'sha1_git': cont['sha1_git'],
- 'blake2s256': cont['blake2s256'],
- 'length': cont['length'],
- 'status': 'visible'
- })
+ self.assertEqual(1, len(actually_present))
+
+ actually_present[0].pop('ctime')
+ self.assertEqual(actually_present[0], {
+ 'sha1': cont['sha1'],
+ 'sha256': cont['sha256'],
+ 'sha1_git': cont['sha1_git'],
+ 'blake2s256': cont['blake2s256'],
+ 'length': cont['length'],
+ 'status': 'visible'
+ })
def test_content_find_with_non_present_content(self):
# 1. with something that does not exist
@@ -2502,6 +2508,117 @@
self.assertIsNone(actually_present)
+ def test_content_find_with_duplicate_input(self):
+ cont1 = self.cont
+ duplicate_cont = cont1.copy()
+
+ # Create fake data with colliding sha256 and blake2s256
+ sha1_array = bytearray(duplicate_cont['sha1'])
+ sha1_array[0] += 1
+ duplicate_cont['sha1'] = bytes(sha1_array)
+ sha1git_array = bytearray(duplicate_cont['sha1_git'])
+ sha1git_array[0] += 1
+ duplicate_cont['sha1_git'] = bytes(sha1git_array)
+ # Inject the data
+ self.storage.content_add([cont1, duplicate_cont])
+ finder = {'blake2s256': duplicate_cont['blake2s256'],
+ 'sha256': duplicate_cont['sha256']}
+ actual_result = list(self.storage.content_find(finder))
+
+ cont1.pop('data')
+ duplicate_cont.pop('data')
+ actual_result[0].pop('ctime')
+ actual_result[1].pop('ctime')
+
+ expected_result = [
+ cont1, duplicate_cont
+ ]
+ self.assertCountEqual(expected_result, actual_result)
+
+ def test_content_find_with_duplicate_sha256(self):
+ cont1 = self.cont
+ duplicate_cont = cont1.copy()
+
+ # Create fake data with colliding sha256 and blake2s256
+ sha1_array = bytearray(duplicate_cont['sha1'])
+ sha1_array[0] += 1
+ duplicate_cont['sha1'] = bytes(sha1_array)
+ sha1git_array = bytearray(duplicate_cont['sha1_git'])
+ sha1git_array[0] += 1
+ duplicate_cont['sha1_git'] = bytes(sha1git_array)
+ blake2s256_array = bytearray(duplicate_cont['blake2s256'])
+ blake2s256_array[0] += 1
+ duplicate_cont['blake2s256'] = bytes(blake2s256_array)
+ self.storage.content_add([cont1, duplicate_cont])
+ finder = {
+ 'sha256': duplicate_cont['sha256']
+ }
+ actual_result = list(self.storage.content_find(finder))
+
+ cont1.pop('data')
+ duplicate_cont.pop('data')
+ actual_result[0].pop('ctime')
+ actual_result[1].pop('ctime')
+ expected_result = [
+ cont1, duplicate_cont
+ ]
+ self.assertCountEqual(expected_result, actual_result)
+ # Find with both sha256 and blake2s256
+ finder = {
+ 'sha256': duplicate_cont['sha256'],
+ 'blake2s256': duplicate_cont['blake2s256']
+ }
+ actual_result = list(self.storage.content_find(finder))
+
+ actual_result[0].pop('ctime')
+
+ expected_result = [
+ duplicate_cont
+ ]
+ self.assertCountEqual(expected_result, actual_result)
+
+ def test_content_find_with_duplicate_blake2s256(self):
+ cont1 = self.cont
+ duplicate_cont = cont1.copy()
+
+ # Create fake data with colliding sha256 and blake2s256
+ sha1_array = bytearray(duplicate_cont['sha1'])
+ sha1_array[0] += 1
+ duplicate_cont['sha1'] = bytes(sha1_array)
+ sha1git_array = bytearray(duplicate_cont['sha1_git'])
+ sha1git_array[0] += 1
+ duplicate_cont['sha1_git'] = bytes(sha1git_array)
+ sha256_array = bytearray(duplicate_cont['sha256'])
+ sha256_array[0] += 1
+ duplicate_cont['sha256'] = bytes(sha256_array)
+ self.storage.content_add([cont1, duplicate_cont])
+ finder = {
+ 'blake2s256': duplicate_cont['blake2s256']
+ }
+ actual_result = list(self.storage.content_find(finder))
+
+ cont1.pop('data')
+ duplicate_cont.pop('data')
+ actual_result[0].pop('ctime')
+ actual_result[1].pop('ctime')
+ expected_result = [
+ cont1, duplicate_cont
+ ]
+ self.assertCountEqual(expected_result, actual_result)
+ # Find with both sha256 and blake2s256
+ finder = {
+ 'sha256': duplicate_cont['sha256'],
+ 'blake2s256': duplicate_cont['blake2s256']
+ }
+ actual_result = list(self.storage.content_find(finder))
+
+ actual_result[0].pop('ctime')
+
+ expected_result = [
+ duplicate_cont
+ ]
+ self.assertCountEqual(expected_result, actual_result)
+
def test_content_find_bad_input(self):
# 1. with bad input
with self.assertRaises(ValueError):

File Metadata

Mime Type
text/plain
Expires
Tue, Dec 17, 9:49 PM (2 d, 15 h ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3218529

Event Timeline