diff --git a/swh/storage/db.py b/swh/storage/db.py --- a/swh/storage/db.py +++ b/swh/storage/db.py @@ -14,6 +14,7 @@ """Proxy to the SWH DB, with wrappers around stored procedures """ + def mktemp_dir_entry(self, entry_type, cur=None): self._cursor(cur).execute('SELECT swh_mktemp_dir_entry(%s)', (('directory_entry_%s' % entry_type),)) @@ -204,13 +205,30 @@ """ cur = self._cursor(cur) - cur.execute("""SELECT %s - FROM swh_content_find(%%s, %%s, %%s, %%s) - LIMIT 1""" % ','.join(self.content_find_cols), - (sha1, sha1_git, sha256, blake2s256)) + checksum_str = {'sha1': sha1, 'sha1_git': sha1_git, + 'sha256': sha256, 'blake2s256': blake2s256} + new_checksum_str = dict() - content = cur.fetchone() - if set(content) == {None}: + # Adds only those keys which have value other than None + for key in checksum_str: + if checksum_str[key] is not None: + new_checksum_str[key] = checksum_str[key] + # Creates query for sql on python side + query = ' AND '.join(str(str(key) + " "+"="+" %s ") + for key in new_checksum_str) + query_list = [] + # Creates list which has hashes stored in order + for key in new_checksum_str: + query_list.append(new_checksum_str[key]) + + cur.execute("""SELECT %s + FROM content WHERE %s + LIMIT ALL""" + % (','.join(self.content_find_cols), query), + query_list) + content = cur.fetchall() + # Content might return empty list hence this + if not content: return None else: return content @@ -789,7 +807,7 @@ query = '''SELECT %s FROM swh_origin_metadata_get_by_origin( %%s)''' % (','.join( - self.origin_metadata_get_cols)) + self.origin_metadata_get_cols)) cur.execute(query, (origin_id, )) @@ -797,7 +815,7 @@ query = '''SELECT %s FROM swh_origin_metadata_get_by_provider_type( %%s, %%s)''' % (','.join( - self.origin_metadata_get_cols)) + self.origin_metadata_get_cols)) cur.execute(query, (origin_id, provider_type)) @@ -822,8 +840,8 @@ where name=%%s and version=%%s and configuration=%%s''' % ( - ','.join(self.tool_cols)), - (name, version, configuration)) + ','.join(self.tool_cols)), + (name, version, configuration)) return cur.fetchone() @@ -839,7 +857,7 @@ RETURNING id""" cur.execute(insert, (provider_name, provider_type, provider_url, - jsonize(metadata))) + jsonize(metadata))) return cur.fetchone()[0] def metadata_provider_get(self, provider_id, cur=None): @@ -847,8 +865,8 @@ cur.execute('''select %s from metadata_provider where id=%%s ''' % ( - ','.join(self.metadata_provider_cols)), - (provider_id, )) + ','.join(self.metadata_provider_cols)), + (provider_id, )) return cur.fetchone() @@ -859,7 +877,7 @@ from metadata_provider where provider_name=%%s and provider_url=%%s''' % ( - ','.join(self.metadata_provider_cols)), - (provider_name, provider_url)) + ','.join(self.metadata_provider_cols)), + (provider_name, provider_url)) return cur.fetchone() diff --git a/swh/storage/in_memory.py b/swh/storage/in_memory.py --- a/swh/storage/in_memory.py +++ b/swh/storage/in_memory.py @@ -77,7 +77,8 @@ if key in self._contents: continue for algorithm in DEFAULT_ALGORITHMS: - if content[algorithm] in self._content_indexes[algorithm]: + if content[algorithm] in self._content_indexes[algorithm] \ + and (algorithm not in {'blake2s256', 'sha256'}): from . import HashCollision raise HashCollision(algorithm, content[algorithm], key) for algorithm in DEFAULT_ALGORITHMS: @@ -213,9 +214,7 @@ if not found: return keys = list(set.intersection(*found)) - - # FIXME: should really be a list of all the objects found - return copy.deepcopy(self._contents[keys[0]]) + return copy.deepcopy([self._contents[key] for key in keys]) def content_missing(self, contents, key_hash='sha1'): """List content missing from storage @@ -242,10 +241,9 @@ yield content[key_hash] break else: - # content_find cannot return None here, because we checked - # above that there is a content with matching hashes. - if self.content_find(content)['status'] == 'missing': - yield content[key_hash] + for result in self.content_find(content): + if result['status'] == 'missing': + yield content[key_hash] def content_missing_per_sha1(self, contents): """List content missing from storage based only on sha1. diff --git a/swh/storage/storage.py b/swh/storage/storage.py --- a/swh/storage/storage.py +++ b/swh/storage/storage.py @@ -167,7 +167,7 @@ 'content_pkey': 'sha1', 'content_sha1_git_idx': 'sha1_git', 'content_sha256_idx': 'sha256', - } + } colliding_hash_name = constaint_to_hash_name \ .get(e.diag.constraint_name) raise HashCollision(colliding_hash_name) @@ -404,13 +404,16 @@ raise ValueError('content keys must contain at least one of: ' 'sha1, sha1_git, sha256, blake2s256') - c = db.content_find(sha1=content.get('sha1'), - sha1_git=content.get('sha1_git'), - sha256=content.get('sha256'), - blake2s256=content.get('blake2s256'), - cur=cur) - if c: - return dict(zip(db.content_find_cols, c)) + contents = db.content_find(sha1=content.get('sha1'), + sha1_git=content.get('sha1_git'), + sha256=content.get('sha256'), + blake2s256=content.get('blake2s256'), + cur=cur) + if contents: + contents_list = [] + for cont in contents: + contents_list.append(dict(zip(db.content_find_cols, cont))) + return contents_list return None def directory_add(self, directories): diff --git a/swh/storage/tests/test_storage.py b/swh/storage/tests/test_storage.py --- a/swh/storage/tests/test_storage.py +++ b/swh/storage/tests/test_storage.py @@ -1608,45 +1608,51 @@ cont = self.cont self.storage.content_add([cont]) - actually_present = self.storage.content_find({'sha1': cont['sha1']}) - + actually_present = self.storage.content_find( + {'sha1': cont['sha1']} + )[0] if self.storage.content_find( + {'sha1': cont['sha1']} + ) else [] actually_present.pop('ctime') + self.assertEqual(actually_present, { - 'sha1': cont['sha1'], - 'sha256': cont['sha256'], - 'sha1_git': cont['sha1_git'], - 'blake2s256': cont['blake2s256'], - 'length': cont['length'], - 'status': 'visible' - }) + 'sha1': cont['sha1'], + 'sha256': cont['sha256'], + 'sha1_git': cont['sha1_git'], + 'blake2s256': cont['blake2s256'], + 'length': cont['length'], + 'status': 'visible' + }) # 2. with something to find actually_present = self.storage.content_find( - {'sha1_git': cont['sha1_git']}) + {'sha1_git': cont['sha1_git']})[0] if self.storage.content_find( + {'sha1_git': cont['sha1_git']}) else [] actually_present.pop('ctime') self.assertEqual(actually_present, { - 'sha1': cont['sha1'], - 'sha256': cont['sha256'], - 'sha1_git': cont['sha1_git'], - 'blake2s256': cont['blake2s256'], - 'length': cont['length'], - 'status': 'visible' - }) + 'sha1': cont['sha1'], + 'sha256': cont['sha256'], + 'sha1_git': cont['sha1_git'], + 'blake2s256': cont['blake2s256'], + 'length': cont['length'], + 'status': 'visible' + }) # 3. with something to find actually_present = self.storage.content_find( - {'sha256': cont['sha256']}) + {'sha256': cont['sha256']})[0] if self.storage.content_find( + {'sha256': cont['sha256']}) else [] actually_present.pop('ctime') self.assertEqual(actually_present, { - 'sha1': cont['sha1'], - 'sha256': cont['sha256'], - 'sha1_git': cont['sha1_git'], - 'blake2s256': cont['blake2s256'], - 'length': cont['length'], - 'status': 'visible' - }) + 'sha1': cont['sha1'], + 'sha256': cont['sha256'], + 'sha1_git': cont['sha1_git'], + 'blake2s256': cont['blake2s256'], + 'length': cont['length'], + 'status': 'visible' + }) # 4. with something to find actually_present = self.storage.content_find({ @@ -1654,17 +1660,22 @@ 'sha1_git': cont['sha1_git'], 'sha256': cont['sha256'], 'blake2s256': cont['blake2s256'], - }) - - actually_present.pop('ctime') - self.assertEqual(actually_present, { + })[0] if self.storage.content_find({ 'sha1': cont['sha1'], - 'sha256': cont['sha256'], 'sha1_git': cont['sha1_git'], + 'sha256': cont['sha256'], 'blake2s256': cont['blake2s256'], - 'length': cont['length'], - 'status': 'visible' - }) + }) else [] + + actually_present.pop('ctime') + self.assertEqual(actually_present, { + 'sha1': cont['sha1'], + 'sha256': cont['sha256'], + 'sha1_git': cont['sha1_git'], + 'blake2s256': cont['blake2s256'], + 'length': cont['length'], + 'status': 'visible' + }) def test_content_find_with_non_present_content(self): # 1. with something that does not exist @@ -1687,6 +1698,28 @@ self.assertIsNone(actually_present) + def test_content_find_with_duplicate_input(self): + cont1 = self.cont + duplicate_cont = cont1.copy() + + # Create fake data with colliding sha256 and blake2s256 + sha1_array = bytearray(duplicate_cont['sha1']) + sha1_array[0] += 1 + duplicate_cont['sha1'] = bytes(sha1_array) + sha1git_array = bytearray(duplicate_cont['sha1_git']) + sha1git_array[0] += 1 + duplicate_cont['sha1_git'] = bytes(sha1git_array) + # Inject the data + self.storage.content_add([cont1, duplicate_cont]) + finder = {'blake2s256': duplicate_cont['blake2s256'], + 'sha256': duplicate_cont['sha256']} + con = list(self.storage.content_find(finder)) + + # con might return more than two colliding hashes sometimes + for i in range(len(con)-1): + self.assertEqual(con[i]['blake2s256'], con[i+1]['blake2s256']) + self.assertEqual(con[i]['sha256'], con[i+1]['sha256']) + def test_content_find_bad_input(self): # 1. with bad input with self.assertRaises(ValueError):