diff --git a/swh/storage/db.py b/swh/storage/db.py --- a/swh/storage/db.py +++ b/swh/storage/db.py @@ -14,6 +14,7 @@ """Proxy to the SWH DB, with wrappers around stored procedures """ + def mktemp_dir_entry(self, entry_type, cur=None): self._cursor(cur).execute('SELECT swh_mktemp_dir_entry(%s)', (('directory_entry_%s' % entry_type),)) @@ -217,16 +218,27 @@ """ cur = self._cursor(cur) - cur.execute("""SELECT %s - FROM swh_content_find(%%s, %%s, %%s, %%s) - LIMIT 1""" % ','.join(self.content_find_cols), - (sha1, sha1_git, sha256, blake2s256)) + checksum_dict = {'sha1': sha1, 'sha1_git': sha1_git, + 'sha256': sha256, 'blake2s256': blake2s256} + new_checksum_dict = dict() + query_list = [] - content = cur.fetchone() - if set(content) == {None}: - return None - else: - return content + # Adds only those keys which have value other than None + for algorithm in checksum_dict: + if checksum_dict[algorithm] is not None: + new_checksum_dict[algorithm] = checksum_dict[algorithm] + query_list.append(new_checksum_dict[algorithm]) + # Creates query for sql on python side + query = ' AND '.join(key + " = %s " + for key in new_checksum_dict) + + cur.execute("""SELECT %s + FROM content WHERE %s + """ + % (','.join(self.content_find_cols), query), + query_list) + content = cur.fetchall() + return content def directory_missing_from_list(self, directories, cur=None): cur = self._cursor(cur) @@ -829,7 +841,7 @@ query = '''SELECT %s FROM swh_origin_metadata_get_by_origin( %%s)''' % (','.join( - self.origin_metadata_get_cols)) + self.origin_metadata_get_cols)) cur.execute(query, (origin_id, )) @@ -837,7 +849,7 @@ query = '''SELECT %s FROM swh_origin_metadata_get_by_provider_type( %%s, %%s)''' % (','.join( - self.origin_metadata_get_cols)) + self.origin_metadata_get_cols)) cur.execute(query, (origin_id, provider_type)) @@ -862,8 +874,8 @@ where name=%%s and version=%%s and configuration=%%s''' % ( - ','.join(self.tool_cols)), - (name, version, configuration)) + ','.join(self.tool_cols)), + (name, version, configuration)) return cur.fetchone() @@ -879,7 +891,7 @@ RETURNING id""" cur.execute(insert, (provider_name, provider_type, provider_url, - jsonize(metadata))) + jsonize(metadata))) return cur.fetchone()[0] def metadata_provider_get(self, provider_id, cur=None): @@ -887,8 +899,8 @@ cur.execute('''select %s from metadata_provider where id=%%s ''' % ( - ','.join(self.metadata_provider_cols)), - (provider_id, )) + ','.join(self.metadata_provider_cols)), + (provider_id, )) return cur.fetchone() @@ -899,7 +911,7 @@ from metadata_provider where provider_name=%%s and provider_url=%%s''' % ( - ','.join(self.metadata_provider_cols)), - (provider_name, provider_url)) + ','.join(self.metadata_provider_cols)), + (provider_name, provider_url)) return cur.fetchone() diff --git a/swh/storage/in_memory.py b/swh/storage/in_memory.py --- a/swh/storage/in_memory.py +++ b/swh/storage/in_memory.py @@ -76,7 +76,8 @@ if key in self._contents: continue for algorithm in DEFAULT_ALGORITHMS: - if content[algorithm] in self._content_indexes[algorithm]: + if content[algorithm] in self._content_indexes[algorithm]\ + and (algorithm not in {'blake2s256', 'sha256'}): from . import HashCollision raise HashCollision(algorithm, content[algorithm], key) for algorithm in DEFAULT_ALGORITHMS: @@ -291,9 +292,7 @@ if not found: return keys = list(set.intersection(*found)) - - # FIXME: should really be a list of all the objects found - return copy.deepcopy(self._contents[keys[0]]) + return copy.deepcopy([self._contents[key] for key in keys]) def content_missing(self, contents, key_hash='sha1'): """List content missing from storage @@ -320,10 +319,9 @@ yield content[key_hash] break else: - # content_find cannot return None here, because we checked - # above that there is a content with matching hashes. - if self.content_find(content)['status'] == 'missing': - yield content[key_hash] + for result in self.content_find(content): + if result['status'] == 'missing': + yield content[key_hash] def content_missing_per_sha1(self, contents): """List content missing from storage based only on sha1. diff --git a/swh/storage/storage.py b/swh/storage/storage.py --- a/swh/storage/storage.py +++ b/swh/storage/storage.py @@ -511,13 +511,16 @@ raise ValueError('content keys must contain at least one of: ' 'sha1, sha1_git, sha256, blake2s256') - c = db.content_find(sha1=content.get('sha1'), - sha1_git=content.get('sha1_git'), - sha256=content.get('sha256'), - blake2s256=content.get('blake2s256'), - cur=cur) - if c: - return dict(zip(db.content_find_cols, c)) + contents = db.content_find(sha1=content.get('sha1'), + sha1_git=content.get('sha1_git'), + sha256=content.get('sha256'), + blake2s256=content.get('blake2s256'), + cur=cur) + if contents: + contents_list = [] + for cont in contents: + contents_list.append(dict(zip(db.content_find_cols, cont))) + return contents_list return None @db_transaction() diff --git a/swh/storage/tests/test_storage.py b/swh/storage/tests/test_storage.py --- a/swh/storage/tests/test_storage.py +++ b/swh/storage/tests/test_storage.py @@ -2408,7 +2408,7 @@ actually_present = self.storage.content_find({'sha1': cont['sha1']}) - self.assertEqual(actually_present, { + self.assertEqual(actually_present[0], { 'ctime': now, 'sha1': cont['sha1'], 'sha256': cont['sha256'], @@ -2423,45 +2423,50 @@ cont = self.cont self.storage.content_add([cont]) - actually_present = self.storage.content_find({'sha1': cont['sha1']}) - - actually_present.pop('ctime') - self.assertEqual(actually_present, { - 'sha1': cont['sha1'], - 'sha256': cont['sha256'], - 'sha1_git': cont['sha1_git'], - 'blake2s256': cont['blake2s256'], - 'length': cont['length'], - 'status': 'visible' - }) + actually_present = self.storage.content_find( + {'sha1': cont['sha1']} + ) + self.assertEqual(1, len(actually_present)) + actually_present[0].pop('ctime') + + self.assertEqual(actually_present[0], { + 'sha1': cont['sha1'], + 'sha256': cont['sha256'], + 'sha1_git': cont['sha1_git'], + 'blake2s256': cont['blake2s256'], + 'length': cont['length'], + 'status': 'visible' + }) # 2. with something to find actually_present = self.storage.content_find( {'sha1_git': cont['sha1_git']}) - - actually_present.pop('ctime') - self.assertEqual(actually_present, { - 'sha1': cont['sha1'], - 'sha256': cont['sha256'], - 'sha1_git': cont['sha1_git'], - 'blake2s256': cont['blake2s256'], - 'length': cont['length'], - 'status': 'visible' - }) + self.assertEqual(1, len(actually_present)) + + actually_present[0].pop('ctime') + self.assertEqual(actually_present[0], { + 'sha1': cont['sha1'], + 'sha256': cont['sha256'], + 'sha1_git': cont['sha1_git'], + 'blake2s256': cont['blake2s256'], + 'length': cont['length'], + 'status': 'visible' + }) # 3. with something to find actually_present = self.storage.content_find( {'sha256': cont['sha256']}) - - actually_present.pop('ctime') - self.assertEqual(actually_present, { - 'sha1': cont['sha1'], - 'sha256': cont['sha256'], - 'sha1_git': cont['sha1_git'], - 'blake2s256': cont['blake2s256'], - 'length': cont['length'], - 'status': 'visible' - }) + self.assertEqual(1, len(actually_present)) + + actually_present[0].pop('ctime') + self.assertEqual(actually_present[0], { + 'sha1': cont['sha1'], + 'sha256': cont['sha256'], + 'sha1_git': cont['sha1_git'], + 'blake2s256': cont['blake2s256'], + 'length': cont['length'], + 'status': 'visible' + }) # 4. with something to find actually_present = self.storage.content_find({ @@ -2470,16 +2475,17 @@ 'sha256': cont['sha256'], 'blake2s256': cont['blake2s256'], }) - - actually_present.pop('ctime') - self.assertEqual(actually_present, { - 'sha1': cont['sha1'], - 'sha256': cont['sha256'], - 'sha1_git': cont['sha1_git'], - 'blake2s256': cont['blake2s256'], - 'length': cont['length'], - 'status': 'visible' - }) + self.assertEqual(1, len(actually_present)) + + actually_present[0].pop('ctime') + self.assertEqual(actually_present[0], { + 'sha1': cont['sha1'], + 'sha256': cont['sha256'], + 'sha1_git': cont['sha1_git'], + 'blake2s256': cont['blake2s256'], + 'length': cont['length'], + 'status': 'visible' + }) def test_content_find_with_non_present_content(self): # 1. with something that does not exist @@ -2502,6 +2508,120 @@ self.assertIsNone(actually_present) + def test_content_find_with_duplicate_input(self): + cont1 = self.cont + duplicate_cont = cont1.copy() + + # Create fake data with colliding sha256 and blake2s256 + sha1_array = bytearray(duplicate_cont['sha1']) + sha1_array[0] += 1 + duplicate_cont['sha1'] = bytes(sha1_array) + sha1git_array = bytearray(duplicate_cont['sha1_git']) + sha1git_array[0] += 1 + duplicate_cont['sha1_git'] = bytes(sha1git_array) + # Inject the data + self.storage.content_add([cont1, duplicate_cont]) + finder = {'blake2s256': duplicate_cont['blake2s256'], + 'sha256': duplicate_cont['sha256']} + actual_result = list(self.storage.content_find(finder)) + + cont1.pop('data') + duplicate_cont.pop('data') + actual_result[0].pop('ctime') + actual_result[1].pop('ctime') + + expected_result = [ + cont1, duplicate_cont + ] + self.assertEqual(len(expected_result), len(actual_result)) + self.assertCountEqual(expected_result, actual_result) + + def test_content_find_with_duplicate_sha256(self): + cont1 = self.cont + duplicate_cont = cont1.copy() + + # Create fake data with colliding sha256 and blake2s256 + sha1_array = bytearray(duplicate_cont['sha1']) + sha1_array[0] += 1 + duplicate_cont['sha1'] = bytes(sha1_array) + sha1git_array = bytearray(duplicate_cont['sha1_git']) + sha1git_array[0] += 1 + duplicate_cont['sha1_git'] = bytes(sha1git_array) + blake2s256_array = bytearray(duplicate_cont['blake2s256']) + blake2s256_array[0] += 1 + duplicate_cont['blake2s256'] = bytes(blake2s256_array) + self.storage.content_add([cont1, duplicate_cont]) + finder = { + 'sha256': duplicate_cont['sha256'] + } + actual_result = list(self.storage.content_find(finder)) + + cont1.pop('data') + duplicate_cont.pop('data') + actual_result[0].pop('ctime') + actual_result[1].pop('ctime') + expected_result = [ + cont1, duplicate_cont + ] + self.assertCountEqual(expected_result, actual_result) + # Find with both sha256 and blake2s256 + finder = { + 'sha256': duplicate_cont['sha256'], + 'blake2s256': duplicate_cont['blake2s256'] + } + actual_result = list(self.storage.content_find(finder)) + + actual_result[0].pop('ctime') + + expected_result = [ + duplicate_cont + ] + self.assertEqual(expected_result, actual_result) + + def test_content_find_with_duplicate_blake2s256(self): + cont1 = self.cont + duplicate_cont = cont1.copy() + + # Create fake data with colliding sha256 and blake2s256 + sha1_array = bytearray(duplicate_cont['sha1']) + sha1_array[0] += 1 + duplicate_cont['sha1'] = bytes(sha1_array) + sha1git_array = bytearray(duplicate_cont['sha1_git']) + sha1git_array[0] += 1 + duplicate_cont['sha1_git'] = bytes(sha1git_array) + sha256_array = bytearray(duplicate_cont['sha256']) + sha256_array[0] += 1 + duplicate_cont['sha256'] = bytes(sha256_array) + self.storage.content_add([cont1, duplicate_cont]) + finder = { + 'blake2s256': duplicate_cont['blake2s256'] + } + actual_result = list(self.storage.content_find(finder)) + + cont1.pop('data') + duplicate_cont.pop('data') + actual_result[0].pop('ctime') + actual_result[1].pop('ctime') + expected_result = [ + cont1, duplicate_cont + ] + self.assertEqual(len(expected_result), len(actual_result)) + self.assertCountEqual(expected_result, actual_result) + # Find with both sha256 and blake2s256 + finder = { + 'sha256': duplicate_cont['sha256'], + 'blake2s256': duplicate_cont['blake2s256'] + } + actual_result = list(self.storage.content_find(finder)) + + actual_result[0].pop('ctime') + + expected_result = [ + duplicate_cont + ] + self.assertEqual(len(expected_result), len(actual_result)) + self.assertEqual(expected_result, actual_result) + def test_content_find_bad_input(self): # 1. with bad input with self.assertRaises(ValueError):