diff --git a/sql/swh-func.sql b/sql/swh-func.sql --- a/sql/swh-func.sql +++ b/sql/swh-func.sql @@ -307,6 +307,35 @@ $$; +-- Update content entries from temporary table. +-- (columns are potential new columns added to the schema, this cannot be empty) +-- +create or replace function swh_content_update(columns_update text[]) + returns void + language plpgsql +as $$ +declare + query text; + tmp_array text[]; +begin + if array_length(columns_update, 1) = 0 then + raise exception 'Please, provide the list of column names to update.'; + end if; + + tmp_array := array(select format('%1$s=t.%1$s', unnest) from unnest(columns_update)); + + query = format('update content set %s + from tmp_content t where t.sha1 = content.sha1', + array_to_string(tmp_array, ', ')); + + execute query; + + return; +end +$$; + +comment on function swh_content_update(text[]) IS 'Update existing content''s columns'; + -- check which entries of tmp_directory are missing from directory -- -- operates in bulk: 0. swh_mktemp(directory), 1. COPY to tmp_directory, diff --git a/sql/swh-schema.sql b/sql/swh-schema.sql --- a/sql/swh-schema.sql +++ b/sql/swh-schema.sql @@ -14,7 +14,7 @@ ); insert into dbversion(version, release, description) - values(100, now(), 'Work In Progress'); + values(101, now(), 'Work In Progress'); -- a SHA1 checksum (not necessarily originating from Git) create domain sha1 as bytea check (length(value) = 20); diff --git a/sql/upgrades/101.sql b/sql/upgrades/101.sql new file mode 100644 --- /dev/null +++ b/sql/upgrades/101.sql @@ -0,0 +1,36 @@ +-- SWH DB schema upgrade +-- from_version: 100 +-- to_version: 101 +-- description: Open swh_content_update function + +insert into dbversion(version, release, description) + values(101, now(), 'Work In Progress'); + +-- Update content entries from temporary table. +-- (columns are potential new columns added to the schema, this cannot be empty) +-- +create or replace function swh_content_update(columns_update text[]) + returns void + language plpgsql +as $$ +declare + query text; + tmp_array text[]; +begin + if array_length(columns_update, 1) = 0 then + raise exception 'Please, provide the list of column names to update.'; + end if; + + tmp_array := array(select format('%1$s=t.%1$s', unnest) from unnest(columns_update)); + + query = format('update content set %s + from tmp_content t where t.sha1 = content.sha1', + array_to_string(tmp_array, ', ')); + + execute query; + + return; +end +$$; + +comment on function swh_content_update(text[]) IS 'Update existing content''s columns'; diff --git a/swh/storage/api/client.py b/swh/storage/api/client.py --- a/swh/storage/api/client.py +++ b/swh/storage/api/client.py @@ -20,6 +20,10 @@ def content_add(self, content): return self.post('content/add', {'content': content}) + def content_update(self, content, keys=[]): + return self.post('content/update', {'content': content, + 'keys': keys}) + def content_missing(self, content, key_hash='sha1'): return self.post('content/missing', {'content': content, 'key_hash': key_hash}) diff --git a/swh/storage/api/server.py b/swh/storage/api/server.py --- a/swh/storage/api/server.py +++ b/swh/storage/api/server.py @@ -82,6 +82,11 @@ return encode_data(g.storage.content_add(**decode_request(request))) +@app.route('/content/update', methods=['POST']) +def content_update(): + return encode_data(g.storage.content_update(**decode_request(request))) + + @app.route('/content/data', methods=['POST']) def content_get(): return encode_data(g.storage.content_get(**decode_request(request))) diff --git a/swh/storage/db.py b/swh/storage/db.py --- a/swh/storage/db.py +++ b/swh/storage/db.py @@ -1,4 +1,4 @@ -# Copyright (C) 2015-2016 The Software Heritage developers +# Copyright (C) 2015-2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -263,6 +263,11 @@ self.copy_to(({'id': elem} for elem in ids), 'tmp_bytea', ['id'], cur) + def content_update_from_temp(self, keys_to_update, cur=None): + cur = self._cursor(cur) + cur.execute("""select swh_content_update(ARRAY[%s] :: text[])""" % + keys_to_update) + content_get_metadata_keys = ['sha1', 'sha1_git', 'sha256', 'length', 'status'] diff --git a/swh/storage/storage.py b/swh/storage/storage.py --- a/swh/storage/storage.py +++ b/swh/storage/storage.py @@ -1,4 +1,4 @@ -# Copyright (C) 2015-2016 The Software Heritage developers +# Copyright (C) 2015-2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -135,6 +135,36 @@ # move metadata in place db.skipped_content_add_from_temp(cur) + @db_transaction + def content_update(self, content, keys=[], cur=None): + """Update content blobs to the storage. Does nothing for unknown + contents or skipped ones. + + Args: + content: iterable of dictionaries representing individual pieces of + content to update. Each dictionary has the following keys: + - data (bytes): the actual content + - length (int): content length (default: -1) + - one key for each checksum algorithm in + swh.core.hashutil.ALGORITHMS, mapped to the corresponding + checksum + - status (str): one of visible, hidden, absent + + keys ([str]): List of keys whose values needs an update ( + e.g. new hash column) + + """ + db = self.db + + # TODO: Add a check on input keys. How to properly implement + # this? We don't know yet the new columns. + + db.mktemp('content') + select_keys = list(set(db.content_get_metadata_keys).union(set(keys))) + db.copy_to(content, 'tmp_content', select_keys, cur) + db.content_update_from_temp(keys_to_update=keys, + cur=cur) + def content_get(self, content): """Retrieve in bulk contents and their data. diff --git a/swh/storage/tests/test_storage.py b/swh/storage/tests/test_storage.py --- a/swh/storage/tests/test_storage.py +++ b/swh/storage/tests/test_storage.py @@ -1,4 +1,4 @@ -# Copyright (C) 2015-2016 The Software Heritage developers +# Copyright (C) 2015-2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -3040,3 +3040,60 @@ 'email': person1['email'], }, ]) + + +class AlteringSchemaTest(AbstractTestStorage, unittest.TestCase): + """This class is dedicated for the rare case where the schema needs to + be altered dynamically. + + Otherwise, the tests could be blocking when ran altogether. + + """ + @istest + def content_update(self): + cont = self.cont + + self.storage.content_add([cont]) + # alter the sha1_git for example + cont['sha1_git'] = hex_to_hash( + '3a60a5275d0333bf13468e8b3dcab90f4046e654') + + self.storage.content_update([cont], keys=['sha1_git']) + + self.cursor.execute('SELECT sha1, sha1_git, sha256, length, status' + ' FROM content WHERE sha1 = %s', + (cont['sha1'],)) + datum = self.cursor.fetchone() + self.assertEqual( + (datum[0].tobytes(), datum[1].tobytes(), datum[2].tobytes(), + datum[3], datum[4]), + (cont['sha1'], cont['sha1_git'], cont['sha256'], + cont['length'], 'visible')) + + @istest + def content_update_with_new_cols(self): + self.cursor.execute("""alter table content + add column test text default null, + add column test2 text default null""") + + cont = self.cont2 + self.storage.content_add([cont]) + cont['test'] = 'value-1' + cont['test2'] = 'value-2' + + self.storage.content_update([cont], keys=['test', 'test2']) + + self.cursor.execute( + 'SELECT sha1, sha1_git, sha256, length, status, test, test2' + ' FROM content WHERE sha1 = %s', + (cont['sha1'],)) + + datum = self.cursor.fetchone() + self.assertEqual( + (datum[0].tobytes(), datum[1].tobytes(), datum[2].tobytes(), + datum[3], datum[4], datum[5], datum[6]), + (cont['sha1'], cont['sha1_git'], cont['sha256'], + cont['length'], 'visible', cont['test'], cont['test2'])) + + self.cursor.execute("""alter table content drop column test, + drop column test2""")