diff --git a/sql/upgrades/117.sql b/sql/upgrades/117.sql --- a/sql/upgrades/117.sql +++ b/sql/upgrades/117.sql @@ -3,6 +3,9 @@ -- to_version: 117 -- description: Add fulltext search index for origin intrinsic metadata +insert into dbversion(version, release, description) +values(117, now(), 'Work In Progress'); + alter table origin_intrinsic_metadata add column metadata_tsvector tsvector; update origin_intrinsic_metadata set metadata_tsvector = to_tsvector('pg_catalog.simple', metadata); create index origin_intrinsic_metadata_fulltext_idx on origin_intrinsic_metadata using gin (metadata_tsvector); diff --git a/sql/upgrades/118.sql b/sql/upgrades/118.sql new file mode 100644 --- /dev/null +++ b/sql/upgrades/118.sql @@ -0,0 +1,11 @@ +-- SWH Indexer DB schema upgrade +-- from_version: 117 +-- to_version: 118 +-- description: content_mimetype: Migrate bytes column to text + +insert into dbversion(version, release, description) +values(118, now(), 'Work In Progress'); + +alter table content_mimetype + alter column mimetype set data type text, + alter column encoding set data type text; diff --git a/swh/indexer/mimetype.py b/swh/indexer/mimetype.py --- a/swh/indexer/mimetype.py +++ b/swh/indexer/mimetype.py @@ -23,8 +23,8 @@ """ r = magic.detect_from_content(raw_content) return { - 'mimetype': r.mime_type.encode('utf-8'), - 'encoding': r.encoding.encode('utf-8'), + 'mimetype': r.mime_type, + 'encoding': r.encoding, } diff --git a/swh/indexer/sql/30-swh-schema.sql b/swh/indexer/sql/30-swh-schema.sql --- a/swh/indexer/sql/30-swh-schema.sql +++ b/swh/indexer/sql/30-swh-schema.sql @@ -14,7 +14,7 @@ ); insert into dbversion(version, release, description) - values(116, now(), 'Work In Progress'); + values(117, now(), 'Work In Progress'); -- Computing metadata on sha1's contents -- a SHA1 checksum (not necessarily originating from Git) @@ -39,8 +39,8 @@ -- Properties (mimetype, encoding, etc...) create table content_mimetype ( id sha1 not null, - mimetype bytea not null, - encoding bytea not null, + mimetype text not null, + encoding text not null, indexer_configuration_id bigint not null ); diff --git a/swh/indexer/tests/storage/test_storage.py b/swh/indexer/tests/storage/test_storage.py --- a/swh/indexer/tests/storage/test_storage.py +++ b/swh/indexer/tests/storage/test_storage.py @@ -116,8 +116,8 @@ # given self.storage.content_mimetype_add([{ 'id': self.sha1_2, - 'mimetype': b'text/plain', - 'encoding': b'utf-8', + 'mimetype': 'text/plain', + 'encoding': 'utf-8', 'indexer_configuration_id': tool_id, }]) @@ -133,8 +133,8 @@ mimetype_v1 = { 'id': self.sha1_2, - 'mimetype': b'text/plain', - 'encoding': b'utf-8', + 'mimetype': 'text/plain', + 'encoding': 'utf-8', 'indexer_configuration_id': tool_id, } @@ -148,8 +148,8 @@ # then expected_mimetypes_v1 = [{ 'id': self.sha1_2, - 'mimetype': b'text/plain', - 'encoding': b'utf-8', + 'mimetype': 'text/plain', + 'encoding': 'utf-8', 'tool': self.tools['file'], }] self.assertEqual(actual_mimetypes, expected_mimetypes_v1) @@ -157,8 +157,8 @@ # given mimetype_v2 = mimetype_v1.copy() mimetype_v2.update({ - 'mimetype': b'text/html', - 'encoding': b'us-ascii', + 'mimetype': 'text/html', + 'encoding': 'us-ascii', }) self.storage.content_mimetype_add([mimetype_v2]) @@ -175,8 +175,8 @@ mimetype_v1 = { 'id': self.sha1_2, - 'mimetype': b'text/plain', - 'encoding': b'utf-8', + 'mimetype': 'text/plain', + 'encoding': 'utf-8', 'indexer_configuration_id': tool_id, } @@ -189,8 +189,8 @@ expected_mimetypes_v1 = [{ 'id': self.sha1_2, - 'mimetype': b'text/plain', - 'encoding': b'utf-8', + 'mimetype': 'text/plain', + 'encoding': 'utf-8', 'tool': self.tools['file'], }] @@ -200,8 +200,8 @@ # given mimetype_v2 = mimetype_v1.copy() mimetype_v2.update({ - 'mimetype': b'text/html', - 'encoding': b'us-ascii', + 'mimetype': 'text/html', + 'encoding': 'us-ascii', }) self.storage.content_mimetype_add([mimetype_v2], conflict_update=True) @@ -211,8 +211,8 @@ expected_mimetypes_v2 = [{ 'id': self.sha1_2, - 'mimetype': b'text/html', - 'encoding': b'us-ascii', + 'mimetype': 'text/html', + 'encoding': 'us-ascii', 'tool': { 'id': 2, 'name': 'file', @@ -232,8 +232,8 @@ mimetype1 = { 'id': self.sha1_2, - 'mimetype': b'text/plain', - 'encoding': b'utf-8', + 'mimetype': 'text/plain', + 'encoding': 'utf-8', 'indexer_configuration_id': tool_id, } @@ -246,8 +246,8 @@ # then expected_mimetypes = [{ 'id': self.sha1_2, - 'mimetype': b'text/plain', - 'encoding': b'utf-8', + 'mimetype': 'text/plain', + 'encoding': 'utf-8', 'tool': self.tools['file'] }] diff --git a/swh/indexer/tests/test_mimetype.py b/swh/indexer/tests/test_mimetype.py --- a/swh/indexer/tests/test_mimetype.py +++ b/swh/indexer/tests/test_mimetype.py @@ -37,8 +37,8 @@ actual_result = compute_mimetype_encoding(_input) self.assertEqual(actual_result, { - 'mimetype': _mimetype.encode('utf-8'), - 'encoding': _encoding.encode('utf-8'), + 'mimetype': _mimetype, + 'encoding': _encoding }) @@ -83,20 +83,20 @@ self.id0: { 'id': self.id0, 'indexer_configuration_id': tool_id, - 'mimetype': b'text/plain', - 'encoding': b'us-ascii', + 'mimetype': 'text/plain', + 'encoding': 'us-ascii', }, self.id1: { 'id': self.id1, 'indexer_configuration_id': tool_id, - 'mimetype': b'text/plain', - 'encoding': b'us-ascii', + 'mimetype': 'text/plain', + 'encoding': 'us-ascii', }, self.id2: { 'id': self.id2, 'indexer_configuration_id': tool_id, - 'mimetype': b'application/x-empty', - 'encoding': b'binary', + 'mimetype': 'application/x-empty', + 'encoding': 'binary', } } @@ -154,20 +154,20 @@ self.expected_results = { self.id0: { - 'encoding': b'us-ascii', + 'encoding': 'us-ascii', 'id': self.id0, 'indexer_configuration_id': tool_id, - 'mimetype': b'text/plain'}, + 'mimetype': 'text/plain'}, self.id1: { - 'encoding': b'us-ascii', + 'encoding': 'us-ascii', 'id': self.id1, 'indexer_configuration_id': tool_id, - 'mimetype': b'text/x-python'}, + 'mimetype': 'text/x-python'}, self.id2: { - 'encoding': b'us-ascii', + 'encoding': 'us-ascii', 'id': self.id2, 'indexer_configuration_id': tool_id, - 'mimetype': b'text/plain'} + 'mimetype': 'text/plain'} }