diff --git a/sql/upgrades/148.sql b/sql/upgrades/148.sql new file mode 100644 --- /dev/null +++ b/sql/upgrades/148.sql @@ -0,0 +1,45 @@ +-- SWH DB schema upgrade +-- from_version: 147 +-- to_version: 148 +-- description: Fix origin_metadata values and migration schema to use correct primary key +-- 1. Fix existing duplicated entries in origin_metadata and metadata_provider +-- 2. Fix primary key on metadata_provider + +-- latest schema version +insert into dbversion(version, release, description) + values(148, now(), 'Work In Progress'); + +-- Clean up duplicated provider references in origin_metadata + +-- Lookup provider with the most minimal id already matching the future +-- composite key (provider_type, provider_url) +update origin_metadata om set provider_id=( + select min(mp1.id) as min_provider_id + from metadata_provider mp1 inner join metadata_provider mp2 + on (mp1.provider_type = mp2.provider_type and mp1.provider_url = mp2.provider_url) + where mp2.id=om.provider_id +); + +-- QUERY PLAN +-- ------------------------------------------------------------------------------------------------------------------------------------- +-- Update on origin_metadata om (cost=0.00..7506.94 rows=554 width=1157) +-- -> Seq Scan on origin_metadata om (cost=0.00..7506.94 rows=554 width=1157) +-- SubPlan 1 +-- -> Aggregate (cost=13.37..13.38 rows=1 width=4) +-- -> Hash Join (cost=2.18..12.90 rows=190 width=4) +-- Hash Cond: ((mp1.provider_type = mp2.provider_type) AND (mp1.provider_url = mp2.provider_url)) +-- -> Seq Scan on metadata_provider mp1 (cost=0.00..6.88 rows=288 width=51) +-- -> Hash (cost=2.17..2.17 rows=1 width=47) +-- -> Index Scan using metadata_provider_pkey on metadata_provider mp2 (cost=0.15..2.17 rows=1 width=47) +-- Index Cond: (id = om.provider_id) +-- (10 rows) + +-- Cleanup duplicated provider entries + +delete from metadata_provider +where id not in (select distinct provider_id from origin_metadata); + +-- Create unique index on table + +create unique index metadata_provider_type_url + on metadata_provider(provider_type, provider_url); diff --git a/swh/storage/db.py b/swh/storage/db.py --- a/swh/storage/db.py +++ b/swh/storage/db.py @@ -1150,18 +1150,43 @@ ] def metadata_provider_add( - self, provider_name, provider_type, provider_url, metadata, cur=None - ): + self, + provider_name: str, + provider_type: str, + provider_url: str, + metadata: Dict, + cur=None, + ) -> int: """Insert a new provider and return the new identifier.""" cur = self._cursor(cur) - insert = """INSERT INTO metadata_provider (provider_name, provider_type, - provider_url, metadata) values (%s, %s, %s, %s) - RETURNING id""" - + insert = """ + INSERT INTO metadata_provider (provider_name, provider_type, + provider_url, metadata) values (%s, %s, %s, %s) + ON CONFLICT(provider_type, provider_url) do nothing + """ cur.execute( insert, (provider_name, provider_type, provider_url, jsonize(metadata)) ) - return cur.fetchone()[0] + row = self.metadata_provider_get_by_composite_key( + provider_type, provider_url, cur=cur + ) + return row[0] + + def metadata_provider_get_by_composite_key( + self, provider_type: str, provider_url: str, cur=None + ) -> Tuple: + """Retrieve metadata provider by its composite primary key. + + """ + cur = self._cursor(cur) + cur.execute( + """select %s + from metadata_provider + where provider_type=%%s and provider_url=%%s""" + % (",".join(self.metadata_provider_cols)), + (provider_type, provider_url,), + ) + return cur.fetchone() def metadata_provider_get(self, provider_id, cur=None): cur = self._cursor(cur) diff --git a/swh/storage/sql/30-swh-schema.sql b/swh/storage/sql/30-swh-schema.sql --- a/swh/storage/sql/30-swh-schema.sql +++ b/swh/storage/sql/30-swh-schema.sql @@ -17,7 +17,7 @@ -- latest schema version insert into dbversion(version, release, description) - values(147, now(), 'Work In Progress'); + values(148, now(), 'Work In Progress'); -- a SHA1 checksum create domain sha1 as bytea check (length(value) = 20); diff --git a/swh/storage/sql/60-swh-indexes.sql b/swh/storage/sql/60-swh-indexes.sql --- a/swh/storage/sql/60-swh-indexes.sql +++ b/swh/storage/sql/60-swh-indexes.sql @@ -166,11 +166,14 @@ alter table metadata_provider add primary key using index metadata_provider_pkey; create index concurrently on metadata_provider(provider_name, provider_url); +create unique index metadata_provider_type_url + on metadata_provider(provider_type, provider_url); -- origin_metadata create unique index concurrently origin_metadata_pkey on origin_metadata(id); alter table origin_metadata add primary key using index origin_metadata_pkey; + create index concurrently on origin_metadata(origin_id, provider_id, tool_id); alter table origin_metadata add constraint origin_metadata_origin_fkey foreign key (origin_id) references origin(id) not valid; diff --git a/swh/storage/tests/test_cassandra.py b/swh/storage/tests/test_cassandra.py --- a/swh/storage/tests/test_cassandra.py +++ b/swh/storage/tests/test_cassandra.py @@ -339,6 +339,10 @@ def test_metadata_provider_add(self): pass + @pytest.mark.skip("Not yet implemented") + def test_metadata_provider_add_idempotent(self): + pass + @pytest.mark.skip("Not yet implemented") def test_metadata_provider_get(self): pass diff --git a/swh/storage/tests/test_storage.py b/swh/storage/tests/test_storage.py --- a/swh/storage/tests/test_storage.py +++ b/swh/storage/tests/test_storage.py @@ -3446,6 +3446,27 @@ ) assert provider == swh_storage.metadata_provider_get(provider_id) + def test_metadata_provider_add_idempotent(self, swh_storage): + provider = { + "provider_name": "swMATH", + "provider_type": "registry", + "provider_url": "http://www.swmath.org/", + "metadata": { + "email": "contact@swmath.org", + "license": "All rights reserved", + }, + } + provider_id = swh_storage.metadata_provider_add(**provider) + + expected_provider = {**provider, "id": provider_id} + assert expected_provider == swh_storage.metadata_provider_get_by( + {"provider_name": "swMATH", "provider_url": "http://www.swmath.org/"} + ) + assert expected_provider == swh_storage.metadata_provider_get(provider_id) + + provider_id2 = swh_storage.metadata_provider_add(**provider) + assert provider_id2 == provider_id + def test_origin_metadata_get_by_provider_type(self, swh_storage): # given origin_url = data.origin["url"]