Page MenuHomeSoftware Heritage

D3159.id11225.diff
No OneTemporary

D3159.id11225.diff

diff --git a/sql/upgrades/148.sql b/sql/upgrades/148.sql
new file mode 100644
--- /dev/null
+++ b/sql/upgrades/148.sql
@@ -0,0 +1,45 @@
+-- SWH DB schema upgrade
+-- from_version: 147
+-- to_version: 148
+-- description: Fix origin_metadata values and migration schema to use correct primary key
+-- 1. Fix existing duplicated entries in origin_metadata and metadata_provider
+-- 2. Fix primary key on metadata_provider
+
+-- latest schema version
+insert into dbversion(version, release, description)
+ values(148, now(), 'Work In Progress');
+
+-- Clean up duplicated provider references in origin_metadata
+
+-- Lookup provider with the most minimal id already matching the future
+-- composite key (provider_type, provider_url)
+update origin_metadata om set provider_id=(
+ select min(mp1.id) as min_provider_id
+ from metadata_provider mp1 inner join metadata_provider mp2
+ on (mp1.provider_type = mp2.provider_type and mp1.provider_url = mp2.provider_url)
+ where mp2.id=om.provider_id
+);
+
+-- QUERY PLAN
+-- -------------------------------------------------------------------------------------------------------------------------------------
+-- Update on origin_metadata om (cost=0.00..7506.94 rows=554 width=1157)
+-- -> Seq Scan on origin_metadata om (cost=0.00..7506.94 rows=554 width=1157)
+-- SubPlan 1
+-- -> Aggregate (cost=13.37..13.38 rows=1 width=4)
+-- -> Hash Join (cost=2.18..12.90 rows=190 width=4)
+-- Hash Cond: ((mp1.provider_type = mp2.provider_type) AND (mp1.provider_url = mp2.provider_url))
+-- -> Seq Scan on metadata_provider mp1 (cost=0.00..6.88 rows=288 width=51)
+-- -> Hash (cost=2.17..2.17 rows=1 width=47)
+-- -> Index Scan using metadata_provider_pkey on metadata_provider mp2 (cost=0.15..2.17 rows=1 width=47)
+-- Index Cond: (id = om.provider_id)
+-- (10 rows)
+
+-- Cleanup duplicated provider entries
+
+delete from metadata_provider
+where id not in (select distinct provider_id from origin_metadata);
+
+-- Create unique index on table
+
+create unique index metadata_provider_type_url
+ on metadata_provider(provider_type, provider_url);
diff --git a/swh/storage/db.py b/swh/storage/db.py
--- a/swh/storage/db.py
+++ b/swh/storage/db.py
@@ -1150,18 +1150,43 @@
]
def metadata_provider_add(
- self, provider_name, provider_type, provider_url, metadata, cur=None
- ):
+ self,
+ provider_name: str,
+ provider_type: str,
+ provider_url: str,
+ metadata: Dict,
+ cur=None,
+ ) -> int:
"""Insert a new provider and return the new identifier."""
cur = self._cursor(cur)
- insert = """INSERT INTO metadata_provider (provider_name, provider_type,
- provider_url, metadata) values (%s, %s, %s, %s)
- RETURNING id"""
-
+ insert = """
+ INSERT INTO metadata_provider (provider_name, provider_type,
+ provider_url, metadata) values (%s, %s, %s, %s)
+ ON CONFLICT(provider_type, provider_url) do nothing
+ """
cur.execute(
insert, (provider_name, provider_type, provider_url, jsonize(metadata))
)
- return cur.fetchone()[0]
+ row = self.metadata_provider_get_by_composite_key(
+ provider_type, provider_url, cur=cur
+ )
+ return row[0]
+
+ def metadata_provider_get_by_composite_key(
+ self, provider_type: str, provider_url: str, cur=None
+ ) -> Tuple:
+ """Retrieve metadata provider by its composite primary key.
+
+ """
+ cur = self._cursor(cur)
+ cur.execute(
+ """select %s
+ from metadata_provider
+ where provider_type=%%s and provider_url=%%s"""
+ % (",".join(self.metadata_provider_cols)),
+ (provider_type, provider_url,),
+ )
+ return cur.fetchone()
def metadata_provider_get(self, provider_id, cur=None):
cur = self._cursor(cur)
diff --git a/swh/storage/sql/30-swh-schema.sql b/swh/storage/sql/30-swh-schema.sql
--- a/swh/storage/sql/30-swh-schema.sql
+++ b/swh/storage/sql/30-swh-schema.sql
@@ -17,7 +17,7 @@
-- latest schema version
insert into dbversion(version, release, description)
- values(147, now(), 'Work In Progress');
+ values(148, now(), 'Work In Progress');
-- a SHA1 checksum
create domain sha1 as bytea check (length(value) = 20);
diff --git a/swh/storage/sql/60-swh-indexes.sql b/swh/storage/sql/60-swh-indexes.sql
--- a/swh/storage/sql/60-swh-indexes.sql
+++ b/swh/storage/sql/60-swh-indexes.sql
@@ -166,11 +166,14 @@
alter table metadata_provider add primary key using index metadata_provider_pkey;
create index concurrently on metadata_provider(provider_name, provider_url);
+create unique index metadata_provider_type_url
+ on metadata_provider(provider_type, provider_url);
-- origin_metadata
create unique index concurrently origin_metadata_pkey on origin_metadata(id);
alter table origin_metadata add primary key using index origin_metadata_pkey;
+
create index concurrently on origin_metadata(origin_id, provider_id, tool_id);
alter table origin_metadata add constraint origin_metadata_origin_fkey foreign key (origin_id) references origin(id) not valid;
diff --git a/swh/storage/tests/test_cassandra.py b/swh/storage/tests/test_cassandra.py
--- a/swh/storage/tests/test_cassandra.py
+++ b/swh/storage/tests/test_cassandra.py
@@ -339,6 +339,10 @@
def test_metadata_provider_add(self):
pass
+ @pytest.mark.skip("Not yet implemented")
+ def test_metadata_provider_add_idempotent(self):
+ pass
+
@pytest.mark.skip("Not yet implemented")
def test_metadata_provider_get(self):
pass
diff --git a/swh/storage/tests/test_storage.py b/swh/storage/tests/test_storage.py
--- a/swh/storage/tests/test_storage.py
+++ b/swh/storage/tests/test_storage.py
@@ -3446,6 +3446,27 @@
)
assert provider == swh_storage.metadata_provider_get(provider_id)
+ def test_metadata_provider_add_idempotent(self, swh_storage):
+ provider = {
+ "provider_name": "swMATH",
+ "provider_type": "registry",
+ "provider_url": "http://www.swmath.org/",
+ "metadata": {
+ "email": "contact@swmath.org",
+ "license": "All rights reserved",
+ },
+ }
+ provider_id = swh_storage.metadata_provider_add(**provider)
+
+ expected_provider = {**provider, "id": provider_id}
+ assert expected_provider == swh_storage.metadata_provider_get_by(
+ {"provider_name": "swMATH", "provider_url": "http://www.swmath.org/"}
+ )
+ assert expected_provider == swh_storage.metadata_provider_get(provider_id)
+
+ provider_id2 = swh_storage.metadata_provider_add(**provider)
+ assert provider_id2 == provider_id
+
def test_origin_metadata_get_by_provider_type(self, swh_storage):
# given
origin_url = data.origin["url"]

File Metadata

Mime Type
text/plain
Expires
Tue, Dec 17, 10:21 PM (2 d, 15 h ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3216171

Event Timeline