Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F7123117
D3159.id11225.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
7 KB
Subscribers
None
D3159.id11225.diff
View Options
diff --git a/sql/upgrades/148.sql b/sql/upgrades/148.sql
new file mode 100644
--- /dev/null
+++ b/sql/upgrades/148.sql
@@ -0,0 +1,45 @@
+-- SWH DB schema upgrade
+-- from_version: 147
+-- to_version: 148
+-- description: Fix origin_metadata values and migration schema to use correct primary key
+-- 1. Fix existing duplicated entries in origin_metadata and metadata_provider
+-- 2. Fix primary key on metadata_provider
+
+-- latest schema version
+insert into dbversion(version, release, description)
+ values(148, now(), 'Work In Progress');
+
+-- Clean up duplicated provider references in origin_metadata
+
+-- Lookup provider with the most minimal id already matching the future
+-- composite key (provider_type, provider_url)
+update origin_metadata om set provider_id=(
+ select min(mp1.id) as min_provider_id
+ from metadata_provider mp1 inner join metadata_provider mp2
+ on (mp1.provider_type = mp2.provider_type and mp1.provider_url = mp2.provider_url)
+ where mp2.id=om.provider_id
+);
+
+-- QUERY PLAN
+-- -------------------------------------------------------------------------------------------------------------------------------------
+-- Update on origin_metadata om (cost=0.00..7506.94 rows=554 width=1157)
+-- -> Seq Scan on origin_metadata om (cost=0.00..7506.94 rows=554 width=1157)
+-- SubPlan 1
+-- -> Aggregate (cost=13.37..13.38 rows=1 width=4)
+-- -> Hash Join (cost=2.18..12.90 rows=190 width=4)
+-- Hash Cond: ((mp1.provider_type = mp2.provider_type) AND (mp1.provider_url = mp2.provider_url))
+-- -> Seq Scan on metadata_provider mp1 (cost=0.00..6.88 rows=288 width=51)
+-- -> Hash (cost=2.17..2.17 rows=1 width=47)
+-- -> Index Scan using metadata_provider_pkey on metadata_provider mp2 (cost=0.15..2.17 rows=1 width=47)
+-- Index Cond: (id = om.provider_id)
+-- (10 rows)
+
+-- Cleanup duplicated provider entries
+
+delete from metadata_provider
+where id not in (select distinct provider_id from origin_metadata);
+
+-- Create unique index on table
+
+create unique index metadata_provider_type_url
+ on metadata_provider(provider_type, provider_url);
diff --git a/swh/storage/db.py b/swh/storage/db.py
--- a/swh/storage/db.py
+++ b/swh/storage/db.py
@@ -1150,18 +1150,43 @@
]
def metadata_provider_add(
- self, provider_name, provider_type, provider_url, metadata, cur=None
- ):
+ self,
+ provider_name: str,
+ provider_type: str,
+ provider_url: str,
+ metadata: Dict,
+ cur=None,
+ ) -> int:
"""Insert a new provider and return the new identifier."""
cur = self._cursor(cur)
- insert = """INSERT INTO metadata_provider (provider_name, provider_type,
- provider_url, metadata) values (%s, %s, %s, %s)
- RETURNING id"""
-
+ insert = """
+ INSERT INTO metadata_provider (provider_name, provider_type,
+ provider_url, metadata) values (%s, %s, %s, %s)
+ ON CONFLICT(provider_type, provider_url) do nothing
+ """
cur.execute(
insert, (provider_name, provider_type, provider_url, jsonize(metadata))
)
- return cur.fetchone()[0]
+ row = self.metadata_provider_get_by_composite_key(
+ provider_type, provider_url, cur=cur
+ )
+ return row[0]
+
+ def metadata_provider_get_by_composite_key(
+ self, provider_type: str, provider_url: str, cur=None
+ ) -> Tuple:
+ """Retrieve metadata provider by its composite primary key.
+
+ """
+ cur = self._cursor(cur)
+ cur.execute(
+ """select %s
+ from metadata_provider
+ where provider_type=%%s and provider_url=%%s"""
+ % (",".join(self.metadata_provider_cols)),
+ (provider_type, provider_url,),
+ )
+ return cur.fetchone()
def metadata_provider_get(self, provider_id, cur=None):
cur = self._cursor(cur)
diff --git a/swh/storage/sql/30-swh-schema.sql b/swh/storage/sql/30-swh-schema.sql
--- a/swh/storage/sql/30-swh-schema.sql
+++ b/swh/storage/sql/30-swh-schema.sql
@@ -17,7 +17,7 @@
-- latest schema version
insert into dbversion(version, release, description)
- values(147, now(), 'Work In Progress');
+ values(148, now(), 'Work In Progress');
-- a SHA1 checksum
create domain sha1 as bytea check (length(value) = 20);
diff --git a/swh/storage/sql/60-swh-indexes.sql b/swh/storage/sql/60-swh-indexes.sql
--- a/swh/storage/sql/60-swh-indexes.sql
+++ b/swh/storage/sql/60-swh-indexes.sql
@@ -166,11 +166,14 @@
alter table metadata_provider add primary key using index metadata_provider_pkey;
create index concurrently on metadata_provider(provider_name, provider_url);
+create unique index metadata_provider_type_url
+ on metadata_provider(provider_type, provider_url);
-- origin_metadata
create unique index concurrently origin_metadata_pkey on origin_metadata(id);
alter table origin_metadata add primary key using index origin_metadata_pkey;
+
create index concurrently on origin_metadata(origin_id, provider_id, tool_id);
alter table origin_metadata add constraint origin_metadata_origin_fkey foreign key (origin_id) references origin(id) not valid;
diff --git a/swh/storage/tests/test_cassandra.py b/swh/storage/tests/test_cassandra.py
--- a/swh/storage/tests/test_cassandra.py
+++ b/swh/storage/tests/test_cassandra.py
@@ -339,6 +339,10 @@
def test_metadata_provider_add(self):
pass
+ @pytest.mark.skip("Not yet implemented")
+ def test_metadata_provider_add_idempotent(self):
+ pass
+
@pytest.mark.skip("Not yet implemented")
def test_metadata_provider_get(self):
pass
diff --git a/swh/storage/tests/test_storage.py b/swh/storage/tests/test_storage.py
--- a/swh/storage/tests/test_storage.py
+++ b/swh/storage/tests/test_storage.py
@@ -3446,6 +3446,27 @@
)
assert provider == swh_storage.metadata_provider_get(provider_id)
+ def test_metadata_provider_add_idempotent(self, swh_storage):
+ provider = {
+ "provider_name": "swMATH",
+ "provider_type": "registry",
+ "provider_url": "http://www.swmath.org/",
+ "metadata": {
+ "email": "contact@swmath.org",
+ "license": "All rights reserved",
+ },
+ }
+ provider_id = swh_storage.metadata_provider_add(**provider)
+
+ expected_provider = {**provider, "id": provider_id}
+ assert expected_provider == swh_storage.metadata_provider_get_by(
+ {"provider_name": "swMATH", "provider_url": "http://www.swmath.org/"}
+ )
+ assert expected_provider == swh_storage.metadata_provider_get(provider_id)
+
+ provider_id2 = swh_storage.metadata_provider_add(**provider)
+ assert provider_id2 == provider_id
+
def test_origin_metadata_get_by_provider_type(self, swh_storage):
# given
origin_url = data.origin["url"]
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Tue, Dec 17, 10:21 PM (2 d, 8 h ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3216171
Attached To
D3159: storage: metadata_provider: Ensure idempotency when creating provider
Event Timeline
Log In to Comment