Page MenuHomeSoftware Heritage

D3239.id11495.diff
No OneTemporary

D3239.id11495.diff

diff --git a/sql/upgrades/153.sql b/sql/upgrades/153.sql
new file mode 100644
--- /dev/null
+++ b/sql/upgrades/153.sql
@@ -0,0 +1,13 @@
+-- SWH DB schema upgrade
+-- from_version: 152
+-- to_version: 153
+-- description: Make (origin, authority, discover_date, fetcher) a unique index
+
+-- latest schema version
+insert into dbversion(version, release, description)
+ values(153, now(), 'Work In Progress');
+
+
+create unique index origin_metadata_origin_authority_date_fetcher on origin_metadata(origin_id, authority_id, discovery_date, fetcher_id);
+
+drop index origin_metadata_origin_authority_date;
diff --git a/swh/storage/db.py b/swh/storage/db.py
--- a/swh/storage/db.py
+++ b/swh/storage/db.py
@@ -1106,7 +1106,12 @@
cur = self._cursor(cur)
insert = """INSERT INTO origin_metadata (origin_id, discovery_date,
authority_id, fetcher_id, format, metadata)
- SELECT id, %s, %s, %s, %s, %s FROM origin WHERE url = %s"""
+ SELECT id, %s, %s, %s, %s, %s FROM origin WHERE url = %s
+ ON CONFLICT (origin_id, authority_id, discovery_date, fetcher_id)
+ DO UPDATE SET
+ format=EXCLUDED.format,
+ metadata=EXCLUDED.metadata
+ """
cur.execute(
insert, (discovery_date, authority, fetcher, format, metadata, origin),
)
diff --git a/swh/storage/in_memory.py b/swh/storage/in_memory.py
--- a/swh/storage/in_memory.py
+++ b/swh/storage/in_memory.py
@@ -102,6 +102,16 @@
for (k, item) in itertools.islice(self.data, from_index, None):
yield item
+ def iter_after(self, start_key: SortedListKey) -> Iterator[SortedListItem]:
+ """Same as iter_from, but using a strict inequality."""
+ it = self.iter_from(start_key)
+ for item in it:
+ if self.key(item) > start_key: # type: ignore
+ yield item
+ break
+
+ yield from it
+
class InMemoryStorage:
def __init__(self, journal_writer=None):
@@ -1086,6 +1096,8 @@
if fetcher_key not in self._metadata_fetchers:
raise StorageArgumentException(f"Unknown fetcher {fetcher}")
+ origin_metadata_list = self._origin_metadata[origin_url][authority_key]
+
origin_metadata = {
"origin_url": origin_url,
"discovery_date": discovery_date,
@@ -1094,7 +1106,17 @@
"format": format,
"metadata": metadata,
}
- self._origin_metadata[origin_url][authority_key].add(origin_metadata)
+
+ for existing_origin_metadata in origin_metadata_list:
+ if (
+ existing_origin_metadata["fetcher"] == fetcher_key
+ and existing_origin_metadata["discovery_date"] == discovery_date
+ ):
+ # Duplicate of an existing one; replace it.
+ existing_origin_metadata.update(origin_metadata)
+ break
+ else:
+ origin_metadata_list.add(origin_metadata)
return None
def origin_metadata_get(
diff --git a/swh/storage/interface.py b/swh/storage/interface.py
--- a/swh/storage/interface.py
+++ b/swh/storage/interface.py
@@ -1162,6 +1162,9 @@
The authority and fetcher must be known to the storage before
using this endpoint.
+ If there is already origin metadata for the same origin, authority,
+ fetcher, and at the same date, it will be replaced by this one.
+
Args:
discovery_date: when the metadata was fetched.
authority: a dict containing keys `type` and `url`.
diff --git a/swh/storage/sql/60-swh-indexes.sql b/swh/storage/sql/60-swh-indexes.sql
--- a/swh/storage/sql/60-swh-indexes.sql
+++ b/swh/storage/sql/60-swh-indexes.sql
@@ -171,7 +171,7 @@
create unique index concurrently origin_metadata_pkey on origin_metadata(id);
alter table origin_metadata add primary key using index origin_metadata_pkey;
-create index concurrently origin_metadata_origin_authority_date on origin_metadata(origin_id, authority_id, discovery_date);
+create unique index concurrently origin_metadata_origin_authority_date_fetcher on origin_metadata(origin_id, authority_id, discovery_date, fetcher_id);
alter table origin_metadata add constraint origin_metadata_origin_fkey foreign key (origin_id) references origin(id) not valid;
alter table origin_metadata validate constraint origin_metadata_origin_fkey;
diff --git a/swh/storage/tests/storage_data.py b/swh/storage/tests/storage_data.py
--- a/swh/storage/tests/storage_data.py
+++ b/swh/storage/tests/storage_data.py
@@ -469,7 +469,7 @@
origin_metadata = {
"origin_url": origin["url"],
"discovery_date": datetime.datetime(
- 2015, 1, 1, 23, 0, 0, tzinfo=datetime.timezone.utc
+ 2015, 1, 1, 21, 0, 0, tzinfo=datetime.timezone.utc
),
"authority": {
"type": metadata_authority["type"],
@@ -485,7 +485,7 @@
origin_metadata2 = {
"origin_url": origin["url"],
"discovery_date": datetime.datetime(
- 2017, 1, 1, 23, 0, 0, tzinfo=datetime.timezone.utc
+ 2017, 1, 1, 22, 0, 0, tzinfo=datetime.timezone.utc
),
"authority": {
"type": metadata_authority["type"],
@@ -501,7 +501,7 @@
origin_metadata3 = {
"origin_url": origin["url"],
"discovery_date": datetime.datetime(
- 2017, 1, 1, 23, 0, 0, tzinfo=datetime.timezone.utc
+ 2017, 1, 1, 22, 0, 0, tzinfo=datetime.timezone.utc
),
"authority": {
"type": metadata_authority2["type"],
diff --git a/swh/storage/tests/test_in_memory.py b/swh/storage/tests/test_in_memory.py
--- a/swh/storage/tests/test_in_memory.py
+++ b/swh/storage/tests/test_in_memory.py
@@ -68,3 +68,19 @@
for split in items:
expected = reversed(sorted(item for item in items if item <= split))
assert list(list_.iter_from(-split)) == list(expected), f"split: {split}"
+
+
+@parametrize
+def test_sorted_list_iter_after(items):
+ list_ = SortedList(items)
+ for split in items:
+ expected = sorted(item for item in items if item > split)
+ assert list(list_.iter_after(split)) == expected, f"split: {split}"
+
+
+@parametrize
+def test_sorted_list_iter_after__key(items):
+ list_ = SortedList(items, key=lambda item: -item)
+ for split in items:
+ expected = reversed(sorted(item for item in items if item < split))
+ assert list(list_.iter_after(-split)) == list(expected), f"split: {split}"
diff --git a/swh/storage/tests/test_storage.py b/swh/storage/tests/test_storage.py
--- a/swh/storage/tests/test_storage.py
+++ b/swh/storage/tests/test_storage.py
@@ -3289,6 +3289,35 @@
)
)
+ def test_origin_metadata_add_duplicate(self, swh_storage):
+ """Duplicates should be silently updated."""
+ origin = data.origin
+ fetcher = data.metadata_fetcher
+ authority = data.metadata_authority
+ swh_storage.origin_add([origin])[0]
+
+ new_origin_metadata2 = {
+ **data.origin_metadata2,
+ "format": "new-format",
+ "metadata": b"new-metadata",
+ }
+
+ swh_storage.metadata_fetcher_add(**fetcher)
+ swh_storage.metadata_authority_add(**authority)
+
+ swh_storage.origin_metadata_add(**data.origin_metadata)
+ swh_storage.origin_metadata_add(**data.origin_metadata2)
+ swh_storage.origin_metadata_add(**new_origin_metadata2)
+
+ swh_storage.origin_metadata_get(origin["url"], authority)
+
+ assert [data.origin_metadata, new_origin_metadata2] == list(
+ sorted(
+ swh_storage.origin_metadata_get(origin["url"], authority),
+ key=lambda x: x["discovery_date"],
+ )
+ )
+
def test_origin_metadata_add_dict(self, swh_storage):
origin = data.origin
fetcher = data.metadata_fetcher

File Metadata

Mime Type
text/plain
Expires
Jul 3 2025, 8:07 AM (10 w, 2 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3221252

Event Timeline