diff --git a/sql/upgrades/153.sql b/sql/upgrades/153.sql new file mode 100644 --- /dev/null +++ b/sql/upgrades/153.sql @@ -0,0 +1,13 @@ +-- SWH DB schema upgrade +-- from_version: 152 +-- to_version: 153 +-- description: Make (origin, authority, discover_date, fetcher) a unique index + +-- latest schema version +insert into dbversion(version, release, description) + values(153, now(), 'Work In Progress'); + + +create unique index origin_metadata_origin_authority_date_fetcher on origin_metadata(origin_id, authority_id, discovery_date, fetcher_id); + +drop index origin_metadata_origin_authority_date; diff --git a/swh/storage/db.py b/swh/storage/db.py --- a/swh/storage/db.py +++ b/swh/storage/db.py @@ -1106,7 +1106,12 @@ cur = self._cursor(cur) insert = """INSERT INTO origin_metadata (origin_id, discovery_date, authority_id, fetcher_id, format, metadata) - SELECT id, %s, %s, %s, %s, %s FROM origin WHERE url = %s""" + SELECT id, %s, %s, %s, %s, %s FROM origin WHERE url = %s + ON CONFLICT (origin_id, authority_id, discovery_date, fetcher_id) + DO UPDATE SET + format=EXCLUDED.format, + metadata=EXCLUDED.metadata + """ cur.execute( insert, (discovery_date, authority, fetcher, format, metadata, origin), ) diff --git a/swh/storage/in_memory.py b/swh/storage/in_memory.py --- a/swh/storage/in_memory.py +++ b/swh/storage/in_memory.py @@ -102,6 +102,16 @@ for (k, item) in itertools.islice(self.data, from_index, None): yield item + def iter_after(self, start_key: SortedListKey) -> Iterator[SortedListItem]: + """Same as iter_from, but using a strict inequality.""" + it = self.iter_from(start_key) + for item in it: + if self.key(item) > start_key: # type: ignore + yield item + break + + yield from it + class InMemoryStorage: def __init__(self, journal_writer=None): @@ -1086,6 +1096,8 @@ if fetcher_key not in self._metadata_fetchers: raise StorageArgumentException(f"Unknown fetcher {fetcher}") + origin_metadata_list = self._origin_metadata[origin_url][authority_key] + origin_metadata = { "origin_url": origin_url, "discovery_date": discovery_date, @@ -1094,7 +1106,17 @@ "format": format, "metadata": metadata, } - self._origin_metadata[origin_url][authority_key].add(origin_metadata) + + for existing_origin_metadata in origin_metadata_list: + if ( + existing_origin_metadata["fetcher"] == fetcher_key + and existing_origin_metadata["discovery_date"] == discovery_date + ): + # Duplicate of an existing one; replace it. + existing_origin_metadata.update(origin_metadata) + break + else: + origin_metadata_list.add(origin_metadata) return None def origin_metadata_get( diff --git a/swh/storage/interface.py b/swh/storage/interface.py --- a/swh/storage/interface.py +++ b/swh/storage/interface.py @@ -1162,6 +1162,9 @@ The authority and fetcher must be known to the storage before using this endpoint. + If there is already origin metadata for the same origin, authority, + fetcher, and at the same date, it will be replaced by this one. + Args: discovery_date: when the metadata was fetched. authority: a dict containing keys `type` and `url`. diff --git a/swh/storage/sql/60-swh-indexes.sql b/swh/storage/sql/60-swh-indexes.sql --- a/swh/storage/sql/60-swh-indexes.sql +++ b/swh/storage/sql/60-swh-indexes.sql @@ -171,7 +171,7 @@ create unique index concurrently origin_metadata_pkey on origin_metadata(id); alter table origin_metadata add primary key using index origin_metadata_pkey; -create index concurrently origin_metadata_origin_authority_date on origin_metadata(origin_id, authority_id, discovery_date); +create unique index concurrently origin_metadata_origin_authority_date_fetcher on origin_metadata(origin_id, authority_id, discovery_date, fetcher_id); alter table origin_metadata add constraint origin_metadata_origin_fkey foreign key (origin_id) references origin(id) not valid; alter table origin_metadata validate constraint origin_metadata_origin_fkey; diff --git a/swh/storage/tests/storage_data.py b/swh/storage/tests/storage_data.py --- a/swh/storage/tests/storage_data.py +++ b/swh/storage/tests/storage_data.py @@ -469,7 +469,7 @@ origin_metadata = { "origin_url": origin["url"], "discovery_date": datetime.datetime( - 2015, 1, 1, 23, 0, 0, tzinfo=datetime.timezone.utc + 2015, 1, 1, 21, 0, 0, tzinfo=datetime.timezone.utc ), "authority": { "type": metadata_authority["type"], @@ -485,7 +485,7 @@ origin_metadata2 = { "origin_url": origin["url"], "discovery_date": datetime.datetime( - 2017, 1, 1, 23, 0, 0, tzinfo=datetime.timezone.utc + 2017, 1, 1, 22, 0, 0, tzinfo=datetime.timezone.utc ), "authority": { "type": metadata_authority["type"], @@ -501,7 +501,7 @@ origin_metadata3 = { "origin_url": origin["url"], "discovery_date": datetime.datetime( - 2017, 1, 1, 23, 0, 0, tzinfo=datetime.timezone.utc + 2017, 1, 1, 22, 0, 0, tzinfo=datetime.timezone.utc ), "authority": { "type": metadata_authority2["type"], diff --git a/swh/storage/tests/test_in_memory.py b/swh/storage/tests/test_in_memory.py --- a/swh/storage/tests/test_in_memory.py +++ b/swh/storage/tests/test_in_memory.py @@ -68,3 +68,19 @@ for split in items: expected = reversed(sorted(item for item in items if item <= split)) assert list(list_.iter_from(-split)) == list(expected), f"split: {split}" + + +@parametrize +def test_sorted_list_iter_after(items): + list_ = SortedList(items) + for split in items: + expected = sorted(item for item in items if item > split) + assert list(list_.iter_after(split)) == expected, f"split: {split}" + + +@parametrize +def test_sorted_list_iter_after__key(items): + list_ = SortedList(items, key=lambda item: -item) + for split in items: + expected = reversed(sorted(item for item in items if item < split)) + assert list(list_.iter_after(-split)) == list(expected), f"split: {split}" diff --git a/swh/storage/tests/test_storage.py b/swh/storage/tests/test_storage.py --- a/swh/storage/tests/test_storage.py +++ b/swh/storage/tests/test_storage.py @@ -3289,6 +3289,35 @@ ) ) + def test_origin_metadata_add_duplicate(self, swh_storage): + """Duplicates should be silently updated.""" + origin = data.origin + fetcher = data.metadata_fetcher + authority = data.metadata_authority + swh_storage.origin_add([origin])[0] + + new_origin_metadata2 = { + **data.origin_metadata2, + "format": "new-format", + "metadata": b"new-metadata", + } + + swh_storage.metadata_fetcher_add(**fetcher) + swh_storage.metadata_authority_add(**authority) + + swh_storage.origin_metadata_add(**data.origin_metadata) + swh_storage.origin_metadata_add(**data.origin_metadata2) + swh_storage.origin_metadata_add(**new_origin_metadata2) + + swh_storage.origin_metadata_get(origin["url"], authority) + + assert [data.origin_metadata, new_origin_metadata2] == list( + sorted( + swh_storage.origin_metadata_get(origin["url"], authority), + key=lambda x: x["discovery_date"], + ) + ) + def test_origin_metadata_add_dict(self, swh_storage): origin = data.origin fetcher = data.metadata_fetcher