diff --git a/sql/upgrades/168.sql b/sql/upgrades/168.sql --- a/sql/upgrades/168.sql +++ b/sql/upgrades/168.sql @@ -6,12 +6,16 @@ insert into dbversion(version, release, description) values(168, now(), 'Work In Progress'); +-- 0. update swh-storage to the version whose python code relies +-- on dbversion 168, but do not restart it yet (this is in step 2). +-- If it relies on dbversion 169, it WILL throw errors after step 2. + -- 1. add the 'id' column alter table raw_extrinsic_metadata add column id sha1_git; --- 2. restart swh-storage, so that it starts writing the id (but does not read it) +-- 2. restart swh-storage, so that it starts writing the id (but does not read it). -- 3. run swh storage migrate-168 diff --git a/sql/upgrades/169.sql b/sql/upgrades/169.sql new file mode 100644 --- /dev/null +++ b/sql/upgrades/169.sql @@ -0,0 +1,25 @@ +-- SWH DB schema upgrade +-- from_version: 168 +-- to_version: 169 +-- description: make (target, authority_id, discovery_date, fetcher_id) non-unique + +insert into dbversion(version, release, description) + values(169, now(), 'Work In Progress'); + +-- 1. restart swh-storage, so that it stops relying on +-- raw_extrinsic_metadata_content_authority_date_fetcher being a unique index + +-- 2. rename old index + +alter index raw_extrinsic_metadata_content_authority_date_fetcher + rename to raw_extrinsic_metadata_content_authority_date_fetcher_unique; + +-- 3. create new (non-unique) index (excluding fetcher, because it was only needed +-- for uniqueness and not for accesses + +create unique index concurrently raw_extrinsic_metadata_content_authority_date + on raw_extrinsic_metadata(target, authority_id, discovery_date); + +-- 4. remove old index + +drop index raw_extrinsic_metadata_content_authority_date_fetcher_unique; diff --git a/swh/storage/cassandra/cql.py b/swh/storage/cassandra/cql.py --- a/swh/storage/cassandra/cql.py +++ b/swh/storage/cassandra/cql.py @@ -922,16 +922,15 @@ @_prepared_select_statement( RawExtrinsicMetadataRow, "WHERE target=? AND authority_type=? AND authority_url=? " - "AND (discovery_date, fetcher_name, fetcher_version) > (?, ?, ?)", + "AND (discovery_date, id) > (?, ?)", ) - def raw_extrinsic_metadata_get_after_date_and_fetcher( + def raw_extrinsic_metadata_get_after_date_and_id( self, target: str, authority_type: str, authority_url: str, after_date: datetime.datetime, - after_fetcher_name: str, - after_fetcher_version: str, + after_id: bytes, *, statement, ) -> Iterable[RawExtrinsicMetadataRow]: @@ -939,14 +938,7 @@ RawExtrinsicMetadataRow.from_dict, self._execute_with_retries( statement, - [ - target, - authority_type, - authority_url, - after_date, - after_fetcher_name, - after_fetcher_version, - ], + [target, authority_type, authority_url, after_date, after_id,], ), ) diff --git a/swh/storage/cassandra/model.py b/swh/storage/cassandra/model.py --- a/swh/storage/cassandra/model.py +++ b/swh/storage/cassandra/model.py @@ -246,10 +246,11 @@ "authority_type", "authority_url", "discovery_date", - "fetcher_name", - "fetcher_version", + "id", ) + id: bytes + type: str target: str diff --git a/swh/storage/cassandra/schema.py b/swh/storage/cassandra/schema.py --- a/swh/storage/cassandra/schema.py +++ b/swh/storage/cassandra/schema.py @@ -188,6 +188,8 @@ );""", """ CREATE TABLE IF NOT EXISTS raw_extrinsic_metadata ( + id blob, + type text, target text, @@ -211,8 +213,33 @@ path blob, directory text, - PRIMARY KEY ((target), authority_type, authority_url, discovery_date, - fetcher_name, fetcher_version) + PRIMARY KEY ((target), authority_type, authority_url, discovery_date, id) + + -- An explanation is in order for this primary key: + -- + -- Intuitively, the primary key should only be 'id', because two metadata + -- entries are the same iff the id is the same; and 'id' is used for + -- deduplication. + -- + -- However, we also want to query by + -- (target, authority_type, authority_url, discovery_date) + -- The naive solution to this would be an extra table, to use as index; + -- but it means 1. extra code to keep them in sync 2. overhead when writing + -- 3. overhead + random reads (instead of linear) when reading. + -- + -- Therefore, we use a single table for both, by adding the column + -- we want to query with before the id. + -- It solves both a) the query/order issues and b) the uniqueness issue because: + -- + -- a) adding the id at the end of the primary key does not change the rows' order: + -- for two different rows, id1 != id2, so + -- (target1, ..., date1) < (target2, ..., date2) + -- <=> (target1, ..., date1, id1) < (target2, ..., date2, id2) + -- + -- b) the id is a hash of all the columns, so: + -- rows are the same + -- <=> id1 == id2 + -- <=> (target1, ..., date1, id1) == (target2, ..., date2, id2) );""", """ CREATE TABLE IF NOT EXISTS object_count ( diff --git a/swh/storage/cassandra/storage.py b/swh/storage/cassandra/storage.py --- a/swh/storage/cassandra/storage.py +++ b/swh/storage/cassandra/storage.py @@ -1158,6 +1158,7 @@ try: row = RawExtrinsicMetadataRow( + id=metadata_entry.id, type=metadata_entry.type.value, target=str(metadata_entry.target), authority_type=metadata_entry.authority.type.value, @@ -1202,20 +1203,13 @@ ) if page_token is not None: - (after_date, after_fetcher_name, after_fetcher_url) = msgpack_loads( - base64.b64decode(page_token) - ) + (after_date, id_) = msgpack_loads(base64.b64decode(page_token)) if after and after_date < after: raise StorageArgumentException( "page_token is inconsistent with the value of 'after'." ) - entries = self._cql_runner.raw_extrinsic_metadata_get_after_date_and_fetcher( # noqa - str(target), - authority.type.value, - authority.url, - after_date, - after_fetcher_name, - after_fetcher_url, + entries = self._cql_runner.raw_extrinsic_metadata_get_after_date_and_id( + str(target), authority.type.value, authority.url, after_date, id_, ) elif after is not None: entries = self._cql_runner.raw_extrinsic_metadata_get_after_date( @@ -1264,13 +1258,7 @@ assert len(results) == limit last_result = results[-1] next_page_token: Optional[str] = base64.b64encode( - msgpack_dumps( - ( - last_result.discovery_date, - last_result.fetcher.name, - last_result.fetcher.version, - ) - ) + msgpack_dumps((last_result.discovery_date, last_result.id,)) ).decode() else: next_page_token = None diff --git a/swh/storage/in_memory.py b/swh/storage/in_memory.py --- a/swh/storage/in_memory.py +++ b/swh/storage/in_memory.py @@ -583,23 +583,22 @@ ) return (m for m in metadata if m.discovery_date > after) - def raw_extrinsic_metadata_get_after_date_and_fetcher( + def raw_extrinsic_metadata_get_after_date_and_id( self, target: str, authority_type: str, authority_url: str, after_date: datetime.datetime, - after_fetcher_name: str, - after_fetcher_version: str, + after_id: bytes, ) -> Iterable[RawExtrinsicMetadataRow]: metadata = self._raw_extrinsic_metadata.get_from_partition_key((target,)) - after_tuple = (after_date, after_fetcher_name, after_fetcher_version) + after_tuple = (after_date, after_id) return ( m for m in metadata if m.authority_type == authority_type and m.authority_url == authority_url - and (m.discovery_date, m.fetcher_name, m.fetcher_version) > after_tuple + and (m.discovery_date, m.id) > after_tuple ) def raw_extrinsic_metadata_get( diff --git a/swh/storage/postgresql/db.py b/swh/storage/postgresql/db.py --- a/swh/storage/postgresql/db.py +++ b/swh/storage/postgresql/db.py @@ -28,7 +28,7 @@ """ - current_version = 168 + current_version = 169 def mktemp_dir_entry(self, entry_type, cur=None): self._cursor(cur).execute( @@ -1153,7 +1153,7 @@ INSERT INTO raw_extrinsic_metadata ({', '.join(_raw_extrinsic_metadata_insert_cols)}) VALUES ({', '.join('%s' for _ in _raw_extrinsic_metadata_insert_cols)}) - ON CONFLICT (target, authority_id, discovery_date, fetcher_id) + ON CONFLICT (id) DO NOTHING """ diff --git a/swh/storage/sql/30-schema.sql b/swh/storage/sql/30-schema.sql --- a/swh/storage/sql/30-schema.sql +++ b/swh/storage/sql/30-schema.sql @@ -17,7 +17,7 @@ -- latest schema version insert into dbversion(version, release, description) - values(168, now(), 'Work In Progress'); + values(169, now(), 'Work In Progress'); -- a SHA1 checksum create domain sha1 as bytea check (length(value) = 20); diff --git a/swh/storage/sql/60-indexes.sql b/swh/storage/sql/60-indexes.sql --- a/swh/storage/sql/60-indexes.sql +++ b/swh/storage/sql/60-indexes.sql @@ -268,7 +268,7 @@ create unique index concurrently raw_extrinsic_metadata_pkey on raw_extrinsic_metadata(id); alter table raw_extrinsic_metadata add primary key using index raw_extrinsic_metadata_pkey; -create unique index concurrently raw_extrinsic_metadata_content_authority_date_fetcher on raw_extrinsic_metadata(target, authority_id, discovery_date, fetcher_id); +create index concurrently raw_extrinsic_metadata_content_authority_date on raw_extrinsic_metadata(target, authority_id, discovery_date); \if :dbflavor_default alter table raw_extrinsic_metadata add constraint raw_extrinsic_metadata_authority_fkey foreign key (authority_id) references metadata_authority(id) not valid; diff --git a/swh/storage/tests/storage_tests.py b/swh/storage/tests/storage_tests.py --- a/swh/storage/tests/storage_tests.py +++ b/swh/storage/tests/storage_tests.py @@ -3361,7 +3361,7 @@ assert obj in actual_objects def test_content_metadata_add_duplicate(self, swh_storage, sample_data): - """Duplicates should be silently updated.""" + """Duplicates should be silently ignored.""" content = sample_data.content fetcher = sample_data.metadata_fetcher authority = sample_data.metadata_authority @@ -3370,31 +3370,22 @@ object_type="content", object_id=hash_to_bytes(content.sha1_git) ) - new_content_metadata2 = RawExtrinsicMetadata.from_dict( - { - **remove_keys(content_metadata2.to_dict(), ("id",)), # recompute id - "format": "new-format", - "metadata": b"new-metadata", - } - ) - swh_storage.metadata_fetcher_add([fetcher]) swh_storage.metadata_authority_add([authority]) swh_storage.raw_extrinsic_metadata_add([content_metadata, content_metadata2]) - swh_storage.raw_extrinsic_metadata_add([new_content_metadata2]) + swh_storage.raw_extrinsic_metadata_add([content_metadata2, content_metadata]) result = swh_storage.raw_extrinsic_metadata_get( MetadataTargetType.CONTENT, content_swhid, authority ) assert result.next_page_token is None - expected_results1 = (content_metadata, new_content_metadata2) - expected_results2 = (content_metadata, content_metadata2) + expected_results = (content_metadata, content_metadata2) - assert tuple(sorted(result.results, key=lambda x: x.discovery_date,)) in ( - expected_results1, # cassandra - expected_results2, # postgresql + assert ( + tuple(sorted(result.results, key=lambda x: x.discovery_date,)) + == expected_results ) def test_content_metadata_get(self, swh_storage, sample_data): @@ -3619,19 +3610,11 @@ origin_metadata, origin_metadata2 = sample_data.origin_metadata[:2] assert swh_storage.origin_add([origin]) == {"origin:add": 1} - new_origin_metadata2 = RawExtrinsicMetadata.from_dict( - { - **remove_keys(origin_metadata2.to_dict(), ("id",)), # recompute id - "format": "new-format", - "metadata": b"new-metadata", - } - ) - swh_storage.metadata_fetcher_add([fetcher]) swh_storage.metadata_authority_add([authority]) swh_storage.raw_extrinsic_metadata_add([origin_metadata, origin_metadata2]) - swh_storage.raw_extrinsic_metadata_add([new_origin_metadata2]) + swh_storage.raw_extrinsic_metadata_add([origin_metadata2, origin_metadata]) result = swh_storage.raw_extrinsic_metadata_get( MetadataTargetType.ORIGIN, origin.url, authority @@ -3639,12 +3622,11 @@ assert result.next_page_token is None # which of the two behavior happens is backend-specific. - expected_results1 = (origin_metadata, new_origin_metadata2) - expected_results2 = (origin_metadata, origin_metadata2) + expected_results = (origin_metadata, origin_metadata2) - assert tuple(sorted(result.results, key=lambda x: x.discovery_date,)) in ( - expected_results1, # cassandra - expected_results2, # postgresql + assert ( + tuple(sorted(result.results, key=lambda x: x.discovery_date,)) + == expected_results ) def test_origin_metadata_get(self, swh_storage, sample_data):