diff --git a/sql/upgrades/172.sql b/sql/upgrades/172.sql new file mode 100644 --- /dev/null +++ b/sql/upgrades/172.sql @@ -0,0 +1,30 @@ +-- SWH DB schema upgrade +-- from_version: 171 +-- to_version: 172 +-- description: add raw_extrinsic_metadata.id + +insert into dbversion(version, release, description) + values(172, now(), 'Work In Progress'); + +-- 0. update swh-storage to the version whose python code relies +-- on dbversion 169, but do not restart it yet (this is in step 2). +-- If it relies on dbversion 172, it WILL throw errors after step 2. + +-- 1. add the 'id' column + +alter table raw_extrinsic_metadata + add column id sha1_git; + +-- 2. restart swh-storage, so that it starts writing the id (but does not read it) + +-- 3. truncate the raw_extrinsic_metadata table + +-- 4. make the id column not null, and index it + +alter table raw_extrinsic_metadata + alter column id set not null; + +create unique index concurrently raw_extrinsic_metadata_pkey on raw_extrinsic_metadata(id); +alter table raw_extrinsic_metadata add primary key using index raw_extrinsic_metadata_pkey; + +-- 5. backfill from kafka diff --git a/swh/storage/cassandra/cql.py b/swh/storage/cassandra/cql.py --- a/swh/storage/cassandra/cql.py +++ b/swh/storage/cassandra/cql.py @@ -1016,16 +1016,15 @@ @_prepared_select_statement( RawExtrinsicMetadataRow, "WHERE target=? AND authority_type=? AND authority_url=? " - "AND (discovery_date, fetcher_name, fetcher_version) > (?, ?, ?)", + "AND (discovery_date, id) > (?, ?)", ) - def raw_extrinsic_metadata_get_after_date_and_fetcher( + def raw_extrinsic_metadata_get_after_date_and_id( self, target: str, authority_type: str, authority_url: str, after_date: datetime.datetime, - after_fetcher_name: str, - after_fetcher_version: str, + after_id: bytes, *, statement, ) -> Iterable[RawExtrinsicMetadataRow]: @@ -1033,14 +1032,7 @@ RawExtrinsicMetadataRow.from_dict, self._execute_with_retries( statement, - [ - target, - authority_type, - authority_url, - after_date, - after_fetcher_name, - after_fetcher_version, - ], + [target, authority_type, authority_url, after_date, after_id,], ), ) diff --git a/swh/storage/cassandra/model.py b/swh/storage/cassandra/model.py --- a/swh/storage/cassandra/model.py +++ b/swh/storage/cassandra/model.py @@ -246,10 +246,11 @@ "authority_type", "authority_url", "discovery_date", - "fetcher_name", - "fetcher_version", + "id", ) + id: bytes + type: str target: str diff --git a/swh/storage/cassandra/schema.py b/swh/storage/cassandra/schema.py --- a/swh/storage/cassandra/schema.py +++ b/swh/storage/cassandra/schema.py @@ -188,6 +188,8 @@ );""", """ CREATE TABLE IF NOT EXISTS raw_extrinsic_metadata ( + id blob, + type text, target text, @@ -211,8 +213,33 @@ path blob, directory text, - PRIMARY KEY ((target), authority_type, authority_url, discovery_date, - fetcher_name, fetcher_version) + PRIMARY KEY ((target), authority_type, authority_url, discovery_date, id) + + -- An explanation is in order for this primary key: + -- + -- Intuitively, the primary key should only be 'id', because two metadata + -- entries are the same iff the id is the same; and 'id' is used for + -- deduplication. + -- + -- However, we also want to query by + -- (target, authority_type, authority_url, discovery_date) + -- The naive solution to this would be an extra table, to use as index; + -- but it means 1. extra code to keep them in sync 2. overhead when writing + -- 3. overhead + random reads (instead of linear) when reading. + -- + -- Therefore, we use a single table for both, by adding the column + -- we want to query with before the id. + -- It solves both a) the query/order issues and b) the uniqueness issue because: + -- + -- a) adding the id at the end of the primary key does not change the rows' order: + -- for two different rows, id1 != id2, so + -- (target1, ..., date1) < (target2, ..., date2) + -- <=> (target1, ..., date1, id1) < (target2, ..., date2, id2) + -- + -- b) the id is a hash of all the columns, so: + -- rows are the same + -- <=> id1 == id2 + -- <=> (target1, ..., date1, id1) == (target2, ..., date2, id2) );""", """ CREATE TABLE IF NOT EXISTS object_count ( diff --git a/swh/storage/cassandra/storage.py b/swh/storage/cassandra/storage.py --- a/swh/storage/cassandra/storage.py +++ b/swh/storage/cassandra/storage.py @@ -1196,6 +1196,7 @@ try: row = RawExtrinsicMetadataRow( + id=metadata_entry.id, type=metadata_entry.target.object_type.name.lower(), target=str(metadata_entry.target), authority_type=metadata_entry.authority.type.value, @@ -1226,20 +1227,13 @@ limit: int = 1000, ) -> PagedResult[RawExtrinsicMetadata]: if page_token is not None: - (after_date, after_fetcher_name, after_fetcher_url) = msgpack_loads( - base64.b64decode(page_token) - ) + (after_date, id_) = msgpack_loads(base64.b64decode(page_token)) if after and after_date < after: raise StorageArgumentException( "page_token is inconsistent with the value of 'after'." ) - entries = self._cql_runner.raw_extrinsic_metadata_get_after_date_and_fetcher( # noqa - str(target), - authority.type.value, - authority.url, - after_date, - after_fetcher_name, - after_fetcher_url, + entries = self._cql_runner.raw_extrinsic_metadata_get_after_date_and_id( + str(target), authority.type.value, authority.url, after_date, id_, ) elif after is not None: entries = self._cql_runner.raw_extrinsic_metadata_get_after_date( @@ -1287,13 +1281,7 @@ assert len(results) == limit last_result = results[-1] next_page_token: Optional[str] = base64.b64encode( - msgpack_dumps( - ( - last_result.discovery_date, - last_result.fetcher.name, - last_result.fetcher.version, - ) - ) + msgpack_dumps((last_result.discovery_date, last_result.id,)) ).decode() else: next_page_token = None diff --git a/swh/storage/in_memory.py b/swh/storage/in_memory.py --- a/swh/storage/in_memory.py +++ b/swh/storage/in_memory.py @@ -599,23 +599,22 @@ ) return (m for m in metadata if m.discovery_date > after) - def raw_extrinsic_metadata_get_after_date_and_fetcher( + def raw_extrinsic_metadata_get_after_date_and_id( self, target: str, authority_type: str, authority_url: str, after_date: datetime.datetime, - after_fetcher_name: str, - after_fetcher_version: str, + after_id: bytes, ) -> Iterable[RawExtrinsicMetadataRow]: metadata = self._raw_extrinsic_metadata.get_from_partition_key((target,)) - after_tuple = (after_date, after_fetcher_name, after_fetcher_version) + after_tuple = (after_date, after_id) return ( m for m in metadata if m.authority_type == authority_type and m.authority_url == authority_url - and (m.discovery_date, m.fetcher_name, m.fetcher_version) > after_tuple + and (m.discovery_date, m.id) > after_tuple ) def raw_extrinsic_metadata_get( diff --git a/swh/storage/postgresql/db.py b/swh/storage/postgresql/db.py --- a/swh/storage/postgresql/db.py +++ b/swh/storage/postgresql/db.py @@ -29,7 +29,7 @@ """ - current_version = 171 + current_version = 172 def mktemp_dir_entry(self, entry_type, cur=None): self._cursor(cur).execute( @@ -1226,7 +1226,7 @@ INSERT INTO raw_extrinsic_metadata ({', '.join(_raw_extrinsic_metadata_insert_cols)}) VALUES ({', '.join('%s' for _ in _raw_extrinsic_metadata_insert_cols)}) - ON CONFLICT (target, authority_id, discovery_date, fetcher_id) + ON CONFLICT (id) DO NOTHING """ diff --git a/swh/storage/sql/30-schema.sql b/swh/storage/sql/30-schema.sql --- a/swh/storage/sql/30-schema.sql +++ b/swh/storage/sql/30-schema.sql @@ -17,7 +17,7 @@ -- latest schema version insert into dbversion(version, release, description) - values(171, now(), 'Work In Progress'); + values(172, now(), 'Work In Progress'); -- a SHA1 checksum create domain sha1 as bytea check (length(value) = 20); diff --git a/swh/storage/sql/60-indexes.sql b/swh/storage/sql/60-indexes.sql --- a/swh/storage/sql/60-indexes.sql +++ b/swh/storage/sql/60-indexes.sql @@ -268,7 +268,7 @@ create unique index concurrently raw_extrinsic_metadata_pkey on raw_extrinsic_metadata(id); alter table raw_extrinsic_metadata add primary key using index raw_extrinsic_metadata_pkey; -create unique index concurrently raw_extrinsic_metadata_content_authority_date_fetcher on raw_extrinsic_metadata(target, authority_id, discovery_date, fetcher_id); +create index concurrently raw_extrinsic_metadata_content_authority_date on raw_extrinsic_metadata(target, authority_id, discovery_date); \if :dbflavor_default alter table raw_extrinsic_metadata add constraint raw_extrinsic_metadata_authority_fkey foreign key (authority_id) references metadata_authority(id) not valid; diff --git a/swh/storage/tests/storage_tests.py b/swh/storage/tests/storage_tests.py --- a/swh/storage/tests/storage_tests.py +++ b/swh/storage/tests/storage_tests.py @@ -3771,37 +3771,28 @@ assert obj in actual_objects def test_content_metadata_add_duplicate(self, swh_storage, sample_data): - """Duplicates should be silently updated.""" + """Duplicates should be silently ignored.""" content = sample_data.content fetcher = sample_data.metadata_fetcher authority = sample_data.metadata_authority content_metadata, content_metadata2 = sample_data.content_metadata[:2] - new_content_metadata2 = RawExtrinsicMetadata.from_dict( - { - **remove_keys(content_metadata2.to_dict(), ("id",)), # recompute id - "format": "new-format", - "metadata": b"new-metadata", - } - ) - swh_storage.metadata_fetcher_add([fetcher]) swh_storage.metadata_authority_add([authority]) swh_storage.raw_extrinsic_metadata_add([content_metadata, content_metadata2]) - swh_storage.raw_extrinsic_metadata_add([new_content_metadata2]) + swh_storage.raw_extrinsic_metadata_add([content_metadata2, content_metadata]) result = swh_storage.raw_extrinsic_metadata_get( content.swhid().to_extended(), authority ) assert result.next_page_token is None - expected_results1 = (content_metadata, new_content_metadata2) - expected_results2 = (content_metadata, content_metadata2) + expected_results = (content_metadata, content_metadata2) - assert tuple(sorted(result.results, key=lambda x: x.discovery_date,)) in ( - expected_results1, # cassandra - expected_results2, # postgresql + assert ( + tuple(sorted(result.results, key=lambda x: x.discovery_date,)) + == expected_results ) def test_content_metadata_get(self, swh_storage, sample_data): @@ -3996,19 +3987,11 @@ origin_metadata, origin_metadata2 = sample_data.origin_metadata[:2] assert swh_storage.origin_add([origin]) == {"origin:add": 1} - new_origin_metadata2 = RawExtrinsicMetadata.from_dict( - { - **remove_keys(origin_metadata2.to_dict(), ("id",)), # recompute id - "format": "new-format", - "metadata": b"new-metadata", - } - ) - swh_storage.metadata_fetcher_add([fetcher]) swh_storage.metadata_authority_add([authority]) swh_storage.raw_extrinsic_metadata_add([origin_metadata, origin_metadata2]) - swh_storage.raw_extrinsic_metadata_add([new_origin_metadata2]) + swh_storage.raw_extrinsic_metadata_add([origin_metadata2, origin_metadata]) result = swh_storage.raw_extrinsic_metadata_get( Origin(origin.url).swhid(), authority @@ -4016,12 +3999,11 @@ assert result.next_page_token is None # which of the two behavior happens is backend-specific. - expected_results1 = (origin_metadata, new_origin_metadata2) - expected_results2 = (origin_metadata, origin_metadata2) + expected_results = (origin_metadata, origin_metadata2) - assert tuple(sorted(result.results, key=lambda x: x.discovery_date,)) in ( - expected_results1, # cassandra - expected_results2, # postgresql + assert ( + tuple(sorted(result.results, key=lambda x: x.discovery_date,)) + == expected_results ) def test_origin_metadata_get(self, swh_storage, sample_data):