D5030.diff
No OneTemporary
Actions

Size

13 KB

Subscribers

None

D5030.diff
View Options

	diff --git a/sql/upgrades/172.sql b/sql/upgrades/172.sql
	new file mode 100644
	--- /dev/null
	+++ b/sql/upgrades/172.sql
	@@ -0,0 +1,25 @@
	+-- SWH DB schema upgrade
	+-- from_version: 171
	+-- to_version: 172
	+-- description: add raw_extrinsic_metadata.id
	+
	+insert into dbversion(version, release, description)
	+ values(172, now(), 'Work In Progress');
	+-- 1. restart swh-storage, so that it stops relying on
	+-- raw_extrinsic_metadata_content_authority_date_fetcher being a unique index
	+
	+-- 2. rename old index
	+
	+alter index raw_extrinsic_metadata_content_authority_date_fetcher
	+ rename to raw_extrinsic_metadata_content_authority_date_fetcher_unique;
	+
	+-- 3. create new (non-unique) index (excluding fetcher, because it was only needed
	+-- for uniqueness and not for accesses
	+
	+create index concurrently raw_extrinsic_metadata_content_authority_date
	+ on raw_extrinsic_metadata(target, authority_id, discovery_date);
	+
	+-- 4. remove old index
	+
	+drop index raw_extrinsic_metadata_content_authority_date_fetcher_unique;
	+
	diff --git a/swh/storage/cassandra/cql.py b/swh/storage/cassandra/cql.py
	--- a/swh/storage/cassandra/cql.py
	+++ b/swh/storage/cassandra/cql.py
	@@ -1016,16 +1016,15 @@
	@_prepared_select_statement(
	RawExtrinsicMetadataRow,
	"WHERE target=? AND authority_type=? AND authority_url=? "
	- "AND (discovery_date, fetcher_name, fetcher_version) > (?, ?, ?)",
	+ "AND (discovery_date, id) > (?, ?)",
	)
	- def raw_extrinsic_metadata_get_after_date_and_fetcher(
	+ def raw_extrinsic_metadata_get_after_date_and_id(
	self,
	target: str,
	authority_type: str,
	authority_url: str,
	after_date: datetime.datetime,
	- after_fetcher_name: str,
	- after_fetcher_version: str,
	+ after_id: bytes,
	*,
	statement,
	) -> Iterable[RawExtrinsicMetadataRow]:
	@@ -1033,14 +1032,7 @@
	RawExtrinsicMetadataRow.from_dict,
	self._execute_with_retries(
	statement,
	- [
	- target,
	- authority_type,
	- authority_url,
	- after_date,
	- after_fetcher_name,
	- after_fetcher_version,
	- ],
	+ [target, authority_type, authority_url, after_date, after_id,],
	),
	)

	diff --git a/swh/storage/cassandra/model.py b/swh/storage/cassandra/model.py
	--- a/swh/storage/cassandra/model.py
	+++ b/swh/storage/cassandra/model.py
	@@ -246,10 +246,11 @@
	"authority_type",
	"authority_url",
	"discovery_date",
	- "fetcher_name",
	- "fetcher_version",
	+ "id",
	)

	+ id: bytes
	+
	type: str
	target: str

	diff --git a/swh/storage/cassandra/schema.py b/swh/storage/cassandra/schema.py
	--- a/swh/storage/cassandra/schema.py
	+++ b/swh/storage/cassandra/schema.py
	@@ -188,6 +188,8 @@
	);""",
	"""
	CREATE TABLE IF NOT EXISTS raw_extrinsic_metadata (
	+ id blob,
	+
	type text,
	target text,

	@@ -211,8 +213,33 @@
	path blob,
	directory text,

	- PRIMARY KEY ((target), authority_type, authority_url, discovery_date,
	- fetcher_name, fetcher_version)
	+ PRIMARY KEY ((target), authority_type, authority_url, discovery_date, id)
	+
	+ -- An explanation is in order for this primary key:
	+ --
	+ -- Intuitively, the primary key should only be 'id', because two metadata
	+ -- entries are the same iff the id is the same; and 'id' is used for
	+ -- deduplication.
	+ --
	+ -- However, we also want to query by
	+ -- (target, authority_type, authority_url, discovery_date)
	+ -- The naive solution to this would be an extra table, to use as index;
	+ -- but it means 1. extra code to keep them in sync 2. overhead when writing
	+ -- 3. overhead + random reads (instead of linear) when reading.
	+ --
	+ -- Therefore, we use a single table for both, by adding the column
	+ -- we want to query with before the id.
	+ -- It solves both a) the query/order issues and b) the uniqueness issue because:
	+ --
	+ -- a) adding the id at the end of the primary key does not change the rows' order:
	+ -- for two different rows, id1 != id2, so
	+ -- (target1, ..., date1) < (target2, ..., date2)
	+ -- <=> (target1, ..., date1, id1) < (target2, ..., date2, id2)
	+ --
	+ -- b) the id is a hash of all the columns, so:
	+ -- rows are the same
	+ -- <=> id1 == id2
	+ -- <=> (target1, ..., date1, id1) == (target2, ..., date2, id2)
	);""",
	"""
	CREATE TABLE IF NOT EXISTS object_count (
	diff --git a/swh/storage/cassandra/storage.py b/swh/storage/cassandra/storage.py
	--- a/swh/storage/cassandra/storage.py
	+++ b/swh/storage/cassandra/storage.py
	@@ -1196,6 +1196,7 @@

	try:
	row = RawExtrinsicMetadataRow(
	+ id=metadata_entry.id,
	type=metadata_entry.target.object_type.name.lower(),
	target=str(metadata_entry.target),
	authority_type=metadata_entry.authority.type.value,
	@@ -1226,20 +1227,13 @@
	limit: int = 1000,
	) -> PagedResult[RawExtrinsicMetadata]:
	if page_token is not None:
	- (after_date, after_fetcher_name, after_fetcher_url) = msgpack_loads(
	- base64.b64decode(page_token)
	- )
	+ (after_date, id_) = msgpack_loads(base64.b64decode(page_token))
	if after and after_date < after:
	raise StorageArgumentException(
	"page_token is inconsistent with the value of 'after'."
	)
	- entries = self._cql_runner.raw_extrinsic_metadata_get_after_date_and_fetcher( # noqa
	- str(target),
	- authority.type.value,
	- authority.url,
	- after_date,
	- after_fetcher_name,
	- after_fetcher_url,
	+ entries = self._cql_runner.raw_extrinsic_metadata_get_after_date_and_id(
	+ str(target), authority.type.value, authority.url, after_date, id_,
	)
	elif after is not None:
	entries = self._cql_runner.raw_extrinsic_metadata_get_after_date(
	@@ -1287,13 +1281,7 @@
	assert len(results) == limit
	last_result = results[-1]
	next_page_token: Optional[str] = base64.b64encode(
	- msgpack_dumps(
	- (
	- last_result.discovery_date,
	- last_result.fetcher.name,
	- last_result.fetcher.version,
	- )
	- )
	+ msgpack_dumps((last_result.discovery_date, last_result.id,))
	).decode()
	else:
	next_page_token = None
	diff --git a/swh/storage/in_memory.py b/swh/storage/in_memory.py
	--- a/swh/storage/in_memory.py
	+++ b/swh/storage/in_memory.py
	@@ -599,23 +599,22 @@
	)
	return (m for m in metadata if m.discovery_date > after)

	- def raw_extrinsic_metadata_get_after_date_and_fetcher(
	+ def raw_extrinsic_metadata_get_after_date_and_id(
	self,
	target: str,
	authority_type: str,
	authority_url: str,
	after_date: datetime.datetime,
	- after_fetcher_name: str,
	- after_fetcher_version: str,
	+ after_id: bytes,
	) -> Iterable[RawExtrinsicMetadataRow]:
	metadata = self._raw_extrinsic_metadata.get_from_partition_key((target,))
	- after_tuple = (after_date, after_fetcher_name, after_fetcher_version)
	+ after_tuple = (after_date, after_id)
	return (
	m
	for m in metadata
	if m.authority_type == authority_type
	and m.authority_url == authority_url
	- and (m.discovery_date, m.fetcher_name, m.fetcher_version) > after_tuple
	+ and (m.discovery_date, m.id) > after_tuple
	)

	def raw_extrinsic_metadata_get(
	diff --git a/swh/storage/postgresql/db.py b/swh/storage/postgresql/db.py
	--- a/swh/storage/postgresql/db.py
	+++ b/swh/storage/postgresql/db.py
	@@ -29,7 +29,7 @@

	"""

	- current_version = 171
	+ current_version = 172

	def mktemp_dir_entry(self, entry_type, cur=None):
	self._cursor(cur).execute(
	@@ -1226,7 +1226,7 @@
	INSERT INTO raw_extrinsic_metadata
	({', '.join(_raw_extrinsic_metadata_insert_cols)})
	VALUES ({', '.join('%s' for _ in _raw_extrinsic_metadata_insert_cols)})
	- ON CONFLICT (target, authority_id, discovery_date, fetcher_id)
	+ ON CONFLICT (id)
	DO NOTHING
	"""

	diff --git a/swh/storage/sql/30-schema.sql b/swh/storage/sql/30-schema.sql
	--- a/swh/storage/sql/30-schema.sql
	+++ b/swh/storage/sql/30-schema.sql
	@@ -17,7 +17,7 @@

	-- latest schema version
	insert into dbversion(version, release, description)
	- values(171, now(), 'Work In Progress');
	+ values(172, now(), 'Work In Progress');

	-- a SHA1 checksum
	create domain sha1 as bytea check (length(value) = 20);
	diff --git a/swh/storage/sql/60-indexes.sql b/swh/storage/sql/60-indexes.sql
	--- a/swh/storage/sql/60-indexes.sql
	+++ b/swh/storage/sql/60-indexes.sql
	@@ -268,7 +268,7 @@
	create unique index concurrently raw_extrinsic_metadata_pkey on raw_extrinsic_metadata(id);
	alter table raw_extrinsic_metadata add primary key using index raw_extrinsic_metadata_pkey;

	-create unique index concurrently raw_extrinsic_metadata_content_authority_date_fetcher on raw_extrinsic_metadata(target, authority_id, discovery_date, fetcher_id);
	+create index concurrently raw_extrinsic_metadata_content_authority_date on raw_extrinsic_metadata(target, authority_id, discovery_date);

	\if :dbflavor_default
	alter table raw_extrinsic_metadata add constraint raw_extrinsic_metadata_authority_fkey foreign key (authority_id) references metadata_authority(id) not valid;
	diff --git a/swh/storage/tests/storage_tests.py b/swh/storage/tests/storage_tests.py
	--- a/swh/storage/tests/storage_tests.py
	+++ b/swh/storage/tests/storage_tests.py
	@@ -3771,37 +3771,28 @@
	assert obj in actual_objects

	def test_content_metadata_add_duplicate(self, swh_storage, sample_data):
	- """Duplicates should be silently updated."""
	+ """Duplicates should be silently ignored."""
	content = sample_data.content
	fetcher = sample_data.metadata_fetcher
	authority = sample_data.metadata_authority
	content_metadata, content_metadata2 = sample_data.content_metadata[:2]

	- new_content_metadata2 = RawExtrinsicMetadata.from_dict(
	- {
	- **remove_keys(content_metadata2.to_dict(), ("id",)), # recompute id
	- "format": "new-format",
	- "metadata": b"new-metadata",
	- }
	- )
	-
	swh_storage.metadata_fetcher_add([fetcher])
	swh_storage.metadata_authority_add([authority])

	swh_storage.raw_extrinsic_metadata_add([content_metadata, content_metadata2])
	- swh_storage.raw_extrinsic_metadata_add([new_content_metadata2])
	+ swh_storage.raw_extrinsic_metadata_add([content_metadata2, content_metadata])

	result = swh_storage.raw_extrinsic_metadata_get(
	content.swhid().to_extended(), authority
	)
	assert result.next_page_token is None

	- expected_results1 = (content_metadata, new_content_metadata2)
	- expected_results2 = (content_metadata, content_metadata2)
	+ expected_results = (content_metadata, content_metadata2)

	- assert tuple(sorted(result.results, key=lambda x: x.discovery_date,)) in (
	- expected_results1, # cassandra
	- expected_results2, # postgresql
	+ assert (
	+ tuple(sorted(result.results, key=lambda x: x.discovery_date,))
	+ == expected_results
	)

	def test_content_metadata_get(self, swh_storage, sample_data):
	@@ -3996,19 +3987,11 @@
	origin_metadata, origin_metadata2 = sample_data.origin_metadata[:2]
	assert swh_storage.origin_add([origin]) == {"origin:add": 1}

	- new_origin_metadata2 = RawExtrinsicMetadata.from_dict(
	- {
	- **remove_keys(origin_metadata2.to_dict(), ("id",)), # recompute id
	- "format": "new-format",
	- "metadata": b"new-metadata",
	- }
	- )
	-
	swh_storage.metadata_fetcher_add([fetcher])
	swh_storage.metadata_authority_add([authority])

	swh_storage.raw_extrinsic_metadata_add([origin_metadata, origin_metadata2])
	- swh_storage.raw_extrinsic_metadata_add([new_origin_metadata2])
	+ swh_storage.raw_extrinsic_metadata_add([origin_metadata2, origin_metadata])

	result = swh_storage.raw_extrinsic_metadata_get(
	Origin(origin.url).swhid(), authority
	@@ -4016,12 +3999,11 @@
	assert result.next_page_token is None

	# which of the two behavior happens is backend-specific.
	- expected_results1 = (origin_metadata, new_origin_metadata2)
	- expected_results2 = (origin_metadata, origin_metadata2)
	+ expected_results = (origin_metadata, origin_metadata2)

	- assert tuple(sorted(result.results, key=lambda x: x.discovery_date,)) in (
	- expected_results1, # cassandra
	- expected_results2, # postgresql
	+ assert (
	+ tuple(sorted(result.results, key=lambda x: x.discovery_date,))
	+ == expected_results
	)

	def test_origin_metadata_get(self, swh_storage, sample_data):

File Metadata

Mime Type: text/plain
Expires: Mar 17 2025, 7:39 PM (7 w, 3 d ago)
Storage Engine: blob
Storage Format: Raw Data
Storage Handle: 3218676

D5030.diffNo OneTemporaryActions

D5030.diffView Options

File Metadata

Event Timeline

D5030.diff
No OneTemporary
Actions

D5030.diff
View Options