Page MenuHomeSoftware Heritage

D5030.diff
No OneTemporary

D5030.diff

diff --git a/sql/upgrades/172.sql b/sql/upgrades/172.sql
new file mode 100644
--- /dev/null
+++ b/sql/upgrades/172.sql
@@ -0,0 +1,25 @@
+-- SWH DB schema upgrade
+-- from_version: 171
+-- to_version: 172
+-- description: add raw_extrinsic_metadata.id
+
+insert into dbversion(version, release, description)
+ values(172, now(), 'Work In Progress');
+-- 1. restart swh-storage, so that it stops relying on
+-- raw_extrinsic_metadata_content_authority_date_fetcher being a unique index
+
+-- 2. rename old index
+
+alter index raw_extrinsic_metadata_content_authority_date_fetcher
+ rename to raw_extrinsic_metadata_content_authority_date_fetcher_unique;
+
+-- 3. create new (non-unique) index (excluding fetcher, because it was only needed
+-- for uniqueness and not for accesses
+
+create index concurrently raw_extrinsic_metadata_content_authority_date
+ on raw_extrinsic_metadata(target, authority_id, discovery_date);
+
+-- 4. remove old index
+
+drop index raw_extrinsic_metadata_content_authority_date_fetcher_unique;
+
diff --git a/swh/storage/cassandra/cql.py b/swh/storage/cassandra/cql.py
--- a/swh/storage/cassandra/cql.py
+++ b/swh/storage/cassandra/cql.py
@@ -1016,16 +1016,15 @@
@_prepared_select_statement(
RawExtrinsicMetadataRow,
"WHERE target=? AND authority_type=? AND authority_url=? "
- "AND (discovery_date, fetcher_name, fetcher_version) > (?, ?, ?)",
+ "AND (discovery_date, id) > (?, ?)",
)
- def raw_extrinsic_metadata_get_after_date_and_fetcher(
+ def raw_extrinsic_metadata_get_after_date_and_id(
self,
target: str,
authority_type: str,
authority_url: str,
after_date: datetime.datetime,
- after_fetcher_name: str,
- after_fetcher_version: str,
+ after_id: bytes,
*,
statement,
) -> Iterable[RawExtrinsicMetadataRow]:
@@ -1033,14 +1032,7 @@
RawExtrinsicMetadataRow.from_dict,
self._execute_with_retries(
statement,
- [
- target,
- authority_type,
- authority_url,
- after_date,
- after_fetcher_name,
- after_fetcher_version,
- ],
+ [target, authority_type, authority_url, after_date, after_id,],
),
)
diff --git a/swh/storage/cassandra/model.py b/swh/storage/cassandra/model.py
--- a/swh/storage/cassandra/model.py
+++ b/swh/storage/cassandra/model.py
@@ -246,10 +246,11 @@
"authority_type",
"authority_url",
"discovery_date",
- "fetcher_name",
- "fetcher_version",
+ "id",
)
+ id: bytes
+
type: str
target: str
diff --git a/swh/storage/cassandra/schema.py b/swh/storage/cassandra/schema.py
--- a/swh/storage/cassandra/schema.py
+++ b/swh/storage/cassandra/schema.py
@@ -188,6 +188,8 @@
);""",
"""
CREATE TABLE IF NOT EXISTS raw_extrinsic_metadata (
+ id blob,
+
type text,
target text,
@@ -211,8 +213,33 @@
path blob,
directory text,
- PRIMARY KEY ((target), authority_type, authority_url, discovery_date,
- fetcher_name, fetcher_version)
+ PRIMARY KEY ((target), authority_type, authority_url, discovery_date, id)
+
+ -- An explanation is in order for this primary key:
+ --
+ -- Intuitively, the primary key should only be 'id', because two metadata
+ -- entries are the same iff the id is the same; and 'id' is used for
+ -- deduplication.
+ --
+ -- However, we also want to query by
+ -- (target, authority_type, authority_url, discovery_date)
+ -- The naive solution to this would be an extra table, to use as index;
+ -- but it means 1. extra code to keep them in sync 2. overhead when writing
+ -- 3. overhead + random reads (instead of linear) when reading.
+ --
+ -- Therefore, we use a single table for both, by adding the column
+ -- we want to query with before the id.
+ -- It solves both a) the query/order issues and b) the uniqueness issue because:
+ --
+ -- a) adding the id at the end of the primary key does not change the rows' order:
+ -- for two different rows, id1 != id2, so
+ -- (target1, ..., date1) < (target2, ..., date2)
+ -- <=> (target1, ..., date1, id1) < (target2, ..., date2, id2)
+ --
+ -- b) the id is a hash of all the columns, so:
+ -- rows are the same
+ -- <=> id1 == id2
+ -- <=> (target1, ..., date1, id1) == (target2, ..., date2, id2)
);""",
"""
CREATE TABLE IF NOT EXISTS object_count (
diff --git a/swh/storage/cassandra/storage.py b/swh/storage/cassandra/storage.py
--- a/swh/storage/cassandra/storage.py
+++ b/swh/storage/cassandra/storage.py
@@ -1196,6 +1196,7 @@
try:
row = RawExtrinsicMetadataRow(
+ id=metadata_entry.id,
type=metadata_entry.target.object_type.name.lower(),
target=str(metadata_entry.target),
authority_type=metadata_entry.authority.type.value,
@@ -1226,20 +1227,13 @@
limit: int = 1000,
) -> PagedResult[RawExtrinsicMetadata]:
if page_token is not None:
- (after_date, after_fetcher_name, after_fetcher_url) = msgpack_loads(
- base64.b64decode(page_token)
- )
+ (after_date, id_) = msgpack_loads(base64.b64decode(page_token))
if after and after_date < after:
raise StorageArgumentException(
"page_token is inconsistent with the value of 'after'."
)
- entries = self._cql_runner.raw_extrinsic_metadata_get_after_date_and_fetcher( # noqa
- str(target),
- authority.type.value,
- authority.url,
- after_date,
- after_fetcher_name,
- after_fetcher_url,
+ entries = self._cql_runner.raw_extrinsic_metadata_get_after_date_and_id(
+ str(target), authority.type.value, authority.url, after_date, id_,
)
elif after is not None:
entries = self._cql_runner.raw_extrinsic_metadata_get_after_date(
@@ -1287,13 +1281,7 @@
assert len(results) == limit
last_result = results[-1]
next_page_token: Optional[str] = base64.b64encode(
- msgpack_dumps(
- (
- last_result.discovery_date,
- last_result.fetcher.name,
- last_result.fetcher.version,
- )
- )
+ msgpack_dumps((last_result.discovery_date, last_result.id,))
).decode()
else:
next_page_token = None
diff --git a/swh/storage/in_memory.py b/swh/storage/in_memory.py
--- a/swh/storage/in_memory.py
+++ b/swh/storage/in_memory.py
@@ -599,23 +599,22 @@
)
return (m for m in metadata if m.discovery_date > after)
- def raw_extrinsic_metadata_get_after_date_and_fetcher(
+ def raw_extrinsic_metadata_get_after_date_and_id(
self,
target: str,
authority_type: str,
authority_url: str,
after_date: datetime.datetime,
- after_fetcher_name: str,
- after_fetcher_version: str,
+ after_id: bytes,
) -> Iterable[RawExtrinsicMetadataRow]:
metadata = self._raw_extrinsic_metadata.get_from_partition_key((target,))
- after_tuple = (after_date, after_fetcher_name, after_fetcher_version)
+ after_tuple = (after_date, after_id)
return (
m
for m in metadata
if m.authority_type == authority_type
and m.authority_url == authority_url
- and (m.discovery_date, m.fetcher_name, m.fetcher_version) > after_tuple
+ and (m.discovery_date, m.id) > after_tuple
)
def raw_extrinsic_metadata_get(
diff --git a/swh/storage/postgresql/db.py b/swh/storage/postgresql/db.py
--- a/swh/storage/postgresql/db.py
+++ b/swh/storage/postgresql/db.py
@@ -29,7 +29,7 @@
"""
- current_version = 171
+ current_version = 172
def mktemp_dir_entry(self, entry_type, cur=None):
self._cursor(cur).execute(
@@ -1226,7 +1226,7 @@
INSERT INTO raw_extrinsic_metadata
({', '.join(_raw_extrinsic_metadata_insert_cols)})
VALUES ({', '.join('%s' for _ in _raw_extrinsic_metadata_insert_cols)})
- ON CONFLICT (target, authority_id, discovery_date, fetcher_id)
+ ON CONFLICT (id)
DO NOTHING
"""
diff --git a/swh/storage/sql/30-schema.sql b/swh/storage/sql/30-schema.sql
--- a/swh/storage/sql/30-schema.sql
+++ b/swh/storage/sql/30-schema.sql
@@ -17,7 +17,7 @@
-- latest schema version
insert into dbversion(version, release, description)
- values(171, now(), 'Work In Progress');
+ values(172, now(), 'Work In Progress');
-- a SHA1 checksum
create domain sha1 as bytea check (length(value) = 20);
diff --git a/swh/storage/sql/60-indexes.sql b/swh/storage/sql/60-indexes.sql
--- a/swh/storage/sql/60-indexes.sql
+++ b/swh/storage/sql/60-indexes.sql
@@ -268,7 +268,7 @@
create unique index concurrently raw_extrinsic_metadata_pkey on raw_extrinsic_metadata(id);
alter table raw_extrinsic_metadata add primary key using index raw_extrinsic_metadata_pkey;
-create unique index concurrently raw_extrinsic_metadata_content_authority_date_fetcher on raw_extrinsic_metadata(target, authority_id, discovery_date, fetcher_id);
+create index concurrently raw_extrinsic_metadata_content_authority_date on raw_extrinsic_metadata(target, authority_id, discovery_date);
\if :dbflavor_default
alter table raw_extrinsic_metadata add constraint raw_extrinsic_metadata_authority_fkey foreign key (authority_id) references metadata_authority(id) not valid;
diff --git a/swh/storage/tests/storage_tests.py b/swh/storage/tests/storage_tests.py
--- a/swh/storage/tests/storage_tests.py
+++ b/swh/storage/tests/storage_tests.py
@@ -3771,37 +3771,28 @@
assert obj in actual_objects
def test_content_metadata_add_duplicate(self, swh_storage, sample_data):
- """Duplicates should be silently updated."""
+ """Duplicates should be silently ignored."""
content = sample_data.content
fetcher = sample_data.metadata_fetcher
authority = sample_data.metadata_authority
content_metadata, content_metadata2 = sample_data.content_metadata[:2]
- new_content_metadata2 = RawExtrinsicMetadata.from_dict(
- {
- **remove_keys(content_metadata2.to_dict(), ("id",)), # recompute id
- "format": "new-format",
- "metadata": b"new-metadata",
- }
- )
-
swh_storage.metadata_fetcher_add([fetcher])
swh_storage.metadata_authority_add([authority])
swh_storage.raw_extrinsic_metadata_add([content_metadata, content_metadata2])
- swh_storage.raw_extrinsic_metadata_add([new_content_metadata2])
+ swh_storage.raw_extrinsic_metadata_add([content_metadata2, content_metadata])
result = swh_storage.raw_extrinsic_metadata_get(
content.swhid().to_extended(), authority
)
assert result.next_page_token is None
- expected_results1 = (content_metadata, new_content_metadata2)
- expected_results2 = (content_metadata, content_metadata2)
+ expected_results = (content_metadata, content_metadata2)
- assert tuple(sorted(result.results, key=lambda x: x.discovery_date,)) in (
- expected_results1, # cassandra
- expected_results2, # postgresql
+ assert (
+ tuple(sorted(result.results, key=lambda x: x.discovery_date,))
+ == expected_results
)
def test_content_metadata_get(self, swh_storage, sample_data):
@@ -3996,19 +3987,11 @@
origin_metadata, origin_metadata2 = sample_data.origin_metadata[:2]
assert swh_storage.origin_add([origin]) == {"origin:add": 1}
- new_origin_metadata2 = RawExtrinsicMetadata.from_dict(
- {
- **remove_keys(origin_metadata2.to_dict(), ("id",)), # recompute id
- "format": "new-format",
- "metadata": b"new-metadata",
- }
- )
-
swh_storage.metadata_fetcher_add([fetcher])
swh_storage.metadata_authority_add([authority])
swh_storage.raw_extrinsic_metadata_add([origin_metadata, origin_metadata2])
- swh_storage.raw_extrinsic_metadata_add([new_origin_metadata2])
+ swh_storage.raw_extrinsic_metadata_add([origin_metadata2, origin_metadata])
result = swh_storage.raw_extrinsic_metadata_get(
Origin(origin.url).swhid(), authority
@@ -4016,12 +3999,11 @@
assert result.next_page_token is None
# which of the two behavior happens is backend-specific.
- expected_results1 = (origin_metadata, new_origin_metadata2)
- expected_results2 = (origin_metadata, origin_metadata2)
+ expected_results = (origin_metadata, origin_metadata2)
- assert tuple(sorted(result.results, key=lambda x: x.discovery_date,)) in (
- expected_results1, # cassandra
- expected_results2, # postgresql
+ assert (
+ tuple(sorted(result.results, key=lambda x: x.discovery_date,))
+ == expected_results
)
def test_origin_metadata_get(self, swh_storage, sample_data):

File Metadata

Mime Type
text/plain
Expires
Mar 17 2025, 7:39 PM (7 w, 3 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3218676

Event Timeline