Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F7123076
D3356.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
14 KB
Subscribers
None
D3356.diff
View Options
diff --git a/sql/upgrades/157.sql b/sql/upgrades/157.sql
--- a/sql/upgrades/157.sql
+++ b/sql/upgrades/157.sql
@@ -44,3 +44,20 @@
create unique index object_metadata_content_authority_date_fetcher
on object_metadata(id, authority_id, discovery_date, fetcher_id);
+
+
+-- Add context columns
+alter table object_metadata
+ add column origin text;
+alter table object_metadata
+ add column visit bigint;
+alter table object_metadata
+ add column snapshot swhid;
+alter table object_metadata
+ add column release swhid;
+alter table object_metadata
+ add column revision swhid;
+alter table object_metadata
+ add column path bytea;
+alter table object_metadata
+ add column directory swhid;
diff --git a/swh/storage/cassandra/cql.py b/swh/storage/cassandra/cql.py
--- a/swh/storage/cassandra/cql.py
+++ b/swh/storage/cassandra/cql.py
@@ -18,6 +18,7 @@
Optional,
Tuple,
TypeVar,
+ Union,
)
from cassandra import CoordinationFailure
@@ -880,6 +881,13 @@
"fetcher_version",
"format",
"metadata",
+ "origin",
+ "visit",
+ "snapshot",
+ "release",
+ "revision",
+ "path",
+ "directory",
]
@_prepared_statement(
@@ -897,6 +905,7 @@
fetcher_version,
format,
metadata,
+ context: Dict[str, Union[str, bytes, int]],
*,
statement,
):
diff --git a/swh/storage/cassandra/schema.py b/swh/storage/cassandra/schema.py
--- a/swh/storage/cassandra/schema.py
+++ b/swh/storage/cassandra/schema.py
@@ -219,6 +219,15 @@
format ascii,
metadata blob,
+ -- context
+ origin text,
+ visit bigint,
+ snapshot text,
+ release text,
+ revision text,
+ path blob,
+ directory text,
+
PRIMARY KEY ((id), authority_type, authority_url, discovery_date,
fetcher_name, fetcher_version)
);
diff --git a/swh/storage/cassandra/storage.py b/swh/storage/cassandra/storage.py
--- a/swh/storage/cassandra/storage.py
+++ b/swh/storage/cassandra/storage.py
@@ -8,7 +8,7 @@
import json
import random
import re
-from typing import Any, Dict, List, Iterable, Optional
+from typing import Any, Dict, List, Iterable, Optional, Union
import attr
from deprecated import deprecated
@@ -32,6 +32,7 @@
from swh.storage.utils import now
from ..exc import StorageArgumentException, HashCollision
+from ..extrinsic_metadata import check_extrinsic_metadata_context, CONTEXT_KEYS
from .common import TOKEN_BEGIN, TOKEN_END
from .converters import (
revision_to_db,
@@ -1030,8 +1031,18 @@
raise StorageArgumentException(
"origin_url must be str, not %r" % (origin_url,)
)
+
+ context: Dict[str, Union[str, bytes, int]] = {} # origins have no context
+
self._object_metadata_add(
- "origin", origin_url, discovery_date, authority, fetcher, format, metadata,
+ "origin",
+ origin_url,
+ discovery_date,
+ authority,
+ fetcher,
+ format,
+ metadata,
+ context,
)
def origin_metadata_get(
@@ -1062,7 +1073,10 @@
fetcher: Dict[str, Any],
format: str,
metadata: bytes,
+ context: Dict[str, Union[str, bytes, int]],
) -> None:
+ check_extrinsic_metadata_context(object_type, context)
+
if not self._cql_runner.metadata_authority_get(**authority):
raise StorageArgumentException(f"Unknown authority {authority}")
if not self._cql_runner.metadata_fetcher_get(**fetcher):
@@ -1079,6 +1093,7 @@
fetcher["version"],
format,
metadata,
+ context,
)
except TypeError as e:
raise StorageArgumentException(*e.args)
@@ -1139,6 +1154,14 @@
"metadata": entry.metadata,
}
+ if CONTEXT_KEYS[object_type]:
+ context = {}
+ for key in CONTEXT_KEYS[object_type]:
+ value = getattr(entry, key)
+ if value is not None:
+ context[key] = value
+ result["context"] = context
+
results.append(result)
if len(results) > limit:
diff --git a/swh/storage/db.py b/swh/storage/db.py
--- a/swh/storage/db.py
+++ b/swh/storage/db.py
@@ -6,7 +6,7 @@
import datetime
import random
import select
-from typing import Any, Dict, Iterable, List, Optional, Tuple
+from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
from swh.core.db import BaseDb
from swh.core.db.db_utils import stored_procedure, jsonize
@@ -1092,6 +1092,17 @@
def release_get_random(self, cur=None):
return self._get_random_row_from_table("release", ["id"], "id", cur)
+ _object_metadata_context_cols = [
+ "origin",
+ "visit",
+ "snapshot",
+ "release",
+ "revision",
+ "path",
+ "directory",
+ ]
+ """The list of context columns for all artifact types."""
+
_object_metadata_insert_cols = [
"type",
"id",
@@ -1100,6 +1111,7 @@
"discovery_date",
"format",
"metadata",
+ *_object_metadata_context_cols,
]
"""List of columns of the object_metadata table, used when writing
metadata."""
@@ -1122,6 +1134,7 @@
"metadata_fetcher.id",
"metadata_fetcher.name",
"metadata_fetcher.version",
+ *_object_metadata_context_cols,
"format",
"metadata",
]
@@ -1144,6 +1157,7 @@
self,
object_type: str,
id: str,
+ context: Dict[str, Union[str, bytes, int]],
discovery_date: datetime.datetime,
authority_id: int,
fetcher_id: int,
@@ -1161,6 +1175,9 @@
format=format,
metadata=metadata,
)
+ for col in self._object_metadata_context_cols:
+ args[col] = context.get(col)
+
params = [args[col] for col in self._object_metadata_insert_cols]
cur.execute(query, params)
diff --git a/swh/storage/extrinsic_metadata.py b/swh/storage/extrinsic_metadata.py
new file mode 100644
--- /dev/null
+++ b/swh/storage/extrinsic_metadata.py
@@ -0,0 +1,55 @@
+# Copyright (C) 2020 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+from typing import Any, cast, Dict
+
+from swh.model.identifiers import PersistentId, parse_persistent_identifier
+
+from .exc import StorageArgumentException
+
+CONTEXT_KEYS: Dict[str, Dict[str, type]] = {}
+CONTEXT_KEYS["origin"] = {}
+CONTEXT_KEYS["snapshot"] = {"origin": str, "visit": int}
+CONTEXT_KEYS["release"] = {**CONTEXT_KEYS["snapshot"], "snapshot": PersistentId}
+CONTEXT_KEYS["revision"] = {**CONTEXT_KEYS["release"], "release": PersistentId}
+CONTEXT_KEYS["directory"] = {
+ **CONTEXT_KEYS["revision"],
+ "revision": PersistentId,
+ "path": bytes,
+}
+CONTEXT_KEYS["content"] = {**CONTEXT_KEYS["directory"], "directory": PersistentId}
+
+
+def check_extrinsic_metadata_context(object_type: str, context: Dict[str, Any]):
+ key_types = CONTEXT_KEYS[object_type]
+
+ extra_keys = set(context) - set(key_types)
+ if extra_keys:
+ raise StorageArgumentException(f"Unknown context keys: {', '.join(extra_keys)}")
+
+ for (key, value) in context.items():
+ expected_type = key_types[key]
+ expected_type_str = str(expected_type) # for display
+
+ # If an SWHID is expected and a string is given, parse it
+ if expected_type is PersistentId and isinstance(value, str):
+ value = parse_persistent_identifier(value)
+ expected_type_str = "PersistentId or str"
+
+ # Check the type of the context value
+ if not isinstance(value, expected_type):
+ raise StorageArgumentException(
+ f"Context key {key} must have type {expected_type_str}, "
+ f"but is {value!r}"
+ )
+
+ # If it is an SWHID, check it is also a core SWHID.
+ if expected_type is PersistentId:
+ value = cast(PersistentId, value)
+ if value.metadata != {}:
+ raise StorageArgumentException(
+ f"Context key {key} must be a core SWHID, "
+ f"but it has qualifiers {', '.join(value.metadata)}."
+ )
diff --git a/swh/storage/in_memory.py b/swh/storage/in_memory.py
--- a/swh/storage/in_memory.py
+++ b/swh/storage/in_memory.py
@@ -25,6 +25,7 @@
Optional,
Tuple,
TypeVar,
+ Union,
)
import attr
@@ -49,9 +50,9 @@
from swh.storage.objstorage import ObjStorage
from swh.storage.utils import now
-from .exc import StorageArgumentException, HashCollision
-
from .converters import origin_url_to_sha1
+from .exc import StorageArgumentException, HashCollision
+from .extrinsic_metadata import check_extrinsic_metadata_context, CONTEXT_KEYS
from .utils import get_partition_bounds_bytes
from .writer import JournalWriter
@@ -1018,6 +1019,7 @@
def content_metadata_add(
self,
id: str,
+ context: Dict[str, Union[str, bytes, int]],
discovery_date: datetime.datetime,
authority: Dict[str, Any],
fetcher: Dict[str, Any],
@@ -1025,7 +1027,14 @@
metadata: bytes,
) -> None:
self._object_metadata_add(
- "content", id, discovery_date, authority, fetcher, format, metadata,
+ "content",
+ id,
+ discovery_date,
+ authority,
+ fetcher,
+ format,
+ metadata,
+ context,
)
def origin_metadata_add(
@@ -1041,8 +1050,18 @@
raise StorageArgumentException(
"origin_url must be str, not %r" % (origin_url,)
)
+
+ context: Dict[str, Union[str, bytes, int]] = {} # origins have no context
+
self._object_metadata_add(
- "origin", origin_url, discovery_date, authority, fetcher, format, metadata,
+ "origin",
+ origin_url,
+ discovery_date,
+ authority,
+ fetcher,
+ format,
+ metadata,
+ context,
)
def _object_metadata_add(
@@ -1054,7 +1073,9 @@
fetcher: Dict[str, Any],
format: str,
metadata: bytes,
+ context: Dict[str, Union[str, bytes, int]],
) -> None:
+ check_extrinsic_metadata_context(object_type, context)
if not isinstance(metadata, bytes):
raise StorageArgumentException(
"metadata must be bytes, not %r" % (metadata,)
@@ -1077,6 +1098,9 @@
"metadata": metadata,
}
+ if CONTEXT_KEYS[object_type]:
+ object_metadata["context"] = context
+
for existing_object_metadata in object_metadata_list:
if (
existing_object_metadata["fetcher"] == fetcher_key
diff --git a/swh/storage/sql/30-swh-schema.sql b/swh/storage/sql/30-swh-schema.sql
--- a/swh/storage/sql/30-swh-schema.sql
+++ b/swh/storage/sql/30-swh-schema.sql
@@ -436,7 +436,16 @@
-- metadata itself
format text not null,
- metadata bytea not null
+ metadata bytea not null,
+
+ -- context
+ origin text,
+ visit bigint,
+ snapshot swhid,
+ release swhid,
+ revision swhid,
+ path bytea,
+ directory swhid
);
comment on table object_metadata is 'keeps all metadata found concerning an object';
diff --git a/swh/storage/storage.py b/swh/storage/storage.py
--- a/swh/storage/storage.py
+++ b/swh/storage/storage.py
@@ -10,7 +10,7 @@
from collections import defaultdict
from contextlib import contextmanager
from deprecated import deprecated
-from typing import Any, Dict, Iterable, List, Optional
+from typing import Any, Dict, Iterable, List, Optional, Union
import attr
import psycopg2
@@ -36,6 +36,10 @@
from swh.storage.utils import now
from . import converters
+from .extrinsic_metadata import (
+ check_extrinsic_metadata_context,
+ CONTEXT_KEYS,
+)
from .common import db_transaction_generator, db_transaction
from .db import Db
from .exc import StorageArgumentException, StorageDBError, HashCollision
@@ -1162,9 +1166,12 @@
db=None,
cur=None,
) -> None:
+ context: Dict[str, Union[str, bytes, int]] = {} # origins have no context
+
self._object_metadata_add(
"origin",
origin_url,
+ context,
discovery_date,
authority,
fetcher,
@@ -1178,6 +1185,7 @@
self,
object_type: str,
id: str,
+ context: Dict[str, Union[str, bytes, int]],
discovery_date: datetime.datetime,
authority: Dict[str, Any],
fetcher: Dict[str, Any],
@@ -1186,6 +1194,8 @@
db,
cur,
) -> None:
+ check_extrinsic_metadata_context(object_type, context)
+
authority_id = self._get_authority_id(authority, db, cur)
fetcher_id = self._get_fetcher_id(fetcher, db, cur)
if not isinstance(metadata, bytes):
@@ -1196,6 +1206,7 @@
db.object_metadata_add(
object_type,
id,
+ context,
discovery_date,
authority_id,
fetcher_id,
@@ -1270,7 +1281,14 @@
for row in rows:
row = row.copy()
row.pop("metadata_fetcher.id")
+ context = {}
+ for key in CONTEXT_KEYS[object_type]:
+ value = row[key]
+ if value is not None:
+ context[key] = value
+
result = {
+ "id": row["id"],
"authority": {
"type": row.pop("metadata_authority.type"),
"url": row.pop("metadata_authority.url"),
@@ -1279,9 +1297,14 @@
"name": row.pop("metadata_fetcher.name"),
"version": row.pop("metadata_fetcher.version"),
},
- **row,
+ "discovery_date": row["discovery_date"],
+ "format": row["format"],
+ "metadata": row["metadata"],
}
+ if CONTEXT_KEYS[object_type]:
+ result["context"] = context
+
results.append(result)
if len(results) > limit:
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Tue, Dec 17, 7:49 PM (2 d, 16 h ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3220639
Attached To
D3356: Add context columns to object_metadata table and object_metadata_{add,get}.
Event Timeline
Log In to Comment