Page MenuHomeSoftware Heritage

D3356.id11905.diff
No OneTemporary

D3356.id11905.diff

diff --git a/swh/storage/cassandra/cql.py b/swh/storage/cassandra/cql.py
--- a/swh/storage/cassandra/cql.py
+++ b/swh/storage/cassandra/cql.py
@@ -17,6 +17,7 @@
Optional,
Tuple,
TypeVar,
+ Union,
)
from cassandra import CoordinationFailure
@@ -806,6 +807,13 @@
"fetcher_version",
"format",
"metadata",
+ "origin",
+ "visit",
+ "snapshot",
+ "release",
+ "revision",
+ "path",
+ "directory",
]
@_prepared_statement(
@@ -823,6 +831,7 @@
fetcher_version,
format,
metadata,
+ context: Dict[str, Union[str, bytes, int]],
*,
statement,
):
diff --git a/swh/storage/cassandra/schema.py b/swh/storage/cassandra/schema.py
--- a/swh/storage/cassandra/schema.py
+++ b/swh/storage/cassandra/schema.py
@@ -219,6 +219,15 @@
format ascii,
metadata blob,
+ -- context
+ origin text,
+ visit bigint,
+ snapshot text,
+ release text,
+ revision text,
+ path blob,
+ directory text,
+
PRIMARY KEY ((id), authority_type, authority_url, discovery_date,
fetcher_name, fetcher_version)
);
diff --git a/swh/storage/cassandra/storage.py b/swh/storage/cassandra/storage.py
--- a/swh/storage/cassandra/storage.py
+++ b/swh/storage/cassandra/storage.py
@@ -8,7 +8,7 @@
import json
import random
import re
-from typing import Any, Dict, List, Iterable, Optional
+from typing import Any, Dict, List, Iterable, Optional, Union
import attr
from deprecated import deprecated
@@ -32,6 +32,7 @@
from swh.storage.utils import now
from ..exc import StorageArgumentException, HashCollision
+from ..extrinsic_metadata import check_extrinsic_metadata_context, CONTEXT_KEYS
from .common import TOKEN_BEGIN, TOKEN_END
from .converters import (
revision_to_db,
@@ -1021,8 +1022,18 @@
raise StorageArgumentException(
"origin_id must be str, not %r" % (origin_url,)
)
+
+ context: Dict[str, Union[str, bytes, int]] = {} # origins have no context
+
self._object_metadata_add(
- "origin", origin_url, discovery_date, authority, fetcher, format, metadata,
+ "origin",
+ origin_url,
+ discovery_date,
+ authority,
+ fetcher,
+ format,
+ metadata,
+ context,
)
def origin_metadata_get(
@@ -1053,7 +1064,10 @@
fetcher: Dict[str, Any],
format: str,
metadata: bytes,
+ context: Dict[str, Union[str, bytes, int]],
) -> None:
+ check_extrinsic_metadata_context(object_type, context)
+
if not self._cql_runner.metadata_authority_get(**authority):
raise StorageArgumentException(f"Unknown authority {authority}")
if not self._cql_runner.metadata_fetcher_get(**fetcher):
@@ -1070,6 +1084,7 @@
fetcher["version"],
format,
metadata,
+ context,
)
except TypeError as e:
raise StorageArgumentException(*e.args)
@@ -1130,6 +1145,14 @@
"metadata": entry.metadata,
}
+ if CONTEXT_KEYS[object_type]:
+ context = {}
+ for key in CONTEXT_KEYS[object_type]:
+ value = getattr(entry, key)
+ if value is not None:
+ context[key] = value
+ result["context"] = context
+
results.append(result)
if len(results) > limit:
diff --git a/swh/storage/db.py b/swh/storage/db.py
--- a/swh/storage/db.py
+++ b/swh/storage/db.py
@@ -6,7 +6,7 @@
import datetime
import random
import select
-from typing import Any, Dict, Iterable, List, Optional, Tuple
+from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
from swh.core.db import BaseDb
from swh.core.db.db_utils import stored_procedure, jsonize
@@ -1082,6 +1082,17 @@
def release_get_random(self, cur=None):
return self._get_random_row_from_table("release", ["id"], "id", cur)
+ _object_metadata_context_cols = [
+ "origin",
+ "visit",
+ "snapshot",
+ "release",
+ "revision",
+ "path",
+ "directory",
+ ]
+ """The list of context columns for all artifact types."""
+
_object_metadata_insert_cols = [
"type",
"id",
@@ -1090,6 +1101,7 @@
"discovery_date",
"format",
"metadata",
+ *_object_metadata_context_cols,
]
"""List of columns of the object_metadata table, used when writing
metadata."""
@@ -1112,6 +1124,7 @@
"metadata_fetcher.id",
"metadata_fetcher.name",
"metadata_fetcher.version",
+ *_object_metadata_context_cols,
"format",
"metadata",
]
@@ -1134,6 +1147,7 @@
self,
object_type: str,
id: str,
+ context: Dict[str, Union[str, bytes, int]],
discovery_date: datetime.datetime,
authority_id: int,
fetcher_id: int,
@@ -1151,6 +1165,9 @@
format=format,
metadata=metadata,
)
+ for col in self._object_metadata_context_cols:
+ args[col] = context.get(col)
+
params = [args[col] for col in self._object_metadata_insert_cols]
cur.execute(query, params)
diff --git a/swh/storage/extrinsic_metadata.py b/swh/storage/extrinsic_metadata.py
new file mode 100644
--- /dev/null
+++ b/swh/storage/extrinsic_metadata.py
@@ -0,0 +1,57 @@
+# Copyright (C) 2020 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+from typing import Any, cast, Dict
+
+from swh.model.identifiers import PersistentId, parse_persistent_identifier
+
+from .exc import StorageArgumentException
+
+CONTEXT_KEYS: Dict[str, Dict[str, type]] = {}
+CONTEXT_KEYS["origin"] = {}
+CONTEXT_KEYS["snapshot"] = {"origin": str, "visit": int}
+CONTEXT_KEYS["release"] = {**CONTEXT_KEYS["snapshot"], "snapshot": PersistentId}
+CONTEXT_KEYS["revision"] = {**CONTEXT_KEYS["release"], "release": PersistentId}
+CONTEXT_KEYS["directory"] = {
+ **CONTEXT_KEYS["revision"],
+ "revision": PersistentId,
+ "path": bytes,
+}
+CONTEXT_KEYS["content"] = {**CONTEXT_KEYS["directory"], "directory": PersistentId}
+
+ALL_CONTEXT_KEYS = set(CONTEXT_KEYS["content"])
+
+
+def check_extrinsic_metadata_context(object_type: str, context: Dict[str, Any]):
+ key_types = CONTEXT_KEYS[object_type]
+
+ extra_keys = set(context) - set(key_types)
+ if extra_keys:
+ raise StorageArgumentException(f"Unknown context keys: {', '.join(extra_keys)}")
+
+ for (key, value) in context.items():
+ expected_type = key_types[key]
+ expected_type_str = str(expected_type) # for display
+
+ # If an SWHID is expected and a string is given, parse it
+ if expected_type is PersistentId and isinstance(value, str):
+ value = parse_persistent_identifier(value)
+ expected_type_str = "PersistentId or str"
+
+ # Check the type of the context value
+ if not isinstance(value, expected_type):
+ raise StorageArgumentException(
+ f"Context key {key} must have type {expected_type_str}, "
+ f"but is {value!r}"
+ )
+
+ # If it is an SWHID, check it is also a core SWHID.
+ if expected_type is PersistentId:
+ value = cast(PersistentId, value)
+ if value.metadata != {}:
+ raise StorageArgumentException(
+ f"Context key {key} must be a core SWHID, "
+ f"but it has qualifiers {', '.join(value.metadata)}."
+ )
diff --git a/swh/storage/in_memory.py b/swh/storage/in_memory.py
--- a/swh/storage/in_memory.py
+++ b/swh/storage/in_memory.py
@@ -25,6 +25,7 @@
Optional,
Tuple,
TypeVar,
+ Union,
)
import attr
@@ -49,9 +50,9 @@
from swh.storage.objstorage import ObjStorage
from swh.storage.utils import now
-from .exc import StorageArgumentException, HashCollision
-
from .converters import origin_url_to_sha1
+from .exc import StorageArgumentException, HashCollision
+from .extrinsic_metadata import check_extrinsic_metadata_context, CONTEXT_KEYS
from .utils import get_partition_bounds_bytes
from .writer import JournalWriter
@@ -1009,6 +1010,7 @@
def content_metadata_add(
self,
id: str,
+ context: Dict[str, Union[str, bytes, int]],
discovery_date: datetime.datetime,
authority: Dict[str, Any],
fetcher: Dict[str, Any],
@@ -1016,7 +1018,14 @@
metadata: bytes,
) -> None:
self._object_metadata_add(
- "content", id, discovery_date, authority, fetcher, format, metadata,
+ "content",
+ id,
+ discovery_date,
+ authority,
+ fetcher,
+ format,
+ metadata,
+ context,
)
def origin_metadata_add(
@@ -1032,8 +1041,18 @@
raise StorageArgumentException(
"origin_id must be str, not %r" % (origin_url,)
)
+
+ context: Dict[str, Union[str, bytes, int]] = {} # origins have no context
+
self._object_metadata_add(
- "origin", origin_url, discovery_date, authority, fetcher, format, metadata,
+ "origin",
+ origin_url,
+ discovery_date,
+ authority,
+ fetcher,
+ format,
+ metadata,
+ context,
)
def _object_metadata_add(
@@ -1045,7 +1064,9 @@
fetcher: Dict[str, Any],
format: str,
metadata: bytes,
+ context: Dict[str, Union[str, bytes, int]],
) -> None:
+ check_extrinsic_metadata_context(object_type, context)
if not isinstance(metadata, bytes):
raise StorageArgumentException(
"metadata must be bytes, not %r" % (metadata,)
@@ -1068,6 +1089,9 @@
"metadata": metadata,
}
+ if CONTEXT_KEYS[object_type]:
+ object_metadata["context"] = context
+
for existing_object_metadata in object_metadata_list:
if (
existing_object_metadata["fetcher"] == fetcher_key
diff --git a/swh/storage/sql/30-swh-schema.sql b/swh/storage/sql/30-swh-schema.sql
--- a/swh/storage/sql/30-swh-schema.sql
+++ b/swh/storage/sql/30-swh-schema.sql
@@ -436,7 +436,16 @@
-- metadata itself
format text not null,
- metadata bytea not null
+ metadata bytea not null,
+
+ -- context
+ origin text,
+ visit bigint,
+ snapshot swhid,
+ release swhid,
+ revision swhid,
+ path bytea,
+ directory swhid
);
comment on table object_metadata is 'keeps all metadata found concerning an object';
diff --git a/swh/storage/storage.py b/swh/storage/storage.py
--- a/swh/storage/storage.py
+++ b/swh/storage/storage.py
@@ -10,7 +10,7 @@
from collections import defaultdict
from contextlib import contextmanager
from deprecated import deprecated
-from typing import Any, Dict, Iterable, List, Optional
+from typing import Any, Dict, Iterable, List, Optional, Union
import attr
import psycopg2
@@ -36,6 +36,11 @@
from swh.storage.utils import now
from . import converters
+from .extrinsic_metadata import (
+ check_extrinsic_metadata_context,
+ CONTEXT_KEYS,
+ ALL_CONTEXT_KEYS,
+)
from .common import db_transaction_generator, db_transaction
from .db import Db
from .exc import StorageArgumentException, StorageDBError, HashCollision
@@ -1164,9 +1169,12 @@
if origin_id is None:
raise StorageArgumentException(f"Unknown origin {origin_url}")
+ context: Dict[str, Union[str, bytes, int]] = {} # origins have no context
+
self._object_metadata_add(
"origin",
origin_url,
+ context,
discovery_date,
authority,
fetcher,
@@ -1180,6 +1188,7 @@
self,
object_type: str,
id: str,
+ context: Dict[str, Union[str, bytes, int]],
discovery_date: datetime.datetime,
authority: Dict[str, Any],
fetcher: Dict[str, Any],
@@ -1188,6 +1197,8 @@
db,
cur,
) -> None:
+ check_extrinsic_metadata_context(object_type, context)
+
authority_id = self._get_authority_id(authority, db, cur)
fetcher_id = self._get_fetcher_id(fetcher, db, cur)
if not isinstance(metadata, bytes):
@@ -1198,6 +1209,7 @@
db.object_metadata_add(
object_type,
id,
+ context,
discovery_date,
authority_id,
fetcher_id,
@@ -1276,6 +1288,13 @@
for row in rows:
row = row.copy()
row.pop("metadata_fetcher.id")
+ context = {}
+ for key in ALL_CONTEXT_KEYS:
+ value = row.pop(key)
+ if key in CONTEXT_KEYS[object_type]:
+ if value is not None:
+ context[key] = value
+
result = {
"authority": {
"type": row.pop("metadata_authority.type"),
@@ -1288,6 +1307,9 @@
**row,
}
+ if CONTEXT_KEYS[object_type]:
+ result["context"] = context
+
results.append(result)
if len(results) > limit:

File Metadata

Mime Type
text/plain
Expires
Sun, Aug 17, 7:54 PM (1 w, 2 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3221890

Event Timeline