Page MenuHomeSoftware Heritage

D3356.diff
No OneTemporary

D3356.diff

diff --git a/sql/upgrades/157.sql b/sql/upgrades/157.sql
--- a/sql/upgrades/157.sql
+++ b/sql/upgrades/157.sql
@@ -44,3 +44,20 @@
create unique index object_metadata_content_authority_date_fetcher
on object_metadata(id, authority_id, discovery_date, fetcher_id);
+
+
+-- Add context columns
+alter table object_metadata
+ add column origin text;
+alter table object_metadata
+ add column visit bigint;
+alter table object_metadata
+ add column snapshot swhid;
+alter table object_metadata
+ add column release swhid;
+alter table object_metadata
+ add column revision swhid;
+alter table object_metadata
+ add column path bytea;
+alter table object_metadata
+ add column directory swhid;
diff --git a/swh/storage/cassandra/cql.py b/swh/storage/cassandra/cql.py
--- a/swh/storage/cassandra/cql.py
+++ b/swh/storage/cassandra/cql.py
@@ -18,6 +18,7 @@
Optional,
Tuple,
TypeVar,
+ Union,
)
from cassandra import CoordinationFailure
@@ -880,6 +881,13 @@
"fetcher_version",
"format",
"metadata",
+ "origin",
+ "visit",
+ "snapshot",
+ "release",
+ "revision",
+ "path",
+ "directory",
]
@_prepared_statement(
@@ -897,6 +905,7 @@
fetcher_version,
format,
metadata,
+ context: Dict[str, Union[str, bytes, int]],
*,
statement,
):
diff --git a/swh/storage/cassandra/schema.py b/swh/storage/cassandra/schema.py
--- a/swh/storage/cassandra/schema.py
+++ b/swh/storage/cassandra/schema.py
@@ -219,6 +219,15 @@
format ascii,
metadata blob,
+ -- context
+ origin text,
+ visit bigint,
+ snapshot text,
+ release text,
+ revision text,
+ path blob,
+ directory text,
+
PRIMARY KEY ((id), authority_type, authority_url, discovery_date,
fetcher_name, fetcher_version)
);
diff --git a/swh/storage/cassandra/storage.py b/swh/storage/cassandra/storage.py
--- a/swh/storage/cassandra/storage.py
+++ b/swh/storage/cassandra/storage.py
@@ -8,7 +8,7 @@
import json
import random
import re
-from typing import Any, Dict, List, Iterable, Optional
+from typing import Any, Dict, List, Iterable, Optional, Union
import attr
from deprecated import deprecated
@@ -32,6 +32,7 @@
from swh.storage.utils import now
from ..exc import StorageArgumentException, HashCollision
+from ..extrinsic_metadata import check_extrinsic_metadata_context, CONTEXT_KEYS
from .common import TOKEN_BEGIN, TOKEN_END
from .converters import (
revision_to_db,
@@ -1030,8 +1031,18 @@
raise StorageArgumentException(
"origin_url must be str, not %r" % (origin_url,)
)
+
+ context: Dict[str, Union[str, bytes, int]] = {} # origins have no context
+
self._object_metadata_add(
- "origin", origin_url, discovery_date, authority, fetcher, format, metadata,
+ "origin",
+ origin_url,
+ discovery_date,
+ authority,
+ fetcher,
+ format,
+ metadata,
+ context,
)
def origin_metadata_get(
@@ -1062,7 +1073,10 @@
fetcher: Dict[str, Any],
format: str,
metadata: bytes,
+ context: Dict[str, Union[str, bytes, int]],
) -> None:
+ check_extrinsic_metadata_context(object_type, context)
+
if not self._cql_runner.metadata_authority_get(**authority):
raise StorageArgumentException(f"Unknown authority {authority}")
if not self._cql_runner.metadata_fetcher_get(**fetcher):
@@ -1079,6 +1093,7 @@
fetcher["version"],
format,
metadata,
+ context,
)
except TypeError as e:
raise StorageArgumentException(*e.args)
@@ -1139,6 +1154,14 @@
"metadata": entry.metadata,
}
+ if CONTEXT_KEYS[object_type]:
+ context = {}
+ for key in CONTEXT_KEYS[object_type]:
+ value = getattr(entry, key)
+ if value is not None:
+ context[key] = value
+ result["context"] = context
+
results.append(result)
if len(results) > limit:
diff --git a/swh/storage/db.py b/swh/storage/db.py
--- a/swh/storage/db.py
+++ b/swh/storage/db.py
@@ -6,7 +6,7 @@
import datetime
import random
import select
-from typing import Any, Dict, Iterable, List, Optional, Tuple
+from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
from swh.core.db import BaseDb
from swh.core.db.db_utils import stored_procedure, jsonize
@@ -1092,6 +1092,17 @@
def release_get_random(self, cur=None):
return self._get_random_row_from_table("release", ["id"], "id", cur)
+ _object_metadata_context_cols = [
+ "origin",
+ "visit",
+ "snapshot",
+ "release",
+ "revision",
+ "path",
+ "directory",
+ ]
+ """The list of context columns for all artifact types."""
+
_object_metadata_insert_cols = [
"type",
"id",
@@ -1100,6 +1111,7 @@
"discovery_date",
"format",
"metadata",
+ *_object_metadata_context_cols,
]
"""List of columns of the object_metadata table, used when writing
metadata."""
@@ -1122,6 +1134,7 @@
"metadata_fetcher.id",
"metadata_fetcher.name",
"metadata_fetcher.version",
+ *_object_metadata_context_cols,
"format",
"metadata",
]
@@ -1144,6 +1157,7 @@
self,
object_type: str,
id: str,
+ context: Dict[str, Union[str, bytes, int]],
discovery_date: datetime.datetime,
authority_id: int,
fetcher_id: int,
@@ -1161,6 +1175,9 @@
format=format,
metadata=metadata,
)
+ for col in self._object_metadata_context_cols:
+ args[col] = context.get(col)
+
params = [args[col] for col in self._object_metadata_insert_cols]
cur.execute(query, params)
diff --git a/swh/storage/extrinsic_metadata.py b/swh/storage/extrinsic_metadata.py
new file mode 100644
--- /dev/null
+++ b/swh/storage/extrinsic_metadata.py
@@ -0,0 +1,55 @@
+# Copyright (C) 2020 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+from typing import Any, cast, Dict
+
+from swh.model.identifiers import PersistentId, parse_persistent_identifier
+
+from .exc import StorageArgumentException
+
+CONTEXT_KEYS: Dict[str, Dict[str, type]] = {}
+CONTEXT_KEYS["origin"] = {}
+CONTEXT_KEYS["snapshot"] = {"origin": str, "visit": int}
+CONTEXT_KEYS["release"] = {**CONTEXT_KEYS["snapshot"], "snapshot": PersistentId}
+CONTEXT_KEYS["revision"] = {**CONTEXT_KEYS["release"], "release": PersistentId}
+CONTEXT_KEYS["directory"] = {
+ **CONTEXT_KEYS["revision"],
+ "revision": PersistentId,
+ "path": bytes,
+}
+CONTEXT_KEYS["content"] = {**CONTEXT_KEYS["directory"], "directory": PersistentId}
+
+
+def check_extrinsic_metadata_context(object_type: str, context: Dict[str, Any]):
+ key_types = CONTEXT_KEYS[object_type]
+
+ extra_keys = set(context) - set(key_types)
+ if extra_keys:
+ raise StorageArgumentException(f"Unknown context keys: {', '.join(extra_keys)}")
+
+ for (key, value) in context.items():
+ expected_type = key_types[key]
+ expected_type_str = str(expected_type) # for display
+
+ # If an SWHID is expected and a string is given, parse it
+ if expected_type is PersistentId and isinstance(value, str):
+ value = parse_persistent_identifier(value)
+ expected_type_str = "PersistentId or str"
+
+ # Check the type of the context value
+ if not isinstance(value, expected_type):
+ raise StorageArgumentException(
+ f"Context key {key} must have type {expected_type_str}, "
+ f"but is {value!r}"
+ )
+
+ # If it is an SWHID, check it is also a core SWHID.
+ if expected_type is PersistentId:
+ value = cast(PersistentId, value)
+ if value.metadata != {}:
+ raise StorageArgumentException(
+ f"Context key {key} must be a core SWHID, "
+ f"but it has qualifiers {', '.join(value.metadata)}."
+ )
diff --git a/swh/storage/in_memory.py b/swh/storage/in_memory.py
--- a/swh/storage/in_memory.py
+++ b/swh/storage/in_memory.py
@@ -25,6 +25,7 @@
Optional,
Tuple,
TypeVar,
+ Union,
)
import attr
@@ -49,9 +50,9 @@
from swh.storage.objstorage import ObjStorage
from swh.storage.utils import now
-from .exc import StorageArgumentException, HashCollision
-
from .converters import origin_url_to_sha1
+from .exc import StorageArgumentException, HashCollision
+from .extrinsic_metadata import check_extrinsic_metadata_context, CONTEXT_KEYS
from .utils import get_partition_bounds_bytes
from .writer import JournalWriter
@@ -1018,6 +1019,7 @@
def content_metadata_add(
self,
id: str,
+ context: Dict[str, Union[str, bytes, int]],
discovery_date: datetime.datetime,
authority: Dict[str, Any],
fetcher: Dict[str, Any],
@@ -1025,7 +1027,14 @@
metadata: bytes,
) -> None:
self._object_metadata_add(
- "content", id, discovery_date, authority, fetcher, format, metadata,
+ "content",
+ id,
+ discovery_date,
+ authority,
+ fetcher,
+ format,
+ metadata,
+ context,
)
def origin_metadata_add(
@@ -1041,8 +1050,18 @@
raise StorageArgumentException(
"origin_url must be str, not %r" % (origin_url,)
)
+
+ context: Dict[str, Union[str, bytes, int]] = {} # origins have no context
+
self._object_metadata_add(
- "origin", origin_url, discovery_date, authority, fetcher, format, metadata,
+ "origin",
+ origin_url,
+ discovery_date,
+ authority,
+ fetcher,
+ format,
+ metadata,
+ context,
)
def _object_metadata_add(
@@ -1054,7 +1073,9 @@
fetcher: Dict[str, Any],
format: str,
metadata: bytes,
+ context: Dict[str, Union[str, bytes, int]],
) -> None:
+ check_extrinsic_metadata_context(object_type, context)
if not isinstance(metadata, bytes):
raise StorageArgumentException(
"metadata must be bytes, not %r" % (metadata,)
@@ -1077,6 +1098,9 @@
"metadata": metadata,
}
+ if CONTEXT_KEYS[object_type]:
+ object_metadata["context"] = context
+
for existing_object_metadata in object_metadata_list:
if (
existing_object_metadata["fetcher"] == fetcher_key
diff --git a/swh/storage/sql/30-swh-schema.sql b/swh/storage/sql/30-swh-schema.sql
--- a/swh/storage/sql/30-swh-schema.sql
+++ b/swh/storage/sql/30-swh-schema.sql
@@ -436,7 +436,16 @@
-- metadata itself
format text not null,
- metadata bytea not null
+ metadata bytea not null,
+
+ -- context
+ origin text,
+ visit bigint,
+ snapshot swhid,
+ release swhid,
+ revision swhid,
+ path bytea,
+ directory swhid
);
comment on table object_metadata is 'keeps all metadata found concerning an object';
diff --git a/swh/storage/storage.py b/swh/storage/storage.py
--- a/swh/storage/storage.py
+++ b/swh/storage/storage.py
@@ -10,7 +10,7 @@
from collections import defaultdict
from contextlib import contextmanager
from deprecated import deprecated
-from typing import Any, Dict, Iterable, List, Optional
+from typing import Any, Dict, Iterable, List, Optional, Union
import attr
import psycopg2
@@ -36,6 +36,10 @@
from swh.storage.utils import now
from . import converters
+from .extrinsic_metadata import (
+ check_extrinsic_metadata_context,
+ CONTEXT_KEYS,
+)
from .common import db_transaction_generator, db_transaction
from .db import Db
from .exc import StorageArgumentException, StorageDBError, HashCollision
@@ -1162,9 +1166,12 @@
db=None,
cur=None,
) -> None:
+ context: Dict[str, Union[str, bytes, int]] = {} # origins have no context
+
self._object_metadata_add(
"origin",
origin_url,
+ context,
discovery_date,
authority,
fetcher,
@@ -1178,6 +1185,7 @@
self,
object_type: str,
id: str,
+ context: Dict[str, Union[str, bytes, int]],
discovery_date: datetime.datetime,
authority: Dict[str, Any],
fetcher: Dict[str, Any],
@@ -1186,6 +1194,8 @@
db,
cur,
) -> None:
+ check_extrinsic_metadata_context(object_type, context)
+
authority_id = self._get_authority_id(authority, db, cur)
fetcher_id = self._get_fetcher_id(fetcher, db, cur)
if not isinstance(metadata, bytes):
@@ -1196,6 +1206,7 @@
db.object_metadata_add(
object_type,
id,
+ context,
discovery_date,
authority_id,
fetcher_id,
@@ -1270,7 +1281,14 @@
for row in rows:
row = row.copy()
row.pop("metadata_fetcher.id")
+ context = {}
+ for key in CONTEXT_KEYS[object_type]:
+ value = row[key]
+ if value is not None:
+ context[key] = value
+
result = {
+ "id": row["id"],
"authority": {
"type": row.pop("metadata_authority.type"),
"url": row.pop("metadata_authority.url"),
@@ -1279,9 +1297,14 @@
"name": row.pop("metadata_fetcher.name"),
"version": row.pop("metadata_fetcher.version"),
},
- **row,
+ "discovery_date": row["discovery_date"],
+ "format": row["format"],
+ "metadata": row["metadata"],
}
+ if CONTEXT_KEYS[object_type]:
+ result["context"] = context
+
results.append(result)
if len(results) > limit:

File Metadata

Mime Type
text/plain
Expires
Tue, Dec 17, 7:49 PM (2 d, 16 h ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3220639

Event Timeline