Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F9345990
D3247.id11506.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
31 KB
Subscribers
None
D3247.id11506.diff
View Options
diff --git a/swh/storage/db.py b/swh/storage/db.py
--- a/swh/storage/db.py
+++ b/swh/storage/db.py
@@ -6,13 +6,15 @@
import datetime
import random
import select
-from typing import Any, Dict, List, Optional, Tuple
+from typing import Any, Dict, List, Optional, Tuple, Union
from swh.core.db import BaseDb
from swh.core.db.db_utils import stored_procedure, jsonize
from swh.core.db.db_utils import execute_values_generator
from swh.model.model import OriginVisit, OriginVisitStatus, SHA1_SIZE
+from . import extrinsic_metadata
+
class Db(BaseDb):
"""Proxy to the SWH DB, with wrappers around stored procedures
@@ -1071,74 +1073,110 @@
def release_get_random(self, cur=None):
return self._get_random_row_from_table("release", ["id"], "id", cur)
- origin_metadata_get_cols = [
- "origin.url",
- "discovery_date",
- "metadata_authority.type",
- "metadata_authority.url",
- "metadata_fetcher.id",
- "metadata_fetcher.name",
- "metadata_fetcher.version",
- "format",
- "metadata",
- ]
+ _object_metadata_context_cols = {
+ object_type: sorted(context_keys)
+ for (object_type, context_keys) in extrinsic_metadata.CONTEXT_KEYS.items()
+ }
+ """The list of context columns for each object type."""
+
+ _object_metadata_insert_cols = {
+ object_type: [
+ "id",
+ "authority_id",
+ "fetcher_id",
+ "discovery_date",
+ "format",
+ "metadata",
+ ]
+ + cols
+ for (object_type, cols) in _object_metadata_context_cols.items()
+ }
+ """List of columns of the {object_type}_metadata table, used when writing
+ metadata."""
+
+ _object_metadata_insert_queries = {
+ object_type: f"""
+ INSERT INTO {object_type}_metadata ({', '.join(cols)})
+ VALUES ({', '.join(['%s']*len(cols))})
+ ON CONFLICT (id, authority_id, discovery_date, fetcher_id)
+ DO UPDATE SET
+ format=EXCLUDED.format,
+ metadata=EXCLUDED.metadata
+ """
+ for (object_type, cols) in _object_metadata_insert_cols.items()
+ }
+
+ object_metadata_get_cols = {
+ object_type: ["id",]
+ + cols
+ + [
+ "discovery_date",
+ "metadata_authority.type",
+ "metadata_authority.url",
+ "metadata_fetcher.id",
+ "metadata_fetcher.name",
+ "metadata_fetcher.version",
+ "format",
+ "metadata",
+ ]
+ for (object_type, cols) in _object_metadata_context_cols.items()
+ }
+ """List of columns of the {object_type}_metadata, metadata_authority,
+ and metadata_fetcher tables, used when reading object metadata."""
+
+ _object_metadata_select_queries = {
+ object_type: f"""
+ SELECT {object_type}_metadata.id AS id, {', '.join(cols[1:-1])},
+ {object_type}_metadata.metadata AS metadata
+ FROM {object_type}_metadata
+ INNER JOIN metadata_authority
+ ON (metadata_authority.id=authority_id)
+ INNER JOIN metadata_fetcher ON (metadata_fetcher.id=fetcher_id)
+ WHERE {object_type}_metadata.id=%s AND authority_id=%s
+ """
+ for (object_type, cols) in object_metadata_get_cols.items()
+ }
- def origin_metadata_add(
+ def object_metadata_add(
self,
- origin: str,
+ object_type: str,
+ id: Union[int, str],
+ context: Dict[str, Union[str, bytes, int]],
discovery_date: datetime.datetime,
- authority: int,
- fetcher: int,
+ authority_id: int,
+ fetcher_id: int,
format: str,
metadata: bytes,
- cur=None,
- ) -> None:
- """ Add an origin_metadata for the origin at ts with provider, tool and
- metadata.
-
- Args:
- origin: the origin's id for which the metadata is added
- discovery_date: time when the metadata was found
- authority: the metadata provider identifier
- fetcher: the tool's identifier used to extract metadata
- format: the format of the metadata
- metadata: the metadata retrieved at the time and location
- """
- cur = self._cursor(cur)
- insert = """INSERT INTO origin_metadata (origin_id, discovery_date,
- authority_id, fetcher_id, format, metadata)
- SELECT id, %s, %s, %s, %s, %s FROM origin WHERE url = %s
- ON CONFLICT (origin_id, authority_id, discovery_date, fetcher_id)
- DO UPDATE SET
- format=EXCLUDED.format,
- metadata=EXCLUDED.metadata
- """
- cur.execute(
- insert, (discovery_date, authority, fetcher, format, metadata, origin),
+ cur,
+ ):
+ query = self._object_metadata_insert_queries[object_type]
+ args = dict(
+ id=id,
+ authority_id=authority_id,
+ fetcher_id=fetcher_id,
+ discovery_date=discovery_date,
+ format=format,
+ metadata=metadata,
)
+ for col in self._object_metadata_context_cols[object_type]:
+ args[col] = context.get(col)
- def origin_metadata_get(
+ params = [args[col] for col in self._object_metadata_insert_cols[object_type]]
+
+ cur.execute(query, params)
+
+ def object_metadata_get(
self,
- origin_url: str,
- authority: int,
+ object_type: str,
+ id: Union[id, str],
+ authority_id: int,
after_time: Optional[datetime.datetime],
after_fetcher: Optional[int],
- limit: Optional[int],
- cur=None,
+ limit: int,
+ cur,
):
- cur = self._cursor(cur)
- assert self.origin_metadata_get_cols[-1] == "metadata"
- query_parts = [
- f"SELECT {', '.join(self.origin_metadata_get_cols[0:-1])}, "
- f" origin_metadata.metadata AS metadata "
- f"FROM origin_metadata "
- f"INNER JOIN metadata_authority "
- f" ON (metadata_authority.id=authority_id) "
- f"INNER JOIN metadata_fetcher ON (metadata_fetcher.id=fetcher_id) "
- f"INNER JOIN origin ON (origin.id=origin_metadata.origin_id) "
- f"WHERE origin.url=%s AND authority_id=%s "
- ]
- args = [origin_url, authority]
+ query_parts = [self._object_metadata_select_queries[object_type]]
+ args = [id, authority_id]
if after_fetcher is not None:
assert after_time
diff --git a/swh/storage/extrinsic_metadata.py b/swh/storage/extrinsic_metadata.py
new file mode 100644
--- /dev/null
+++ b/swh/storage/extrinsic_metadata.py
@@ -0,0 +1,55 @@
+# Copyright (C) 2020 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+from typing import Any, cast, Dict
+
+from swh.model.identifiers import PersistentId, parse_persistent_identifier
+
+from .exc import StorageArgumentException
+
+CONTEXT_KEYS = {}
+CONTEXT_KEYS["origin"] = {}
+CONTEXT_KEYS["snapshot"] = {"origin": str, "visit": int}
+CONTEXT_KEYS["release"] = {**CONTEXT_KEYS["snapshot"], "snapshot": PersistentId}
+CONTEXT_KEYS["revision"] = {**CONTEXT_KEYS["release"], "release": PersistentId}
+CONTEXT_KEYS["directory"] = {
+ **CONTEXT_KEYS["revision"],
+ "revision": PersistentId,
+ "path": bytes,
+}
+CONTEXT_KEYS["content"] = {**CONTEXT_KEYS["directory"], "directory": PersistentId}
+
+
+def check_extrinsic_metadata_context(object_type: str, context: Dict[str, Any]):
+ key_types = CONTEXT_KEYS[object_type]
+
+ extra_keys = set(context) - set(key_types)
+ if extra_keys:
+ raise StorageArgumentException(f"Unknown context keys: {', '.join(extra_keys)}")
+
+ for (key, value) in context.items():
+ expected_type = key_types[key]
+ expected_type_str = str(expected_type) # for display
+
+ # If an SWHID is expected and a string is given, parse it
+ if expected_type is PersistentId and isinstance(value, str):
+ value = parse_persistent_identifier(value)
+ expected_type_str = "PersistentId or str"
+
+ # Check the type of the context value
+ if not isinstance(value, expected_type):
+ raise StorageArgumentException(
+ f"Context key {key} must have type {expected_type_str}, "
+ f"but is {value!r}"
+ )
+
+ # If it is an SWHID, check it is also a core SWHID.
+ if expected_type is PersistentId:
+ value = cast(PersistentId, value)
+ if value.metadata != {}:
+ raise StorageArgumentException(
+ f"Context key {key} must be a core SWHID, "
+ f"but it has qualifiers {', '.join(value.metadata)}."
+ )
diff --git a/swh/storage/sql/30-swh-schema.sql b/swh/storage/sql/30-swh-schema.sql
--- a/swh/storage/sql/30-swh-schema.sql
+++ b/swh/storage/sql/30-swh-schema.sql
@@ -37,6 +37,9 @@
-- a set of UNIX-like access permissions, as manipulated by, e.g., chmod
create domain file_perms as int;
+-- an SWHID
+create domain swhid as text check (value ~ '^swh:[0-9]+:.*');
+
-- Checksums about actual file content. Note that the content itself is not
-- stored in the DB, but on external (key-value) storage. A single checksum is
@@ -66,6 +69,32 @@
comment on column content.object_id is 'Content identifier';
+-- Extrinsic metadata on a content object.
+create table content_metadata
+(
+ -- content identifier
+ id swhid not null,
+
+ -- metadata source
+ authority_id bigint not null,
+ fetcher_id bigint not null,
+ discovery_date timestamptz not null,
+
+ -- metadata itself
+ format text not null,
+ metadata bytea not null,
+
+ -- context
+ origin text,
+ visit bigint,
+ snapshot swhid,
+ release swhid,
+ revision swhid,
+ path bytea,
+ directory swhid
+);
+
+
-- An origin is a place, identified by an URL, where software source code
-- artifacts can be found. We support different kinds of origins, e.g., git and
-- other VCS repositories, web pages that list tarballs URLs (e.g.,
@@ -431,8 +460,7 @@
-- also provides a translation to a defined json schema using a translation tool (tool_id)
create table origin_metadata
(
- id bigserial not null, -- PK internal object identifier
- origin_id bigint not null, -- references origin(id)
+ id bigint not null, -- references origin(id)
discovery_date timestamptz not null, -- when it was extracted
authority_id bigint not null,
fetcher_id bigint not null,
@@ -441,8 +469,7 @@
);
comment on table origin_metadata is 'keeps all metadata found concerning an origin';
-comment on column origin_metadata.id is 'the origin_metadata object''s id';
-comment on column origin_metadata.origin_id is 'the origin id for which the metadata was found';
+comment on column origin_metadata.id is 'the origin id for which the metadata was found';
comment on column origin_metadata.discovery_date is 'the date of retrieval';
comment on column origin_metadata.authority_id is 'the metadata provider: github, openhub, deposit, etc.';
comment on column origin_metadata.fetcher_id is 'the tool used for extracting metadata: loaders, crawlers, etc.';
diff --git a/swh/storage/sql/60-swh-indexes.sql b/swh/storage/sql/60-swh-indexes.sql
--- a/swh/storage/sql/60-swh-indexes.sql
+++ b/swh/storage/sql/60-swh-indexes.sql
@@ -167,13 +167,19 @@
create unique index metadata_authority_type_url on metadata_authority(type, url);
--- origin_metadata
-create unique index concurrently origin_metadata_pkey on origin_metadata(id);
-alter table origin_metadata add primary key using index origin_metadata_pkey;
+-- content_metadata
+create unique index concurrently content_metadata_content_authority_date_fetcher on content_metadata(id, authority_id, discovery_date, fetcher_id);
+
+alter table content_metadata add constraint content_metadata_authority_fkey foreign key (authority_id) references metadata_authority(id) not valid;
+alter table content_metadata validate constraint content_metadata_authority_fkey;
-create unique index concurrently origin_metadata_origin_authority_date_fetcher on origin_metadata(origin_id, authority_id, discovery_date, fetcher_id);
+alter table content_metadata add constraint content_metadata_fetcher_fkey foreign key (fetcher_id) references metadata_fetcher(id) not valid;
+alter table content_metadata validate constraint content_metadata_fetcher_fkey;
+
+-- origin_metadata
+create unique index concurrently origin_metadata_origin_authority_date_fetcher on origin_metadata(id, authority_id, discovery_date, fetcher_id);
-alter table origin_metadata add constraint origin_metadata_origin_fkey foreign key (origin_id) references origin(id) not valid;
+alter table origin_metadata add constraint origin_metadata_origin_fkey foreign key (id) references origin(id) not valid;
alter table origin_metadata validate constraint origin_metadata_origin_fkey;
alter table origin_metadata add constraint origin_metadata_authority_fkey foreign key (authority_id) references metadata_authority(id) not valid;
diff --git a/swh/storage/storage.py b/swh/storage/storage.py
--- a/swh/storage/storage.py
+++ b/swh/storage/storage.py
@@ -36,6 +36,7 @@
from swh.storage.utils import now
from . import converters
+from .extrinsic_metadata import check_extrinsic_metadata_context, CONTEXT_KEYS
from .common import db_transaction_generator, db_transaction
from .db import Db
from .exc import StorageArgumentException, StorageDBError, HashCollision
@@ -360,6 +361,49 @@
)
return [dict(zip(db.content_find_cols, content)) for content in contents]
+ @timed
+ @db_transaction()
+ def content_metadata_add(
+ self,
+ id: str,
+ context: Dict[str, Union[str, bytes, int]],
+ discovery_date: datetime.datetime,
+ authority: Dict[str, Any],
+ fetcher: Dict[str, Any],
+ format: str,
+ metadata: bytes,
+ db=None,
+ cur=None,
+ ):
+ self._object_metadata_add(
+ "content",
+ id,
+ context,
+ discovery_date,
+ authority,
+ fetcher,
+ format,
+ metadata,
+ db,
+ cur,
+ )
+
+ @timed
+ @db_transaction()
+ def content_metadata_get(
+ self,
+ id: str,
+ authority: Dict[str, str],
+ after: Optional[datetime.datetime] = None,
+ page_token: Optional[bytes] = None,
+ limit: int = 1000,
+ db=None,
+ cur=None,
+ ):
+ return self._object_metadata_get(
+ "content", id, authority, after, page_token, limit, db, cur
+ )
+
@timed
@db_transaction()
def content_get_random(self, db=None, cur=None):
@@ -1263,29 +1307,64 @@
db=None,
cur=None,
) -> None:
- authority_id = db.metadata_authority_get_id(
- authority["type"], authority["url"], cur
- )
- if not authority_id:
- raise StorageArgumentException(f"Unknown authority {authority}")
- fetcher_id = db.metadata_fetcher_get_id(
- fetcher["name"], fetcher["version"], cur
+ origin_id = next(iter(list(db.origin_id_get_by_url([origin_url], cur))), None)
+ if origin_id is None:
+ raise StorageArgumentException(f"Unknown origin {origin_url}")
+
+ context = {} # origins have no context
+
+ self._object_metadata_add(
+ "origin",
+ origin_id,
+ context,
+ discovery_date,
+ authority,
+ fetcher,
+ format,
+ metadata,
+ db,
+ cur,
)
- if not fetcher_id:
- raise StorageArgumentException(f"Unknown fetcher {fetcher}")
- try:
- db.origin_metadata_add(
- origin_url,
- discovery_date,
- authority_id,
- fetcher_id,
- format,
- metadata,
- cur,
+
+ def _object_metadata_add(
+ self,
+ object_type: str,
+ id: Union[int, str],
+ context: Dict[str, Union[str, bytes, int]],
+ discovery_date: datetime.datetime,
+ authority: Dict[str, Any],
+ fetcher: Dict[str, Any],
+ format: str,
+ metadata: bytes,
+ db,
+ cur,
+ ) -> None:
+ check_extrinsic_metadata_context(object_type, context)
+
+ authority_id = self._get_authority_id(authority, db, cur)
+ fetcher_id = self._get_fetcher_id(fetcher, db, cur)
+ if not isinstance(metadata, bytes):
+ raise StorageArgumentException(
+ "metadata must be bytes, not %r" % (metadata,)
)
- except psycopg2.ProgrammingError as e:
- raise StorageArgumentException(*e.args)
- send_metric("origin_metadata:add", count=1, method_name="origin_metadata_add")
+
+ db.object_metadata_add(
+ object_type,
+ id,
+ context,
+ discovery_date,
+ authority_id,
+ fetcher_id,
+ format,
+ metadata,
+ cur,
+ )
+
+ send_metric(
+ f"{object_type}_metadata:add",
+ count=1,
+ method_name=f"{object_type}_metadata_add",
+ )
@timed
@db_transaction(statement_timeout=500)
@@ -1298,6 +1377,31 @@
limit: int = 1000,
db=None,
cur=None,
+ ) -> Dict[str, Any]:
+ origin_id = next(iter(list(db.origin_id_get_by_url([origin_url], cur))), None)
+ if origin_id is None:
+ raise StorageArgumentException(f"Unknown origin {origin_url}")
+
+ result = self._object_metadata_get(
+ "origin", origin_id, authority, after, page_token, limit, db, cur
+ )
+
+ for res in result["results"]:
+ res.pop("id")
+ res["origin_url"] = origin_url
+
+ return result
+
+ def _object_metadata_get(
+ self,
+ object_type: str,
+ id: Union[str, int],
+ authority: Dict[str, str],
+ after: Optional[datetime.datetime],
+ page_token: Optional[bytes],
+ limit: int,
+ db,
+ cur,
) -> Dict[str, Any]:
if page_token:
(after_time, after_fetcher) = msgpack_loads(page_token)
@@ -1318,28 +1422,41 @@
"results": [],
}
- rows = db.origin_metadata_get(
- origin_url, authority_id, after_time, after_fetcher, limit + 1, cur
+ rows = db.object_metadata_get(
+ object_type, id, authority_id, after_time, after_fetcher, limit + 1, cur
)
- rows = [dict(zip(db.origin_metadata_get_cols, row)) for row in rows]
+ rows = [
+ dict(zip(db.object_metadata_get_cols[object_type], row)) for row in rows
+ ]
results = []
for row in rows:
row = row.copy()
row.pop("metadata_fetcher.id")
- results.append(
- {
- "origin_url": row.pop("origin.url"),
- "authority": {
- "type": row.pop("metadata_authority.type"),
- "url": row.pop("metadata_authority.url"),
- },
- "fetcher": {
- "name": row.pop("metadata_fetcher.name"),
- "version": row.pop("metadata_fetcher.version"),
- },
- **row,
- }
- )
+ if CONTEXT_KEYS[object_type]:
+ context = {}
+ for key in CONTEXT_KEYS[object_type]:
+ value = row.pop(key)
+ if value is not None:
+ context[key] = value
+ else:
+ context = None
+
+ result = {
+ "authority": {
+ "type": row.pop("metadata_authority.type"),
+ "url": row.pop("metadata_authority.url"),
+ },
+ "fetcher": {
+ "name": row.pop("metadata_fetcher.name"),
+ "version": row.pop("metadata_fetcher.version"),
+ },
+ **row,
+ }
+
+ if context:
+ result["context"] = context
+
+ results.append(result)
if len(results) > limit:
results.pop()
@@ -1415,3 +1532,19 @@
def flush(self, object_types: Optional[Iterable[str]] = None) -> Dict:
return {}
+
+ def _get_authority_id(self, authority: Dict[str, Any], db, cur):
+ authority_id = db.metadata_authority_get_id(
+ authority["type"], authority["url"], cur
+ )
+ if not authority_id:
+ raise StorageArgumentException(f"Unknown authority {authority}")
+ return authority_id
+
+ def _get_fetcher_id(self, fetcher: Dict[str, Any], db, cur):
+ fetcher_id = db.metadata_fetcher_get_id(
+ fetcher["name"], fetcher["version"], cur
+ )
+ if not fetcher_id:
+ raise StorageArgumentException(f"Unknown fetcher {fetcher}")
+ return fetcher_id
diff --git a/swh/storage/tests/storage_data.py b/swh/storage/tests/storage_data.py
--- a/swh/storage/tests/storage_data.py
+++ b/swh/storage/tests/storage_data.py
@@ -4,7 +4,7 @@
# See top-level LICENSE file for more information
import datetime
-from swh.model.hashutil import hash_to_bytes
+from swh.model.hashutil import hash_to_bytes, hash_to_hex
from swh.model import from_disk
@@ -466,6 +466,66 @@
},
}
+content_metadata = {
+ "id": f"swh:1:cnt:{cont['sha1_git']}",
+ "context": {"origin": origin["url"]},
+ "discovery_date": datetime.datetime(
+ 2015, 1, 1, 21, 0, 0, tzinfo=datetime.timezone.utc
+ ),
+ "authority": {
+ "type": metadata_authority["type"],
+ "url": metadata_authority["url"],
+ },
+ "fetcher": {
+ "name": metadata_fetcher["name"],
+ "version": metadata_fetcher["version"],
+ },
+ "format": "json",
+ "metadata": b'{"foo": "bar"}',
+}
+content_metadata2 = {
+ "id": f"swh:1:cnt:{cont['sha1_git']}",
+ "context": {"origin": origin2["url"]},
+ "discovery_date": datetime.datetime(
+ 2017, 1, 1, 22, 0, 0, tzinfo=datetime.timezone.utc
+ ),
+ "authority": {
+ "type": metadata_authority["type"],
+ "url": metadata_authority["url"],
+ },
+ "fetcher": {
+ "name": metadata_fetcher["name"],
+ "version": metadata_fetcher["version"],
+ },
+ "format": "yaml",
+ "metadata": b"foo: bar",
+}
+content_metadata3 = {
+ "id": f"swh:1:cnt:{cont['sha1_git']}",
+ "context": {
+ "origin": origin["url"],
+ "visit": 42,
+ "snapshot": f"swh:1:snp:{hash_to_hex(snapshot['id'])}",
+ "release": f"swh:1:rel:{hash_to_hex(release['id'])}",
+ "revision": f"swh:1:rev:{hash_to_hex(revision['id'])}",
+ "directory": f"swh:1:dir:{hash_to_hex(dir['id'])}",
+ "path": b"/foo/bar",
+ },
+ "discovery_date": datetime.datetime(
+ 2017, 1, 1, 22, 0, 0, tzinfo=datetime.timezone.utc
+ ),
+ "authority": {
+ "type": metadata_authority2["type"],
+ "url": metadata_authority2["url"],
+ },
+ "fetcher": {
+ "name": metadata_fetcher2["name"],
+ "version": metadata_fetcher2["version"],
+ },
+ "format": "yaml",
+ "metadata": b"foo: bar",
+}
+
origin_metadata = {
"origin_url": origin["url"],
"discovery_date": datetime.datetime(
diff --git a/swh/storage/tests/test_storage.py b/swh/storage/tests/test_storage.py
--- a/swh/storage/tests/test_storage.py
+++ b/swh/storage/tests/test_storage.py
@@ -3206,6 +3206,191 @@
with pytest.raises(StorageArgumentException):
swh_storage.content_find({"unknown-sha1": "something"}) # not the right key
+ def test_content_metadata_add(self, swh_storage):
+ content = data.cont
+ fetcher = data.metadata_fetcher
+ authority = data.metadata_authority
+ content_swhid = f"swh:1:cnt:{content['sha1_git']}"
+
+ swh_storage.metadata_fetcher_add(**fetcher)
+ swh_storage.metadata_authority_add(**authority)
+
+ swh_storage.content_metadata_add(**data.content_metadata)
+ swh_storage.content_metadata_add(**data.content_metadata2)
+
+ result = swh_storage.content_metadata_get(content_swhid, authority)
+ assert result["next_page_token"] is None
+ assert [data.content_metadata, data.content_metadata2] == list(
+ sorted(result["results"], key=lambda x: x["discovery_date"],)
+ )
+
+ def test_content_metadata_add_duplicate(self, swh_storage):
+ """Duplicates should be silently updated."""
+ content = data.cont
+ fetcher = data.metadata_fetcher
+ authority = data.metadata_authority
+ content_swhid = f"swh:1:cnt:{content['sha1_git']}"
+
+ new_content_metadata2 = {
+ **data.content_metadata2,
+ "format": "new-format",
+ "metadata": b"new-metadata",
+ }
+
+ swh_storage.metadata_fetcher_add(**fetcher)
+ swh_storage.metadata_authority_add(**authority)
+
+ swh_storage.content_metadata_add(**data.content_metadata)
+ swh_storage.content_metadata_add(**data.content_metadata2)
+ swh_storage.content_metadata_add(**new_content_metadata2)
+
+ result = swh_storage.content_metadata_get(content_swhid, authority)
+ assert result["next_page_token"] is None
+ assert [data.content_metadata, new_content_metadata2] == list(
+ sorted(result["results"], key=lambda x: x["discovery_date"],)
+ )
+
+ def test_content_metadata_add_dict(self, swh_storage):
+ content = data.cont
+ fetcher = data.metadata_fetcher
+ authority = data.metadata_authority
+
+ swh_storage.metadata_fetcher_add(**fetcher)
+ swh_storage.metadata_authority_add(**authority)
+
+ kwargs = data.content_metadata.copy()
+ kwargs["metadata"] = {"foo": "bar"}
+
+ with pytest.raises(StorageArgumentException):
+ swh_storage.content_metadata_add(**kwargs)
+
+ def test_content_metadata_get(self, swh_storage):
+ authority = data.metadata_authority
+ fetcher = data.metadata_fetcher
+ authority2 = data.metadata_authority2
+ fetcher2 = data.metadata_fetcher2
+ content1_swhid = f"swh:1:cnt:{data.cont['sha1_git']}"
+ content2_swhid = f"swh:1:cnt:{data.cont2['sha1_git']}"
+
+ content1_metadata1 = data.content_metadata
+ content1_metadata2 = data.content_metadata2
+ content1_metadata3 = data.content_metadata3
+ content2_metadata = {**data.content_metadata2, "id": content2_swhid}
+
+ swh_storage.metadata_authority_add(**authority)
+ swh_storage.metadata_fetcher_add(**fetcher)
+ swh_storage.metadata_authority_add(**authority2)
+ swh_storage.metadata_fetcher_add(**fetcher2)
+
+ swh_storage.content_metadata_add(**content1_metadata1)
+ swh_storage.content_metadata_add(**content1_metadata2)
+ swh_storage.content_metadata_add(**content1_metadata3)
+ swh_storage.content_metadata_add(**content2_metadata)
+
+ result = swh_storage.content_metadata_get(content1_swhid, authority)
+ assert result["next_page_token"] is None
+ assert [content1_metadata1, content1_metadata2] == list(
+ sorted(result["results"], key=lambda x: x["discovery_date"],)
+ )
+
+ result = swh_storage.content_metadata_get(content1_swhid, authority2)
+ assert result["next_page_token"] is None
+ assert [content1_metadata3] == list(
+ sorted(result["results"], key=lambda x: x["discovery_date"],)
+ )
+
+ result = swh_storage.content_metadata_get(content2_swhid, authority)
+ assert result["next_page_token"] is None
+ assert [content2_metadata] == list(result["results"],)
+
+ def test_content_metadata_get_after(self, swh_storage):
+ content = data.cont
+ fetcher = data.metadata_fetcher
+ authority = data.metadata_authority
+ content_swhid = f"swh:1:cnt:{content['sha1_git']}"
+
+ swh_storage.metadata_fetcher_add(**fetcher)
+ swh_storage.metadata_authority_add(**authority)
+
+ swh_storage.content_metadata_add(**data.content_metadata)
+ swh_storage.content_metadata_add(**data.content_metadata2)
+
+ result = swh_storage.content_metadata_get(
+ content_swhid,
+ authority,
+ after=data.content_metadata["discovery_date"] - timedelta(seconds=1),
+ )
+ assert result["next_page_token"] is None
+ assert [data.content_metadata, data.content_metadata2] == list(
+ sorted(result["results"], key=lambda x: x["discovery_date"],)
+ )
+
+ result = swh_storage.content_metadata_get(
+ content_swhid, authority, after=data.content_metadata["discovery_date"]
+ )
+ assert result["next_page_token"] is None
+ assert [data.content_metadata2] == result["results"]
+
+ result = swh_storage.content_metadata_get(
+ content_swhid, authority, after=data.content_metadata2["discovery_date"]
+ )
+ assert result["next_page_token"] is None
+ assert [] == result["results"]
+
+ def test_content_metadata_get_paginate(self, swh_storage):
+ content = data.cont
+ fetcher = data.metadata_fetcher
+ authority = data.metadata_authority
+ content_swhid = f"swh:1:cnt:{content['sha1_git']}"
+
+ swh_storage.metadata_fetcher_add(**fetcher)
+ swh_storage.metadata_authority_add(**authority)
+
+ swh_storage.content_metadata_add(**data.content_metadata)
+ swh_storage.content_metadata_add(**data.content_metadata2)
+
+ swh_storage.content_metadata_get(content_swhid, authority)
+
+ result = swh_storage.content_metadata_get(content_swhid, authority, limit=1)
+ assert result["next_page_token"] is not None
+ assert [data.content_metadata] == result["results"]
+
+ result = swh_storage.content_metadata_get(
+ content_swhid, authority, limit=1, page_token=result["next_page_token"]
+ )
+ assert result["next_page_token"] is None
+ assert [data.content_metadata2] == result["results"]
+
+ def test_content_metadata_get_paginate_same_date(self, swh_storage):
+ content = data.cont
+ fetcher1 = data.metadata_fetcher
+ fetcher2 = data.metadata_fetcher2
+ authority = data.metadata_authority
+ content_swhid = f"swh:1:cnt:{content['sha1_git']}"
+
+ swh_storage.metadata_fetcher_add(**fetcher1)
+ swh_storage.metadata_fetcher_add(**fetcher2)
+ swh_storage.metadata_authority_add(**authority)
+
+ content_metadata2 = {
+ **data.content_metadata2,
+ "discovery_date": data.content_metadata2["discovery_date"],
+ "fetcher": {"name": fetcher2["name"], "version": fetcher2["version"],},
+ }
+
+ swh_storage.content_metadata_add(**data.content_metadata)
+ swh_storage.content_metadata_add(**content_metadata2)
+
+ result = swh_storage.content_metadata_get(content_swhid, authority, limit=1)
+ assert result["next_page_token"] is not None
+ assert [data.content_metadata] == result["results"]
+
+ result = swh_storage.content_metadata_get(
+ content_swhid, authority, limit=1, page_token=result["next_page_token"]
+ )
+ assert result["next_page_token"] is None
+ assert [content_metadata2] == result["results"]
+
def test_object_find_by_sha1_git(self, swh_storage):
sha1_gits = [b"00000000000000000000"]
expected = {
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Thu, Jul 3, 3:39 PM (1 w, 6 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3225361
Attached To
D3247: [WIP] Add content_metadata_{add,get}.
Event Timeline
Log In to Comment