Page MenuHomeSoftware Heritage

D3247.id11506.diff
No OneTemporary

D3247.id11506.diff

diff --git a/swh/storage/db.py b/swh/storage/db.py
--- a/swh/storage/db.py
+++ b/swh/storage/db.py
@@ -6,13 +6,15 @@
import datetime
import random
import select
-from typing import Any, Dict, List, Optional, Tuple
+from typing import Any, Dict, List, Optional, Tuple, Union
from swh.core.db import BaseDb
from swh.core.db.db_utils import stored_procedure, jsonize
from swh.core.db.db_utils import execute_values_generator
from swh.model.model import OriginVisit, OriginVisitStatus, SHA1_SIZE
+from . import extrinsic_metadata
+
class Db(BaseDb):
"""Proxy to the SWH DB, with wrappers around stored procedures
@@ -1071,74 +1073,110 @@
def release_get_random(self, cur=None):
return self._get_random_row_from_table("release", ["id"], "id", cur)
- origin_metadata_get_cols = [
- "origin.url",
- "discovery_date",
- "metadata_authority.type",
- "metadata_authority.url",
- "metadata_fetcher.id",
- "metadata_fetcher.name",
- "metadata_fetcher.version",
- "format",
- "metadata",
- ]
+ _object_metadata_context_cols = {
+ object_type: sorted(context_keys)
+ for (object_type, context_keys) in extrinsic_metadata.CONTEXT_KEYS.items()
+ }
+ """The list of context columns for each object type."""
+
+ _object_metadata_insert_cols = {
+ object_type: [
+ "id",
+ "authority_id",
+ "fetcher_id",
+ "discovery_date",
+ "format",
+ "metadata",
+ ]
+ + cols
+ for (object_type, cols) in _object_metadata_context_cols.items()
+ }
+ """List of columns of the {object_type}_metadata table, used when writing
+ metadata."""
+
+ _object_metadata_insert_queries = {
+ object_type: f"""
+ INSERT INTO {object_type}_metadata ({', '.join(cols)})
+ VALUES ({', '.join(['%s']*len(cols))})
+ ON CONFLICT (id, authority_id, discovery_date, fetcher_id)
+ DO UPDATE SET
+ format=EXCLUDED.format,
+ metadata=EXCLUDED.metadata
+ """
+ for (object_type, cols) in _object_metadata_insert_cols.items()
+ }
+
+ object_metadata_get_cols = {
+ object_type: ["id",]
+ + cols
+ + [
+ "discovery_date",
+ "metadata_authority.type",
+ "metadata_authority.url",
+ "metadata_fetcher.id",
+ "metadata_fetcher.name",
+ "metadata_fetcher.version",
+ "format",
+ "metadata",
+ ]
+ for (object_type, cols) in _object_metadata_context_cols.items()
+ }
+ """List of columns of the {object_type}_metadata, metadata_authority,
+ and metadata_fetcher tables, used when reading object metadata."""
+
+ _object_metadata_select_queries = {
+ object_type: f"""
+ SELECT {object_type}_metadata.id AS id, {', '.join(cols[1:-1])},
+ {object_type}_metadata.metadata AS metadata
+ FROM {object_type}_metadata
+ INNER JOIN metadata_authority
+ ON (metadata_authority.id=authority_id)
+ INNER JOIN metadata_fetcher ON (metadata_fetcher.id=fetcher_id)
+ WHERE {object_type}_metadata.id=%s AND authority_id=%s
+ """
+ for (object_type, cols) in object_metadata_get_cols.items()
+ }
- def origin_metadata_add(
+ def object_metadata_add(
self,
- origin: str,
+ object_type: str,
+ id: Union[int, str],
+ context: Dict[str, Union[str, bytes, int]],
discovery_date: datetime.datetime,
- authority: int,
- fetcher: int,
+ authority_id: int,
+ fetcher_id: int,
format: str,
metadata: bytes,
- cur=None,
- ) -> None:
- """ Add an origin_metadata for the origin at ts with provider, tool and
- metadata.
-
- Args:
- origin: the origin's id for which the metadata is added
- discovery_date: time when the metadata was found
- authority: the metadata provider identifier
- fetcher: the tool's identifier used to extract metadata
- format: the format of the metadata
- metadata: the metadata retrieved at the time and location
- """
- cur = self._cursor(cur)
- insert = """INSERT INTO origin_metadata (origin_id, discovery_date,
- authority_id, fetcher_id, format, metadata)
- SELECT id, %s, %s, %s, %s, %s FROM origin WHERE url = %s
- ON CONFLICT (origin_id, authority_id, discovery_date, fetcher_id)
- DO UPDATE SET
- format=EXCLUDED.format,
- metadata=EXCLUDED.metadata
- """
- cur.execute(
- insert, (discovery_date, authority, fetcher, format, metadata, origin),
+ cur,
+ ):
+ query = self._object_metadata_insert_queries[object_type]
+ args = dict(
+ id=id,
+ authority_id=authority_id,
+ fetcher_id=fetcher_id,
+ discovery_date=discovery_date,
+ format=format,
+ metadata=metadata,
)
+ for col in self._object_metadata_context_cols[object_type]:
+ args[col] = context.get(col)
- def origin_metadata_get(
+ params = [args[col] for col in self._object_metadata_insert_cols[object_type]]
+
+ cur.execute(query, params)
+
+ def object_metadata_get(
self,
- origin_url: str,
- authority: int,
+ object_type: str,
+ id: Union[id, str],
+ authority_id: int,
after_time: Optional[datetime.datetime],
after_fetcher: Optional[int],
- limit: Optional[int],
- cur=None,
+ limit: int,
+ cur,
):
- cur = self._cursor(cur)
- assert self.origin_metadata_get_cols[-1] == "metadata"
- query_parts = [
- f"SELECT {', '.join(self.origin_metadata_get_cols[0:-1])}, "
- f" origin_metadata.metadata AS metadata "
- f"FROM origin_metadata "
- f"INNER JOIN metadata_authority "
- f" ON (metadata_authority.id=authority_id) "
- f"INNER JOIN metadata_fetcher ON (metadata_fetcher.id=fetcher_id) "
- f"INNER JOIN origin ON (origin.id=origin_metadata.origin_id) "
- f"WHERE origin.url=%s AND authority_id=%s "
- ]
- args = [origin_url, authority]
+ query_parts = [self._object_metadata_select_queries[object_type]]
+ args = [id, authority_id]
if after_fetcher is not None:
assert after_time
diff --git a/swh/storage/extrinsic_metadata.py b/swh/storage/extrinsic_metadata.py
new file mode 100644
--- /dev/null
+++ b/swh/storage/extrinsic_metadata.py
@@ -0,0 +1,55 @@
+# Copyright (C) 2020 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+from typing import Any, cast, Dict
+
+from swh.model.identifiers import PersistentId, parse_persistent_identifier
+
+from .exc import StorageArgumentException
+
+CONTEXT_KEYS = {}
+CONTEXT_KEYS["origin"] = {}
+CONTEXT_KEYS["snapshot"] = {"origin": str, "visit": int}
+CONTEXT_KEYS["release"] = {**CONTEXT_KEYS["snapshot"], "snapshot": PersistentId}
+CONTEXT_KEYS["revision"] = {**CONTEXT_KEYS["release"], "release": PersistentId}
+CONTEXT_KEYS["directory"] = {
+ **CONTEXT_KEYS["revision"],
+ "revision": PersistentId,
+ "path": bytes,
+}
+CONTEXT_KEYS["content"] = {**CONTEXT_KEYS["directory"], "directory": PersistentId}
+
+
+def check_extrinsic_metadata_context(object_type: str, context: Dict[str, Any]):
+ key_types = CONTEXT_KEYS[object_type]
+
+ extra_keys = set(context) - set(key_types)
+ if extra_keys:
+ raise StorageArgumentException(f"Unknown context keys: {', '.join(extra_keys)}")
+
+ for (key, value) in context.items():
+ expected_type = key_types[key]
+ expected_type_str = str(expected_type) # for display
+
+ # If an SWHID is expected and a string is given, parse it
+ if expected_type is PersistentId and isinstance(value, str):
+ value = parse_persistent_identifier(value)
+ expected_type_str = "PersistentId or str"
+
+ # Check the type of the context value
+ if not isinstance(value, expected_type):
+ raise StorageArgumentException(
+ f"Context key {key} must have type {expected_type_str}, "
+ f"but is {value!r}"
+ )
+
+ # If it is an SWHID, check it is also a core SWHID.
+ if expected_type is PersistentId:
+ value = cast(PersistentId, value)
+ if value.metadata != {}:
+ raise StorageArgumentException(
+ f"Context key {key} must be a core SWHID, "
+ f"but it has qualifiers {', '.join(value.metadata)}."
+ )
diff --git a/swh/storage/sql/30-swh-schema.sql b/swh/storage/sql/30-swh-schema.sql
--- a/swh/storage/sql/30-swh-schema.sql
+++ b/swh/storage/sql/30-swh-schema.sql
@@ -37,6 +37,9 @@
-- a set of UNIX-like access permissions, as manipulated by, e.g., chmod
create domain file_perms as int;
+-- an SWHID
+create domain swhid as text check (value ~ '^swh:[0-9]+:.*');
+
-- Checksums about actual file content. Note that the content itself is not
-- stored in the DB, but on external (key-value) storage. A single checksum is
@@ -66,6 +69,32 @@
comment on column content.object_id is 'Content identifier';
+-- Extrinsic metadata on a content object.
+create table content_metadata
+(
+ -- content identifier
+ id swhid not null,
+
+ -- metadata source
+ authority_id bigint not null,
+ fetcher_id bigint not null,
+ discovery_date timestamptz not null,
+
+ -- metadata itself
+ format text not null,
+ metadata bytea not null,
+
+ -- context
+ origin text,
+ visit bigint,
+ snapshot swhid,
+ release swhid,
+ revision swhid,
+ path bytea,
+ directory swhid
+);
+
+
-- An origin is a place, identified by an URL, where software source code
-- artifacts can be found. We support different kinds of origins, e.g., git and
-- other VCS repositories, web pages that list tarballs URLs (e.g.,
@@ -431,8 +460,7 @@
-- also provides a translation to a defined json schema using a translation tool (tool_id)
create table origin_metadata
(
- id bigserial not null, -- PK internal object identifier
- origin_id bigint not null, -- references origin(id)
+ id bigint not null, -- references origin(id)
discovery_date timestamptz not null, -- when it was extracted
authority_id bigint not null,
fetcher_id bigint not null,
@@ -441,8 +469,7 @@
);
comment on table origin_metadata is 'keeps all metadata found concerning an origin';
-comment on column origin_metadata.id is 'the origin_metadata object''s id';
-comment on column origin_metadata.origin_id is 'the origin id for which the metadata was found';
+comment on column origin_metadata.id is 'the origin id for which the metadata was found';
comment on column origin_metadata.discovery_date is 'the date of retrieval';
comment on column origin_metadata.authority_id is 'the metadata provider: github, openhub, deposit, etc.';
comment on column origin_metadata.fetcher_id is 'the tool used for extracting metadata: loaders, crawlers, etc.';
diff --git a/swh/storage/sql/60-swh-indexes.sql b/swh/storage/sql/60-swh-indexes.sql
--- a/swh/storage/sql/60-swh-indexes.sql
+++ b/swh/storage/sql/60-swh-indexes.sql
@@ -167,13 +167,19 @@
create unique index metadata_authority_type_url on metadata_authority(type, url);
--- origin_metadata
-create unique index concurrently origin_metadata_pkey on origin_metadata(id);
-alter table origin_metadata add primary key using index origin_metadata_pkey;
+-- content_metadata
+create unique index concurrently content_metadata_content_authority_date_fetcher on content_metadata(id, authority_id, discovery_date, fetcher_id);
+
+alter table content_metadata add constraint content_metadata_authority_fkey foreign key (authority_id) references metadata_authority(id) not valid;
+alter table content_metadata validate constraint content_metadata_authority_fkey;
-create unique index concurrently origin_metadata_origin_authority_date_fetcher on origin_metadata(origin_id, authority_id, discovery_date, fetcher_id);
+alter table content_metadata add constraint content_metadata_fetcher_fkey foreign key (fetcher_id) references metadata_fetcher(id) not valid;
+alter table content_metadata validate constraint content_metadata_fetcher_fkey;
+
+-- origin_metadata
+create unique index concurrently origin_metadata_origin_authority_date_fetcher on origin_metadata(id, authority_id, discovery_date, fetcher_id);
-alter table origin_metadata add constraint origin_metadata_origin_fkey foreign key (origin_id) references origin(id) not valid;
+alter table origin_metadata add constraint origin_metadata_origin_fkey foreign key (id) references origin(id) not valid;
alter table origin_metadata validate constraint origin_metadata_origin_fkey;
alter table origin_metadata add constraint origin_metadata_authority_fkey foreign key (authority_id) references metadata_authority(id) not valid;
diff --git a/swh/storage/storage.py b/swh/storage/storage.py
--- a/swh/storage/storage.py
+++ b/swh/storage/storage.py
@@ -36,6 +36,7 @@
from swh.storage.utils import now
from . import converters
+from .extrinsic_metadata import check_extrinsic_metadata_context, CONTEXT_KEYS
from .common import db_transaction_generator, db_transaction
from .db import Db
from .exc import StorageArgumentException, StorageDBError, HashCollision
@@ -360,6 +361,49 @@
)
return [dict(zip(db.content_find_cols, content)) for content in contents]
+ @timed
+ @db_transaction()
+ def content_metadata_add(
+ self,
+ id: str,
+ context: Dict[str, Union[str, bytes, int]],
+ discovery_date: datetime.datetime,
+ authority: Dict[str, Any],
+ fetcher: Dict[str, Any],
+ format: str,
+ metadata: bytes,
+ db=None,
+ cur=None,
+ ):
+ self._object_metadata_add(
+ "content",
+ id,
+ context,
+ discovery_date,
+ authority,
+ fetcher,
+ format,
+ metadata,
+ db,
+ cur,
+ )
+
+ @timed
+ @db_transaction()
+ def content_metadata_get(
+ self,
+ id: str,
+ authority: Dict[str, str],
+ after: Optional[datetime.datetime] = None,
+ page_token: Optional[bytes] = None,
+ limit: int = 1000,
+ db=None,
+ cur=None,
+ ):
+ return self._object_metadata_get(
+ "content", id, authority, after, page_token, limit, db, cur
+ )
+
@timed
@db_transaction()
def content_get_random(self, db=None, cur=None):
@@ -1263,29 +1307,64 @@
db=None,
cur=None,
) -> None:
- authority_id = db.metadata_authority_get_id(
- authority["type"], authority["url"], cur
- )
- if not authority_id:
- raise StorageArgumentException(f"Unknown authority {authority}")
- fetcher_id = db.metadata_fetcher_get_id(
- fetcher["name"], fetcher["version"], cur
+ origin_id = next(iter(list(db.origin_id_get_by_url([origin_url], cur))), None)
+ if origin_id is None:
+ raise StorageArgumentException(f"Unknown origin {origin_url}")
+
+ context = {} # origins have no context
+
+ self._object_metadata_add(
+ "origin",
+ origin_id,
+ context,
+ discovery_date,
+ authority,
+ fetcher,
+ format,
+ metadata,
+ db,
+ cur,
)
- if not fetcher_id:
- raise StorageArgumentException(f"Unknown fetcher {fetcher}")
- try:
- db.origin_metadata_add(
- origin_url,
- discovery_date,
- authority_id,
- fetcher_id,
- format,
- metadata,
- cur,
+
+ def _object_metadata_add(
+ self,
+ object_type: str,
+ id: Union[int, str],
+ context: Dict[str, Union[str, bytes, int]],
+ discovery_date: datetime.datetime,
+ authority: Dict[str, Any],
+ fetcher: Dict[str, Any],
+ format: str,
+ metadata: bytes,
+ db,
+ cur,
+ ) -> None:
+ check_extrinsic_metadata_context(object_type, context)
+
+ authority_id = self._get_authority_id(authority, db, cur)
+ fetcher_id = self._get_fetcher_id(fetcher, db, cur)
+ if not isinstance(metadata, bytes):
+ raise StorageArgumentException(
+ "metadata must be bytes, not %r" % (metadata,)
)
- except psycopg2.ProgrammingError as e:
- raise StorageArgumentException(*e.args)
- send_metric("origin_metadata:add", count=1, method_name="origin_metadata_add")
+
+ db.object_metadata_add(
+ object_type,
+ id,
+ context,
+ discovery_date,
+ authority_id,
+ fetcher_id,
+ format,
+ metadata,
+ cur,
+ )
+
+ send_metric(
+ f"{object_type}_metadata:add",
+ count=1,
+ method_name=f"{object_type}_metadata_add",
+ )
@timed
@db_transaction(statement_timeout=500)
@@ -1298,6 +1377,31 @@
limit: int = 1000,
db=None,
cur=None,
+ ) -> Dict[str, Any]:
+ origin_id = next(iter(list(db.origin_id_get_by_url([origin_url], cur))), None)
+ if origin_id is None:
+ raise StorageArgumentException(f"Unknown origin {origin_url}")
+
+ result = self._object_metadata_get(
+ "origin", origin_id, authority, after, page_token, limit, db, cur
+ )
+
+ for res in result["results"]:
+ res.pop("id")
+ res["origin_url"] = origin_url
+
+ return result
+
+ def _object_metadata_get(
+ self,
+ object_type: str,
+ id: Union[str, int],
+ authority: Dict[str, str],
+ after: Optional[datetime.datetime],
+ page_token: Optional[bytes],
+ limit: int,
+ db,
+ cur,
) -> Dict[str, Any]:
if page_token:
(after_time, after_fetcher) = msgpack_loads(page_token)
@@ -1318,28 +1422,41 @@
"results": [],
}
- rows = db.origin_metadata_get(
- origin_url, authority_id, after_time, after_fetcher, limit + 1, cur
+ rows = db.object_metadata_get(
+ object_type, id, authority_id, after_time, after_fetcher, limit + 1, cur
)
- rows = [dict(zip(db.origin_metadata_get_cols, row)) for row in rows]
+ rows = [
+ dict(zip(db.object_metadata_get_cols[object_type], row)) for row in rows
+ ]
results = []
for row in rows:
row = row.copy()
row.pop("metadata_fetcher.id")
- results.append(
- {
- "origin_url": row.pop("origin.url"),
- "authority": {
- "type": row.pop("metadata_authority.type"),
- "url": row.pop("metadata_authority.url"),
- },
- "fetcher": {
- "name": row.pop("metadata_fetcher.name"),
- "version": row.pop("metadata_fetcher.version"),
- },
- **row,
- }
- )
+ if CONTEXT_KEYS[object_type]:
+ context = {}
+ for key in CONTEXT_KEYS[object_type]:
+ value = row.pop(key)
+ if value is not None:
+ context[key] = value
+ else:
+ context = None
+
+ result = {
+ "authority": {
+ "type": row.pop("metadata_authority.type"),
+ "url": row.pop("metadata_authority.url"),
+ },
+ "fetcher": {
+ "name": row.pop("metadata_fetcher.name"),
+ "version": row.pop("metadata_fetcher.version"),
+ },
+ **row,
+ }
+
+ if context:
+ result["context"] = context
+
+ results.append(result)
if len(results) > limit:
results.pop()
@@ -1415,3 +1532,19 @@
def flush(self, object_types: Optional[Iterable[str]] = None) -> Dict:
return {}
+
+ def _get_authority_id(self, authority: Dict[str, Any], db, cur):
+ authority_id = db.metadata_authority_get_id(
+ authority["type"], authority["url"], cur
+ )
+ if not authority_id:
+ raise StorageArgumentException(f"Unknown authority {authority}")
+ return authority_id
+
+ def _get_fetcher_id(self, fetcher: Dict[str, Any], db, cur):
+ fetcher_id = db.metadata_fetcher_get_id(
+ fetcher["name"], fetcher["version"], cur
+ )
+ if not fetcher_id:
+ raise StorageArgumentException(f"Unknown fetcher {fetcher}")
+ return fetcher_id
diff --git a/swh/storage/tests/storage_data.py b/swh/storage/tests/storage_data.py
--- a/swh/storage/tests/storage_data.py
+++ b/swh/storage/tests/storage_data.py
@@ -4,7 +4,7 @@
# See top-level LICENSE file for more information
import datetime
-from swh.model.hashutil import hash_to_bytes
+from swh.model.hashutil import hash_to_bytes, hash_to_hex
from swh.model import from_disk
@@ -466,6 +466,66 @@
},
}
+content_metadata = {
+ "id": f"swh:1:cnt:{cont['sha1_git']}",
+ "context": {"origin": origin["url"]},
+ "discovery_date": datetime.datetime(
+ 2015, 1, 1, 21, 0, 0, tzinfo=datetime.timezone.utc
+ ),
+ "authority": {
+ "type": metadata_authority["type"],
+ "url": metadata_authority["url"],
+ },
+ "fetcher": {
+ "name": metadata_fetcher["name"],
+ "version": metadata_fetcher["version"],
+ },
+ "format": "json",
+ "metadata": b'{"foo": "bar"}',
+}
+content_metadata2 = {
+ "id": f"swh:1:cnt:{cont['sha1_git']}",
+ "context": {"origin": origin2["url"]},
+ "discovery_date": datetime.datetime(
+ 2017, 1, 1, 22, 0, 0, tzinfo=datetime.timezone.utc
+ ),
+ "authority": {
+ "type": metadata_authority["type"],
+ "url": metadata_authority["url"],
+ },
+ "fetcher": {
+ "name": metadata_fetcher["name"],
+ "version": metadata_fetcher["version"],
+ },
+ "format": "yaml",
+ "metadata": b"foo: bar",
+}
+content_metadata3 = {
+ "id": f"swh:1:cnt:{cont['sha1_git']}",
+ "context": {
+ "origin": origin["url"],
+ "visit": 42,
+ "snapshot": f"swh:1:snp:{hash_to_hex(snapshot['id'])}",
+ "release": f"swh:1:rel:{hash_to_hex(release['id'])}",
+ "revision": f"swh:1:rev:{hash_to_hex(revision['id'])}",
+ "directory": f"swh:1:dir:{hash_to_hex(dir['id'])}",
+ "path": b"/foo/bar",
+ },
+ "discovery_date": datetime.datetime(
+ 2017, 1, 1, 22, 0, 0, tzinfo=datetime.timezone.utc
+ ),
+ "authority": {
+ "type": metadata_authority2["type"],
+ "url": metadata_authority2["url"],
+ },
+ "fetcher": {
+ "name": metadata_fetcher2["name"],
+ "version": metadata_fetcher2["version"],
+ },
+ "format": "yaml",
+ "metadata": b"foo: bar",
+}
+
origin_metadata = {
"origin_url": origin["url"],
"discovery_date": datetime.datetime(
diff --git a/swh/storage/tests/test_storage.py b/swh/storage/tests/test_storage.py
--- a/swh/storage/tests/test_storage.py
+++ b/swh/storage/tests/test_storage.py
@@ -3206,6 +3206,191 @@
with pytest.raises(StorageArgumentException):
swh_storage.content_find({"unknown-sha1": "something"}) # not the right key
+ def test_content_metadata_add(self, swh_storage):
+ content = data.cont
+ fetcher = data.metadata_fetcher
+ authority = data.metadata_authority
+ content_swhid = f"swh:1:cnt:{content['sha1_git']}"
+
+ swh_storage.metadata_fetcher_add(**fetcher)
+ swh_storage.metadata_authority_add(**authority)
+
+ swh_storage.content_metadata_add(**data.content_metadata)
+ swh_storage.content_metadata_add(**data.content_metadata2)
+
+ result = swh_storage.content_metadata_get(content_swhid, authority)
+ assert result["next_page_token"] is None
+ assert [data.content_metadata, data.content_metadata2] == list(
+ sorted(result["results"], key=lambda x: x["discovery_date"],)
+ )
+
+ def test_content_metadata_add_duplicate(self, swh_storage):
+ """Duplicates should be silently updated."""
+ content = data.cont
+ fetcher = data.metadata_fetcher
+ authority = data.metadata_authority
+ content_swhid = f"swh:1:cnt:{content['sha1_git']}"
+
+ new_content_metadata2 = {
+ **data.content_metadata2,
+ "format": "new-format",
+ "metadata": b"new-metadata",
+ }
+
+ swh_storage.metadata_fetcher_add(**fetcher)
+ swh_storage.metadata_authority_add(**authority)
+
+ swh_storage.content_metadata_add(**data.content_metadata)
+ swh_storage.content_metadata_add(**data.content_metadata2)
+ swh_storage.content_metadata_add(**new_content_metadata2)
+
+ result = swh_storage.content_metadata_get(content_swhid, authority)
+ assert result["next_page_token"] is None
+ assert [data.content_metadata, new_content_metadata2] == list(
+ sorted(result["results"], key=lambda x: x["discovery_date"],)
+ )
+
+ def test_content_metadata_add_dict(self, swh_storage):
+ content = data.cont
+ fetcher = data.metadata_fetcher
+ authority = data.metadata_authority
+
+ swh_storage.metadata_fetcher_add(**fetcher)
+ swh_storage.metadata_authority_add(**authority)
+
+ kwargs = data.content_metadata.copy()
+ kwargs["metadata"] = {"foo": "bar"}
+
+ with pytest.raises(StorageArgumentException):
+ swh_storage.content_metadata_add(**kwargs)
+
+ def test_content_metadata_get(self, swh_storage):
+ authority = data.metadata_authority
+ fetcher = data.metadata_fetcher
+ authority2 = data.metadata_authority2
+ fetcher2 = data.metadata_fetcher2
+ content1_swhid = f"swh:1:cnt:{data.cont['sha1_git']}"
+ content2_swhid = f"swh:1:cnt:{data.cont2['sha1_git']}"
+
+ content1_metadata1 = data.content_metadata
+ content1_metadata2 = data.content_metadata2
+ content1_metadata3 = data.content_metadata3
+ content2_metadata = {**data.content_metadata2, "id": content2_swhid}
+
+ swh_storage.metadata_authority_add(**authority)
+ swh_storage.metadata_fetcher_add(**fetcher)
+ swh_storage.metadata_authority_add(**authority2)
+ swh_storage.metadata_fetcher_add(**fetcher2)
+
+ swh_storage.content_metadata_add(**content1_metadata1)
+ swh_storage.content_metadata_add(**content1_metadata2)
+ swh_storage.content_metadata_add(**content1_metadata3)
+ swh_storage.content_metadata_add(**content2_metadata)
+
+ result = swh_storage.content_metadata_get(content1_swhid, authority)
+ assert result["next_page_token"] is None
+ assert [content1_metadata1, content1_metadata2] == list(
+ sorted(result["results"], key=lambda x: x["discovery_date"],)
+ )
+
+ result = swh_storage.content_metadata_get(content1_swhid, authority2)
+ assert result["next_page_token"] is None
+ assert [content1_metadata3] == list(
+ sorted(result["results"], key=lambda x: x["discovery_date"],)
+ )
+
+ result = swh_storage.content_metadata_get(content2_swhid, authority)
+ assert result["next_page_token"] is None
+ assert [content2_metadata] == list(result["results"],)
+
+ def test_content_metadata_get_after(self, swh_storage):
+ content = data.cont
+ fetcher = data.metadata_fetcher
+ authority = data.metadata_authority
+ content_swhid = f"swh:1:cnt:{content['sha1_git']}"
+
+ swh_storage.metadata_fetcher_add(**fetcher)
+ swh_storage.metadata_authority_add(**authority)
+
+ swh_storage.content_metadata_add(**data.content_metadata)
+ swh_storage.content_metadata_add(**data.content_metadata2)
+
+ result = swh_storage.content_metadata_get(
+ content_swhid,
+ authority,
+ after=data.content_metadata["discovery_date"] - timedelta(seconds=1),
+ )
+ assert result["next_page_token"] is None
+ assert [data.content_metadata, data.content_metadata2] == list(
+ sorted(result["results"], key=lambda x: x["discovery_date"],)
+ )
+
+ result = swh_storage.content_metadata_get(
+ content_swhid, authority, after=data.content_metadata["discovery_date"]
+ )
+ assert result["next_page_token"] is None
+ assert [data.content_metadata2] == result["results"]
+
+ result = swh_storage.content_metadata_get(
+ content_swhid, authority, after=data.content_metadata2["discovery_date"]
+ )
+ assert result["next_page_token"] is None
+ assert [] == result["results"]
+
+ def test_content_metadata_get_paginate(self, swh_storage):
+ content = data.cont
+ fetcher = data.metadata_fetcher
+ authority = data.metadata_authority
+ content_swhid = f"swh:1:cnt:{content['sha1_git']}"
+
+ swh_storage.metadata_fetcher_add(**fetcher)
+ swh_storage.metadata_authority_add(**authority)
+
+ swh_storage.content_metadata_add(**data.content_metadata)
+ swh_storage.content_metadata_add(**data.content_metadata2)
+
+ swh_storage.content_metadata_get(content_swhid, authority)
+
+ result = swh_storage.content_metadata_get(content_swhid, authority, limit=1)
+ assert result["next_page_token"] is not None
+ assert [data.content_metadata] == result["results"]
+
+ result = swh_storage.content_metadata_get(
+ content_swhid, authority, limit=1, page_token=result["next_page_token"]
+ )
+ assert result["next_page_token"] is None
+ assert [data.content_metadata2] == result["results"]
+
+ def test_content_metadata_get_paginate_same_date(self, swh_storage):
+ content = data.cont
+ fetcher1 = data.metadata_fetcher
+ fetcher2 = data.metadata_fetcher2
+ authority = data.metadata_authority
+ content_swhid = f"swh:1:cnt:{content['sha1_git']}"
+
+ swh_storage.metadata_fetcher_add(**fetcher1)
+ swh_storage.metadata_fetcher_add(**fetcher2)
+ swh_storage.metadata_authority_add(**authority)
+
+ content_metadata2 = {
+ **data.content_metadata2,
+ "discovery_date": data.content_metadata2["discovery_date"],
+ "fetcher": {"name": fetcher2["name"], "version": fetcher2["version"],},
+ }
+
+ swh_storage.content_metadata_add(**data.content_metadata)
+ swh_storage.content_metadata_add(**content_metadata2)
+
+ result = swh_storage.content_metadata_get(content_swhid, authority, limit=1)
+ assert result["next_page_token"] is not None
+ assert [data.content_metadata] == result["results"]
+
+ result = swh_storage.content_metadata_get(
+ content_swhid, authority, limit=1, page_token=result["next_page_token"]
+ )
+ assert result["next_page_token"] is None
+ assert [content_metadata2] == result["results"]
+
def test_object_find_by_sha1_git(self, swh_storage):
sha1_gits = [b"00000000000000000000"]
expected = {

File Metadata

Mime Type
text/plain
Expires
Thu, Jul 3, 3:39 PM (1 w, 6 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3225361

Event Timeline