Page MenuHomeSoftware Heritage

D2988.id11243.diff
No OneTemporary

D2988.id11243.diff

diff --git a/docs/extrinsic-metadata-specification.rst b/docs/extrinsic-metadata-specification.rst
--- a/docs/extrinsic-metadata-specification.rst
+++ b/docs/extrinsic-metadata-specification.rst
@@ -32,11 +32,11 @@
An authority is uniquely defined by these properties:
* its type, representing the kind of authority, which is one of these values:
- * `deposit`, for metadata pushed to Software Heritage at the same time
- as a software artifact
- * `forge`, for metadata pulled from the same source as the one hosting
- the software artifacts (which includes package managers)
- * `registry`, for metadata pulled from a third-party
+ * `deposit`, for metadata pushed to Software Heritage at the same time
+ as a software artifact
+ * `forge`, for metadata pulled from the same source as the one hosting
+ the software artifacts (which includes package managers)
+ * `registry`, for metadata pulled from a third-party
* its URL, which unambiguously identifies an instance of the authority type.
Examples:
@@ -145,6 +145,7 @@
added from this origin, in the format::
{
+ 'origin_url': ...,
'authority': {'type': ..., 'url': ...},
'fetcher': {'name': ..., 'version': ...},
'discovery_date': ...,
diff --git a/sql/upgrades/149.sql b/sql/upgrades/149.sql
new file mode 100644
--- /dev/null
+++ b/sql/upgrades/149.sql
@@ -0,0 +1,105 @@
+-- SWH DB schema upgrade
+-- from_version: 148
+-- to_version: 149
+-- description: Implement extrinsic origin-metadata specification
+
+-- latest schema version
+insert into dbversion(version, release, description)
+ values(149, now(), 'Work In Progress');
+
+-- metadata_fetcher
+
+alter table tool
+ rename to metadata_fetcher;
+comment on table metadata_fetcher is 'Tools used to retrieve metadata';
+
+alter table metadata_fetcher
+ rename column configuration to metadata;
+
+comment on column metadata_fetcher.id is 'Internal identifier of the fetcher';
+comment on column metadata_fetcher.name is 'Fetcher name';
+comment on column metadata_fetcher.version is 'Fetcher version';
+comment on column metadata_fetcher.metadata is 'Extra information about the fetcher';
+
+alter index tool_pkey
+ rename to metadata_fetcher_pkey;
+create unique index metadata_fetcher_name_version
+ on metadata_fetcher(name, version);
+
+drop index tool_tool_name_tool_version_tool_configuration_idx;
+ -- was an index on (name, version, configuration)
+ -- this is the name in production; in new setups it would be called tool_name_version_configuration_idx
+ -- drop index tool_name_version_configuration_idx;
+
+-- metadata_authority
+
+alter table metadata_provider
+ rename to metadata_authority;
+comment on table metadata_authority is 'Metadata authority information';
+
+drop index metadata_provider_provider_name_provider_url_idx;
+ -- was an index on (provider_name, provider_url)
+
+alter table metadata_authority
+ drop column provider_name;
+alter table metadata_authority
+ rename column provider_type to type;
+alter table metadata_authority
+ rename column provider_url to url;
+
+comment on column metadata_authority.id is 'Internal identifier of the authority';
+comment on column metadata_authority.type is 'Type of authority (deposit/forge/registry)';
+comment on column metadata_authority.url is 'Authority''s uri';
+comment on column metadata_authority.metadata is 'Other metadata about authority';
+
+alter index metadata_provider_pkey
+ rename to metadata_authority_pkey;
+alter index metadata_provider_type_url
+ rename to metadata_authority_type_url;
+
+-- origin_metadata
+
+alter table origin_metadata
+ rename column provider_id to authority_id;
+alter table origin_metadata
+ rename column tool_id to fetcher_id;
+alter table origin_metadata
+ add column format text default 'sword-v2-atom-codemeta-v2-in-json';
+alter table origin_metadata
+ rename column metadata to metadata_jsonb;
+alter table origin_metadata
+ add column metadata bytea;
+
+-- migrates metadata_jsonb (a jsonb) to metadata (a bytea)
+--update origin_metadata
+-- set metadata=metadata_jsonb::text::bytea;
+update origin_metadata
+ set metadata=convert_to(metadata_jsonb::text, 'utf-8');
+
+create index origin_metadata_origin_authority_date
+ on origin_metadata(origin_id, authority_id, discovery_date);
+
+drop index origin_metadata_origin_id_provider_id_tool_id_idx;
+ -- was an index on (origin_id, provider_id, tool_id)
+
+alter table origin_metadata
+ drop column metadata_jsonb;
+
+comment on column origin_metadata.authority_id is 'the metadata provider: github, openhub, deposit, etc.';
+comment on column origin_metadata.fetcher_id is 'the tool used for extracting metadata: loaders, crawlers, etc.';
+comment on column origin_metadata.format is 'name of the format of metadata, used by readers to interpret it.';
+comment on column origin_metadata.metadata is 'original metadata in opaque format';
+
+
+-- cleanup unused functions
+
+drop function swh_mktemp_tool;
+drop function swh_tool_add;
+
+drop function swh_origin_metadata_get_by_origin(text);
+drop function swh_origin_metadata_get_by_provider_type(text, text);
+
+drop function swh_origin_metadata_get_by_origin(int);
+drop function swh_origin_metadata_get_by_provider_type(int, text);
+
+drop type origin_metadata_signature;
diff --git a/swh/storage/cassandra/cql.py b/swh/storage/cassandra/cql.py
--- a/swh/storage/cassandra/cql.py
+++ b/swh/storage/cassandra/cql.py
@@ -794,34 +794,100 @@
yield from self._origin_visit_iter_to(start_token)
##########################
- # 'tool' table
+ # 'metadata_authority' table
##########################
- _tool_keys = ["id", "name", "version", "configuration"]
+ _metadata_authority_keys = ["url", "type", "metadata"]
- @_prepared_insert_statement("tool_by_uuid", _tool_keys)
- def tool_by_uuid_add_one(self, tool: Dict[str, Any], *, statement) -> None:
- self._execute_with_retries(statement, [tool[key] for key in self._tool_keys])
+ @_prepared_insert_statement("metadata_authority", _metadata_authority_keys)
+ def metadata_authority_add(self, url, type, metadata, *, statement):
+ return self._execute_with_retries(statement, [url, type, metadata])
- @_prepared_insert_statement("tool", _tool_keys)
- def tool_add_one(self, tool: Dict[str, Any], *, statement) -> None:
- self._execute_with_retries(statement, [tool[key] for key in self._tool_keys])
- self._increment_counter("tool", 1)
+ @_prepared_statement("SELECT * from metadata_authority WHERE type = ? AND url = ?")
+ def metadata_authority_get(self, type, url, *, statement) -> Optional[Row]:
+ return next(iter(self._execute_with_retries(statement, [type, url])), None)
+
+ ##########################
+ # 'metadata_fetcher' table
+ ##########################
+
+ _metadata_fetcher_keys = ["name", "version", "metadata"]
+
+ @_prepared_insert_statement("metadata_fetcher", _metadata_fetcher_keys)
+ def metadata_fetcher_add(self, name, version, metadata, *, statement):
+ return self._execute_with_retries(statement, [name, version, metadata])
@_prepared_statement(
- "SELECT id FROM tool " "WHERE name = ? AND version = ? " "AND configuration = ?"
+ "SELECT * from metadata_fetcher WHERE name = ? AND version = ?"
)
- def tool_get_one_uuid(
- self, name: str, version: str, configuration: Dict[str, Any], *, statement
- ) -> Optional[str]:
- rows = list(
- self._execute_with_retries(statement, [name, version, configuration])
+ def metadata_fetcher_get(self, name, version, *, statement) -> Optional[Row]:
+ return next(iter(self._execute_with_retries(statement, [name, version])), None)
+
+ ##########################
+ # 'origin_metadata' table
+ ##########################
+
+ _origin_metadata_keys = [
+ "origin",
+ "authority_type",
+ "authority_url",
+ "discovery_date",
+ "fetcher_name",
+ "fetcher_version",
+ "format",
+ "metadata",
+ ]
+
+ @_prepared_insert_statement("origin_metadata", _origin_metadata_keys)
+ def origin_metadata_add(
+ self,
+ origin,
+ authority_type,
+ authority_url,
+ discovery_date,
+ fetcher_name,
+ fetcher_version,
+ format,
+ metadata,
+ *,
+ statement,
+ ):
+ return self._execute_with_retries(
+ statement,
+ [
+ origin,
+ authority_type,
+ authority_url,
+ discovery_date,
+ fetcher_name,
+ fetcher_version,
+ format,
+ metadata,
+ ],
+ )
+
+ @_prepared_statement(
+ "SELECT * from origin_metadata "
+ "WHERE origin=? AND authority_url=? AND discovery_date>=? "
+ "AND authority_type=?"
+ )
+ def origin_metadata_get_after(
+ self, origin, authority_type, authority_url, after, *, statement
+ ):
+ return self._execute_with_retries(
+ statement, [origin, authority_url, after, authority_type]
+ )
+
+ @_prepared_statement(
+ "SELECT * from origin_metadata "
+ "WHERE origin=? AND authority_url=? AND authority_type=?"
+ )
+ def origin_metadata_get(
+ self, origin, authority_type, authority_url, *, statement
+ ) -> Iterable[Row]:
+ return self._execute_with_retries(
+ statement, [origin, authority_url, authority_type]
)
- if rows:
- assert len(rows) == 1
- return rows[0].id
- else:
- return None
##########################
# Miscellaneous
diff --git a/swh/storage/cassandra/schema.py b/swh/storage/cassandra/schema.py
--- a/swh/storage/cassandra/schema.py
+++ b/swh/storage/cassandra/schema.py
@@ -175,21 +175,37 @@
);
-CREATE TABLE IF NOT EXISTS tool_by_uuid (
- id timeuuid PRIMARY KEY,
- name ascii,
- version ascii,
- configuration blob,
+CREATE TABLE IF NOT EXISTS metadata_authority (
+ url text,
+ type ascii,
+ metadata text,
+ PRIMARY KEY ((url), type)
);
-CREATE TABLE IF NOT EXISTS tool (
- id timeuuid,
+CREATE TABLE IF NOT EXISTS metadata_fetcher (
name ascii,
version ascii,
- configuration blob,
- PRIMARY KEY ((name, version, configuration))
-)
+ metadata text,
+ PRIMARY KEY ((name), version)
+);
+
+
+CREATE TABLE IF NOT EXISTS origin_metadata (
+ origin text,
+ authority_type text,
+ authority_url text,
+ discovery_date timestamp,
+ fetcher_name ascii,
+ fetcher_version ascii,
+ format ascii,
+ metadata blob,
+ PRIMARY KEY ((origin), authority_type, authority_url, discovery_date,
+ fetcher_name, fetcher_version),
+ -- for now, authority_url could be in the partition key; but leaving
+ -- in the partition key allows listing authorities with metadata on an
+ -- origin if we ever need to do it.
+);
CREATE TABLE IF NOT EXISTS object_count (
@@ -220,8 +236,9 @@
TABLES = (
"skipped_content content revision revision_parent release "
"directory directory_entry snapshot snapshot_branch "
- "origin_visit origin tool_by_uuid tool object_count "
- "origin_visit_status"
+ "origin_visit origin origin_metadata object_count "
+ "origin_visit_status metadata_authority "
+ "metadata_fetcher"
).split()
HASH_ALGORITHMS = ["sha1", "sha1_git", "sha256", "blake2s256"]
diff --git a/swh/storage/cassandra/storage.py b/swh/storage/cassandra/storage.py
--- a/swh/storage/cassandra/storage.py
+++ b/swh/storage/cassandra/storage.py
@@ -4,11 +4,11 @@
# See top-level LICENSE file for more information
import datetime
+import itertools
import json
import random
import re
from typing import Any, Dict, List, Iterable, Optional, Union
-import uuid
import attr
import dateutil
@@ -1034,37 +1034,6 @@
else:
return None
- def tool_add(self, tools):
- inserted = []
- for tool in tools:
- tool = tool.copy()
- tool_json = tool.copy()
- tool_json["configuration"] = json.dumps(
- tool["configuration"], sort_keys=True
- ).encode()
- id_ = self._cql_runner.tool_get_one_uuid(**tool_json)
- if not id_:
- id_ = uuid.uuid1()
- tool_json["id"] = id_
- self._cql_runner.tool_by_uuid_add_one(tool_json)
- self._cql_runner.tool_add_one(tool_json)
- tool["id"] = id_
- inserted.append(tool)
- return inserted
-
- def tool_get(self, tool):
- id_ = self._cql_runner.tool_get_one_uuid(
- tool["name"],
- tool["version"],
- json.dumps(tool["configuration"], sort_keys=True).encode(),
- )
- if id_:
- tool = tool.copy()
- tool["id"] = id_
- return tool
- else:
- return None
-
def stat_counters(self):
rows = self._cql_runner.stat_counters()
keys = (
@@ -1084,27 +1053,109 @@
def refresh_stat_counters(self):
pass
- def origin_metadata_add(self, origin_url, ts, provider, tool, metadata):
- # TODO
- raise NotImplementedError("not yet supported for Cassandra")
+ def origin_metadata_add(
+ self,
+ origin_url: str,
+ discovery_date: datetime.datetime,
+ authority: Dict[str, Any],
+ fetcher: Dict[str, Any],
+ format: str,
+ metadata: bytes,
+ ) -> None:
+ if not isinstance(origin_url, str):
+ raise StorageArgumentException(
+ "origin_id must be str, not %r" % (origin_url,)
+ )
+ if not self._cql_runner.metadata_authority_get(**authority):
+ raise StorageArgumentException(f"Unknown authority {authority}")
+ if not self._cql_runner.metadata_fetcher_get(**fetcher):
+ raise StorageArgumentException(f"Unknown fetcher {fetcher}")
+
+ self._cql_runner.origin_metadata_add(
+ origin_url,
+ authority["type"],
+ authority["url"],
+ discovery_date,
+ fetcher["name"],
+ fetcher["version"],
+ format,
+ metadata,
+ )
- def origin_metadata_get_by(self, origin_url, provider_type=None):
- # TODO
- raise NotImplementedError("not yet supported for Cassandra")
+ def origin_metadata_get(
+ self,
+ origin_url: str,
+ authority: Dict[str, str],
+ after: Optional[datetime.datetime] = None,
+ limit: Optional[int] = None,
+ ) -> List[Dict[str, Any]]:
+ if not isinstance(origin_url, str):
+ raise TypeError("origin_url must be str, not %r" % (origin_url,))
+
+ if after is None:
+ entries = self._cql_runner.origin_metadata_get(
+ origin_url, authority["type"], authority["url"]
+ )
+ else:
+ entries = self._cql_runner.origin_metadata_get_after(
+ origin_url, authority["type"], authority["url"], after
+ )
- def metadata_provider_add(
- self, provider_name, provider_type, provider_url, metadata
- ):
- # TODO
- raise NotImplementedError("not yet supported for Cassandra")
+ if limit:
+ entries = itertools.islice(entries, 0, limit)
- def metadata_provider_get(self, provider_id):
- # TODO
- raise NotImplementedError("not yet supported for Cassandra")
+ results = []
+ for entry in entries:
+ discovery_date = entry.discovery_date.replace(tzinfo=datetime.timezone.utc)
+ results.append(
+ {
+ "origin_url": entry.origin,
+ "authority": {
+ "type": entry.authority_type,
+ "url": entry.authority_url,
+ },
+ "fetcher": {
+ "name": entry.fetcher_name,
+ "version": entry.fetcher_version,
+ },
+ "discovery_date": discovery_date,
+ "format": entry.format,
+ "metadata": entry.metadata,
+ }
+ )
+ return results
+
+ def metadata_fetcher_add(
+ self, name: str, version: str, metadata: Dict[str, Any]
+ ) -> None:
+ self._cql_runner.metadata_fetcher_add(name, version, json.dumps(metadata))
+
+ def metadata_fetcher_get(self, name: str, version: str) -> Optional[Dict[str, Any]]:
+ fetcher = self._cql_runner.metadata_fetcher_get(name, version)
+ if fetcher:
+ return {
+ "name": fetcher.name,
+ "version": fetcher.version,
+ "metadata": json.loads(fetcher.metadata),
+ }
+ else:
+ return None
- def metadata_provider_get_by(self, provider):
- # TODO
- raise NotImplementedError("not yet supported for Cassandra")
+ def metadata_authority_add(
+ self, type: str, url: str, metadata: Dict[str, Any]
+ ) -> None:
+ self._cql_runner.metadata_authority_add(url, type, json.dumps(metadata))
+
+ def metadata_authority_get(self, type: str, url: str) -> Optional[Dict[str, Any]]:
+ authority = self._cql_runner.metadata_authority_get(type, url)
+ if authority:
+ return {
+ "type": authority.type,
+ "url": authority.url,
+ "metadata": json.loads(authority.metadata),
+ }
+ else:
+ return None
def clear_buffers(self, object_types: Optional[Iterable[str]] = None) -> None:
"""Do nothing
diff --git a/swh/storage/db.py b/swh/storage/db.py
--- a/swh/storage/db.py
+++ b/swh/storage/db.py
@@ -3,9 +3,9 @@
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
+import datetime
import random
import select
-
from typing import Any, Dict, Optional, Tuple
from swh.core.db import BaseDb
@@ -1059,159 +1059,150 @@
def release_get_random(self, cur=None):
return self._get_random_row_from_table("release", ["id"], "id", cur)
- def origin_metadata_add(self, origin, ts, provider, tool, metadata, cur=None):
+ origin_metadata_get_cols = [
+ "origin.url",
+ "discovery_date",
+ "metadata_authority.type",
+ "metadata_authority.url",
+ "metadata_fetcher.name",
+ "metadata_fetcher.version",
+ "format",
+ "metadata",
+ ]
+
+ def origin_metadata_add(
+ self,
+ origin: str,
+ discovery_date: datetime.datetime,
+ authority: int,
+ fetcher: int,
+ format: str,
+ metadata: bytes,
+ cur=None,
+ ) -> None:
""" Add an origin_metadata for the origin at ts with provider, tool and
metadata.
Args:
- origin (int): the origin's id for which the metadata is added
- ts (datetime): time when the metadata was found
- provider (int): the metadata provider identifier
- tool (int): the tool's identifier used to extract metadata
- metadata (jsonb): the metadata retrieved at the time and location
-
- Returns:
- id (int): the origin_metadata unique id
-
+ origin: the origin's id for which the metadata is added
+ discovery_date: time when the metadata was found
+ authority: the metadata provider identifier
+ fetcher: the tool's identifier used to extract metadata
+ format: the format of the metadata
+ metadata: the metadata retrieved at the time and location
"""
cur = self._cursor(cur)
insert = """INSERT INTO origin_metadata (origin_id, discovery_date,
- provider_id, tool_id, metadata)
- SELECT id, %s, %s, %s, %s FROM origin WHERE url = %s"""
- cur.execute(insert, (ts, provider, tool, jsonize(metadata), origin))
-
- origin_metadata_get_cols = [
- "origin_url",
- "discovery_date",
- "tool_id",
- "metadata",
- "provider_id",
- "provider_name",
- "provider_type",
- "provider_url",
- ]
-
- def origin_metadata_get_by(self, origin_url, provider_type=None, cur=None):
- """Retrieve all origin_metadata entries for one origin_url
+ authority_id, fetcher_id, format, metadata)
+ SELECT id, %s, %s, %s, %s, %s FROM origin WHERE url = %s"""
+ cur.execute(
+ insert,
+ (discovery_date, authority, fetcher, format, jsonize(metadata), origin),
+ )
- """
+ def origin_metadata_get(
+ self,
+ origin_url: str,
+ authority: int,
+ after: Optional[datetime.datetime],
+ limit: Optional[int],
+ cur=None,
+ ):
cur = self._cursor(cur)
- if not provider_type:
- query = """SELECT %s
- FROM swh_origin_metadata_get_by_origin(
- %%s)""" % (
- ",".join(self.origin_metadata_get_cols)
- )
+ assert self.origin_metadata_get_cols[-1] == "metadata"
+ query_parts = [
+ f"SELECT {', '.join(self.origin_metadata_get_cols[0:-1])}, "
+ f" origin_metadata.metadata AS metadata "
+ f"FROM origin_metadata "
+ f"INNER JOIN metadata_authority "
+ f" ON (metadata_authority.id=authority_id) "
+ f"INNER JOIN metadata_fetcher ON (metadata_fetcher.id=fetcher_id) "
+ f"INNER JOIN origin ON (origin.id=origin_metadata.origin_id) "
+ f"WHERE origin.url=%s AND authority_id=%s"
+ ]
+ args = [origin_url, authority]
- cur.execute(query, (origin_url,))
+ if after:
+ query_parts.append("AND discovery_date >= %s")
+ args.append(after)
- else:
- query = """SELECT %s
- FROM swh_origin_metadata_get_by_provider_type(
- %%s, %%s)""" % (
- ",".join(self.origin_metadata_get_cols)
- )
+ query_parts.append("ORDER BY discovery_date")
- cur.execute(query, (origin_url, provider_type))
+ if limit:
+ query_parts.append("LIMIT %s")
+ args.append(limit)
+ cur.execute(" ".join(query_parts), args)
yield from cur
- tool_cols = ["id", "name", "version", "configuration"]
-
- @stored_procedure("swh_mktemp_tool")
- def mktemp_tool(self, cur=None):
- pass
+ metadata_fetcher_cols = ["name", "version", "metadata"]
- def tool_add_from_temp(self, cur=None):
+ def metadata_fetcher_add(
+ self, name: str, version: str, metadata: bytes, cur=None
+ ) -> None:
cur = self._cursor(cur)
- cur.execute("SELECT %s from swh_tool_add()" % (",".join(self.tool_cols),))
- yield from cur
+ cur.execute(
+ "INSERT INTO metadata_fetcher (name, version, metadata) "
+ "VALUES (%s, %s, %s) ON CONFLICT DO NOTHING",
+ (name, version, jsonize(metadata)),
+ )
- def tool_get(self, name, version, configuration, cur=None):
+ def metadata_fetcher_get(self, name: str, version: str, cur=None):
cur = self._cursor(cur)
cur.execute(
- """select %s
- from tool
- where name=%%s and
- version=%%s and
- configuration=%%s"""
- % (",".join(self.tool_cols)),
- (name, version, configuration),
+ f"SELECT {', '.join(self.metadata_fetcher_cols)} "
+ f"FROM metadata_fetcher "
+ f"WHERE name=%s AND version=%s",
+ (name, version),
)
-
return cur.fetchone()
- metadata_provider_cols = [
- "id",
- "provider_name",
- "provider_type",
- "provider_url",
- "metadata",
- ]
-
- def metadata_provider_add(
- self,
- provider_name: str,
- provider_type: str,
- provider_url: str,
- metadata: Dict,
- cur=None,
- ) -> int:
- """Insert a new provider and return the new identifier."""
+ def metadata_fetcher_get_id(
+ self, name: str, version: str, cur=None
+ ) -> Optional[int]:
cur = self._cursor(cur)
- insert = """
- INSERT INTO metadata_provider (provider_name, provider_type,
- provider_url, metadata) values (%s, %s, %s, %s)
- ON CONFLICT(provider_type, provider_url) do nothing
- """
cur.execute(
- insert, (provider_name, provider_type, provider_url, jsonize(metadata))
- )
- row = self.metadata_provider_get_by_composite_key(
- provider_type, provider_url, cur=cur
+ "SELECT id FROM metadata_fetcher WHERE name=%s AND version=%s",
+ (name, version),
)
- return row[0]
+ row = cur.fetchone()
+ if row:
+ return row[0]
+ else:
+ return None
- def metadata_provider_get_by_composite_key(
- self, provider_type: str, provider_url: str, cur=None
- ) -> Tuple:
- """Retrieve metadata provider by its composite primary key.
+ metadata_authority_cols = ["type", "url", "metadata"]
- """
+ def metadata_authority_add(
+ self, type: str, url: str, metadata: bytes, cur=None
+ ) -> None:
cur = self._cursor(cur)
cur.execute(
- """select %s
- from metadata_provider
- where provider_type=%%s and provider_url=%%s"""
- % (",".join(self.metadata_provider_cols)),
- (provider_type, provider_url,),
+ "INSERT INTO metadata_authority (type, url, metadata) "
+ "VALUES (%s, %s, %s) ON CONFLICT DO NOTHING",
+ (type, url, jsonize(metadata)),
)
- return cur.fetchone()
- def metadata_provider_get(self, provider_id, cur=None):
+ def metadata_authority_get(self, type: str, url: str, cur=None):
cur = self._cursor(cur)
cur.execute(
- """select %s
- from metadata_provider
- where id=%%s """
- % (",".join(self.metadata_provider_cols)),
- (provider_id,),
+ f"SELECT {', '.join(self.metadata_authority_cols)} "
+ f"FROM metadata_authority "
+ f"WHERE type=%s AND url=%s",
+ (type, url),
)
-
return cur.fetchone()
- def metadata_provider_get_by(self, provider_name, provider_url, cur=None):
+ def metadata_authority_get_id(self, type: str, url: str, cur=None) -> Optional[int]:
cur = self._cursor(cur)
cur.execute(
- """select %s
- from metadata_provider
- where provider_name=%%s and
- provider_url=%%s"""
- % (",".join(self.metadata_provider_cols)),
- (provider_name, provider_url),
+ "SELECT id FROM metadata_authority WHERE type=%s AND url=%s", (type, url)
)
-
- return cur.fetchone()
+ row = cur.fetchone()
+ if row:
+ return row[0]
+ else:
+ return None
def _get_random_row_from_table(self, table_name, cols, id_col, cur=None):
random_sha1 = bytes(random.randint(0, 255) for _ in range(SHA1_SIZE))
diff --git a/swh/storage/in_memory.py b/swh/storage/in_memory.py
--- a/swh/storage/in_memory.py
+++ b/swh/storage/in_memory.py
@@ -19,6 +19,7 @@
Callable,
Dict,
Generic,
+ Hashable,
Iterable,
Iterator,
List,
@@ -123,11 +124,17 @@
self._origin_visits = {}
self._origin_visit_statuses: Dict[Tuple[str, int], List[OriginVisitStatus]] = {}
self._persons = []
- self._origin_metadata = defaultdict(list)
- self._tools = {}
- self._metadata_providers = {}
- self._objects = defaultdict(list)
+ # {origin_url: {authority: [metadata]}}
+ self._origin_metadata: Dict[
+ str, Dict[Hashable, SortedList[datetime.datetime, Dict[str, Any]]]
+ ] = defaultdict(
+ lambda: defaultdict(lambda: SortedList(key=lambda x: x["discovery_date"]))
+ ) # noqa
+
+ self._metadata_fetchers: Dict[Hashable, Dict[str, Any]] = {}
+ self._metadata_authorities: Dict[Hashable, Dict[str, Any]] = {}
+ self._objects = defaultdict(list)
self._sorted_sha1s = SortedList[bytes, bytes]()
self.objstorage = ObjStorage({"cls": "memory", "args": {}})
@@ -139,7 +146,6 @@
self.journal_writer.content_add(contents)
content_add = 0
- content_add_bytes = 0
if with_data:
summary = self.objstorage.content_add(
c for c in contents if c.status != "absent"
@@ -1040,71 +1046,104 @@
def refresh_stat_counters(self):
pass
- def origin_metadata_add(self, origin_url, ts, provider, tool, metadata):
+ def origin_metadata_add(
+ self,
+ origin_url: str,
+ discovery_date: datetime.datetime,
+ authority: Dict[str, Any],
+ fetcher: Dict[str, Any],
+ format: str,
+ metadata: bytes,
+ ) -> None:
if not isinstance(origin_url, str):
- raise TypeError("origin_id must be str, not %r" % (origin_url,))
-
- if isinstance(ts, str):
- ts = dateutil.parser.parse(ts)
+ raise StorageArgumentException(
+ "origin_id must be str, not %r" % (origin_url,)
+ )
+ authority_key = self._metadata_authority_key(authority)
+ if authority_key not in self._metadata_authorities:
+ raise StorageArgumentException(f"Unknown authority {authority}")
+ fetcher_key = self._metadata_fetcher_key(fetcher)
+ if fetcher_key not in self._metadata_fetchers:
+ raise StorageArgumentException(f"Unknown fetcher {fetcher}")
origin_metadata = {
"origin_url": origin_url,
- "discovery_date": ts,
- "tool_id": tool,
+ "discovery_date": discovery_date,
+ "authority": authority_key,
+ "fetcher": fetcher_key,
+ "format": format,
"metadata": metadata,
- "provider_id": provider,
}
- self._origin_metadata[origin_url].append(origin_metadata)
+ self._origin_metadata[origin_url][authority_key].add(origin_metadata)
return None
- def origin_metadata_get_by(self, origin_url, provider_type=None):
+ def origin_metadata_get(
+ self,
+ origin_url: str,
+ authority: Dict[str, str],
+ after: Optional[datetime.datetime] = None,
+ limit: Optional[int] = None,
+ ) -> List[Dict[str, Any]]:
if not isinstance(origin_url, str):
raise TypeError("origin_url must be str, not %r" % (origin_url,))
- metadata = []
- for item in self._origin_metadata[origin_url]:
- item = copy.deepcopy(item)
- provider = self.metadata_provider_get(item["provider_id"])
- for attr_name in ("name", "type", "url"):
- item["provider_" + attr_name] = provider["provider_" + attr_name]
- metadata.append(item)
- return metadata
-
- def tool_add(self, tools):
- inserted = []
- for tool in tools:
- key = self._tool_key(tool)
- assert "id" not in tool
- record = copy.deepcopy(tool)
- record["id"] = key # TODO: remove this
- if key not in self._tools:
- self._tools[key] = record
- inserted.append(copy.deepcopy(self._tools[key]))
-
- return inserted
-
- def tool_get(self, tool):
- return self._tools.get(self._tool_key(tool))
-
- def metadata_provider_add(
- self, provider_name, provider_type, provider_url, metadata
- ):
- provider = {
- "provider_name": provider_name,
- "provider_type": provider_type,
- "provider_url": provider_url,
+
+ authority_key = self._metadata_authority_key(authority)
+
+ if after is None:
+ entries = iter(self._origin_metadata[origin_url][authority_key])
+ else:
+ entries = self._origin_metadata[origin_url][authority_key].iter_from(after)
+ if limit:
+ entries = itertools.islice(entries, 0, limit)
+
+ results = []
+ for entry in entries:
+ authority = self._metadata_authorities[entry["authority"]]
+ fetcher = self._metadata_fetchers[entry["fetcher"]]
+ results.append(
+ {
+ **entry,
+ "authority": {"type": authority["type"], "url": authority["url"],},
+ "fetcher": {
+ "name": fetcher["name"],
+ "version": fetcher["version"],
+ },
+ }
+ )
+ return results
+
+ def metadata_fetcher_add(
+ self, name: str, version: str, metadata: Dict[str, Any]
+ ) -> None:
+ fetcher = {
+ "name": name,
+ "version": version,
"metadata": metadata,
}
- key = self._metadata_provider_key(provider)
- provider["id"] = key
- self._metadata_providers[key] = provider
- return key
+ key = self._metadata_fetcher_key(fetcher)
+ if key not in self._metadata_fetchers:
+ self._metadata_fetchers[key] = fetcher
+
+ def metadata_fetcher_get(self, name: str, version: str) -> Optional[Dict[str, Any]]:
+ return self._metadata_fetchers.get(
+ self._metadata_fetcher_key({"name": name, "version": version})
+ )
- def metadata_provider_get(self, provider_id):
- return self._metadata_providers.get(provider_id)
+ def metadata_authority_add(
+ self, type: str, url: str, metadata: Dict[str, Any]
+ ) -> None:
+ authority = {
+ "type": type,
+ "url": url,
+ "metadata": metadata,
+ }
+ key = self._metadata_authority_key(authority)
+ self._metadata_authorities[key] = authority
- def metadata_provider_get_by(self, provider):
- key = self._metadata_provider_key(provider)
- return self._metadata_providers.get(key)
+ def metadata_authority_get(self, type: str, url: str) -> Optional[Dict[str, Any]]:
+ return self._metadata_authorities.get(
+ self._metadata_authority_key({"type": type, "url": url})
+ )
def _get_origin_url(self, origin):
if isinstance(origin, str):
@@ -1131,16 +1170,12 @@
return tuple((key, content.get(key)) for key in sorted(DEFAULT_ALGORITHMS))
@staticmethod
- def _tool_key(tool):
- return "%r %r %r" % (
- tool["name"],
- tool["version"],
- tuple(sorted(tool["configuration"].items())),
- )
+ def _metadata_fetcher_key(fetcher: Dict) -> Hashable:
+ return (fetcher["name"], fetcher["version"])
@staticmethod
- def _metadata_provider_key(provider):
- return "%r %r" % (provider["provider_name"], provider["provider_url"])
+ def _metadata_authority_key(authority: Dict) -> Hashable:
+ return (authority["type"], authority["url"])
def diff_directories(self, from_dir, to_dir, track_renaming=False):
raise NotImplementedError("InMemoryStorage.diff_directories")
diff --git a/swh/storage/interface.py b/swh/storage/interface.py
--- a/swh/storage/interface.py
+++ b/swh/storage/interface.py
@@ -1132,120 +1132,118 @@
...
@remote_api_endpoint("origin/metadata/add")
- def origin_metadata_add(self, origin_url, ts, provider, tool, metadata):
- """ Add an origin_metadata for the origin at ts with provenance and
- metadata.
+ def origin_metadata_add(
+ self,
+ origin_url: str,
+ discovery_date: datetime.datetime,
+ authority: Dict[str, Any],
+ fetcher: Dict[str, Any],
+ format: str,
+ metadata: bytes,
+ ) -> None:
+ """Add an origin_metadata for the origin at discovery_date,
+ obtained using the `fetcher` from the `authority`.
+
+ The authority and fetcher must be known to the storage before
+ using this endpoint.
Args:
- origin_url (str): the origin url for which the metadata is added
- ts (datetime): timestamp of the found metadata
- provider (int): the provider of metadata (ex:'hal')
- tool (int): tool used to extract metadata
- metadata (jsonb): the metadata retrieved at the time and location
+ discovery_date: when the metadata was fetched.
+ authority: a dict containing keys `type` and `url`.
+ fetcher: a dict containing keys `name` and `version`.
+ format: text field indicating the format of the content of the
+ metadata: blob of raw metadata
"""
...
@remote_api_endpoint("origin/metadata/get")
- def origin_metadata_get_by(self, origin_url, provider_type=None):
+ def origin_metadata_get(
+ self,
+ origin_url: str,
+ authority: Dict[str, str],
+ after: Optional[datetime.datetime] = None,
+ limit: Optional[int] = None,
+ ) -> List[Dict[str, Any]]:
"""Retrieve list of all origin_metadata entries for the origin_id
Args:
- origin_url (str): the origin's URL
- provider_type (str): (optional) type of provider
+ origin_url: the origin's URL
+ authority: a dict containing keys `type` and `url`.
+ after: minimum discovery_date for a result to be returned
+ limit: maximum number of results to be returned
Returns:
- list of dicts: the origin_metadata dictionary with the keys:
+ list of dicts in the format:
+
+ .. code-block: python
- - origin_id (int): origin's id
- - discovery_date (datetime): timestamp of discovery
- - tool_id (int): metadata's extracting tool
- - metadata (jsonb)
- - provider_id (int): metadata's provider
- - provider_name (str)
- - provider_type (str)
- - provider_url (str)
+ {
+ 'authority': {'type': ..., 'url': ...},
+ 'fetcher': {'name': ..., 'version': ...},
+ 'discovery_date': ...,
+ 'format': '...',
+ 'metadata': b'...'
+ }
"""
...
- @remote_api_endpoint("tool/add")
- def tool_add(self, tools):
- """Add new tools to the storage.
-
- Args:
- tools (iterable of :class:`dict`): Tool information to add to
- storage. Each tool is a :class:`dict` with the following keys:
+ @remote_api_endpoint("fetcher/add")
+ def metadata_fetcher_add(
+ self, name: str, version: str, metadata: Dict[str, Any]
+ ) -> None:
+ """Add a new metadata fetcher to the storage.
- - name (:class:`str`): name of the tool
- - version (:class:`str`): version of the tool
- - configuration (:class:`dict`): configuration of the tool,
- must be json-encodable
+ `name` and `version` together are a unique identifier of this
+ fetcher; and `metadata` is an arbitrary dict of JSONable data
+ with information about this fetcher.
- Returns:
- :class:`dict`: All the tools inserted in storage
- (including the internal ``id``). The order of the list is not
- guaranteed to match the order of the initial list.
+ Args:
+ name: the name of the fetcher
+ version: version of the fetcher
"""
...
- @remote_api_endpoint("tool/data")
- def tool_get(self, tool):
- """Retrieve tool information.
+ @remote_api_endpoint("fetcher/get")
+ def metadata_fetcher_get(self, name: str, version: str) -> Optional[Dict[str, Any]]:
+ """Retrieve information about a fetcher
Args:
- tool (dict): Tool information we want to retrieve from storage.
- The dicts have the same keys as those used in :func:`tool_add`.
+ name: the name of the fetcher
+ version: version of the fetcher
Returns:
- dict: The full tool information if it exists (``id`` included),
- None otherwise.
+ dictionary with keys `name`, `version`, and `metadata`; or None
+ if the fetcher is not known
"""
...
- @remote_api_endpoint("provider/add")
- def metadata_provider_add(
- self, provider_name, provider_type, provider_url, metadata
- ):
- """Add a metadata provider.
+ @remote_api_endpoint("authority/add")
+ def metadata_authority_add(
+ self, type: str, url: str, metadata: Dict[str, Any]
+ ) -> None:
+ """Add a metadata authority
Args:
- provider_name (str): Its name
- provider_type (str): Its type (eg. `'deposit-client'`)
- provider_url (str): Its URL
+ type: one of "deposit", "forge", or "registry"
+ url: unique URI identifying the authority
metadata: JSON-encodable object
-
- Returns:
- int: an identifier of the provider
- """
- ...
-
- @remote_api_endpoint("provider/get")
- def metadata_provider_get(self, provider_id):
- """Get a metadata provider
-
- Args:
- provider_id: Its identifier, as given by `metadata_provider_add`.
-
- Returns:
- dict: same as `metadata_provider_add`;
- or None if it does not exist.
"""
...
- @remote_api_endpoint("provider/getby")
- def metadata_provider_get_by(self, provider):
- """Get a metadata provider
+ @remote_api_endpoint("authority/get")
+ def metadata_authority_get(self, type: str, url: str) -> Optional[Dict[str, Any]]:
+ """Retrieve information about an authority
Args:
- provider (dict): A dictionary with keys:
- * provider_name: Its name
- * provider_url: Its URL
+ type: one of "deposit", "forge", or "registry"
+ url: unique URI identifying the authority
Returns:
- dict: same as `metadata_provider_add`;
- or None if it does not exist.
+ dictionary with keys `type`, `url`, and `metadata`; or None
+ if the authority is not known
"""
...
diff --git a/swh/storage/retry.py b/swh/storage/retry.py
--- a/swh/storage/retry.py
+++ b/swh/storage/retry.py
@@ -7,7 +7,7 @@
import traceback
from datetime import datetime
-from typing import Dict, Iterable, List, Optional, Union
+from typing import Any, Dict, Iterable, Optional, Union
from tenacity import (
retry,
@@ -127,29 +127,29 @@
)
@swh_retry
- def tool_add(self, tools: Iterable[Dict]) -> List[Dict]:
- tools = list(tools)
- return self.storage.tool_add(tools)
+ def metadata_fetcher_add(
+ self, name: str, version: str, metadata: Dict[str, Any]
+ ) -> None:
+ return self.storage.metadata_fetcher_add(name, version, metadata)
@swh_retry
- def metadata_provider_add(
- self, provider_name: str, provider_type: str, provider_url: str, metadata: Dict
- ) -> Union[str, int]:
- return self.storage.metadata_provider_add(
- provider_name, provider_type, provider_url, metadata
- )
+ def metadata_authority_add(
+ self, type: str, url: str, metadata: Dict[str, Any]
+ ) -> None:
+ return self.storage.metadata_authority_add(type, url, metadata)
@swh_retry
def origin_metadata_add(
self,
origin_url: str,
- ts: Union[str, datetime],
- provider_id: int,
- tool_id: int,
- metadata: Dict,
+ discovery_date: datetime,
+ authority: Dict[str, Any],
+ fetcher: Dict[str, Any],
+ format: str,
+ metadata: bytes,
) -> None:
return self.storage.origin_metadata_add(
- origin_url, ts, provider_id, tool_id, metadata
+ origin_url, discovery_date, authority, fetcher, format, metadata
)
@swh_retry
diff --git a/swh/storage/sql/30-swh-schema.sql b/swh/storage/sql/30-swh-schema.sql
--- a/swh/storage/sql/30-swh-schema.sql
+++ b/swh/storage/sql/30-swh-schema.sql
@@ -17,7 +17,7 @@
-- latest schema version
insert into dbversion(version, release, description)
- values(148, now(), 'Work In Progress');
+ values(149, now(), 'Work In Progress');
-- a SHA1 checksum
create domain sha1 as bytea check (length(value) = 20);
@@ -397,35 +397,34 @@
comment on column release.date_neg_utc_offset is 'True indicates -0 UTC offset for release timestamp';
-- Tools
-create table tool
+create table metadata_fetcher
(
- id serial not null,
- name text not null,
- version text not null,
- configuration jsonb
+ id serial not null,
+ name text not null,
+ version text not null,
+ metadata jsonb not null
);
-comment on table tool is 'Tool information';
-comment on column tool.id is 'Tool identifier';
-comment on column tool.version is 'Tool name';
-comment on column tool.version is 'Tool version';
-comment on column tool.configuration is 'Tool configuration: command line, flags, etc...';
+comment on table metadata_fetcher is 'Tools used to retrieve metadata';
+comment on column metadata_fetcher.id is 'Internal identifier of the fetcher';
+comment on column metadata_fetcher.name is 'Fetcher name';
+comment on column metadata_fetcher.version is 'Fetcher version';
+comment on column metadata_fetcher.metadata is 'Extra information about the fetcher';
-create table metadata_provider
+create table metadata_authority
(
- id serial not null,
- provider_name text not null,
- provider_type text not null,
- provider_url text,
- metadata jsonb
+ id serial not null,
+ type text not null,
+ url text not null,
+ metadata jsonb not null
);
-comment on table metadata_provider is 'Metadata provider information';
-comment on column metadata_provider.id is 'Provider''s identifier';
-comment on column metadata_provider.provider_name is 'Provider''s name';
-comment on column metadata_provider.provider_url is 'Provider''s url';
-comment on column metadata_provider.metadata is 'Other metadata about provider';
+comment on table metadata_authority is 'Metadata authority information';
+comment on column metadata_authority.id is 'Internal identifier of the authority';
+comment on column metadata_authority.type is 'Type of authority (deposit/forge/registry)';
+comment on column metadata_authority.url is 'Authority''s uri';
+comment on column metadata_authority.metadata is 'Other metadata about authority';
-- Discovery of metadata during a listing, loading, deposit or external_catalog of an origin
@@ -435,18 +434,20 @@
id bigserial not null, -- PK internal object identifier
origin_id bigint not null, -- references origin(id)
discovery_date timestamptz not null, -- when it was extracted
- provider_id bigint not null, -- ex: 'hal', 'lister-github', 'loader-github'
- tool_id bigint not null,
- metadata jsonb not null
+ authority_id bigint not null,
+ fetcher_id bigint not null,
+ format text not null,
+ metadata bytea not null
);
comment on table origin_metadata is 'keeps all metadata found concerning an origin';
comment on column origin_metadata.id is 'the origin_metadata object''s id';
comment on column origin_metadata.origin_id is 'the origin id for which the metadata was found';
comment on column origin_metadata.discovery_date is 'the date of retrieval';
-comment on column origin_metadata.provider_id is 'the metadata provider: github, openhub, deposit, etc.';
-comment on column origin_metadata.tool_id is 'the tool used for extracting metadata: lister-github, etc.';
-comment on column origin_metadata.metadata is 'metadata in json format but with original terms';
+comment on column origin_metadata.authority_id is 'the metadata provider: github, openhub, deposit, etc.';
+comment on column origin_metadata.fetcher_id is 'the tool used for extracting metadata: loaders, crawlers, etc.';
+comment on column origin_metadata.format is 'name of the format of metadata, used by readers to interpret it.';
+comment on column origin_metadata.metadata is 'original metadata in opaque format';
-- Keep a cache of object counts
diff --git a/swh/storage/sql/40-swh-func.sql b/swh/storage/sql/40-swh-func.sql
--- a/swh/storage/sql/40-swh-func.sql
+++ b/swh/storage/sql/40-swh-func.sql
@@ -101,17 +101,6 @@
) on commit delete rows;
$$;
--- create a temporary table for the tools
-create or replace function swh_mktemp_tool()
- returns void
- language sql
-as $$
- create temporary table if not exists tmp_tool (
- like tool including defaults
- ) on commit delete rows;
- alter table tmp_tool drop column if exists id;
-$$;
-
-- a content signature is a set of cryptographic checksums that we use to
-- uniquely identify content, for the purpose of verifying if we already have
-- some content or not during content injection
@@ -920,76 +909,6 @@
$$;
--- end revision_metadata functions
--- origin_metadata functions
-create type origin_metadata_signature as (
- id bigint,
- origin_url text,
- discovery_date timestamptz,
- tool_id bigint,
- metadata jsonb,
- provider_id integer,
- provider_name text,
- provider_type text,
- provider_url text
-);
-create or replace function swh_origin_metadata_get_by_origin(
- origin text)
- returns setof origin_metadata_signature
- language sql
- stable
-as $$
- select om.id as id, o.url as origin_url, discovery_date, tool_id, om.metadata,
- mp.id as provider_id, provider_name, provider_type, provider_url
- from origin_metadata as om
- inner join metadata_provider mp on om.provider_id = mp.id
- inner join origin o on om.origin_id = o.id
- where o.url = origin
- order by discovery_date desc;
-$$;
-
-create or replace function swh_origin_metadata_get_by_provider_type(
- origin_url text,
- provider_type text)
- returns setof origin_metadata_signature
- language sql
- stable
-as $$
- select om.id as id, o.url as origin_url, discovery_date, tool_id, om.metadata,
- mp.id as provider_id, provider_name, provider_type, provider_url
- from origin_metadata as om
- inner join metadata_provider mp on om.provider_id = mp.id
- inner join origin o on om.origin_id = o.id
- where o.url = origin_url
- and mp.provider_type = provider_type
- order by discovery_date desc;
-$$;
--- end origin_metadata functions
-
--- add tmp_tool entries to tool,
--- skipping duplicates if any.
---
--- operates in bulk: 0. create temporary tmp_tool, 1. COPY to
--- it, 2. call this function to insert and filtering out duplicates
-create or replace function swh_tool_add()
- returns setof tool
- language plpgsql
-as $$
-begin
- insert into tool(name, version, configuration)
- select name, version, configuration from tmp_tool tmp
- on conflict(name, version, configuration) do nothing;
-
- return query
- select id, name, version, configuration
- from tmp_tool join tool
- using(name, version, configuration);
-
- return;
-end
-$$;
-
-
-- simple counter mapping a textual label to an integer value
create type counter as (
label text,
diff --git a/swh/storage/sql/60-swh-indexes.sql b/swh/storage/sql/60-swh-indexes.sql
--- a/swh/storage/sql/60-swh-indexes.sql
+++ b/swh/storage/sql/60-swh-indexes.sql
@@ -155,35 +155,32 @@
alter table release add constraint release_author_date_check check ((date is null) or (author is not null)) not valid;
alter table release validate constraint release_author_date_check;
--- tool
-create unique index tool_pkey on tool(id);
-alter table tool add primary key using index tool_pkey;
+-- metadata_fetcher
+create unique index metadata_fetcher_pkey on metadata_fetcher(id);
+alter table metadata_fetcher add primary key using index metadata_fetcher_pkey;
-create unique index on tool(name, version, configuration);
+create unique index metadata_fetcher_name_version on metadata_fetcher(name, version);
--- metadata_provider
-create unique index concurrently metadata_provider_pkey on metadata_provider(id);
-alter table metadata_provider add primary key using index metadata_provider_pkey;
+-- metadata_authority
+create unique index concurrently metadata_authority_pkey on metadata_authority(id);
+alter table metadata_authority add primary key using index metadata_authority_pkey;
-create index concurrently on metadata_provider(provider_name, provider_url);
-create unique index metadata_provider_type_url
- on metadata_provider(provider_type, provider_url);
+create unique index metadata_authority_type_url on metadata_authority(type, url);
-- origin_metadata
create unique index concurrently origin_metadata_pkey on origin_metadata(id);
alter table origin_metadata add primary key using index origin_metadata_pkey;
-
-create index concurrently on origin_metadata(origin_id, provider_id, tool_id);
+create index concurrently origin_metadata_origin_authority_date on origin_metadata(origin_id, authority_id, discovery_date);
alter table origin_metadata add constraint origin_metadata_origin_fkey foreign key (origin_id) references origin(id) not valid;
alter table origin_metadata validate constraint origin_metadata_origin_fkey;
-alter table origin_metadata add constraint origin_metadata_provider_fkey foreign key (provider_id) references metadata_provider(id) not valid;
-alter table origin_metadata validate constraint origin_metadata_provider_fkey;
+alter table origin_metadata add constraint origin_metadata_authority_fkey foreign key (authority_id) references metadata_authority(id) not valid;
+alter table origin_metadata validate constraint origin_metadata_authority_fkey;
-alter table origin_metadata add constraint origin_metadata_tool_fkey foreign key (tool_id) references tool(id) not valid;
-alter table origin_metadata validate constraint origin_metadata_tool_fkey;
+alter table origin_metadata add constraint origin_metadata_fetcher_fkey foreign key (fetcher_id) references metadata_fetcher(id) not valid;
+alter table origin_metadata validate constraint origin_metadata_fetcher_fkey;
-- object_counts
create unique index concurrently object_counts_pkey on object_counts(object_type);
diff --git a/swh/storage/storage.py b/swh/storage/storage.py
--- a/swh/storage/storage.py
+++ b/swh/storage/storage.py
@@ -6,7 +6,6 @@
import contextlib
import datetime
import itertools
-import json
from collections import defaultdict
from contextlib import contextmanager
@@ -1236,72 +1235,101 @@
@timed
@db_transaction()
def origin_metadata_add(
- self, origin_url, ts, provider, tool, metadata, db=None, cur=None
- ):
- if isinstance(ts, str):
- ts = dateutil.parser.parse(ts)
-
- db.origin_metadata_add(origin_url, ts, provider, tool, metadata, cur)
+ self,
+ origin_url: str,
+ discovery_date: datetime.datetime,
+ authority: Dict[str, Any],
+ fetcher: Dict[str, Any],
+ format: str,
+ metadata: bytes,
+ db=None,
+ cur=None,
+ ) -> None:
+ authority_id = db.metadata_authority_get_id(
+ authority["type"], authority["url"], cur
+ )
+ if not authority_id:
+ raise StorageArgumentException(f"Unknown authority {authority}")
+ fetcher_id = db.metadata_fetcher_get_id(
+ fetcher["name"], fetcher["version"], cur
+ )
+ if not fetcher_id:
+ raise StorageArgumentException(f"Unknown fetcher {fetcher}")
+ db.origin_metadata_add(
+ origin_url, discovery_date, authority_id, fetcher_id, format, metadata, cur
+ )
send_metric("origin_metadata:add", count=1, method_name="origin_metadata_add")
@timed
- @db_transaction_generator(statement_timeout=500)
- def origin_metadata_get_by(self, origin_url, provider_type=None, db=None, cur=None):
- for line in db.origin_metadata_get_by(origin_url, provider_type, cur):
- yield dict(zip(db.origin_metadata_get_cols, line))
+ @db_transaction(statement_timeout=500)
+ def origin_metadata_get(
+ self,
+ origin_url: str,
+ authority: Dict[str, str],
+ after: Optional[datetime.datetime] = None,
+ limit: Optional[int] = None,
+ db=None,
+ cur=None,
+ ) -> List[Dict[str, Any]]:
+ authority_id = db.metadata_authority_get_id(
+ authority["type"], authority["url"], cur
+ )
+ if not authority_id:
+ return []
+ results = []
+ for line in db.origin_metadata_get(origin_url, authority_id, after, limit, cur):
+ row = dict(zip(db.origin_metadata_get_cols, line))
+ results.append(
+ {
+ "origin_url": row.pop("origin.url"),
+ "authority": {
+ "type": row.pop("metadata_authority.type"),
+ "url": row.pop("metadata_authority.url"),
+ },
+ "fetcher": {
+ "name": row.pop("metadata_fetcher.name"),
+ "version": row.pop("metadata_fetcher.version"),
+ },
+ **row,
+ }
+ )
+ return results
@timed
@db_transaction()
- def tool_add(self, tools, db=None, cur=None):
- db.mktemp_tool(cur)
- with convert_validation_exceptions():
- db.copy_to(tools, "tmp_tool", ["name", "version", "configuration"], cur)
- tools = db.tool_add_from_temp(cur)
-
- results = [dict(zip(db.tool_cols, line)) for line in tools]
- send_metric("tool:add", count=len(results), method_name="tool_add")
- return results
+ def metadata_fetcher_add(
+ self, name: str, version: str, metadata: Dict[str, Any], db=None, cur=None
+ ) -> None:
+ db.metadata_fetcher_add(name, version, metadata)
+ send_metric("metadata_fetcher:add", count=1, method_name="metadata_fetcher")
@timed
@db_transaction(statement_timeout=500)
- def tool_get(self, tool, db=None, cur=None):
- tool_conf = tool["configuration"]
- if isinstance(tool_conf, dict):
- tool_conf = json.dumps(tool_conf)
-
- idx = db.tool_get(tool["name"], tool["version"], tool_conf)
- if not idx:
+ def metadata_fetcher_get(
+ self, name: str, version: str, db=None, cur=None
+ ) -> Optional[Dict[str, Any]]:
+ row = db.metadata_fetcher_get(name, version, cur=cur)
+ if not row:
return None
- return dict(zip(db.tool_cols, idx))
-
- @timed
- @db_transaction()
- def metadata_provider_add(
- self, provider_name, provider_type, provider_url, metadata, db=None, cur=None
- ):
- result = db.metadata_provider_add(
- provider_name, provider_type, provider_url, metadata, cur
- )
- send_metric("metadata_provider:add", count=1, method_name="metadata_provider")
- return result
+ return dict(zip(db.metadata_fetcher_cols, row))
@timed
@db_transaction()
- def metadata_provider_get(self, provider_id, db=None, cur=None):
- result = db.metadata_provider_get(provider_id)
- if not result:
- return None
- return dict(zip(db.metadata_provider_cols, result))
+ def metadata_authority_add(
+ self, type: str, url: str, metadata: Dict[str, Any], db=None, cur=None
+ ) -> None:
+ db.metadata_authority_add(type, url, metadata, cur)
+ send_metric("metadata_authority:add", count=1, method_name="metadata_authority")
@timed
@db_transaction()
- def metadata_provider_get_by(self, provider, db=None, cur=None):
- result = db.metadata_provider_get_by(
- provider["provider_name"], provider["provider_url"]
- )
- if not result:
+ def metadata_authority_get(
+ self, type: str, url: str, db=None, cur=None
+ ) -> Optional[Dict[str, Any]]:
+ row = db.metadata_authority_get(type, url, cur=cur)
+ if not row:
return None
- return dict(zip(db.metadata_provider_cols, result))
+ return dict(zip(db.metadata_authority_cols, row))
@timed
def diff_directories(self, from_dir, to_dir, track_renaming=False):
diff --git a/swh/storage/tests/conftest.py b/swh/storage/tests/conftest.py
--- a/swh/storage/tests/conftest.py
+++ b/swh/storage/tests/conftest.py
@@ -246,7 +246,7 @@
"release": [data.release, data.release2, data.release3],
"snapshot": [data.snapshot],
"origin": [data.origin, data.origin2],
- "tool": [data.metadata_tool],
- "provider": [data.provider],
+ "fetcher": [data.metadata_fetcher],
+ "authority": [data.metadata_authority],
"origin_metadata": [data.origin_metadata, data.origin_metadata2],
}
diff --git a/swh/storage/tests/storage_data.py b/swh/storage/tests/storage_data.py
--- a/swh/storage/tests/storage_data.py
+++ b/swh/storage/tests/storage_data.py
@@ -326,17 +326,26 @@
origins = (origin, origin2)
-provider = {
- "name": "hal",
- "type": "deposit-client",
- "url": "http:///hal/inria",
+metadata_authority = {
+ "type": "deposit",
+ "url": "http://hal.inria.example.com/",
"metadata": {"location": "France"},
}
+metadata_authority2 = {
+ "type": "registry",
+ "url": "http://wikidata.example.com/",
+ "metadata": {},
+}
-metadata_tool = {
+metadata_fetcher = {
"name": "swh-deposit",
"version": "0.0.1",
- "configuration": {"sword_version": "2"},
+ "metadata": {"sword_version": "2"},
+}
+metadata_fetcher2 = {
+ "name": "swh-example",
+ "version": "0.0.1",
+ "metadata": {},
}
date_visit1 = datetime.datetime(2015, 1, 1, 23, 0, 0, tzinfo=datetime.timezone.utc)
@@ -456,22 +465,52 @@
}
origin_metadata = {
- "origin": origin,
+ "origin_url": origin["url"],
"discovery_date": datetime.datetime(
2015, 1, 1, 23, 0, 0, tzinfo=datetime.timezone.utc
),
- "provider": provider,
- "tool": "swh-deposit",
- "metadata": {"name": "test_origin_metadata", "version": "0.0.1"},
+ "authority": {
+ "type": metadata_authority["type"],
+ "url": metadata_authority["url"],
+ },
+ "fetcher": {
+ "name": metadata_fetcher["name"],
+ "version": metadata_fetcher["version"],
+ },
+ "format": "json",
+ "metadata": b'{"foo": "bar"}',
}
origin_metadata2 = {
- "origin": origin,
+ "origin_url": origin["url"],
"discovery_date": datetime.datetime(
2017, 1, 1, 23, 0, 0, tzinfo=datetime.timezone.utc
),
- "provider": provider,
- "tool": "swh-deposit",
- "metadata": {"name": "test_origin_metadata", "version": "0.0.1"},
+ "authority": {
+ "type": metadata_authority["type"],
+ "url": metadata_authority["url"],
+ },
+ "fetcher": {
+ "name": metadata_fetcher["name"],
+ "version": metadata_fetcher["version"],
+ },
+ "format": "yaml",
+ "metadata": b"foo: bar",
+}
+origin_metadata3 = {
+ "origin_url": origin["url"],
+ "discovery_date": datetime.datetime(
+ 2017, 1, 1, 23, 0, 0, tzinfo=datetime.timezone.utc
+ ),
+ "authority": {
+ "type": metadata_authority2["type"],
+ "url": metadata_authority2["url"],
+ },
+ "fetcher": {
+ "name": metadata_fetcher2["name"],
+ "version": metadata_fetcher2["version"],
+ },
+ "format": "yaml",
+ "metadata": b"foo: bar",
}
person = {
diff --git a/swh/storage/tests/test_cassandra.py b/swh/storage/tests/test_cassandra.py
--- a/swh/storage/tests/test_cassandra.py
+++ b/swh/storage/tests/test_cassandra.py
@@ -335,34 +335,6 @@
def test_person_get(self):
pass
- @pytest.mark.skip("Not yet implemented")
- def test_metadata_provider_add(self):
- pass
-
- @pytest.mark.skip("Not yet implemented")
- def test_metadata_provider_add_idempotent(self):
- pass
-
- @pytest.mark.skip("Not yet implemented")
- def test_metadata_provider_get(self):
- pass
-
- @pytest.mark.skip("Not yet implemented")
- def test_metadata_provider_get_by(self):
- pass
-
- @pytest.mark.skip("Not yet implemented")
- def test_origin_metadata_add(self):
- pass
-
- @pytest.mark.skip("Not yet implemented")
- def test_origin_metadata_get(self):
- pass
-
- @pytest.mark.skip("Not yet implemented")
- def test_origin_metadata_get_by_provider_type(self):
- pass
-
@pytest.mark.skip("Not supported by Cassandra")
def test_origin_count(self):
pass
diff --git a/swh/storage/tests/test_retry.py b/swh/storage/tests/test_retry.py
--- a/swh/storage/tests/test_retry.py
+++ b/swh/storage/tests/test_retry.py
@@ -3,7 +3,6 @@
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
-from typing import Dict
from unittest.mock import call
import psycopg2
@@ -421,156 +420,152 @@
)
-def test_retrying_proxy_storage_tool_add(swh_storage, sample_data):
- """Standard tool_add works as before
+def test_retrying_proxy_storage_metadata_fetcher_add(swh_storage, sample_data):
+ """Standard metadata_fetcher_add works as before
"""
- sample_tool = sample_data["tool"][0]
+ fetcher = sample_data["fetcher"][0]
- tool = swh_storage.tool_get(sample_tool)
- assert not tool
+ metadata_fetcher = swh_storage.metadata_fetcher_get(
+ fetcher["name"], fetcher["version"]
+ )
+ assert not metadata_fetcher
- tools = swh_storage.tool_add([sample_tool])
- assert tools
- tool = tools[0]
- tool.pop("id")
- assert tool == sample_tool
+ swh_storage.metadata_fetcher_add(**fetcher)
- tool = swh_storage.tool_get(sample_tool)
- tool.pop("id")
- assert tool == sample_tool
+ actual_fetcher = swh_storage.metadata_fetcher_get(
+ fetcher["name"], fetcher["version"]
+ )
+ assert actual_fetcher == fetcher
-def test_retrying_proxy_storage_tool_add_with_retry(
+def test_retrying_proxy_storage_metadata_fetcher_add_with_retry(
monkeypatch_sleep, swh_storage, sample_data, mocker, fake_hash_collision
):
"""Multiple retries for hash collision and psycopg2 error but finally ok
"""
- sample_tool = sample_data["tool"][0]
- mock_memory = mocker.patch("swh.storage.in_memory.InMemoryStorage.tool_add")
+ fetcher = sample_data["fetcher"][0]
+ mock_memory = mocker.patch(
+ "swh.storage.in_memory.InMemoryStorage.metadata_fetcher_add"
+ )
mock_memory.side_effect = [
# first try goes ko
fake_hash_collision,
# second try goes ko
- psycopg2.IntegrityError("tool already inserted"),
+ psycopg2.IntegrityError("metadata_fetcher already inserted"),
# ok then!
- [sample_tool],
+ [fetcher],
]
- tool = swh_storage.tool_get(sample_tool)
- assert not tool
+ actual_fetcher = swh_storage.metadata_fetcher_get(
+ fetcher["name"], fetcher["version"]
+ )
+ assert not actual_fetcher
- tools = swh_storage.tool_add([sample_tool])
- assert tools == [sample_tool]
+ swh_storage.metadata_fetcher_add(**fetcher)
mock_memory.assert_has_calls(
- [call([sample_tool]), call([sample_tool]), call([sample_tool]),]
+ [
+ call(fetcher["name"], fetcher["version"], fetcher["metadata"]),
+ call(fetcher["name"], fetcher["version"], fetcher["metadata"]),
+ call(fetcher["name"], fetcher["version"], fetcher["metadata"]),
+ ]
)
-def test_retrying_proxy_swh_storage_tool_add_failure(swh_storage, sample_data, mocker):
+def test_retrying_proxy_swh_storage_metadata_fetcher_add_failure(
+ swh_storage, sample_data, mocker
+):
"""Unfiltered errors are raising without retry
"""
- mock_memory = mocker.patch("swh.storage.in_memory.InMemoryStorage.tool_add")
- mock_memory.side_effect = StorageArgumentException("Refuse to add tool always!")
+ mock_memory = mocker.patch(
+ "swh.storage.in_memory.InMemoryStorage.metadata_fetcher_add"
+ )
+ mock_memory.side_effect = StorageArgumentException(
+ "Refuse to add metadata_fetcher always!"
+ )
- sample_tool = sample_data["tool"][0]
+ fetcher = sample_data["fetcher"][0]
- tool = swh_storage.tool_get(sample_tool)
- assert not tool
+ actual_fetcher = swh_storage.metadata_fetcher_get(
+ fetcher["name"], fetcher["version"]
+ )
+ assert not actual_fetcher
with pytest.raises(StorageArgumentException, match="Refuse to add"):
- swh_storage.tool_add([sample_tool])
+ swh_storage.metadata_fetcher_add(**fetcher)
assert mock_memory.call_count == 1
-def to_provider(provider: Dict) -> Dict:
- return {
- "provider_name": provider["name"],
- "provider_url": provider["url"],
- "provider_type": provider["type"],
- "metadata": provider["metadata"],
- }
-
-
-def test_retrying_proxy_storage_metadata_provider_add(swh_storage, sample_data):
- """Standard metadata_provider_add works as before
+def test_retrying_proxy_storage_metadata_authority_add(swh_storage, sample_data):
+ """Standard metadata_authority_add works as before
"""
- provider = sample_data["provider"][0]
- provider_get = to_provider(provider)
+ authority = sample_data["authority"][0]
- provider = swh_storage.metadata_provider_get_by(provider_get)
- assert not provider
+ assert not swh_storage.metadata_authority_get(authority["type"], authority["url"])
- provider_id = swh_storage.metadata_provider_add(**provider_get)
- assert provider_id
+ swh_storage.metadata_authority_add(**authority)
- actual_provider = swh_storage.metadata_provider_get(provider_id)
- assert actual_provider
- actual_provider_id = actual_provider.pop("id")
- assert actual_provider_id == provider_id
- assert actual_provider == provider_get
+ actual_authority = swh_storage.metadata_authority_get(
+ authority["type"], authority["url"]
+ )
+ assert actual_authority == authority
-def test_retrying_proxy_storage_metadata_provider_add_with_retry(
+def test_retrying_proxy_storage_metadata_authority_add_with_retry(
monkeypatch_sleep, swh_storage, sample_data, mocker, fake_hash_collision
):
"""Multiple retries for hash collision and psycopg2 error but finally ok
"""
- provider = sample_data["provider"][0]
- provider_get = to_provider(provider)
+ authority = sample_data["authority"][0]
mock_memory = mocker.patch(
- "swh.storage.in_memory.InMemoryStorage.metadata_provider_add"
+ "swh.storage.in_memory.InMemoryStorage.metadata_authority_add"
)
mock_memory.side_effect = [
# first try goes ko
fake_hash_collision,
# second try goes ko
- psycopg2.IntegrityError("provider_id already inserted"),
+ psycopg2.IntegrityError("foo bar"),
# ok then!
- "provider_id",
+ None,
]
- provider = swh_storage.metadata_provider_get_by(provider_get)
- assert not provider
+ assert not swh_storage.metadata_authority_get(authority["type"], authority["url"])
- provider_id = swh_storage.metadata_provider_add(**provider_get)
- assert provider_id == "provider_id"
+ swh_storage.metadata_authority_add(**authority)
- provider_arg_names = ("provider_name", "provider_type", "provider_url", "metadata")
- provider_args = [provider_get[key] for key in provider_arg_names]
+ authority_arg_names = ("type", "url", "metadata")
+ authority_args = [authority[key] for key in authority_arg_names]
mock_memory.assert_has_calls(
- [call(*provider_args), call(*provider_args), call(*provider_args),]
+ [call(*authority_args), call(*authority_args), call(*authority_args),]
)
-def test_retrying_proxy_swh_storage_metadata_provider_add_failure(
+def test_retrying_proxy_swh_storage_metadata_authority_add_failure(
swh_storage, sample_data, mocker
):
"""Unfiltered errors are raising without retry
"""
mock_memory = mocker.patch(
- "swh.storage.in_memory.InMemoryStorage.metadata_provider_add"
+ "swh.storage.in_memory.InMemoryStorage.metadata_authority_add"
)
mock_memory.side_effect = StorageArgumentException(
- "Refuse to add provider_id always!"
+ "Refuse to add authority_id always!"
)
- provider = sample_data["provider"][0]
- provider_get = to_provider(provider)
+ authority = sample_data["authority"][0]
- provider_id = swh_storage.metadata_provider_get_by(provider_get)
- assert not provider_id
+ swh_storage.metadata_authority_get(authority["type"], authority["url"])
with pytest.raises(StorageArgumentException, match="Refuse to add"):
- swh_storage.metadata_provider_add(**provider_get)
+ swh_storage.metadata_authority_add(**authority)
assert mock_memory.call_count == 1
@@ -580,23 +575,20 @@
"""
ori_meta = sample_data["origin_metadata"][0]
- origin = ori_meta["origin"]
- swh_storage.origin_add_one(origin)
- provider_get = to_provider(ori_meta["provider"])
- provider_id = swh_storage.metadata_provider_add(**provider_get)
+ swh_storage.origin_add_one({"url": ori_meta["origin_url"]})
+ swh_storage.metadata_authority_add(**sample_data["authority"][0])
+ swh_storage.metadata_fetcher_add(**sample_data["fetcher"][0])
- origin_metadata = swh_storage.origin_metadata_get_by(origin["url"])
+ origin_metadata = swh_storage.origin_metadata_get(
+ ori_meta["origin_url"], ori_meta["authority"]
+ )
assert not origin_metadata
- swh_storage.origin_metadata_add(
- origin["url"],
- ori_meta["discovery_date"],
- provider_id,
- ori_meta["tool"],
- ori_meta["metadata"],
- )
+ swh_storage.origin_metadata_add(**ori_meta)
- origin_metadata = swh_storage.origin_metadata_get_by(origin["url"])
+ origin_metadata = swh_storage.origin_metadata_get(
+ ori_meta["origin_url"], ori_meta["authority"]
+ )
assert origin_metadata
@@ -607,10 +599,9 @@
"""
ori_meta = sample_data["origin_metadata"][0]
- origin = ori_meta["origin"]
- swh_storage.origin_add_one(origin)
- provider_get = to_provider(ori_meta["provider"])
- provider_id = swh_storage.metadata_provider_add(**provider_get)
+ swh_storage.origin_add_one({"url": ori_meta["origin_url"]})
+ swh_storage.metadata_authority_add(**sample_data["authority"][0])
+ swh_storage.metadata_fetcher_add(**sample_data["fetcher"][0])
mock_memory = mocker.patch(
"swh.storage.in_memory.InMemoryStorage.origin_metadata_add"
)
@@ -619,24 +610,40 @@
# first try goes ko
fake_hash_collision,
# second try goes ko
- psycopg2.IntegrityError("provider_id already inserted"),
+ psycopg2.IntegrityError("foo bar"),
# ok then!
None,
]
- url = origin["url"]
- ts = ori_meta["discovery_date"]
- tool_id = ori_meta["tool"]
- metadata = ori_meta["metadata"]
-
# No exception raised as insertion finally came through
- swh_storage.origin_metadata_add(url, ts, provider_id, tool_id, metadata)
+ swh_storage.origin_metadata_add(**ori_meta)
mock_memory.assert_has_calls(
[ # 3 calls, as long as error raised
- call(url, ts, provider_id, tool_id, metadata),
- call(url, ts, provider_id, tool_id, metadata),
- call(url, ts, provider_id, tool_id, metadata),
+ call(
+ ori_meta["origin_url"],
+ ori_meta["discovery_date"],
+ ori_meta["authority"],
+ ori_meta["fetcher"],
+ ori_meta["format"],
+ ori_meta["metadata"],
+ ),
+ call(
+ ori_meta["origin_url"],
+ ori_meta["discovery_date"],
+ ori_meta["authority"],
+ ori_meta["fetcher"],
+ ori_meta["format"],
+ ori_meta["metadata"],
+ ),
+ call(
+ ori_meta["origin_url"],
+ ori_meta["discovery_date"],
+ ori_meta["authority"],
+ ori_meta["fetcher"],
+ ori_meta["format"],
+ ori_meta["metadata"],
+ ),
]
)
@@ -653,17 +660,10 @@
mock_memory.side_effect = StorageArgumentException("Refuse to add always!")
ori_meta = sample_data["origin_metadata"][0]
- origin = ori_meta["origin"]
- swh_storage.origin_add_one(origin)
-
- url = origin["url"]
- ts = ori_meta["discovery_date"]
- provider_id = "provider_id"
- tool_id = ori_meta["tool"]
- metadata = ori_meta["metadata"]
+ swh_storage.origin_add_one({"url": ori_meta["origin_url"]})
with pytest.raises(StorageArgumentException, match="Refuse to add"):
- swh_storage.origin_metadata_add(url, ts, provider_id, tool_id, metadata)
+ swh_storage.origin_metadata_add(**ori_meta)
assert mock_memory.call_count == 1
diff --git a/swh/storage/tests/test_storage.py b/swh/storage/tests/test_storage.py
--- a/swh/storage/tests/test_storage.py
+++ b/swh/storage/tests/test_storage.py
@@ -3174,374 +3174,99 @@
assert expected == ret
- def test_tool_add(self, swh_storage):
- tool = {
- "name": "some-unknown-tool",
- "version": "some-version",
- "configuration": {"debian-package": "some-package"},
- }
-
- actual_tool = swh_storage.tool_get(tool)
- assert actual_tool is None # does not exist
-
- # add it
- actual_tools = swh_storage.tool_add([tool])
-
- assert len(actual_tools) == 1
- actual_tool = actual_tools[0]
- assert actual_tool is not None # now it exists
- new_id = actual_tool.pop("id")
- assert actual_tool == tool
-
- actual_tools2 = swh_storage.tool_add([tool])
- actual_tool2 = actual_tools2[0]
- assert actual_tool2 is not None # now it exists
- new_id2 = actual_tool2.pop("id")
-
- assert new_id == new_id2
- assert actual_tool == actual_tool2
-
- def test_tool_add_multiple(self, swh_storage):
- tool = {
- "name": "some-unknown-tool",
- "version": "some-version",
- "configuration": {"debian-package": "some-package"},
- }
-
- actual_tools = list(swh_storage.tool_add([tool]))
- assert len(actual_tools) == 1
-
- new_tools = [
- tool,
- {"name": "yet-another-tool", "version": "version", "configuration": {},},
- ]
-
- actual_tools = swh_storage.tool_add(new_tools)
- assert len(actual_tools) == 2
-
- # order not guaranteed, so we iterate over results to check
- for tool in actual_tools:
- _id = tool.pop("id")
- assert _id is not None
- assert tool in new_tools
-
- def test_tool_get_missing(self, swh_storage):
- tool = {
- "name": "unknown-tool",
- "version": "3.1.0rc2-31-ga2cbb8c",
- "configuration": {"command_line": "nomossa <filepath>"},
- }
-
- actual_tool = swh_storage.tool_get(tool)
-
- assert actual_tool is None
-
- def test_tool_metadata_get_missing_context(self, swh_storage):
- tool = {
- "name": "swh-metadata-translator",
- "version": "0.0.1",
- "configuration": {"context": "unknown-context"},
- }
-
- actual_tool = swh_storage.tool_get(tool)
-
- assert actual_tool is None
+ def test_metadata_fetcher_add_get(self, swh_storage):
+ actual_fetcher = swh_storage.metadata_fetcher_get(
+ data.metadata_fetcher["name"], data.metadata_fetcher["version"]
+ )
+ assert actual_fetcher is None # does not exist
- def test_tool_metadata_get(self, swh_storage):
- tool = {
- "name": "swh-metadata-translator",
- "version": "0.0.1",
- "configuration": {"type": "local", "context": "npm"},
- }
- expected_tool = swh_storage.tool_add([tool])[0]
+ swh_storage.metadata_fetcher_add(**data.metadata_fetcher)
- # when
- actual_tool = swh_storage.tool_get(tool)
+ res = swh_storage.metadata_fetcher_get(
+ data.metadata_fetcher["name"], data.metadata_fetcher["version"]
+ )
- # then
- assert expected_tool == actual_tool
+ assert res is not data.metadata_fetcher
+ assert res == data.metadata_fetcher
- def test_metadata_provider_get(self, swh_storage):
- # given
- no_provider = swh_storage.metadata_provider_get(6459456445615)
- assert no_provider is None
- # when
- provider_id = swh_storage.metadata_provider_add(
- data.provider["name"],
- data.provider["type"],
- data.provider["url"],
- data.provider["metadata"],
+ def test_metadata_authority_add_get(self, swh_storage):
+ actual_authority = swh_storage.metadata_authority_get(
+ data.metadata_authority["type"], data.metadata_authority["url"]
)
+ assert actual_authority is None # does not exist
- actual_provider = swh_storage.metadata_provider_get(provider_id)
- expected_provider = {
- "provider_name": data.provider["name"],
- "provider_url": data.provider["url"],
- }
- # then
- del actual_provider["id"]
- assert actual_provider, expected_provider
+ swh_storage.metadata_authority_add(**data.metadata_authority)
- def test_metadata_provider_get_by(self, swh_storage):
- # given
- no_provider = swh_storage.metadata_provider_get_by(
- {
- "provider_name": data.provider["name"],
- "provider_url": data.provider["url"],
- }
- )
- assert no_provider is None
- # when
- provider_id = swh_storage.metadata_provider_add(
- data.provider["name"],
- data.provider["type"],
- data.provider["url"],
- data.provider["metadata"],
+ res = swh_storage.metadata_authority_get(
+ data.metadata_authority["type"], data.metadata_authority["url"]
)
- actual_provider = swh_storage.metadata_provider_get_by(
- {
- "provider_name": data.provider["name"],
- "provider_url": data.provider["url"],
- }
- )
- # then
- assert provider_id, actual_provider["id"]
+ assert res is not data.metadata_authority
+ assert res == data.metadata_authority
def test_origin_metadata_add(self, swh_storage):
- # given
origin = data.origin
+ fetcher = data.metadata_fetcher
+ authority = data.metadata_authority
swh_storage.origin_add([origin])[0]
- tools = swh_storage.tool_add([data.metadata_tool])
- tool = tools[0]
+ swh_storage.metadata_fetcher_add(**fetcher)
+ swh_storage.metadata_authority_add(**authority)
- swh_storage.metadata_provider_add(
- data.provider["name"],
- data.provider["type"],
- data.provider["url"],
- data.provider["metadata"],
- )
- provider = swh_storage.metadata_provider_get_by(
- {
- "provider_name": data.provider["name"],
- "provider_url": data.provider["url"],
- }
- )
+ swh_storage.origin_metadata_add(**data.origin_metadata)
+ swh_storage.origin_metadata_add(**data.origin_metadata2)
- # when adding for the same origin 2 metadatas
- n_om = len(list(swh_storage.origin_metadata_get_by(origin["url"])))
- swh_storage.origin_metadata_add(
- origin["url"],
- data.origin_metadata["discovery_date"],
- provider["id"],
- tool["id"],
- data.origin_metadata["metadata"],
- )
- swh_storage.origin_metadata_add(
- origin["url"],
- "2015-01-01 23:00:00+00",
- provider["id"],
- tool["id"],
- data.origin_metadata2["metadata"],
+ swh_storage.origin_metadata_get(origin["url"], authority)
+
+ assert [data.origin_metadata, data.origin_metadata2] == list(
+ sorted(
+ swh_storage.origin_metadata_get(origin["url"], authority),
+ key=lambda x: x["discovery_date"],
+ )
)
- n_actual_om = len(list(swh_storage.origin_metadata_get_by(origin["url"])))
- # then
- assert n_actual_om == n_om + 2
def test_origin_metadata_get(self, swh_storage):
- # given
- origin_url = data.origin["url"]
+ authority = data.metadata_authority
+ fetcher = data.metadata_fetcher
+ authority2 = data.metadata_authority2
+ fetcher2 = data.metadata_fetcher2
+ origin_url1 = data.origin["url"]
origin_url2 = data.origin2["url"]
swh_storage.origin_add([data.origin])
swh_storage.origin_add([data.origin2])
- swh_storage.metadata_provider_add(
- data.provider["name"],
- data.provider["type"],
- data.provider["url"],
- data.provider["metadata"],
- )
- provider = swh_storage.metadata_provider_get_by(
- {
- "provider_name": data.provider["name"],
- "provider_url": data.provider["url"],
- }
- )
- tool = swh_storage.tool_add([data.metadata_tool])[0]
- # when adding for the same origin 2 metadatas
- swh_storage.origin_metadata_add(
- origin_url,
- data.origin_metadata["discovery_date"],
- provider["id"],
- tool["id"],
- data.origin_metadata["metadata"],
- )
- swh_storage.origin_metadata_add(
- origin_url2,
- data.origin_metadata2["discovery_date"],
- provider["id"],
- tool["id"],
- data.origin_metadata2["metadata"],
- )
- swh_storage.origin_metadata_add(
- origin_url,
- data.origin_metadata2["discovery_date"],
- provider["id"],
- tool["id"],
- data.origin_metadata2["metadata"],
- )
- all_metadatas = list(
+ origin1_metadata1 = data.origin_metadata
+ origin1_metadata2 = data.origin_metadata2
+ origin1_metadata3 = data.origin_metadata3
+ origin2_metadata = {**data.origin_metadata2, "origin_url": origin_url2}
+
+ swh_storage.metadata_authority_add(**authority)
+ swh_storage.metadata_fetcher_add(**fetcher)
+ swh_storage.metadata_authority_add(**authority2)
+ swh_storage.metadata_fetcher_add(**fetcher2)
+
+ swh_storage.origin_metadata_add(**origin1_metadata1)
+ swh_storage.origin_metadata_add(**origin1_metadata2)
+ swh_storage.origin_metadata_add(**origin1_metadata3)
+ swh_storage.origin_metadata_add(**origin2_metadata)
+
+ assert [origin1_metadata1, origin1_metadata2] == list(
sorted(
- swh_storage.origin_metadata_get_by(origin_url),
+ swh_storage.origin_metadata_get(origin_url1, authority),
key=lambda x: x["discovery_date"],
)
)
- metadatas_for_origin2 = list(swh_storage.origin_metadata_get_by(origin_url2))
- expected_results = [
- {
- "origin_url": origin_url,
- "discovery_date": datetime.datetime(
- 2015, 1, 1, 23, 0, tzinfo=datetime.timezone.utc
- ),
- "metadata": {"name": "test_origin_metadata", "version": "0.0.1"},
- "provider_id": provider["id"],
- "provider_name": "hal",
- "provider_type": "deposit-client",
- "provider_url": "http:///hal/inria",
- "tool_id": tool["id"],
- },
- {
- "origin_url": origin_url,
- "discovery_date": datetime.datetime(
- 2017, 1, 1, 23, 0, tzinfo=datetime.timezone.utc
- ),
- "metadata": {"name": "test_origin_metadata", "version": "0.0.1"},
- "provider_id": provider["id"],
- "provider_name": "hal",
- "provider_type": "deposit-client",
- "provider_url": "http:///hal/inria",
- "tool_id": tool["id"],
- },
- ]
-
- # then
- assert len(all_metadatas) == 2
- assert len(metadatas_for_origin2) == 1
- assert all_metadatas == expected_results
-
- def test_metadata_provider_add(self, swh_storage):
- provider = {
- "provider_name": "swMATH",
- "provider_type": "registry",
- "provider_url": "http://www.swmath.org/",
- "metadata": {
- "email": "contact@swmath.org",
- "license": "All rights reserved",
- },
- }
- provider["id"] = provider_id = swh_storage.metadata_provider_add(**provider)
- assert provider == swh_storage.metadata_provider_get_by(
- {"provider_name": "swMATH", "provider_url": "http://www.swmath.org/"}
- )
- assert provider == swh_storage.metadata_provider_get(provider_id)
-
- def test_metadata_provider_add_idempotent(self, swh_storage):
- provider = {
- "provider_name": "swMATH",
- "provider_type": "registry",
- "provider_url": "http://www.swmath.org/",
- "metadata": {
- "email": "contact@swmath.org",
- "license": "All rights reserved",
- },
- }
- provider_id = swh_storage.metadata_provider_add(**provider)
- expected_provider = {**provider, "id": provider_id}
- assert expected_provider == swh_storage.metadata_provider_get_by(
- {"provider_name": "swMATH", "provider_url": "http://www.swmath.org/"}
- )
- assert expected_provider == swh_storage.metadata_provider_get(provider_id)
-
- provider_id2 = swh_storage.metadata_provider_add(**provider)
- assert provider_id2 == provider_id
-
- def test_origin_metadata_get_by_provider_type(self, swh_storage):
- # given
- origin_url = data.origin["url"]
- origin_url2 = data.origin2["url"]
- swh_storage.origin_add([data.origin])
- swh_storage.origin_add([data.origin2])
- provider1_id = swh_storage.metadata_provider_add(
- data.provider["name"],
- data.provider["type"],
- data.provider["url"],
- data.provider["metadata"],
- )
- provider1 = swh_storage.metadata_provider_get_by(
- {
- "provider_name": data.provider["name"],
- "provider_url": data.provider["url"],
- }
+ assert [origin1_metadata3] == list(
+ sorted(
+ swh_storage.origin_metadata_get(origin_url1, authority2),
+ key=lambda x: x["discovery_date"],
+ )
)
- assert provider1 == swh_storage.metadata_provider_get(provider1_id)
- provider2_id = swh_storage.metadata_provider_add(
- "swMATH",
- "registry",
- "http://www.swmath.org/",
- {"email": "contact@swmath.org", "license": "All rights reserved"},
- )
- provider2 = swh_storage.metadata_provider_get_by(
- {"provider_name": "swMATH", "provider_url": "http://www.swmath.org/"}
+ assert [origin2_metadata] == list(
+ swh_storage.origin_metadata_get(origin_url2, authority)
)
- assert provider2 == swh_storage.metadata_provider_get(provider2_id)
-
- # using the only tool now inserted in the data.sql, but for this
- # provider should be a crawler tool (not yet implemented)
- tool = swh_storage.tool_add([data.metadata_tool])[0]
-
- # when adding for the same origin 2 metadatas
- swh_storage.origin_metadata_add(
- origin_url,
- data.origin_metadata["discovery_date"],
- provider1["id"],
- tool["id"],
- data.origin_metadata["metadata"],
- )
- swh_storage.origin_metadata_add(
- origin_url2,
- data.origin_metadata2["discovery_date"],
- provider2["id"],
- tool["id"],
- data.origin_metadata2["metadata"],
- )
- provider_type = "registry"
- m_by_provider = list(
- swh_storage.origin_metadata_get_by(origin_url2, provider_type)
- )
- for item in m_by_provider:
- if "id" in item:
- del item["id"]
- expected_results = [
- {
- "origin_url": origin_url2,
- "discovery_date": datetime.datetime(
- 2017, 1, 1, 23, 0, tzinfo=datetime.timezone.utc
- ),
- "metadata": {"name": "test_origin_metadata", "version": "0.0.1"},
- "provider_id": provider2["id"],
- "provider_name": "swMATH",
- "provider_type": provider_type,
- "provider_url": "http://www.swmath.org/",
- "tool_id": tool["id"],
- }
- ]
- # then
-
- assert len(m_by_provider) == 1
- assert m_by_provider == expected_results
class TestStorageGeneratedData:

File Metadata

Mime Type
text/plain
Expires
Nov 5 2024, 9:21 AM (11 w, 17 h ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3224921

Event Timeline