Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F7066443
D2988.id11243.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
93 KB
Subscribers
None
D2988.id11243.diff
View Options
diff --git a/docs/extrinsic-metadata-specification.rst b/docs/extrinsic-metadata-specification.rst
--- a/docs/extrinsic-metadata-specification.rst
+++ b/docs/extrinsic-metadata-specification.rst
@@ -32,11 +32,11 @@
An authority is uniquely defined by these properties:
* its type, representing the kind of authority, which is one of these values:
- * `deposit`, for metadata pushed to Software Heritage at the same time
- as a software artifact
- * `forge`, for metadata pulled from the same source as the one hosting
- the software artifacts (which includes package managers)
- * `registry`, for metadata pulled from a third-party
+ * `deposit`, for metadata pushed to Software Heritage at the same time
+ as a software artifact
+ * `forge`, for metadata pulled from the same source as the one hosting
+ the software artifacts (which includes package managers)
+ * `registry`, for metadata pulled from a third-party
* its URL, which unambiguously identifies an instance of the authority type.
Examples:
@@ -145,6 +145,7 @@
added from this origin, in the format::
{
+ 'origin_url': ...,
'authority': {'type': ..., 'url': ...},
'fetcher': {'name': ..., 'version': ...},
'discovery_date': ...,
diff --git a/sql/upgrades/149.sql b/sql/upgrades/149.sql
new file mode 100644
--- /dev/null
+++ b/sql/upgrades/149.sql
@@ -0,0 +1,105 @@
+-- SWH DB schema upgrade
+-- from_version: 148
+-- to_version: 149
+-- description: Implement extrinsic origin-metadata specification
+
+-- latest schema version
+insert into dbversion(version, release, description)
+ values(149, now(), 'Work In Progress');
+
+-- metadata_fetcher
+
+alter table tool
+ rename to metadata_fetcher;
+comment on table metadata_fetcher is 'Tools used to retrieve metadata';
+
+alter table metadata_fetcher
+ rename column configuration to metadata;
+
+comment on column metadata_fetcher.id is 'Internal identifier of the fetcher';
+comment on column metadata_fetcher.name is 'Fetcher name';
+comment on column metadata_fetcher.version is 'Fetcher version';
+comment on column metadata_fetcher.metadata is 'Extra information about the fetcher';
+
+alter index tool_pkey
+ rename to metadata_fetcher_pkey;
+create unique index metadata_fetcher_name_version
+ on metadata_fetcher(name, version);
+
+drop index tool_tool_name_tool_version_tool_configuration_idx;
+ -- was an index on (name, version, configuration)
+ -- this is the name in production; in new setups it would be called tool_name_version_configuration_idx
+ -- drop index tool_name_version_configuration_idx;
+
+-- metadata_authority
+
+alter table metadata_provider
+ rename to metadata_authority;
+comment on table metadata_authority is 'Metadata authority information';
+
+drop index metadata_provider_provider_name_provider_url_idx;
+ -- was an index on (provider_name, provider_url)
+
+alter table metadata_authority
+ drop column provider_name;
+alter table metadata_authority
+ rename column provider_type to type;
+alter table metadata_authority
+ rename column provider_url to url;
+
+comment on column metadata_authority.id is 'Internal identifier of the authority';
+comment on column metadata_authority.type is 'Type of authority (deposit/forge/registry)';
+comment on column metadata_authority.url is 'Authority''s uri';
+comment on column metadata_authority.metadata is 'Other metadata about authority';
+
+alter index metadata_provider_pkey
+ rename to metadata_authority_pkey;
+alter index metadata_provider_type_url
+ rename to metadata_authority_type_url;
+
+-- origin_metadata
+
+alter table origin_metadata
+ rename column provider_id to authority_id;
+alter table origin_metadata
+ rename column tool_id to fetcher_id;
+alter table origin_metadata
+ add column format text default 'sword-v2-atom-codemeta-v2-in-json';
+alter table origin_metadata
+ rename column metadata to metadata_jsonb;
+alter table origin_metadata
+ add column metadata bytea;
+
+-- migrates metadata_jsonb (a jsonb) to metadata (a bytea)
+--update origin_metadata
+-- set metadata=metadata_jsonb::text::bytea;
+update origin_metadata
+ set metadata=convert_to(metadata_jsonb::text, 'utf-8');
+
+create index origin_metadata_origin_authority_date
+ on origin_metadata(origin_id, authority_id, discovery_date);
+
+drop index origin_metadata_origin_id_provider_id_tool_id_idx;
+ -- was an index on (origin_id, provider_id, tool_id)
+
+alter table origin_metadata
+ drop column metadata_jsonb;
+
+comment on column origin_metadata.authority_id is 'the metadata provider: github, openhub, deposit, etc.';
+comment on column origin_metadata.fetcher_id is 'the tool used for extracting metadata: loaders, crawlers, etc.';
+comment on column origin_metadata.format is 'name of the format of metadata, used by readers to interpret it.';
+comment on column origin_metadata.metadata is 'original metadata in opaque format';
+
+
+-- cleanup unused functions
+
+drop function swh_mktemp_tool;
+drop function swh_tool_add;
+
+drop function swh_origin_metadata_get_by_origin(text);
+drop function swh_origin_metadata_get_by_provider_type(text, text);
+
+drop function swh_origin_metadata_get_by_origin(int);
+drop function swh_origin_metadata_get_by_provider_type(int, text);
+
+drop type origin_metadata_signature;
diff --git a/swh/storage/cassandra/cql.py b/swh/storage/cassandra/cql.py
--- a/swh/storage/cassandra/cql.py
+++ b/swh/storage/cassandra/cql.py
@@ -794,34 +794,100 @@
yield from self._origin_visit_iter_to(start_token)
##########################
- # 'tool' table
+ # 'metadata_authority' table
##########################
- _tool_keys = ["id", "name", "version", "configuration"]
+ _metadata_authority_keys = ["url", "type", "metadata"]
- @_prepared_insert_statement("tool_by_uuid", _tool_keys)
- def tool_by_uuid_add_one(self, tool: Dict[str, Any], *, statement) -> None:
- self._execute_with_retries(statement, [tool[key] for key in self._tool_keys])
+ @_prepared_insert_statement("metadata_authority", _metadata_authority_keys)
+ def metadata_authority_add(self, url, type, metadata, *, statement):
+ return self._execute_with_retries(statement, [url, type, metadata])
- @_prepared_insert_statement("tool", _tool_keys)
- def tool_add_one(self, tool: Dict[str, Any], *, statement) -> None:
- self._execute_with_retries(statement, [tool[key] for key in self._tool_keys])
- self._increment_counter("tool", 1)
+ @_prepared_statement("SELECT * from metadata_authority WHERE type = ? AND url = ?")
+ def metadata_authority_get(self, type, url, *, statement) -> Optional[Row]:
+ return next(iter(self._execute_with_retries(statement, [type, url])), None)
+
+ ##########################
+ # 'metadata_fetcher' table
+ ##########################
+
+ _metadata_fetcher_keys = ["name", "version", "metadata"]
+
+ @_prepared_insert_statement("metadata_fetcher", _metadata_fetcher_keys)
+ def metadata_fetcher_add(self, name, version, metadata, *, statement):
+ return self._execute_with_retries(statement, [name, version, metadata])
@_prepared_statement(
- "SELECT id FROM tool " "WHERE name = ? AND version = ? " "AND configuration = ?"
+ "SELECT * from metadata_fetcher WHERE name = ? AND version = ?"
)
- def tool_get_one_uuid(
- self, name: str, version: str, configuration: Dict[str, Any], *, statement
- ) -> Optional[str]:
- rows = list(
- self._execute_with_retries(statement, [name, version, configuration])
+ def metadata_fetcher_get(self, name, version, *, statement) -> Optional[Row]:
+ return next(iter(self._execute_with_retries(statement, [name, version])), None)
+
+ ##########################
+ # 'origin_metadata' table
+ ##########################
+
+ _origin_metadata_keys = [
+ "origin",
+ "authority_type",
+ "authority_url",
+ "discovery_date",
+ "fetcher_name",
+ "fetcher_version",
+ "format",
+ "metadata",
+ ]
+
+ @_prepared_insert_statement("origin_metadata", _origin_metadata_keys)
+ def origin_metadata_add(
+ self,
+ origin,
+ authority_type,
+ authority_url,
+ discovery_date,
+ fetcher_name,
+ fetcher_version,
+ format,
+ metadata,
+ *,
+ statement,
+ ):
+ return self._execute_with_retries(
+ statement,
+ [
+ origin,
+ authority_type,
+ authority_url,
+ discovery_date,
+ fetcher_name,
+ fetcher_version,
+ format,
+ metadata,
+ ],
+ )
+
+ @_prepared_statement(
+ "SELECT * from origin_metadata "
+ "WHERE origin=? AND authority_url=? AND discovery_date>=? "
+ "AND authority_type=?"
+ )
+ def origin_metadata_get_after(
+ self, origin, authority_type, authority_url, after, *, statement
+ ):
+ return self._execute_with_retries(
+ statement, [origin, authority_url, after, authority_type]
+ )
+
+ @_prepared_statement(
+ "SELECT * from origin_metadata "
+ "WHERE origin=? AND authority_url=? AND authority_type=?"
+ )
+ def origin_metadata_get(
+ self, origin, authority_type, authority_url, *, statement
+ ) -> Iterable[Row]:
+ return self._execute_with_retries(
+ statement, [origin, authority_url, authority_type]
)
- if rows:
- assert len(rows) == 1
- return rows[0].id
- else:
- return None
##########################
# Miscellaneous
diff --git a/swh/storage/cassandra/schema.py b/swh/storage/cassandra/schema.py
--- a/swh/storage/cassandra/schema.py
+++ b/swh/storage/cassandra/schema.py
@@ -175,21 +175,37 @@
);
-CREATE TABLE IF NOT EXISTS tool_by_uuid (
- id timeuuid PRIMARY KEY,
- name ascii,
- version ascii,
- configuration blob,
+CREATE TABLE IF NOT EXISTS metadata_authority (
+ url text,
+ type ascii,
+ metadata text,
+ PRIMARY KEY ((url), type)
);
-CREATE TABLE IF NOT EXISTS tool (
- id timeuuid,
+CREATE TABLE IF NOT EXISTS metadata_fetcher (
name ascii,
version ascii,
- configuration blob,
- PRIMARY KEY ((name, version, configuration))
-)
+ metadata text,
+ PRIMARY KEY ((name), version)
+);
+
+
+CREATE TABLE IF NOT EXISTS origin_metadata (
+ origin text,
+ authority_type text,
+ authority_url text,
+ discovery_date timestamp,
+ fetcher_name ascii,
+ fetcher_version ascii,
+ format ascii,
+ metadata blob,
+ PRIMARY KEY ((origin), authority_type, authority_url, discovery_date,
+ fetcher_name, fetcher_version),
+ -- for now, authority_url could be in the partition key; but leaving
+ -- in the partition key allows listing authorities with metadata on an
+ -- origin if we ever need to do it.
+);
CREATE TABLE IF NOT EXISTS object_count (
@@ -220,8 +236,9 @@
TABLES = (
"skipped_content content revision revision_parent release "
"directory directory_entry snapshot snapshot_branch "
- "origin_visit origin tool_by_uuid tool object_count "
- "origin_visit_status"
+ "origin_visit origin origin_metadata object_count "
+ "origin_visit_status metadata_authority "
+ "metadata_fetcher"
).split()
HASH_ALGORITHMS = ["sha1", "sha1_git", "sha256", "blake2s256"]
diff --git a/swh/storage/cassandra/storage.py b/swh/storage/cassandra/storage.py
--- a/swh/storage/cassandra/storage.py
+++ b/swh/storage/cassandra/storage.py
@@ -4,11 +4,11 @@
# See top-level LICENSE file for more information
import datetime
+import itertools
import json
import random
import re
from typing import Any, Dict, List, Iterable, Optional, Union
-import uuid
import attr
import dateutil
@@ -1034,37 +1034,6 @@
else:
return None
- def tool_add(self, tools):
- inserted = []
- for tool in tools:
- tool = tool.copy()
- tool_json = tool.copy()
- tool_json["configuration"] = json.dumps(
- tool["configuration"], sort_keys=True
- ).encode()
- id_ = self._cql_runner.tool_get_one_uuid(**tool_json)
- if not id_:
- id_ = uuid.uuid1()
- tool_json["id"] = id_
- self._cql_runner.tool_by_uuid_add_one(tool_json)
- self._cql_runner.tool_add_one(tool_json)
- tool["id"] = id_
- inserted.append(tool)
- return inserted
-
- def tool_get(self, tool):
- id_ = self._cql_runner.tool_get_one_uuid(
- tool["name"],
- tool["version"],
- json.dumps(tool["configuration"], sort_keys=True).encode(),
- )
- if id_:
- tool = tool.copy()
- tool["id"] = id_
- return tool
- else:
- return None
-
def stat_counters(self):
rows = self._cql_runner.stat_counters()
keys = (
@@ -1084,27 +1053,109 @@
def refresh_stat_counters(self):
pass
- def origin_metadata_add(self, origin_url, ts, provider, tool, metadata):
- # TODO
- raise NotImplementedError("not yet supported for Cassandra")
+ def origin_metadata_add(
+ self,
+ origin_url: str,
+ discovery_date: datetime.datetime,
+ authority: Dict[str, Any],
+ fetcher: Dict[str, Any],
+ format: str,
+ metadata: bytes,
+ ) -> None:
+ if not isinstance(origin_url, str):
+ raise StorageArgumentException(
+ "origin_id must be str, not %r" % (origin_url,)
+ )
+ if not self._cql_runner.metadata_authority_get(**authority):
+ raise StorageArgumentException(f"Unknown authority {authority}")
+ if not self._cql_runner.metadata_fetcher_get(**fetcher):
+ raise StorageArgumentException(f"Unknown fetcher {fetcher}")
+
+ self._cql_runner.origin_metadata_add(
+ origin_url,
+ authority["type"],
+ authority["url"],
+ discovery_date,
+ fetcher["name"],
+ fetcher["version"],
+ format,
+ metadata,
+ )
- def origin_metadata_get_by(self, origin_url, provider_type=None):
- # TODO
- raise NotImplementedError("not yet supported for Cassandra")
+ def origin_metadata_get(
+ self,
+ origin_url: str,
+ authority: Dict[str, str],
+ after: Optional[datetime.datetime] = None,
+ limit: Optional[int] = None,
+ ) -> List[Dict[str, Any]]:
+ if not isinstance(origin_url, str):
+ raise TypeError("origin_url must be str, not %r" % (origin_url,))
+
+ if after is None:
+ entries = self._cql_runner.origin_metadata_get(
+ origin_url, authority["type"], authority["url"]
+ )
+ else:
+ entries = self._cql_runner.origin_metadata_get_after(
+ origin_url, authority["type"], authority["url"], after
+ )
- def metadata_provider_add(
- self, provider_name, provider_type, provider_url, metadata
- ):
- # TODO
- raise NotImplementedError("not yet supported for Cassandra")
+ if limit:
+ entries = itertools.islice(entries, 0, limit)
- def metadata_provider_get(self, provider_id):
- # TODO
- raise NotImplementedError("not yet supported for Cassandra")
+ results = []
+ for entry in entries:
+ discovery_date = entry.discovery_date.replace(tzinfo=datetime.timezone.utc)
+ results.append(
+ {
+ "origin_url": entry.origin,
+ "authority": {
+ "type": entry.authority_type,
+ "url": entry.authority_url,
+ },
+ "fetcher": {
+ "name": entry.fetcher_name,
+ "version": entry.fetcher_version,
+ },
+ "discovery_date": discovery_date,
+ "format": entry.format,
+ "metadata": entry.metadata,
+ }
+ )
+ return results
+
+ def metadata_fetcher_add(
+ self, name: str, version: str, metadata: Dict[str, Any]
+ ) -> None:
+ self._cql_runner.metadata_fetcher_add(name, version, json.dumps(metadata))
+
+ def metadata_fetcher_get(self, name: str, version: str) -> Optional[Dict[str, Any]]:
+ fetcher = self._cql_runner.metadata_fetcher_get(name, version)
+ if fetcher:
+ return {
+ "name": fetcher.name,
+ "version": fetcher.version,
+ "metadata": json.loads(fetcher.metadata),
+ }
+ else:
+ return None
- def metadata_provider_get_by(self, provider):
- # TODO
- raise NotImplementedError("not yet supported for Cassandra")
+ def metadata_authority_add(
+ self, type: str, url: str, metadata: Dict[str, Any]
+ ) -> None:
+ self._cql_runner.metadata_authority_add(url, type, json.dumps(metadata))
+
+ def metadata_authority_get(self, type: str, url: str) -> Optional[Dict[str, Any]]:
+ authority = self._cql_runner.metadata_authority_get(type, url)
+ if authority:
+ return {
+ "type": authority.type,
+ "url": authority.url,
+ "metadata": json.loads(authority.metadata),
+ }
+ else:
+ return None
def clear_buffers(self, object_types: Optional[Iterable[str]] = None) -> None:
"""Do nothing
diff --git a/swh/storage/db.py b/swh/storage/db.py
--- a/swh/storage/db.py
+++ b/swh/storage/db.py
@@ -3,9 +3,9 @@
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
+import datetime
import random
import select
-
from typing import Any, Dict, Optional, Tuple
from swh.core.db import BaseDb
@@ -1059,159 +1059,150 @@
def release_get_random(self, cur=None):
return self._get_random_row_from_table("release", ["id"], "id", cur)
- def origin_metadata_add(self, origin, ts, provider, tool, metadata, cur=None):
+ origin_metadata_get_cols = [
+ "origin.url",
+ "discovery_date",
+ "metadata_authority.type",
+ "metadata_authority.url",
+ "metadata_fetcher.name",
+ "metadata_fetcher.version",
+ "format",
+ "metadata",
+ ]
+
+ def origin_metadata_add(
+ self,
+ origin: str,
+ discovery_date: datetime.datetime,
+ authority: int,
+ fetcher: int,
+ format: str,
+ metadata: bytes,
+ cur=None,
+ ) -> None:
""" Add an origin_metadata for the origin at ts with provider, tool and
metadata.
Args:
- origin (int): the origin's id for which the metadata is added
- ts (datetime): time when the metadata was found
- provider (int): the metadata provider identifier
- tool (int): the tool's identifier used to extract metadata
- metadata (jsonb): the metadata retrieved at the time and location
-
- Returns:
- id (int): the origin_metadata unique id
-
+ origin: the origin's id for which the metadata is added
+ discovery_date: time when the metadata was found
+ authority: the metadata provider identifier
+ fetcher: the tool's identifier used to extract metadata
+ format: the format of the metadata
+ metadata: the metadata retrieved at the time and location
"""
cur = self._cursor(cur)
insert = """INSERT INTO origin_metadata (origin_id, discovery_date,
- provider_id, tool_id, metadata)
- SELECT id, %s, %s, %s, %s FROM origin WHERE url = %s"""
- cur.execute(insert, (ts, provider, tool, jsonize(metadata), origin))
-
- origin_metadata_get_cols = [
- "origin_url",
- "discovery_date",
- "tool_id",
- "metadata",
- "provider_id",
- "provider_name",
- "provider_type",
- "provider_url",
- ]
-
- def origin_metadata_get_by(self, origin_url, provider_type=None, cur=None):
- """Retrieve all origin_metadata entries for one origin_url
+ authority_id, fetcher_id, format, metadata)
+ SELECT id, %s, %s, %s, %s, %s FROM origin WHERE url = %s"""
+ cur.execute(
+ insert,
+ (discovery_date, authority, fetcher, format, jsonize(metadata), origin),
+ )
- """
+ def origin_metadata_get(
+ self,
+ origin_url: str,
+ authority: int,
+ after: Optional[datetime.datetime],
+ limit: Optional[int],
+ cur=None,
+ ):
cur = self._cursor(cur)
- if not provider_type:
- query = """SELECT %s
- FROM swh_origin_metadata_get_by_origin(
- %%s)""" % (
- ",".join(self.origin_metadata_get_cols)
- )
+ assert self.origin_metadata_get_cols[-1] == "metadata"
+ query_parts = [
+ f"SELECT {', '.join(self.origin_metadata_get_cols[0:-1])}, "
+ f" origin_metadata.metadata AS metadata "
+ f"FROM origin_metadata "
+ f"INNER JOIN metadata_authority "
+ f" ON (metadata_authority.id=authority_id) "
+ f"INNER JOIN metadata_fetcher ON (metadata_fetcher.id=fetcher_id) "
+ f"INNER JOIN origin ON (origin.id=origin_metadata.origin_id) "
+ f"WHERE origin.url=%s AND authority_id=%s"
+ ]
+ args = [origin_url, authority]
- cur.execute(query, (origin_url,))
+ if after:
+ query_parts.append("AND discovery_date >= %s")
+ args.append(after)
- else:
- query = """SELECT %s
- FROM swh_origin_metadata_get_by_provider_type(
- %%s, %%s)""" % (
- ",".join(self.origin_metadata_get_cols)
- )
+ query_parts.append("ORDER BY discovery_date")
- cur.execute(query, (origin_url, provider_type))
+ if limit:
+ query_parts.append("LIMIT %s")
+ args.append(limit)
+ cur.execute(" ".join(query_parts), args)
yield from cur
- tool_cols = ["id", "name", "version", "configuration"]
-
- @stored_procedure("swh_mktemp_tool")
- def mktemp_tool(self, cur=None):
- pass
+ metadata_fetcher_cols = ["name", "version", "metadata"]
- def tool_add_from_temp(self, cur=None):
+ def metadata_fetcher_add(
+ self, name: str, version: str, metadata: bytes, cur=None
+ ) -> None:
cur = self._cursor(cur)
- cur.execute("SELECT %s from swh_tool_add()" % (",".join(self.tool_cols),))
- yield from cur
+ cur.execute(
+ "INSERT INTO metadata_fetcher (name, version, metadata) "
+ "VALUES (%s, %s, %s) ON CONFLICT DO NOTHING",
+ (name, version, jsonize(metadata)),
+ )
- def tool_get(self, name, version, configuration, cur=None):
+ def metadata_fetcher_get(self, name: str, version: str, cur=None):
cur = self._cursor(cur)
cur.execute(
- """select %s
- from tool
- where name=%%s and
- version=%%s and
- configuration=%%s"""
- % (",".join(self.tool_cols)),
- (name, version, configuration),
+ f"SELECT {', '.join(self.metadata_fetcher_cols)} "
+ f"FROM metadata_fetcher "
+ f"WHERE name=%s AND version=%s",
+ (name, version),
)
-
return cur.fetchone()
- metadata_provider_cols = [
- "id",
- "provider_name",
- "provider_type",
- "provider_url",
- "metadata",
- ]
-
- def metadata_provider_add(
- self,
- provider_name: str,
- provider_type: str,
- provider_url: str,
- metadata: Dict,
- cur=None,
- ) -> int:
- """Insert a new provider and return the new identifier."""
+ def metadata_fetcher_get_id(
+ self, name: str, version: str, cur=None
+ ) -> Optional[int]:
cur = self._cursor(cur)
- insert = """
- INSERT INTO metadata_provider (provider_name, provider_type,
- provider_url, metadata) values (%s, %s, %s, %s)
- ON CONFLICT(provider_type, provider_url) do nothing
- """
cur.execute(
- insert, (provider_name, provider_type, provider_url, jsonize(metadata))
- )
- row = self.metadata_provider_get_by_composite_key(
- provider_type, provider_url, cur=cur
+ "SELECT id FROM metadata_fetcher WHERE name=%s AND version=%s",
+ (name, version),
)
- return row[0]
+ row = cur.fetchone()
+ if row:
+ return row[0]
+ else:
+ return None
- def metadata_provider_get_by_composite_key(
- self, provider_type: str, provider_url: str, cur=None
- ) -> Tuple:
- """Retrieve metadata provider by its composite primary key.
+ metadata_authority_cols = ["type", "url", "metadata"]
- """
+ def metadata_authority_add(
+ self, type: str, url: str, metadata: bytes, cur=None
+ ) -> None:
cur = self._cursor(cur)
cur.execute(
- """select %s
- from metadata_provider
- where provider_type=%%s and provider_url=%%s"""
- % (",".join(self.metadata_provider_cols)),
- (provider_type, provider_url,),
+ "INSERT INTO metadata_authority (type, url, metadata) "
+ "VALUES (%s, %s, %s) ON CONFLICT DO NOTHING",
+ (type, url, jsonize(metadata)),
)
- return cur.fetchone()
- def metadata_provider_get(self, provider_id, cur=None):
+ def metadata_authority_get(self, type: str, url: str, cur=None):
cur = self._cursor(cur)
cur.execute(
- """select %s
- from metadata_provider
- where id=%%s """
- % (",".join(self.metadata_provider_cols)),
- (provider_id,),
+ f"SELECT {', '.join(self.metadata_authority_cols)} "
+ f"FROM metadata_authority "
+ f"WHERE type=%s AND url=%s",
+ (type, url),
)
-
return cur.fetchone()
- def metadata_provider_get_by(self, provider_name, provider_url, cur=None):
+ def metadata_authority_get_id(self, type: str, url: str, cur=None) -> Optional[int]:
cur = self._cursor(cur)
cur.execute(
- """select %s
- from metadata_provider
- where provider_name=%%s and
- provider_url=%%s"""
- % (",".join(self.metadata_provider_cols)),
- (provider_name, provider_url),
+ "SELECT id FROM metadata_authority WHERE type=%s AND url=%s", (type, url)
)
-
- return cur.fetchone()
+ row = cur.fetchone()
+ if row:
+ return row[0]
+ else:
+ return None
def _get_random_row_from_table(self, table_name, cols, id_col, cur=None):
random_sha1 = bytes(random.randint(0, 255) for _ in range(SHA1_SIZE))
diff --git a/swh/storage/in_memory.py b/swh/storage/in_memory.py
--- a/swh/storage/in_memory.py
+++ b/swh/storage/in_memory.py
@@ -19,6 +19,7 @@
Callable,
Dict,
Generic,
+ Hashable,
Iterable,
Iterator,
List,
@@ -123,11 +124,17 @@
self._origin_visits = {}
self._origin_visit_statuses: Dict[Tuple[str, int], List[OriginVisitStatus]] = {}
self._persons = []
- self._origin_metadata = defaultdict(list)
- self._tools = {}
- self._metadata_providers = {}
- self._objects = defaultdict(list)
+ # {origin_url: {authority: [metadata]}}
+ self._origin_metadata: Dict[
+ str, Dict[Hashable, SortedList[datetime.datetime, Dict[str, Any]]]
+ ] = defaultdict(
+ lambda: defaultdict(lambda: SortedList(key=lambda x: x["discovery_date"]))
+ ) # noqa
+
+ self._metadata_fetchers: Dict[Hashable, Dict[str, Any]] = {}
+ self._metadata_authorities: Dict[Hashable, Dict[str, Any]] = {}
+ self._objects = defaultdict(list)
self._sorted_sha1s = SortedList[bytes, bytes]()
self.objstorage = ObjStorage({"cls": "memory", "args": {}})
@@ -139,7 +146,6 @@
self.journal_writer.content_add(contents)
content_add = 0
- content_add_bytes = 0
if with_data:
summary = self.objstorage.content_add(
c for c in contents if c.status != "absent"
@@ -1040,71 +1046,104 @@
def refresh_stat_counters(self):
pass
- def origin_metadata_add(self, origin_url, ts, provider, tool, metadata):
+ def origin_metadata_add(
+ self,
+ origin_url: str,
+ discovery_date: datetime.datetime,
+ authority: Dict[str, Any],
+ fetcher: Dict[str, Any],
+ format: str,
+ metadata: bytes,
+ ) -> None:
if not isinstance(origin_url, str):
- raise TypeError("origin_id must be str, not %r" % (origin_url,))
-
- if isinstance(ts, str):
- ts = dateutil.parser.parse(ts)
+ raise StorageArgumentException(
+ "origin_id must be str, not %r" % (origin_url,)
+ )
+ authority_key = self._metadata_authority_key(authority)
+ if authority_key not in self._metadata_authorities:
+ raise StorageArgumentException(f"Unknown authority {authority}")
+ fetcher_key = self._metadata_fetcher_key(fetcher)
+ if fetcher_key not in self._metadata_fetchers:
+ raise StorageArgumentException(f"Unknown fetcher {fetcher}")
origin_metadata = {
"origin_url": origin_url,
- "discovery_date": ts,
- "tool_id": tool,
+ "discovery_date": discovery_date,
+ "authority": authority_key,
+ "fetcher": fetcher_key,
+ "format": format,
"metadata": metadata,
- "provider_id": provider,
}
- self._origin_metadata[origin_url].append(origin_metadata)
+ self._origin_metadata[origin_url][authority_key].add(origin_metadata)
return None
- def origin_metadata_get_by(self, origin_url, provider_type=None):
+ def origin_metadata_get(
+ self,
+ origin_url: str,
+ authority: Dict[str, str],
+ after: Optional[datetime.datetime] = None,
+ limit: Optional[int] = None,
+ ) -> List[Dict[str, Any]]:
if not isinstance(origin_url, str):
raise TypeError("origin_url must be str, not %r" % (origin_url,))
- metadata = []
- for item in self._origin_metadata[origin_url]:
- item = copy.deepcopy(item)
- provider = self.metadata_provider_get(item["provider_id"])
- for attr_name in ("name", "type", "url"):
- item["provider_" + attr_name] = provider["provider_" + attr_name]
- metadata.append(item)
- return metadata
-
- def tool_add(self, tools):
- inserted = []
- for tool in tools:
- key = self._tool_key(tool)
- assert "id" not in tool
- record = copy.deepcopy(tool)
- record["id"] = key # TODO: remove this
- if key not in self._tools:
- self._tools[key] = record
- inserted.append(copy.deepcopy(self._tools[key]))
-
- return inserted
-
- def tool_get(self, tool):
- return self._tools.get(self._tool_key(tool))
-
- def metadata_provider_add(
- self, provider_name, provider_type, provider_url, metadata
- ):
- provider = {
- "provider_name": provider_name,
- "provider_type": provider_type,
- "provider_url": provider_url,
+
+ authority_key = self._metadata_authority_key(authority)
+
+ if after is None:
+ entries = iter(self._origin_metadata[origin_url][authority_key])
+ else:
+ entries = self._origin_metadata[origin_url][authority_key].iter_from(after)
+ if limit:
+ entries = itertools.islice(entries, 0, limit)
+
+ results = []
+ for entry in entries:
+ authority = self._metadata_authorities[entry["authority"]]
+ fetcher = self._metadata_fetchers[entry["fetcher"]]
+ results.append(
+ {
+ **entry,
+ "authority": {"type": authority["type"], "url": authority["url"],},
+ "fetcher": {
+ "name": fetcher["name"],
+ "version": fetcher["version"],
+ },
+ }
+ )
+ return results
+
+ def metadata_fetcher_add(
+ self, name: str, version: str, metadata: Dict[str, Any]
+ ) -> None:
+ fetcher = {
+ "name": name,
+ "version": version,
"metadata": metadata,
}
- key = self._metadata_provider_key(provider)
- provider["id"] = key
- self._metadata_providers[key] = provider
- return key
+ key = self._metadata_fetcher_key(fetcher)
+ if key not in self._metadata_fetchers:
+ self._metadata_fetchers[key] = fetcher
+
+ def metadata_fetcher_get(self, name: str, version: str) -> Optional[Dict[str, Any]]:
+ return self._metadata_fetchers.get(
+ self._metadata_fetcher_key({"name": name, "version": version})
+ )
- def metadata_provider_get(self, provider_id):
- return self._metadata_providers.get(provider_id)
+ def metadata_authority_add(
+ self, type: str, url: str, metadata: Dict[str, Any]
+ ) -> None:
+ authority = {
+ "type": type,
+ "url": url,
+ "metadata": metadata,
+ }
+ key = self._metadata_authority_key(authority)
+ self._metadata_authorities[key] = authority
- def metadata_provider_get_by(self, provider):
- key = self._metadata_provider_key(provider)
- return self._metadata_providers.get(key)
+ def metadata_authority_get(self, type: str, url: str) -> Optional[Dict[str, Any]]:
+ return self._metadata_authorities.get(
+ self._metadata_authority_key({"type": type, "url": url})
+ )
def _get_origin_url(self, origin):
if isinstance(origin, str):
@@ -1131,16 +1170,12 @@
return tuple((key, content.get(key)) for key in sorted(DEFAULT_ALGORITHMS))
@staticmethod
- def _tool_key(tool):
- return "%r %r %r" % (
- tool["name"],
- tool["version"],
- tuple(sorted(tool["configuration"].items())),
- )
+ def _metadata_fetcher_key(fetcher: Dict) -> Hashable:
+ return (fetcher["name"], fetcher["version"])
@staticmethod
- def _metadata_provider_key(provider):
- return "%r %r" % (provider["provider_name"], provider["provider_url"])
+ def _metadata_authority_key(authority: Dict) -> Hashable:
+ return (authority["type"], authority["url"])
def diff_directories(self, from_dir, to_dir, track_renaming=False):
raise NotImplementedError("InMemoryStorage.diff_directories")
diff --git a/swh/storage/interface.py b/swh/storage/interface.py
--- a/swh/storage/interface.py
+++ b/swh/storage/interface.py
@@ -1132,120 +1132,118 @@
...
@remote_api_endpoint("origin/metadata/add")
- def origin_metadata_add(self, origin_url, ts, provider, tool, metadata):
- """ Add an origin_metadata for the origin at ts with provenance and
- metadata.
+ def origin_metadata_add(
+ self,
+ origin_url: str,
+ discovery_date: datetime.datetime,
+ authority: Dict[str, Any],
+ fetcher: Dict[str, Any],
+ format: str,
+ metadata: bytes,
+ ) -> None:
+ """Add an origin_metadata for the origin at discovery_date,
+ obtained using the `fetcher` from the `authority`.
+
+ The authority and fetcher must be known to the storage before
+ using this endpoint.
Args:
- origin_url (str): the origin url for which the metadata is added
- ts (datetime): timestamp of the found metadata
- provider (int): the provider of metadata (ex:'hal')
- tool (int): tool used to extract metadata
- metadata (jsonb): the metadata retrieved at the time and location
+ discovery_date: when the metadata was fetched.
+ authority: a dict containing keys `type` and `url`.
+ fetcher: a dict containing keys `name` and `version`.
+ format: text field indicating the format of the content of the
+ metadata: blob of raw metadata
"""
...
@remote_api_endpoint("origin/metadata/get")
- def origin_metadata_get_by(self, origin_url, provider_type=None):
+ def origin_metadata_get(
+ self,
+ origin_url: str,
+ authority: Dict[str, str],
+ after: Optional[datetime.datetime] = None,
+ limit: Optional[int] = None,
+ ) -> List[Dict[str, Any]]:
"""Retrieve list of all origin_metadata entries for the origin_id
Args:
- origin_url (str): the origin's URL
- provider_type (str): (optional) type of provider
+ origin_url: the origin's URL
+ authority: a dict containing keys `type` and `url`.
+ after: minimum discovery_date for a result to be returned
+ limit: maximum number of results to be returned
Returns:
- list of dicts: the origin_metadata dictionary with the keys:
+ list of dicts in the format:
+
+ .. code-block: python
- - origin_id (int): origin's id
- - discovery_date (datetime): timestamp of discovery
- - tool_id (int): metadata's extracting tool
- - metadata (jsonb)
- - provider_id (int): metadata's provider
- - provider_name (str)
- - provider_type (str)
- - provider_url (str)
+ {
+ 'authority': {'type': ..., 'url': ...},
+ 'fetcher': {'name': ..., 'version': ...},
+ 'discovery_date': ...,
+ 'format': '...',
+ 'metadata': b'...'
+ }
"""
...
- @remote_api_endpoint("tool/add")
- def tool_add(self, tools):
- """Add new tools to the storage.
-
- Args:
- tools (iterable of :class:`dict`): Tool information to add to
- storage. Each tool is a :class:`dict` with the following keys:
+ @remote_api_endpoint("fetcher/add")
+ def metadata_fetcher_add(
+ self, name: str, version: str, metadata: Dict[str, Any]
+ ) -> None:
+ """Add a new metadata fetcher to the storage.
- - name (:class:`str`): name of the tool
- - version (:class:`str`): version of the tool
- - configuration (:class:`dict`): configuration of the tool,
- must be json-encodable
+ `name` and `version` together are a unique identifier of this
+ fetcher; and `metadata` is an arbitrary dict of JSONable data
+ with information about this fetcher.
- Returns:
- :class:`dict`: All the tools inserted in storage
- (including the internal ``id``). The order of the list is not
- guaranteed to match the order of the initial list.
+ Args:
+ name: the name of the fetcher
+ version: version of the fetcher
"""
...
- @remote_api_endpoint("tool/data")
- def tool_get(self, tool):
- """Retrieve tool information.
+ @remote_api_endpoint("fetcher/get")
+ def metadata_fetcher_get(self, name: str, version: str) -> Optional[Dict[str, Any]]:
+ """Retrieve information about a fetcher
Args:
- tool (dict): Tool information we want to retrieve from storage.
- The dicts have the same keys as those used in :func:`tool_add`.
+ name: the name of the fetcher
+ version: version of the fetcher
Returns:
- dict: The full tool information if it exists (``id`` included),
- None otherwise.
+ dictionary with keys `name`, `version`, and `metadata`; or None
+ if the fetcher is not known
"""
...
- @remote_api_endpoint("provider/add")
- def metadata_provider_add(
- self, provider_name, provider_type, provider_url, metadata
- ):
- """Add a metadata provider.
+ @remote_api_endpoint("authority/add")
+ def metadata_authority_add(
+ self, type: str, url: str, metadata: Dict[str, Any]
+ ) -> None:
+ """Add a metadata authority
Args:
- provider_name (str): Its name
- provider_type (str): Its type (eg. `'deposit-client'`)
- provider_url (str): Its URL
+ type: one of "deposit", "forge", or "registry"
+ url: unique URI identifying the authority
metadata: JSON-encodable object
-
- Returns:
- int: an identifier of the provider
- """
- ...
-
- @remote_api_endpoint("provider/get")
- def metadata_provider_get(self, provider_id):
- """Get a metadata provider
-
- Args:
- provider_id: Its identifier, as given by `metadata_provider_add`.
-
- Returns:
- dict: same as `metadata_provider_add`;
- or None if it does not exist.
"""
...
- @remote_api_endpoint("provider/getby")
- def metadata_provider_get_by(self, provider):
- """Get a metadata provider
+ @remote_api_endpoint("authority/get")
+ def metadata_authority_get(self, type: str, url: str) -> Optional[Dict[str, Any]]:
+ """Retrieve information about an authority
Args:
- provider (dict): A dictionary with keys:
- * provider_name: Its name
- * provider_url: Its URL
+ type: one of "deposit", "forge", or "registry"
+ url: unique URI identifying the authority
Returns:
- dict: same as `metadata_provider_add`;
- or None if it does not exist.
+ dictionary with keys `type`, `url`, and `metadata`; or None
+ if the authority is not known
"""
...
diff --git a/swh/storage/retry.py b/swh/storage/retry.py
--- a/swh/storage/retry.py
+++ b/swh/storage/retry.py
@@ -7,7 +7,7 @@
import traceback
from datetime import datetime
-from typing import Dict, Iterable, List, Optional, Union
+from typing import Any, Dict, Iterable, Optional, Union
from tenacity import (
retry,
@@ -127,29 +127,29 @@
)
@swh_retry
- def tool_add(self, tools: Iterable[Dict]) -> List[Dict]:
- tools = list(tools)
- return self.storage.tool_add(tools)
+ def metadata_fetcher_add(
+ self, name: str, version: str, metadata: Dict[str, Any]
+ ) -> None:
+ return self.storage.metadata_fetcher_add(name, version, metadata)
@swh_retry
- def metadata_provider_add(
- self, provider_name: str, provider_type: str, provider_url: str, metadata: Dict
- ) -> Union[str, int]:
- return self.storage.metadata_provider_add(
- provider_name, provider_type, provider_url, metadata
- )
+ def metadata_authority_add(
+ self, type: str, url: str, metadata: Dict[str, Any]
+ ) -> None:
+ return self.storage.metadata_authority_add(type, url, metadata)
@swh_retry
def origin_metadata_add(
self,
origin_url: str,
- ts: Union[str, datetime],
- provider_id: int,
- tool_id: int,
- metadata: Dict,
+ discovery_date: datetime,
+ authority: Dict[str, Any],
+ fetcher: Dict[str, Any],
+ format: str,
+ metadata: bytes,
) -> None:
return self.storage.origin_metadata_add(
- origin_url, ts, provider_id, tool_id, metadata
+ origin_url, discovery_date, authority, fetcher, format, metadata
)
@swh_retry
diff --git a/swh/storage/sql/30-swh-schema.sql b/swh/storage/sql/30-swh-schema.sql
--- a/swh/storage/sql/30-swh-schema.sql
+++ b/swh/storage/sql/30-swh-schema.sql
@@ -17,7 +17,7 @@
-- latest schema version
insert into dbversion(version, release, description)
- values(148, now(), 'Work In Progress');
+ values(149, now(), 'Work In Progress');
-- a SHA1 checksum
create domain sha1 as bytea check (length(value) = 20);
@@ -397,35 +397,34 @@
comment on column release.date_neg_utc_offset is 'True indicates -0 UTC offset for release timestamp';
-- Tools
-create table tool
+create table metadata_fetcher
(
- id serial not null,
- name text not null,
- version text not null,
- configuration jsonb
+ id serial not null,
+ name text not null,
+ version text not null,
+ metadata jsonb not null
);
-comment on table tool is 'Tool information';
-comment on column tool.id is 'Tool identifier';
-comment on column tool.version is 'Tool name';
-comment on column tool.version is 'Tool version';
-comment on column tool.configuration is 'Tool configuration: command line, flags, etc...';
+comment on table metadata_fetcher is 'Tools used to retrieve metadata';
+comment on column metadata_fetcher.id is 'Internal identifier of the fetcher';
+comment on column metadata_fetcher.name is 'Fetcher name';
+comment on column metadata_fetcher.version is 'Fetcher version';
+comment on column metadata_fetcher.metadata is 'Extra information about the fetcher';
-create table metadata_provider
+create table metadata_authority
(
- id serial not null,
- provider_name text not null,
- provider_type text not null,
- provider_url text,
- metadata jsonb
+ id serial not null,
+ type text not null,
+ url text not null,
+ metadata jsonb not null
);
-comment on table metadata_provider is 'Metadata provider information';
-comment on column metadata_provider.id is 'Provider''s identifier';
-comment on column metadata_provider.provider_name is 'Provider''s name';
-comment on column metadata_provider.provider_url is 'Provider''s url';
-comment on column metadata_provider.metadata is 'Other metadata about provider';
+comment on table metadata_authority is 'Metadata authority information';
+comment on column metadata_authority.id is 'Internal identifier of the authority';
+comment on column metadata_authority.type is 'Type of authority (deposit/forge/registry)';
+comment on column metadata_authority.url is 'Authority''s uri';
+comment on column metadata_authority.metadata is 'Other metadata about authority';
-- Discovery of metadata during a listing, loading, deposit or external_catalog of an origin
@@ -435,18 +434,20 @@
id bigserial not null, -- PK internal object identifier
origin_id bigint not null, -- references origin(id)
discovery_date timestamptz not null, -- when it was extracted
- provider_id bigint not null, -- ex: 'hal', 'lister-github', 'loader-github'
- tool_id bigint not null,
- metadata jsonb not null
+ authority_id bigint not null,
+ fetcher_id bigint not null,
+ format text not null,
+ metadata bytea not null
);
comment on table origin_metadata is 'keeps all metadata found concerning an origin';
comment on column origin_metadata.id is 'the origin_metadata object''s id';
comment on column origin_metadata.origin_id is 'the origin id for which the metadata was found';
comment on column origin_metadata.discovery_date is 'the date of retrieval';
-comment on column origin_metadata.provider_id is 'the metadata provider: github, openhub, deposit, etc.';
-comment on column origin_metadata.tool_id is 'the tool used for extracting metadata: lister-github, etc.';
-comment on column origin_metadata.metadata is 'metadata in json format but with original terms';
+comment on column origin_metadata.authority_id is 'the metadata provider: github, openhub, deposit, etc.';
+comment on column origin_metadata.fetcher_id is 'the tool used for extracting metadata: loaders, crawlers, etc.';
+comment on column origin_metadata.format is 'name of the format of metadata, used by readers to interpret it.';
+comment on column origin_metadata.metadata is 'original metadata in opaque format';
-- Keep a cache of object counts
diff --git a/swh/storage/sql/40-swh-func.sql b/swh/storage/sql/40-swh-func.sql
--- a/swh/storage/sql/40-swh-func.sql
+++ b/swh/storage/sql/40-swh-func.sql
@@ -101,17 +101,6 @@
) on commit delete rows;
$$;
--- create a temporary table for the tools
-create or replace function swh_mktemp_tool()
- returns void
- language sql
-as $$
- create temporary table if not exists tmp_tool (
- like tool including defaults
- ) on commit delete rows;
- alter table tmp_tool drop column if exists id;
-$$;
-
-- a content signature is a set of cryptographic checksums that we use to
-- uniquely identify content, for the purpose of verifying if we already have
-- some content or not during content injection
@@ -920,76 +909,6 @@
$$;
--- end revision_metadata functions
--- origin_metadata functions
-create type origin_metadata_signature as (
- id bigint,
- origin_url text,
- discovery_date timestamptz,
- tool_id bigint,
- metadata jsonb,
- provider_id integer,
- provider_name text,
- provider_type text,
- provider_url text
-);
-create or replace function swh_origin_metadata_get_by_origin(
- origin text)
- returns setof origin_metadata_signature
- language sql
- stable
-as $$
- select om.id as id, o.url as origin_url, discovery_date, tool_id, om.metadata,
- mp.id as provider_id, provider_name, provider_type, provider_url
- from origin_metadata as om
- inner join metadata_provider mp on om.provider_id = mp.id
- inner join origin o on om.origin_id = o.id
- where o.url = origin
- order by discovery_date desc;
-$$;
-
-create or replace function swh_origin_metadata_get_by_provider_type(
- origin_url text,
- provider_type text)
- returns setof origin_metadata_signature
- language sql
- stable
-as $$
- select om.id as id, o.url as origin_url, discovery_date, tool_id, om.metadata,
- mp.id as provider_id, provider_name, provider_type, provider_url
- from origin_metadata as om
- inner join metadata_provider mp on om.provider_id = mp.id
- inner join origin o on om.origin_id = o.id
- where o.url = origin_url
- and mp.provider_type = provider_type
- order by discovery_date desc;
-$$;
--- end origin_metadata functions
-
--- add tmp_tool entries to tool,
--- skipping duplicates if any.
---
--- operates in bulk: 0. create temporary tmp_tool, 1. COPY to
--- it, 2. call this function to insert and filtering out duplicates
-create or replace function swh_tool_add()
- returns setof tool
- language plpgsql
-as $$
-begin
- insert into tool(name, version, configuration)
- select name, version, configuration from tmp_tool tmp
- on conflict(name, version, configuration) do nothing;
-
- return query
- select id, name, version, configuration
- from tmp_tool join tool
- using(name, version, configuration);
-
- return;
-end
-$$;
-
-
-- simple counter mapping a textual label to an integer value
create type counter as (
label text,
diff --git a/swh/storage/sql/60-swh-indexes.sql b/swh/storage/sql/60-swh-indexes.sql
--- a/swh/storage/sql/60-swh-indexes.sql
+++ b/swh/storage/sql/60-swh-indexes.sql
@@ -155,35 +155,32 @@
alter table release add constraint release_author_date_check check ((date is null) or (author is not null)) not valid;
alter table release validate constraint release_author_date_check;
--- tool
-create unique index tool_pkey on tool(id);
-alter table tool add primary key using index tool_pkey;
+-- metadata_fetcher
+create unique index metadata_fetcher_pkey on metadata_fetcher(id);
+alter table metadata_fetcher add primary key using index metadata_fetcher_pkey;
-create unique index on tool(name, version, configuration);
+create unique index metadata_fetcher_name_version on metadata_fetcher(name, version);
--- metadata_provider
-create unique index concurrently metadata_provider_pkey on metadata_provider(id);
-alter table metadata_provider add primary key using index metadata_provider_pkey;
+-- metadata_authority
+create unique index concurrently metadata_authority_pkey on metadata_authority(id);
+alter table metadata_authority add primary key using index metadata_authority_pkey;
-create index concurrently on metadata_provider(provider_name, provider_url);
-create unique index metadata_provider_type_url
- on metadata_provider(provider_type, provider_url);
+create unique index metadata_authority_type_url on metadata_authority(type, url);
-- origin_metadata
create unique index concurrently origin_metadata_pkey on origin_metadata(id);
alter table origin_metadata add primary key using index origin_metadata_pkey;
-
-create index concurrently on origin_metadata(origin_id, provider_id, tool_id);
+create index concurrently origin_metadata_origin_authority_date on origin_metadata(origin_id, authority_id, discovery_date);
alter table origin_metadata add constraint origin_metadata_origin_fkey foreign key (origin_id) references origin(id) not valid;
alter table origin_metadata validate constraint origin_metadata_origin_fkey;
-alter table origin_metadata add constraint origin_metadata_provider_fkey foreign key (provider_id) references metadata_provider(id) not valid;
-alter table origin_metadata validate constraint origin_metadata_provider_fkey;
+alter table origin_metadata add constraint origin_metadata_authority_fkey foreign key (authority_id) references metadata_authority(id) not valid;
+alter table origin_metadata validate constraint origin_metadata_authority_fkey;
-alter table origin_metadata add constraint origin_metadata_tool_fkey foreign key (tool_id) references tool(id) not valid;
-alter table origin_metadata validate constraint origin_metadata_tool_fkey;
+alter table origin_metadata add constraint origin_metadata_fetcher_fkey foreign key (fetcher_id) references metadata_fetcher(id) not valid;
+alter table origin_metadata validate constraint origin_metadata_fetcher_fkey;
-- object_counts
create unique index concurrently object_counts_pkey on object_counts(object_type);
diff --git a/swh/storage/storage.py b/swh/storage/storage.py
--- a/swh/storage/storage.py
+++ b/swh/storage/storage.py
@@ -6,7 +6,6 @@
import contextlib
import datetime
import itertools
-import json
from collections import defaultdict
from contextlib import contextmanager
@@ -1236,72 +1235,101 @@
@timed
@db_transaction()
def origin_metadata_add(
- self, origin_url, ts, provider, tool, metadata, db=None, cur=None
- ):
- if isinstance(ts, str):
- ts = dateutil.parser.parse(ts)
-
- db.origin_metadata_add(origin_url, ts, provider, tool, metadata, cur)
+ self,
+ origin_url: str,
+ discovery_date: datetime.datetime,
+ authority: Dict[str, Any],
+ fetcher: Dict[str, Any],
+ format: str,
+ metadata: bytes,
+ db=None,
+ cur=None,
+ ) -> None:
+ authority_id = db.metadata_authority_get_id(
+ authority["type"], authority["url"], cur
+ )
+ if not authority_id:
+ raise StorageArgumentException(f"Unknown authority {authority}")
+ fetcher_id = db.metadata_fetcher_get_id(
+ fetcher["name"], fetcher["version"], cur
+ )
+ if not fetcher_id:
+ raise StorageArgumentException(f"Unknown fetcher {fetcher}")
+ db.origin_metadata_add(
+ origin_url, discovery_date, authority_id, fetcher_id, format, metadata, cur
+ )
send_metric("origin_metadata:add", count=1, method_name="origin_metadata_add")
@timed
- @db_transaction_generator(statement_timeout=500)
- def origin_metadata_get_by(self, origin_url, provider_type=None, db=None, cur=None):
- for line in db.origin_metadata_get_by(origin_url, provider_type, cur):
- yield dict(zip(db.origin_metadata_get_cols, line))
+ @db_transaction(statement_timeout=500)
+ def origin_metadata_get(
+ self,
+ origin_url: str,
+ authority: Dict[str, str],
+ after: Optional[datetime.datetime] = None,
+ limit: Optional[int] = None,
+ db=None,
+ cur=None,
+ ) -> List[Dict[str, Any]]:
+ authority_id = db.metadata_authority_get_id(
+ authority["type"], authority["url"], cur
+ )
+ if not authority_id:
+ return []
+ results = []
+ for line in db.origin_metadata_get(origin_url, authority_id, after, limit, cur):
+ row = dict(zip(db.origin_metadata_get_cols, line))
+ results.append(
+ {
+ "origin_url": row.pop("origin.url"),
+ "authority": {
+ "type": row.pop("metadata_authority.type"),
+ "url": row.pop("metadata_authority.url"),
+ },
+ "fetcher": {
+ "name": row.pop("metadata_fetcher.name"),
+ "version": row.pop("metadata_fetcher.version"),
+ },
+ **row,
+ }
+ )
+ return results
@timed
@db_transaction()
- def tool_add(self, tools, db=None, cur=None):
- db.mktemp_tool(cur)
- with convert_validation_exceptions():
- db.copy_to(tools, "tmp_tool", ["name", "version", "configuration"], cur)
- tools = db.tool_add_from_temp(cur)
-
- results = [dict(zip(db.tool_cols, line)) for line in tools]
- send_metric("tool:add", count=len(results), method_name="tool_add")
- return results
+ def metadata_fetcher_add(
+ self, name: str, version: str, metadata: Dict[str, Any], db=None, cur=None
+ ) -> None:
+ db.metadata_fetcher_add(name, version, metadata)
+ send_metric("metadata_fetcher:add", count=1, method_name="metadata_fetcher")
@timed
@db_transaction(statement_timeout=500)
- def tool_get(self, tool, db=None, cur=None):
- tool_conf = tool["configuration"]
- if isinstance(tool_conf, dict):
- tool_conf = json.dumps(tool_conf)
-
- idx = db.tool_get(tool["name"], tool["version"], tool_conf)
- if not idx:
+ def metadata_fetcher_get(
+ self, name: str, version: str, db=None, cur=None
+ ) -> Optional[Dict[str, Any]]:
+ row = db.metadata_fetcher_get(name, version, cur=cur)
+ if not row:
return None
- return dict(zip(db.tool_cols, idx))
-
- @timed
- @db_transaction()
- def metadata_provider_add(
- self, provider_name, provider_type, provider_url, metadata, db=None, cur=None
- ):
- result = db.metadata_provider_add(
- provider_name, provider_type, provider_url, metadata, cur
- )
- send_metric("metadata_provider:add", count=1, method_name="metadata_provider")
- return result
+ return dict(zip(db.metadata_fetcher_cols, row))
@timed
@db_transaction()
- def metadata_provider_get(self, provider_id, db=None, cur=None):
- result = db.metadata_provider_get(provider_id)
- if not result:
- return None
- return dict(zip(db.metadata_provider_cols, result))
+ def metadata_authority_add(
+ self, type: str, url: str, metadata: Dict[str, Any], db=None, cur=None
+ ) -> None:
+ db.metadata_authority_add(type, url, metadata, cur)
+ send_metric("metadata_authority:add", count=1, method_name="metadata_authority")
@timed
@db_transaction()
- def metadata_provider_get_by(self, provider, db=None, cur=None):
- result = db.metadata_provider_get_by(
- provider["provider_name"], provider["provider_url"]
- )
- if not result:
+ def metadata_authority_get(
+ self, type: str, url: str, db=None, cur=None
+ ) -> Optional[Dict[str, Any]]:
+ row = db.metadata_authority_get(type, url, cur=cur)
+ if not row:
return None
- return dict(zip(db.metadata_provider_cols, result))
+ return dict(zip(db.metadata_authority_cols, row))
@timed
def diff_directories(self, from_dir, to_dir, track_renaming=False):
diff --git a/swh/storage/tests/conftest.py b/swh/storage/tests/conftest.py
--- a/swh/storage/tests/conftest.py
+++ b/swh/storage/tests/conftest.py
@@ -246,7 +246,7 @@
"release": [data.release, data.release2, data.release3],
"snapshot": [data.snapshot],
"origin": [data.origin, data.origin2],
- "tool": [data.metadata_tool],
- "provider": [data.provider],
+ "fetcher": [data.metadata_fetcher],
+ "authority": [data.metadata_authority],
"origin_metadata": [data.origin_metadata, data.origin_metadata2],
}
diff --git a/swh/storage/tests/storage_data.py b/swh/storage/tests/storage_data.py
--- a/swh/storage/tests/storage_data.py
+++ b/swh/storage/tests/storage_data.py
@@ -326,17 +326,26 @@
origins = (origin, origin2)
-provider = {
- "name": "hal",
- "type": "deposit-client",
- "url": "http:///hal/inria",
+metadata_authority = {
+ "type": "deposit",
+ "url": "http://hal.inria.example.com/",
"metadata": {"location": "France"},
}
+metadata_authority2 = {
+ "type": "registry",
+ "url": "http://wikidata.example.com/",
+ "metadata": {},
+}
-metadata_tool = {
+metadata_fetcher = {
"name": "swh-deposit",
"version": "0.0.1",
- "configuration": {"sword_version": "2"},
+ "metadata": {"sword_version": "2"},
+}
+metadata_fetcher2 = {
+ "name": "swh-example",
+ "version": "0.0.1",
+ "metadata": {},
}
date_visit1 = datetime.datetime(2015, 1, 1, 23, 0, 0, tzinfo=datetime.timezone.utc)
@@ -456,22 +465,52 @@
}
origin_metadata = {
- "origin": origin,
+ "origin_url": origin["url"],
"discovery_date": datetime.datetime(
2015, 1, 1, 23, 0, 0, tzinfo=datetime.timezone.utc
),
- "provider": provider,
- "tool": "swh-deposit",
- "metadata": {"name": "test_origin_metadata", "version": "0.0.1"},
+ "authority": {
+ "type": metadata_authority["type"],
+ "url": metadata_authority["url"],
+ },
+ "fetcher": {
+ "name": metadata_fetcher["name"],
+ "version": metadata_fetcher["version"],
+ },
+ "format": "json",
+ "metadata": b'{"foo": "bar"}',
}
origin_metadata2 = {
- "origin": origin,
+ "origin_url": origin["url"],
"discovery_date": datetime.datetime(
2017, 1, 1, 23, 0, 0, tzinfo=datetime.timezone.utc
),
- "provider": provider,
- "tool": "swh-deposit",
- "metadata": {"name": "test_origin_metadata", "version": "0.0.1"},
+ "authority": {
+ "type": metadata_authority["type"],
+ "url": metadata_authority["url"],
+ },
+ "fetcher": {
+ "name": metadata_fetcher["name"],
+ "version": metadata_fetcher["version"],
+ },
+ "format": "yaml",
+ "metadata": b"foo: bar",
+}
+origin_metadata3 = {
+ "origin_url": origin["url"],
+ "discovery_date": datetime.datetime(
+ 2017, 1, 1, 23, 0, 0, tzinfo=datetime.timezone.utc
+ ),
+ "authority": {
+ "type": metadata_authority2["type"],
+ "url": metadata_authority2["url"],
+ },
+ "fetcher": {
+ "name": metadata_fetcher2["name"],
+ "version": metadata_fetcher2["version"],
+ },
+ "format": "yaml",
+ "metadata": b"foo: bar",
}
person = {
diff --git a/swh/storage/tests/test_cassandra.py b/swh/storage/tests/test_cassandra.py
--- a/swh/storage/tests/test_cassandra.py
+++ b/swh/storage/tests/test_cassandra.py
@@ -335,34 +335,6 @@
def test_person_get(self):
pass
- @pytest.mark.skip("Not yet implemented")
- def test_metadata_provider_add(self):
- pass
-
- @pytest.mark.skip("Not yet implemented")
- def test_metadata_provider_add_idempotent(self):
- pass
-
- @pytest.mark.skip("Not yet implemented")
- def test_metadata_provider_get(self):
- pass
-
- @pytest.mark.skip("Not yet implemented")
- def test_metadata_provider_get_by(self):
- pass
-
- @pytest.mark.skip("Not yet implemented")
- def test_origin_metadata_add(self):
- pass
-
- @pytest.mark.skip("Not yet implemented")
- def test_origin_metadata_get(self):
- pass
-
- @pytest.mark.skip("Not yet implemented")
- def test_origin_metadata_get_by_provider_type(self):
- pass
-
@pytest.mark.skip("Not supported by Cassandra")
def test_origin_count(self):
pass
diff --git a/swh/storage/tests/test_retry.py b/swh/storage/tests/test_retry.py
--- a/swh/storage/tests/test_retry.py
+++ b/swh/storage/tests/test_retry.py
@@ -3,7 +3,6 @@
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
-from typing import Dict
from unittest.mock import call
import psycopg2
@@ -421,156 +420,152 @@
)
-def test_retrying_proxy_storage_tool_add(swh_storage, sample_data):
- """Standard tool_add works as before
+def test_retrying_proxy_storage_metadata_fetcher_add(swh_storage, sample_data):
+ """Standard metadata_fetcher_add works as before
"""
- sample_tool = sample_data["tool"][0]
+ fetcher = sample_data["fetcher"][0]
- tool = swh_storage.tool_get(sample_tool)
- assert not tool
+ metadata_fetcher = swh_storage.metadata_fetcher_get(
+ fetcher["name"], fetcher["version"]
+ )
+ assert not metadata_fetcher
- tools = swh_storage.tool_add([sample_tool])
- assert tools
- tool = tools[0]
- tool.pop("id")
- assert tool == sample_tool
+ swh_storage.metadata_fetcher_add(**fetcher)
- tool = swh_storage.tool_get(sample_tool)
- tool.pop("id")
- assert tool == sample_tool
+ actual_fetcher = swh_storage.metadata_fetcher_get(
+ fetcher["name"], fetcher["version"]
+ )
+ assert actual_fetcher == fetcher
-def test_retrying_proxy_storage_tool_add_with_retry(
+def test_retrying_proxy_storage_metadata_fetcher_add_with_retry(
monkeypatch_sleep, swh_storage, sample_data, mocker, fake_hash_collision
):
"""Multiple retries for hash collision and psycopg2 error but finally ok
"""
- sample_tool = sample_data["tool"][0]
- mock_memory = mocker.patch("swh.storage.in_memory.InMemoryStorage.tool_add")
+ fetcher = sample_data["fetcher"][0]
+ mock_memory = mocker.patch(
+ "swh.storage.in_memory.InMemoryStorage.metadata_fetcher_add"
+ )
mock_memory.side_effect = [
# first try goes ko
fake_hash_collision,
# second try goes ko
- psycopg2.IntegrityError("tool already inserted"),
+ psycopg2.IntegrityError("metadata_fetcher already inserted"),
# ok then!
- [sample_tool],
+ [fetcher],
]
- tool = swh_storage.tool_get(sample_tool)
- assert not tool
+ actual_fetcher = swh_storage.metadata_fetcher_get(
+ fetcher["name"], fetcher["version"]
+ )
+ assert not actual_fetcher
- tools = swh_storage.tool_add([sample_tool])
- assert tools == [sample_tool]
+ swh_storage.metadata_fetcher_add(**fetcher)
mock_memory.assert_has_calls(
- [call([sample_tool]), call([sample_tool]), call([sample_tool]),]
+ [
+ call(fetcher["name"], fetcher["version"], fetcher["metadata"]),
+ call(fetcher["name"], fetcher["version"], fetcher["metadata"]),
+ call(fetcher["name"], fetcher["version"], fetcher["metadata"]),
+ ]
)
-def test_retrying_proxy_swh_storage_tool_add_failure(swh_storage, sample_data, mocker):
+def test_retrying_proxy_swh_storage_metadata_fetcher_add_failure(
+ swh_storage, sample_data, mocker
+):
"""Unfiltered errors are raising without retry
"""
- mock_memory = mocker.patch("swh.storage.in_memory.InMemoryStorage.tool_add")
- mock_memory.side_effect = StorageArgumentException("Refuse to add tool always!")
+ mock_memory = mocker.patch(
+ "swh.storage.in_memory.InMemoryStorage.metadata_fetcher_add"
+ )
+ mock_memory.side_effect = StorageArgumentException(
+ "Refuse to add metadata_fetcher always!"
+ )
- sample_tool = sample_data["tool"][0]
+ fetcher = sample_data["fetcher"][0]
- tool = swh_storage.tool_get(sample_tool)
- assert not tool
+ actual_fetcher = swh_storage.metadata_fetcher_get(
+ fetcher["name"], fetcher["version"]
+ )
+ assert not actual_fetcher
with pytest.raises(StorageArgumentException, match="Refuse to add"):
- swh_storage.tool_add([sample_tool])
+ swh_storage.metadata_fetcher_add(**fetcher)
assert mock_memory.call_count == 1
-def to_provider(provider: Dict) -> Dict:
- return {
- "provider_name": provider["name"],
- "provider_url": provider["url"],
- "provider_type": provider["type"],
- "metadata": provider["metadata"],
- }
-
-
-def test_retrying_proxy_storage_metadata_provider_add(swh_storage, sample_data):
- """Standard metadata_provider_add works as before
+def test_retrying_proxy_storage_metadata_authority_add(swh_storage, sample_data):
+ """Standard metadata_authority_add works as before
"""
- provider = sample_data["provider"][0]
- provider_get = to_provider(provider)
+ authority = sample_data["authority"][0]
- provider = swh_storage.metadata_provider_get_by(provider_get)
- assert not provider
+ assert not swh_storage.metadata_authority_get(authority["type"], authority["url"])
- provider_id = swh_storage.metadata_provider_add(**provider_get)
- assert provider_id
+ swh_storage.metadata_authority_add(**authority)
- actual_provider = swh_storage.metadata_provider_get(provider_id)
- assert actual_provider
- actual_provider_id = actual_provider.pop("id")
- assert actual_provider_id == provider_id
- assert actual_provider == provider_get
+ actual_authority = swh_storage.metadata_authority_get(
+ authority["type"], authority["url"]
+ )
+ assert actual_authority == authority
-def test_retrying_proxy_storage_metadata_provider_add_with_retry(
+def test_retrying_proxy_storage_metadata_authority_add_with_retry(
monkeypatch_sleep, swh_storage, sample_data, mocker, fake_hash_collision
):
"""Multiple retries for hash collision and psycopg2 error but finally ok
"""
- provider = sample_data["provider"][0]
- provider_get = to_provider(provider)
+ authority = sample_data["authority"][0]
mock_memory = mocker.patch(
- "swh.storage.in_memory.InMemoryStorage.metadata_provider_add"
+ "swh.storage.in_memory.InMemoryStorage.metadata_authority_add"
)
mock_memory.side_effect = [
# first try goes ko
fake_hash_collision,
# second try goes ko
- psycopg2.IntegrityError("provider_id already inserted"),
+ psycopg2.IntegrityError("foo bar"),
# ok then!
- "provider_id",
+ None,
]
- provider = swh_storage.metadata_provider_get_by(provider_get)
- assert not provider
+ assert not swh_storage.metadata_authority_get(authority["type"], authority["url"])
- provider_id = swh_storage.metadata_provider_add(**provider_get)
- assert provider_id == "provider_id"
+ swh_storage.metadata_authority_add(**authority)
- provider_arg_names = ("provider_name", "provider_type", "provider_url", "metadata")
- provider_args = [provider_get[key] for key in provider_arg_names]
+ authority_arg_names = ("type", "url", "metadata")
+ authority_args = [authority[key] for key in authority_arg_names]
mock_memory.assert_has_calls(
- [call(*provider_args), call(*provider_args), call(*provider_args),]
+ [call(*authority_args), call(*authority_args), call(*authority_args),]
)
-def test_retrying_proxy_swh_storage_metadata_provider_add_failure(
+def test_retrying_proxy_swh_storage_metadata_authority_add_failure(
swh_storage, sample_data, mocker
):
"""Unfiltered errors are raising without retry
"""
mock_memory = mocker.patch(
- "swh.storage.in_memory.InMemoryStorage.metadata_provider_add"
+ "swh.storage.in_memory.InMemoryStorage.metadata_authority_add"
)
mock_memory.side_effect = StorageArgumentException(
- "Refuse to add provider_id always!"
+ "Refuse to add authority_id always!"
)
- provider = sample_data["provider"][0]
- provider_get = to_provider(provider)
+ authority = sample_data["authority"][0]
- provider_id = swh_storage.metadata_provider_get_by(provider_get)
- assert not provider_id
+ swh_storage.metadata_authority_get(authority["type"], authority["url"])
with pytest.raises(StorageArgumentException, match="Refuse to add"):
- swh_storage.metadata_provider_add(**provider_get)
+ swh_storage.metadata_authority_add(**authority)
assert mock_memory.call_count == 1
@@ -580,23 +575,20 @@
"""
ori_meta = sample_data["origin_metadata"][0]
- origin = ori_meta["origin"]
- swh_storage.origin_add_one(origin)
- provider_get = to_provider(ori_meta["provider"])
- provider_id = swh_storage.metadata_provider_add(**provider_get)
+ swh_storage.origin_add_one({"url": ori_meta["origin_url"]})
+ swh_storage.metadata_authority_add(**sample_data["authority"][0])
+ swh_storage.metadata_fetcher_add(**sample_data["fetcher"][0])
- origin_metadata = swh_storage.origin_metadata_get_by(origin["url"])
+ origin_metadata = swh_storage.origin_metadata_get(
+ ori_meta["origin_url"], ori_meta["authority"]
+ )
assert not origin_metadata
- swh_storage.origin_metadata_add(
- origin["url"],
- ori_meta["discovery_date"],
- provider_id,
- ori_meta["tool"],
- ori_meta["metadata"],
- )
+ swh_storage.origin_metadata_add(**ori_meta)
- origin_metadata = swh_storage.origin_metadata_get_by(origin["url"])
+ origin_metadata = swh_storage.origin_metadata_get(
+ ori_meta["origin_url"], ori_meta["authority"]
+ )
assert origin_metadata
@@ -607,10 +599,9 @@
"""
ori_meta = sample_data["origin_metadata"][0]
- origin = ori_meta["origin"]
- swh_storage.origin_add_one(origin)
- provider_get = to_provider(ori_meta["provider"])
- provider_id = swh_storage.metadata_provider_add(**provider_get)
+ swh_storage.origin_add_one({"url": ori_meta["origin_url"]})
+ swh_storage.metadata_authority_add(**sample_data["authority"][0])
+ swh_storage.metadata_fetcher_add(**sample_data["fetcher"][0])
mock_memory = mocker.patch(
"swh.storage.in_memory.InMemoryStorage.origin_metadata_add"
)
@@ -619,24 +610,40 @@
# first try goes ko
fake_hash_collision,
# second try goes ko
- psycopg2.IntegrityError("provider_id already inserted"),
+ psycopg2.IntegrityError("foo bar"),
# ok then!
None,
]
- url = origin["url"]
- ts = ori_meta["discovery_date"]
- tool_id = ori_meta["tool"]
- metadata = ori_meta["metadata"]
-
# No exception raised as insertion finally came through
- swh_storage.origin_metadata_add(url, ts, provider_id, tool_id, metadata)
+ swh_storage.origin_metadata_add(**ori_meta)
mock_memory.assert_has_calls(
[ # 3 calls, as long as error raised
- call(url, ts, provider_id, tool_id, metadata),
- call(url, ts, provider_id, tool_id, metadata),
- call(url, ts, provider_id, tool_id, metadata),
+ call(
+ ori_meta["origin_url"],
+ ori_meta["discovery_date"],
+ ori_meta["authority"],
+ ori_meta["fetcher"],
+ ori_meta["format"],
+ ori_meta["metadata"],
+ ),
+ call(
+ ori_meta["origin_url"],
+ ori_meta["discovery_date"],
+ ori_meta["authority"],
+ ori_meta["fetcher"],
+ ori_meta["format"],
+ ori_meta["metadata"],
+ ),
+ call(
+ ori_meta["origin_url"],
+ ori_meta["discovery_date"],
+ ori_meta["authority"],
+ ori_meta["fetcher"],
+ ori_meta["format"],
+ ori_meta["metadata"],
+ ),
]
)
@@ -653,17 +660,10 @@
mock_memory.side_effect = StorageArgumentException("Refuse to add always!")
ori_meta = sample_data["origin_metadata"][0]
- origin = ori_meta["origin"]
- swh_storage.origin_add_one(origin)
-
- url = origin["url"]
- ts = ori_meta["discovery_date"]
- provider_id = "provider_id"
- tool_id = ori_meta["tool"]
- metadata = ori_meta["metadata"]
+ swh_storage.origin_add_one({"url": ori_meta["origin_url"]})
with pytest.raises(StorageArgumentException, match="Refuse to add"):
- swh_storage.origin_metadata_add(url, ts, provider_id, tool_id, metadata)
+ swh_storage.origin_metadata_add(**ori_meta)
assert mock_memory.call_count == 1
diff --git a/swh/storage/tests/test_storage.py b/swh/storage/tests/test_storage.py
--- a/swh/storage/tests/test_storage.py
+++ b/swh/storage/tests/test_storage.py
@@ -3174,374 +3174,99 @@
assert expected == ret
- def test_tool_add(self, swh_storage):
- tool = {
- "name": "some-unknown-tool",
- "version": "some-version",
- "configuration": {"debian-package": "some-package"},
- }
-
- actual_tool = swh_storage.tool_get(tool)
- assert actual_tool is None # does not exist
-
- # add it
- actual_tools = swh_storage.tool_add([tool])
-
- assert len(actual_tools) == 1
- actual_tool = actual_tools[0]
- assert actual_tool is not None # now it exists
- new_id = actual_tool.pop("id")
- assert actual_tool == tool
-
- actual_tools2 = swh_storage.tool_add([tool])
- actual_tool2 = actual_tools2[0]
- assert actual_tool2 is not None # now it exists
- new_id2 = actual_tool2.pop("id")
-
- assert new_id == new_id2
- assert actual_tool == actual_tool2
-
- def test_tool_add_multiple(self, swh_storage):
- tool = {
- "name": "some-unknown-tool",
- "version": "some-version",
- "configuration": {"debian-package": "some-package"},
- }
-
- actual_tools = list(swh_storage.tool_add([tool]))
- assert len(actual_tools) == 1
-
- new_tools = [
- tool,
- {"name": "yet-another-tool", "version": "version", "configuration": {},},
- ]
-
- actual_tools = swh_storage.tool_add(new_tools)
- assert len(actual_tools) == 2
-
- # order not guaranteed, so we iterate over results to check
- for tool in actual_tools:
- _id = tool.pop("id")
- assert _id is not None
- assert tool in new_tools
-
- def test_tool_get_missing(self, swh_storage):
- tool = {
- "name": "unknown-tool",
- "version": "3.1.0rc2-31-ga2cbb8c",
- "configuration": {"command_line": "nomossa <filepath>"},
- }
-
- actual_tool = swh_storage.tool_get(tool)
-
- assert actual_tool is None
-
- def test_tool_metadata_get_missing_context(self, swh_storage):
- tool = {
- "name": "swh-metadata-translator",
- "version": "0.0.1",
- "configuration": {"context": "unknown-context"},
- }
-
- actual_tool = swh_storage.tool_get(tool)
-
- assert actual_tool is None
+ def test_metadata_fetcher_add_get(self, swh_storage):
+ actual_fetcher = swh_storage.metadata_fetcher_get(
+ data.metadata_fetcher["name"], data.metadata_fetcher["version"]
+ )
+ assert actual_fetcher is None # does not exist
- def test_tool_metadata_get(self, swh_storage):
- tool = {
- "name": "swh-metadata-translator",
- "version": "0.0.1",
- "configuration": {"type": "local", "context": "npm"},
- }
- expected_tool = swh_storage.tool_add([tool])[0]
+ swh_storage.metadata_fetcher_add(**data.metadata_fetcher)
- # when
- actual_tool = swh_storage.tool_get(tool)
+ res = swh_storage.metadata_fetcher_get(
+ data.metadata_fetcher["name"], data.metadata_fetcher["version"]
+ )
- # then
- assert expected_tool == actual_tool
+ assert res is not data.metadata_fetcher
+ assert res == data.metadata_fetcher
- def test_metadata_provider_get(self, swh_storage):
- # given
- no_provider = swh_storage.metadata_provider_get(6459456445615)
- assert no_provider is None
- # when
- provider_id = swh_storage.metadata_provider_add(
- data.provider["name"],
- data.provider["type"],
- data.provider["url"],
- data.provider["metadata"],
+ def test_metadata_authority_add_get(self, swh_storage):
+ actual_authority = swh_storage.metadata_authority_get(
+ data.metadata_authority["type"], data.metadata_authority["url"]
)
+ assert actual_authority is None # does not exist
- actual_provider = swh_storage.metadata_provider_get(provider_id)
- expected_provider = {
- "provider_name": data.provider["name"],
- "provider_url": data.provider["url"],
- }
- # then
- del actual_provider["id"]
- assert actual_provider, expected_provider
+ swh_storage.metadata_authority_add(**data.metadata_authority)
- def test_metadata_provider_get_by(self, swh_storage):
- # given
- no_provider = swh_storage.metadata_provider_get_by(
- {
- "provider_name": data.provider["name"],
- "provider_url": data.provider["url"],
- }
- )
- assert no_provider is None
- # when
- provider_id = swh_storage.metadata_provider_add(
- data.provider["name"],
- data.provider["type"],
- data.provider["url"],
- data.provider["metadata"],
+ res = swh_storage.metadata_authority_get(
+ data.metadata_authority["type"], data.metadata_authority["url"]
)
- actual_provider = swh_storage.metadata_provider_get_by(
- {
- "provider_name": data.provider["name"],
- "provider_url": data.provider["url"],
- }
- )
- # then
- assert provider_id, actual_provider["id"]
+ assert res is not data.metadata_authority
+ assert res == data.metadata_authority
def test_origin_metadata_add(self, swh_storage):
- # given
origin = data.origin
+ fetcher = data.metadata_fetcher
+ authority = data.metadata_authority
swh_storage.origin_add([origin])[0]
- tools = swh_storage.tool_add([data.metadata_tool])
- tool = tools[0]
+ swh_storage.metadata_fetcher_add(**fetcher)
+ swh_storage.metadata_authority_add(**authority)
- swh_storage.metadata_provider_add(
- data.provider["name"],
- data.provider["type"],
- data.provider["url"],
- data.provider["metadata"],
- )
- provider = swh_storage.metadata_provider_get_by(
- {
- "provider_name": data.provider["name"],
- "provider_url": data.provider["url"],
- }
- )
+ swh_storage.origin_metadata_add(**data.origin_metadata)
+ swh_storage.origin_metadata_add(**data.origin_metadata2)
- # when adding for the same origin 2 metadatas
- n_om = len(list(swh_storage.origin_metadata_get_by(origin["url"])))
- swh_storage.origin_metadata_add(
- origin["url"],
- data.origin_metadata["discovery_date"],
- provider["id"],
- tool["id"],
- data.origin_metadata["metadata"],
- )
- swh_storage.origin_metadata_add(
- origin["url"],
- "2015-01-01 23:00:00+00",
- provider["id"],
- tool["id"],
- data.origin_metadata2["metadata"],
+ swh_storage.origin_metadata_get(origin["url"], authority)
+
+ assert [data.origin_metadata, data.origin_metadata2] == list(
+ sorted(
+ swh_storage.origin_metadata_get(origin["url"], authority),
+ key=lambda x: x["discovery_date"],
+ )
)
- n_actual_om = len(list(swh_storage.origin_metadata_get_by(origin["url"])))
- # then
- assert n_actual_om == n_om + 2
def test_origin_metadata_get(self, swh_storage):
- # given
- origin_url = data.origin["url"]
+ authority = data.metadata_authority
+ fetcher = data.metadata_fetcher
+ authority2 = data.metadata_authority2
+ fetcher2 = data.metadata_fetcher2
+ origin_url1 = data.origin["url"]
origin_url2 = data.origin2["url"]
swh_storage.origin_add([data.origin])
swh_storage.origin_add([data.origin2])
- swh_storage.metadata_provider_add(
- data.provider["name"],
- data.provider["type"],
- data.provider["url"],
- data.provider["metadata"],
- )
- provider = swh_storage.metadata_provider_get_by(
- {
- "provider_name": data.provider["name"],
- "provider_url": data.provider["url"],
- }
- )
- tool = swh_storage.tool_add([data.metadata_tool])[0]
- # when adding for the same origin 2 metadatas
- swh_storage.origin_metadata_add(
- origin_url,
- data.origin_metadata["discovery_date"],
- provider["id"],
- tool["id"],
- data.origin_metadata["metadata"],
- )
- swh_storage.origin_metadata_add(
- origin_url2,
- data.origin_metadata2["discovery_date"],
- provider["id"],
- tool["id"],
- data.origin_metadata2["metadata"],
- )
- swh_storage.origin_metadata_add(
- origin_url,
- data.origin_metadata2["discovery_date"],
- provider["id"],
- tool["id"],
- data.origin_metadata2["metadata"],
- )
- all_metadatas = list(
+ origin1_metadata1 = data.origin_metadata
+ origin1_metadata2 = data.origin_metadata2
+ origin1_metadata3 = data.origin_metadata3
+ origin2_metadata = {**data.origin_metadata2, "origin_url": origin_url2}
+
+ swh_storage.metadata_authority_add(**authority)
+ swh_storage.metadata_fetcher_add(**fetcher)
+ swh_storage.metadata_authority_add(**authority2)
+ swh_storage.metadata_fetcher_add(**fetcher2)
+
+ swh_storage.origin_metadata_add(**origin1_metadata1)
+ swh_storage.origin_metadata_add(**origin1_metadata2)
+ swh_storage.origin_metadata_add(**origin1_metadata3)
+ swh_storage.origin_metadata_add(**origin2_metadata)
+
+ assert [origin1_metadata1, origin1_metadata2] == list(
sorted(
- swh_storage.origin_metadata_get_by(origin_url),
+ swh_storage.origin_metadata_get(origin_url1, authority),
key=lambda x: x["discovery_date"],
)
)
- metadatas_for_origin2 = list(swh_storage.origin_metadata_get_by(origin_url2))
- expected_results = [
- {
- "origin_url": origin_url,
- "discovery_date": datetime.datetime(
- 2015, 1, 1, 23, 0, tzinfo=datetime.timezone.utc
- ),
- "metadata": {"name": "test_origin_metadata", "version": "0.0.1"},
- "provider_id": provider["id"],
- "provider_name": "hal",
- "provider_type": "deposit-client",
- "provider_url": "http:///hal/inria",
- "tool_id": tool["id"],
- },
- {
- "origin_url": origin_url,
- "discovery_date": datetime.datetime(
- 2017, 1, 1, 23, 0, tzinfo=datetime.timezone.utc
- ),
- "metadata": {"name": "test_origin_metadata", "version": "0.0.1"},
- "provider_id": provider["id"],
- "provider_name": "hal",
- "provider_type": "deposit-client",
- "provider_url": "http:///hal/inria",
- "tool_id": tool["id"],
- },
- ]
-
- # then
- assert len(all_metadatas) == 2
- assert len(metadatas_for_origin2) == 1
- assert all_metadatas == expected_results
-
- def test_metadata_provider_add(self, swh_storage):
- provider = {
- "provider_name": "swMATH",
- "provider_type": "registry",
- "provider_url": "http://www.swmath.org/",
- "metadata": {
- "email": "contact@swmath.org",
- "license": "All rights reserved",
- },
- }
- provider["id"] = provider_id = swh_storage.metadata_provider_add(**provider)
- assert provider == swh_storage.metadata_provider_get_by(
- {"provider_name": "swMATH", "provider_url": "http://www.swmath.org/"}
- )
- assert provider == swh_storage.metadata_provider_get(provider_id)
-
- def test_metadata_provider_add_idempotent(self, swh_storage):
- provider = {
- "provider_name": "swMATH",
- "provider_type": "registry",
- "provider_url": "http://www.swmath.org/",
- "metadata": {
- "email": "contact@swmath.org",
- "license": "All rights reserved",
- },
- }
- provider_id = swh_storage.metadata_provider_add(**provider)
- expected_provider = {**provider, "id": provider_id}
- assert expected_provider == swh_storage.metadata_provider_get_by(
- {"provider_name": "swMATH", "provider_url": "http://www.swmath.org/"}
- )
- assert expected_provider == swh_storage.metadata_provider_get(provider_id)
-
- provider_id2 = swh_storage.metadata_provider_add(**provider)
- assert provider_id2 == provider_id
-
- def test_origin_metadata_get_by_provider_type(self, swh_storage):
- # given
- origin_url = data.origin["url"]
- origin_url2 = data.origin2["url"]
- swh_storage.origin_add([data.origin])
- swh_storage.origin_add([data.origin2])
- provider1_id = swh_storage.metadata_provider_add(
- data.provider["name"],
- data.provider["type"],
- data.provider["url"],
- data.provider["metadata"],
- )
- provider1 = swh_storage.metadata_provider_get_by(
- {
- "provider_name": data.provider["name"],
- "provider_url": data.provider["url"],
- }
+ assert [origin1_metadata3] == list(
+ sorted(
+ swh_storage.origin_metadata_get(origin_url1, authority2),
+ key=lambda x: x["discovery_date"],
+ )
)
- assert provider1 == swh_storage.metadata_provider_get(provider1_id)
- provider2_id = swh_storage.metadata_provider_add(
- "swMATH",
- "registry",
- "http://www.swmath.org/",
- {"email": "contact@swmath.org", "license": "All rights reserved"},
- )
- provider2 = swh_storage.metadata_provider_get_by(
- {"provider_name": "swMATH", "provider_url": "http://www.swmath.org/"}
+ assert [origin2_metadata] == list(
+ swh_storage.origin_metadata_get(origin_url2, authority)
)
- assert provider2 == swh_storage.metadata_provider_get(provider2_id)
-
- # using the only tool now inserted in the data.sql, but for this
- # provider should be a crawler tool (not yet implemented)
- tool = swh_storage.tool_add([data.metadata_tool])[0]
-
- # when adding for the same origin 2 metadatas
- swh_storage.origin_metadata_add(
- origin_url,
- data.origin_metadata["discovery_date"],
- provider1["id"],
- tool["id"],
- data.origin_metadata["metadata"],
- )
- swh_storage.origin_metadata_add(
- origin_url2,
- data.origin_metadata2["discovery_date"],
- provider2["id"],
- tool["id"],
- data.origin_metadata2["metadata"],
- )
- provider_type = "registry"
- m_by_provider = list(
- swh_storage.origin_metadata_get_by(origin_url2, provider_type)
- )
- for item in m_by_provider:
- if "id" in item:
- del item["id"]
- expected_results = [
- {
- "origin_url": origin_url2,
- "discovery_date": datetime.datetime(
- 2017, 1, 1, 23, 0, tzinfo=datetime.timezone.utc
- ),
- "metadata": {"name": "test_origin_metadata", "version": "0.0.1"},
- "provider_id": provider2["id"],
- "provider_name": "swMATH",
- "provider_type": provider_type,
- "provider_url": "http://www.swmath.org/",
- "tool_id": tool["id"],
- }
- ]
- # then
-
- assert len(m_by_provider) == 1
- assert m_by_provider == expected_results
class TestStorageGeneratedData:
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Nov 5 2024, 9:21 AM (11 w, 17 h ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3224921
Attached To
D2988: Implement extrinsic origin metadata specification.
Event Timeline
Log In to Comment