Page MenuHomeSoftware Heritage

D2988.id10797.diff
No OneTemporary

D2988.id10797.diff

diff --git a/docs/extrinsic-metadata-specification.rst b/docs/extrinsic-metadata-specification.rst
--- a/docs/extrinsic-metadata-specification.rst
+++ b/docs/extrinsic-metadata-specification.rst
@@ -32,11 +32,11 @@
An authority is uniquely defined by these properties:
* its type, representing the kind of authority, which is one of these values:
- * `deposit`, for metadata pushed to Software Heritage at the same time
- as a software artifact
- * `forge`, for metadata pulled from the same source as the one hosting
- the software artifacts (which includes package managers)
- * `registry`, for metadata pulled from a third-party
+ * `deposit`, for metadata pushed to Software Heritage at the same time
+ as a software artifact
+ * `forge`, for metadata pulled from the same source as the one hosting
+ the software artifacts (which includes package managers)
+ * `registry`, for metadata pulled from a third-party
* its URL, which unambiguously identifies an instance of the authority type.
Examples:
@@ -145,6 +145,7 @@
added from this origin, in the format::
{
+ 'origin_url': ...,
'authority': {'type': ..., 'url': ...},
'fetcher': {'name': ..., 'version': ...},
'discovery_date': ...,
diff --git a/sql/upgrades/148.sql b/sql/upgrades/148.sql
new file mode 100644
--- /dev/null
+++ b/sql/upgrades/148.sql
@@ -0,0 +1,93 @@
+-- SWH DB schema upgrade
+-- from_version: 147
+-- to_version: 148
+-- description: Implement extrinsic origin-metadata specification
+
+-- latest schema version
+insert into dbversion(version, release, description)
+ values(148, now(), 'Work In Progress');
+
+-- metadata_fetcher
+
+alter table tool rename metadata_fetcher;
+comment on table metadata_fetcher is 'Tools used to retrieve metadata';
+
+alter table metadata_fetcher
+ rename column configuration to metadata;
+
+comment on column metadata_fetcher.id is 'Internal identifier of the fetcher';
+comment on column metadata_fetcher.name is 'Fetcher name';
+comment on column metadata_fetcher.version is 'Fetcher version';
+comment on column metadata_fetcher.metadata is 'Extra information about the fetcher';
+
+alter index tool_pkey
+ rename to metadata_fetcher_pkey;
+create unique index metadata_fetcher_name_version
+ on metadata_fetcher(name, version);
+
+drop index tool_tool_name_tool_version_tool_configuration_idx;
+ -- was an index on (name, version, configuration)
+
+-- metadata_authority
+
+alter table metadata_provider rename metadata_authority;
+comment on table metadata_authority is 'Metadata authority information';
+
+alter table metadata_authority
+ drop column provider_name;
+alter table metadata_authority
+ rename column provider_type to type;
+alter table metadata_authority
+ rename column provider_url to url;
+
+comment on column metadata_authority.id is 'Internal identifier of the authority';
+comment on column metadata_authority.type is 'Type of authority (deposit/forge/registry)';
+comment on column metadata_authority.url is 'Authority''s uri';
+comment on column metadata_authority.metadata is 'Other metadata about authority';
+
+alter index metadata_provider_pkey
+ rename to metadata_authority_pkey
+create unique index metadata_authority_type_url
+ on metadata_authority(type, url);
+
+drop index metadata_provider_provider_name_provider_url_idx;
+ -- was an index on (provider_name, provider_url)
+
+-- origin_metadata
+
+alter table origin_metadata
+ rename column provider_id to authority_id;
+alter table origin_metadata
+ rename column tool_id to fetcher_id;
+alter table origin_metadata
+ add column format text default 'sword-json';
+alter table origin_metadata
+ rename column metadata to metadata_jsonb;
+alter table origin_metadata
+ add column metadata;
+
+create index concurrently origin_metadata_origin_authority_date
+ on origin_metadata(origin_id, authority_id, discovery_date);
+
+drop index origin_metadata_origin_id_provider_id_tool_id_idx;
+ -- was an index on (origin_id, provider_id, tool_id)
+
+-- migrate metadata_jsonb (a jsonb) to metadata (a bytea) with an external process
+
+alter table origin_metadata
+ drop column metadata_jsonb;
+
+comment on column origin_metadata.authority_id is 'the metadata provider: github, openhub, deposit, etc.';
+comment on column origin_metadata.fetcher_id is 'the tool used for extracting metadata: loaders, crawlers, etc.';
+comment on column origin_metadata.format is 'name of the format of metadata, used by readers to interpret it.'
+comment on column origin_metadata.metadata is 'original metadata in opaque format';
+
+
+-- cleanup unused functions
+
+drop function swh_mktemp_tool;
+drop function swh_tool_add
+
+drop function swh_origin_metadata_get_by_origin;
+drop function swh_origin_metadata_get_by_provider_type;
+drop type origin_metadata_signature;
diff --git a/swh/storage/cassandra/cql.py b/swh/storage/cassandra/cql.py
--- a/swh/storage/cassandra/cql.py
+++ b/swh/storage/cassandra/cql.py
@@ -785,34 +785,100 @@
yield from self._origin_visit_iter_to(start_token)
##########################
- # 'tool' table
+ # 'metadata_authority' table
##########################
- _tool_keys = ["id", "name", "version", "configuration"]
+ _metadata_authority_keys = ["url", "type", "metadata"]
- @_prepared_insert_statement("tool_by_uuid", _tool_keys)
- def tool_by_uuid_add_one(self, tool: Dict[str, Any], *, statement) -> None:
- self._execute_with_retries(statement, [tool[key] for key in self._tool_keys])
+ @_prepared_insert_statement("metadata_authority", _metadata_authority_keys)
+ def metadata_authority_add(self, url, type, metadata, *, statement):
+ return self._execute_with_retries(statement, [url, type, metadata])
- @_prepared_insert_statement("tool", _tool_keys)
- def tool_add_one(self, tool: Dict[str, Any], *, statement) -> None:
- self._execute_with_retries(statement, [tool[key] for key in self._tool_keys])
- self._increment_counter("tool", 1)
+ @_prepared_statement("SELECT * from metadata_authority WHERE type = ? AND url = ?")
+ def metadata_authority_get(self, type, url, *, statement) -> Optional[Row]:
+ return next(iter(self._execute_with_retries(statement, [type, url])), None)
+
+ ##########################
+ # 'metadata_fetcher' table
+ ##########################
+
+ _metadata_fetcher_keys = ["name", "version", "metadata"]
+
+ @_prepared_insert_statement("metadata_fetcher", _metadata_fetcher_keys)
+ def metadata_fetcher_add(self, name, version, metadata, *, statement):
+ return self._execute_with_retries(statement, [name, version, metadata])
@_prepared_statement(
- "SELECT id FROM tool " "WHERE name = ? AND version = ? " "AND configuration = ?"
+ "SELECT * from metadata_fetcher WHERE name = ? AND version = ?"
)
- def tool_get_one_uuid(
- self, name: str, version: str, configuration: Dict[str, Any], *, statement
- ) -> Optional[str]:
- rows = list(
- self._execute_with_retries(statement, [name, version, configuration])
+ def metadata_fetcher_get(self, name, version, *, statement) -> Optional[Row]:
+ return next(iter(self._execute_with_retries(statement, [name, version])), None)
+
+ ##########################
+ # 'origin_metadata' table
+ ##########################
+
+ _origin_metadata_keys = [
+ "origin",
+ "authority_type",
+ "authority_url",
+ "discovery_date",
+ "fetcher_name",
+ "fetcher_version",
+ "format",
+ "metadata",
+ ]
+
+ @_prepared_insert_statement("origin_metadata", _origin_metadata_keys)
+ def origin_metadata_add(
+ self,
+ origin,
+ authority_type,
+ authority_url,
+ discovery_date,
+ fetcher_name,
+ fetcher_version,
+ format,
+ metadata,
+ *,
+ statement,
+ ):
+ return self._execute_with_retries(
+ statement,
+ [
+ origin,
+ authority_type,
+ authority_url,
+ discovery_date,
+ fetcher_name,
+ fetcher_version,
+ format,
+ metadata,
+ ],
+ )
+
+ @_prepared_statement(
+ "SELECT * from origin_metadata "
+ "WHERE origin=? AND authority_url=? AND discovery_date>=? "
+ "AND authority_type=?"
+ )
+ def origin_metadata_get_after(
+ self, origin, authority_type, authority_url, after, *, statement
+ ):
+ return self._execute_with_retries(
+ statement, [origin, authority_url, after, authority_type]
+ )
+
+ @_prepared_statement(
+ "SELECT * from origin_metadata "
+ "WHERE origin=? AND authority_url=? AND authority_type=?"
+ )
+ def origin_metadata_get(
+ self, origin, authority_type, authority_url, *, statement
+ ) -> Iterable[Row]:
+ return self._execute_with_retries(
+ statement, [origin, authority_url, authority_type]
)
- if rows:
- assert len(rows) == 1
- return rows[0].id
- else:
- return None
##########################
# Miscellaneous
diff --git a/swh/storage/cassandra/schema.py b/swh/storage/cassandra/schema.py
--- a/swh/storage/cassandra/schema.py
+++ b/swh/storage/cassandra/schema.py
@@ -166,21 +166,37 @@
);
-CREATE TABLE IF NOT EXISTS tool_by_uuid (
- id timeuuid PRIMARY KEY,
- name ascii,
- version ascii,
- configuration blob,
+CREATE TABLE IF NOT EXISTS metadata_authority (
+ url text,
+ type ascii,
+ metadata text,
+ PRIMARY KEY ((url), type)
);
-CREATE TABLE IF NOT EXISTS tool (
- id timeuuid,
+CREATE TABLE IF NOT EXISTS metadata_fetcher (
name ascii,
version ascii,
- configuration blob,
- PRIMARY KEY ((name, version, configuration))
-)
+ metadata text,
+ PRIMARY KEY ((name), version)
+);
+
+
+CREATE TABLE IF NOT EXISTS origin_metadata (
+ origin text,
+ authority_type text,
+ authority_url text,
+ discovery_date timestamp,
+ fetcher_name ascii,
+ fetcher_version ascii,
+ format ascii,
+ metadata blob,
+ PRIMARY KEY ((origin), authority_type, authority_url, discovery_date,
+ fetcher_name, fetcher_version),
+ -- for now, authority_url could be in the partition key; but leaving
+ -- in the partition key allows listing authorities with metadata on an
+ -- origin if we every need to do it.
+);
CREATE TABLE IF NOT EXISTS object_count (
@@ -211,7 +227,7 @@
TABLES = (
"skipped_content content revision revision_parent release "
"directory directory_entry snapshot snapshot_branch "
- "origin_visit origin tool_by_uuid tool object_count"
+ "origin_visit origin metadata_authority metadata_fetcher"
).split()
HASH_ALGORITHMS = ["sha1", "sha1_git", "sha256", "blake2s256"]
diff --git a/swh/storage/cassandra/storage.py b/swh/storage/cassandra/storage.py
--- a/swh/storage/cassandra/storage.py
+++ b/swh/storage/cassandra/storage.py
@@ -4,11 +4,11 @@
# See top-level LICENSE file for more information
import datetime
+import itertools
import json
import random
import re
from typing import Any, Dict, List, Iterable, Optional, Union
-import uuid
import attr
import dateutil
@@ -931,37 +931,6 @@
else:
return None
- def tool_add(self, tools):
- inserted = []
- for tool in tools:
- tool = tool.copy()
- tool_json = tool.copy()
- tool_json["configuration"] = json.dumps(
- tool["configuration"], sort_keys=True
- ).encode()
- id_ = self._cql_runner.tool_get_one_uuid(**tool_json)
- if not id_:
- id_ = uuid.uuid1()
- tool_json["id"] = id_
- self._cql_runner.tool_by_uuid_add_one(tool_json)
- self._cql_runner.tool_add_one(tool_json)
- tool["id"] = id_
- inserted.append(tool)
- return inserted
-
- def tool_get(self, tool):
- id_ = self._cql_runner.tool_get_one_uuid(
- tool["name"],
- tool["version"],
- json.dumps(tool["configuration"], sort_keys=True).encode(),
- )
- if id_:
- tool = tool.copy()
- tool["id"] = id_
- return tool
- else:
- return None
-
def stat_counters(self):
rows = self._cql_runner.stat_counters()
keys = (
@@ -981,33 +950,109 @@
def refresh_stat_counters(self):
pass
- def origin_metadata_add(self, origin_url, ts, provider, tool, metadata):
- # TODO
- raise NotImplementedError("not yet supported for Cassandra")
+ def origin_metadata_add(
+ self,
+ origin_url: str,
+ discovery_date: datetime.datetime,
+ authority: Dict[str, Any],
+ fetcher: Dict[str, Any],
+ format: str,
+ metadata: bytes,
+ ) -> None:
+ if not isinstance(origin_url, str):
+ raise StorageArgumentException(
+ "origin_id must be str, not %r" % (origin_url,)
+ )
+ if not self._cql_runner.metadata_authority_get(**authority):
+ raise StorageArgumentException(f"Unknown authority {authority}")
+ if not self._cql_runner.metadata_fetcher_get(**fetcher):
+ raise StorageArgumentException(f"Unknown fetcher {fetcher}")
+
+ self._cql_runner.origin_metadata_add(
+ origin_url,
+ authority["type"],
+ authority["url"],
+ discovery_date,
+ fetcher["name"],
+ fetcher["version"],
+ format,
+ metadata,
+ )
+
+ def origin_metadata_get(
+ self,
+ origin_url: str,
+ authority: Dict[str, str],
+ after: Optional[datetime.datetime] = None,
+ limit: Optional[int] = None,
+ ) -> List[Dict[str, Any]]:
+ if not isinstance(origin_url, str):
+ raise TypeError("origin_url must be str, not %r" % (origin_url,))
+
+ if after is None:
+ entries = self._cql_runner.origin_metadata_get(
+ origin_url, authority["type"], authority["url"]
+ )
+ else:
+ entries = self._cql_runner.origin_metadata_get_after(
+ origin_url, authority["type"], authority["url"], after
+ )
- def origin_metadata_get_by(self, origin_url, provider_type=None):
- # TODO
- raise NotImplementedError("not yet supported for Cassandra")
+ if limit:
+ entries = itertools.islice(entries, 0, limit)
- def metadata_provider_add(
- self, provider_name, provider_type, provider_url, metadata
- ):
- # TODO
- raise NotImplementedError("not yet supported for Cassandra")
+ results = []
+ for entry in entries:
+ discovery_date = entry.discovery_date.replace(tzinfo=datetime.timezone.utc)
+ results.append(
+ {
+ "origin_url": entry.origin,
+ "authority": {
+ "type": entry.authority_type,
+ "url": entry.authority_url,
+ },
+ "fetcher": {
+ "name": entry.fetcher_name,
+ "version": entry.fetcher_version,
+ },
+ "discovery_date": discovery_date,
+ "format": entry.format,
+ "metadata": entry.metadata,
+ }
+ )
+ return results
- def metadata_provider_get(self, provider_id):
- # TODO
- raise NotImplementedError("not yet supported for Cassandra")
+ def metadata_fetcher_add(
+ self, name: str, version: str, metadata: Dict[str, Any]
+ ) -> None:
+ self._cql_runner.metadata_fetcher_add(name, version, json.dumps(metadata))
- def metadata_provider_get_by(self, provider):
- # TODO
- raise NotImplementedError("not yet supported for Cassandra")
+ def metadata_fetcher_get(self, name: str, version: str) -> Optional[Dict[str, Any]]:
+ fetcher = self._cql_runner.metadata_fetcher_get(name, version)
+ if fetcher:
+ return {
+ "name": fetcher.name,
+ "version": fetcher.version,
+ "metadata": json.loads(fetcher.metadata),
+ }
+ else:
+ return None
- def clear_buffers(self, object_types: Optional[Iterable[str]] = None) -> None:
- """Do nothing
+ def metadata_authority_add(
+ self, type: str, url: str, metadata: Dict[str, Any]
+ ) -> None:
+ self._cql_runner.metadata_authority_add(url, type, json.dumps(metadata))
- """
- return None
+ def metadata_authority_get(self, type: str, url: str) -> Optional[Dict[str, Any]]:
+ authority = self._cql_runner.metadata_authority_get(type, url)
+ if authority:
+ return {
+ "type": authority.type,
+ "url": authority.url,
+ "metadata": json.loads(authority.metadata),
+ }
+ else:
+ return None
def flush(self, object_types: Optional[Iterable[str]] = None) -> Dict:
return {}
diff --git a/swh/storage/db.py b/swh/storage/db.py
--- a/swh/storage/db.py
+++ b/swh/storage/db.py
@@ -3,8 +3,10 @@
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
+import datetime
import random
import select
+from typing import Optional
from swh.core.db import BaseDb
from swh.core.db.db_utils import stored_procedure, jsonize
@@ -997,134 +999,150 @@
def release_get_random(self, cur=None):
return self._get_random_row_from_table("release", ["id"], "id", cur)
- def origin_metadata_add(self, origin, ts, provider, tool, metadata, cur=None):
+ origin_metadata_get_cols = [
+ "origin.url",
+ "discovery_date",
+ "metadata_authority.type",
+ "metadata_authority.url",
+ "metadata_fetcher.name",
+ "metadata_fetcher.version",
+ "format",
+ "metadata",
+ ]
+
+ def origin_metadata_add(
+ self,
+ origin: str,
+ discovery_date: datetime.datetime,
+ authority: int,
+ fetcher: int,
+ format: str,
+ metadata: bytes,
+ cur=None,
+ ) -> None:
""" Add an origin_metadata for the origin at ts with provider, tool and
metadata.
Args:
- origin (int): the origin's id for which the metadata is added
- ts (datetime): time when the metadata was found
- provider (int): the metadata provider identifier
- tool (int): the tool's identifier used to extract metadata
- metadata (jsonb): the metadata retrieved at the time and location
-
- Returns:
- id (int): the origin_metadata unique id
-
+ origin: the origin's id for which the metadata is added
+ discovery_date: time when the metadata was found
+ authority: the metadata provider identifier
+ fetcher: the tool's identifier used to extract metadata
+ format: the format of the metadata
+ metadata: the metadata retrieved at the time and location
"""
cur = self._cursor(cur)
insert = """INSERT INTO origin_metadata (origin_id, discovery_date,
- provider_id, tool_id, metadata)
- SELECT id, %s, %s, %s, %s FROM origin WHERE url = %s"""
- cur.execute(insert, (ts, provider, tool, jsonize(metadata), origin))
-
- origin_metadata_get_cols = [
- "origin_url",
- "discovery_date",
- "tool_id",
- "metadata",
- "provider_id",
- "provider_name",
- "provider_type",
- "provider_url",
- ]
-
- def origin_metadata_get_by(self, origin_url, provider_type=None, cur=None):
- """Retrieve all origin_metadata entries for one origin_url
+ authority_id, fetcher_id, format, metadata)
+ SELECT id, %s, %s, %s, %s, %s FROM origin WHERE url = %s"""
+ cur.execute(
+ insert,
+ (discovery_date, authority, fetcher, format, jsonize(metadata), origin),
+ )
- """
+ def origin_metadata_get(
+ self,
+ origin_url: str,
+ authority: int,
+ after: Optional[datetime.datetime],
+ limit: Optional[int],
+ cur=None,
+ ):
cur = self._cursor(cur)
- if not provider_type:
- query = """SELECT %s
- FROM swh_origin_metadata_get_by_origin(
- %%s)""" % (
- ",".join(self.origin_metadata_get_cols)
- )
+ assert self.origin_metadata_get_cols[-1] == "metadata"
+ query_parts = [
+ f"SELECT {', '.join(self.origin_metadata_get_cols[0:-1])}, "
+ f" origin_metadata.metadata AS metadata "
+ f"FROM origin_metadata "
+ f"INNER JOIN metadata_authority "
+ f" ON (metadata_authority.id=authority_id) "
+ f"INNER JOIN metadata_fetcher ON (metadata_fetcher.id=fetcher_id) "
+ f"INNER JOIN origin ON (origin.id=origin_metadata.origin_id) "
+ f"WHERE origin.url=%s AND authority_id=%s"
+ ]
+ args = [origin_url, authority]
- cur.execute(query, (origin_url,))
+ if after:
+ query_parts.append("AND discovery_date >= %s")
+ args.append(after)
- else:
- query = """SELECT %s
- FROM swh_origin_metadata_get_by_provider_type(
- %%s, %%s)""" % (
- ",".join(self.origin_metadata_get_cols)
- )
+ query_parts.append("ORDER BY discovery_date")
- cur.execute(query, (origin_url, provider_type))
+ if limit:
+ query_parts.append("LIMIT %s")
+ args.append(limit)
+ cur.execute(" ".join(query_parts), args)
yield from cur
- tool_cols = ["id", "name", "version", "configuration"]
-
- @stored_procedure("swh_mktemp_tool")
- def mktemp_tool(self, cur=None):
- pass
+ metadata_fetcher_cols = ["name", "version", "metadata"]
- def tool_add_from_temp(self, cur=None):
+ def metadata_fetcher_add(
+ self, name: str, version: str, metadata: bytes, cur=None
+ ) -> None:
cur = self._cursor(cur)
- cur.execute("SELECT %s from swh_tool_add()" % (",".join(self.tool_cols),))
- yield from cur
+ cur.execute(
+ "INSERT INTO metadata_fetcher (name, version, metadata) "
+ "VALUES (%s, %s, %s) ON CONFLICT DO NOTHING",
+ (name, version, jsonize(metadata)),
+ )
- def tool_get(self, name, version, configuration, cur=None):
+ def metadata_fetcher_get(self, name: str, version: str, cur=None):
cur = self._cursor(cur)
cur.execute(
- """select %s
- from tool
- where name=%%s and
- version=%%s and
- configuration=%%s"""
- % (",".join(self.tool_cols)),
- (name, version, configuration),
+ f"SELECT {', '.join(self.metadata_fetcher_cols)} "
+ f"FROM metadata_fetcher "
+ f"WHERE name=%s AND version=%s",
+ (name, version),
)
-
return cur.fetchone()
- metadata_provider_cols = [
- "id",
- "provider_name",
- "provider_type",
- "provider_url",
- "metadata",
- ]
-
- def metadata_provider_add(
- self, provider_name, provider_type, provider_url, metadata, cur=None
- ):
- """Insert a new provider and return the new identifier."""
+ def metadata_fetcher_get_id(
+ self, name: str, version: str, cur=None
+ ) -> Optional[int]:
cur = self._cursor(cur)
- insert = """INSERT INTO metadata_provider (provider_name, provider_type,
- provider_url, metadata) values (%s, %s, %s, %s)
- RETURNING id"""
-
cur.execute(
- insert, (provider_name, provider_type, provider_url, jsonize(metadata))
+ "SELECT id FROM metadata_fetcher WHERE name=%s AND version=%s",
+ (name, version),
)
- return cur.fetchone()[0]
+ row = cur.fetchone()
+ if row:
+ return row[0]
+ else:
+ return None
+
+ metadata_authority_cols = ["type", "url", "metadata"]
- def metadata_provider_get(self, provider_id, cur=None):
+ def metadata_authority_add(
+ self, type: str, url: str, metadata: bytes, cur=None
+ ) -> None:
cur = self._cursor(cur)
cur.execute(
- """select %s
- from metadata_provider
- where id=%%s """
- % (",".join(self.metadata_provider_cols)),
- (provider_id,),
+ "INSERT INTO metadata_authority (type, url, metadata) "
+ "VALUES (%s, %s, %s) ON CONFLICT DO NOTHING",
+ (type, url, jsonize(metadata)),
)
+ def metadata_authority_get(self, type: str, url: str, cur=None):
+ cur = self._cursor(cur)
+ cur.execute(
+ f"SELECT {', '.join(self.metadata_authority_cols)} "
+ f"FROM metadata_authority "
+ f"WHERE type=%s AND url=%s",
+ (type, url),
+ )
return cur.fetchone()
- def metadata_provider_get_by(self, provider_name, provider_url, cur=None):
+ def metadata_authority_get_id(self, type: str, url: str, cur=None) -> Optional[int]:
cur = self._cursor(cur)
cur.execute(
- """select %s
- from metadata_provider
- where provider_name=%%s and
- provider_url=%%s"""
- % (",".join(self.metadata_provider_cols)),
- (provider_name, provider_url),
+ "SELECT id FROM metadata_authority WHERE type=%s AND url=%s", (type, url)
)
-
- return cur.fetchone()
+ row = cur.fetchone()
+ if row:
+ return row[0]
+ else:
+ return None
def _get_random_row_from_table(self, table_name, cols, id_col, cur=None):
random_sha1 = bytes(random.randint(0, 255) for _ in range(SHA1_SIZE))
diff --git a/swh/storage/in_memory.py b/swh/storage/in_memory.py
--- a/swh/storage/in_memory.py
+++ b/swh/storage/in_memory.py
@@ -14,7 +14,20 @@
from collections import defaultdict
from datetime import timedelta
-from typing import Any, Dict, Iterable, List, Optional, Union
+from typing import (
+ Any,
+ Callable,
+ Dict,
+ Generic,
+ Hashable,
+ Iterable,
+ Iterator,
+ List,
+ Optional,
+ Tuple,
+ TypeVar,
+ Union,
+)
import attr
@@ -45,6 +58,50 @@
BULK_BLOCK_CONTENT_LEN_MAX = 10000
+SortedListItem = TypeVar("SortedListItem")
+SortedListKey = TypeVar("SortedListKey")
+
+
+class SortedList(collections.UserList, Generic[SortedListKey, SortedListItem]):
+ data: List[Tuple[Any, SortedListItem]]
+
+ # https://github.com/python/mypy/issues/708
+ # key: Callable[[SortedListItem], SortedListKey]
+
+ def __init__(
+ self,
+ data: List[SortedListItem] = None,
+ key: Optional[Callable[[SortedListItem], SortedListKey]] = None,
+ ):
+ if key is None:
+
+ def key(item):
+ return item
+
+ assert key is not None # for mypy
+ super().__init__(sorted((key(x), x) for x in data or []))
+
+ self.key: Callable[[SortedListItem], SortedListKey] = key
+
+ def add(self, item: SortedListItem):
+ k = self.key(item)
+ bisect.insort(self.data, (k, item))
+
+ def __iter__(self) -> Iterator[SortedListItem]:
+ for (k, item) in self.data:
+ yield item
+
+ def iter_from(self, start_key: SortedListKey) -> Iterator[SortedListItem]:
+ """Returns an iterator over all the elements whose key is greater
+ or equal to `start_key`.
+ (This is an efficient equivalent to:
+ `(x for x in L if key(x) >= start_key)`)
+ """
+ from_index = bisect.bisect_left(self.data, (start_key,))
+ for (k, item) in itertools.islice(self.data, from_index, None):
+ yield item
+
+
class InMemoryStorage:
def __init__(self, journal_writer=None):
@@ -65,13 +122,18 @@
self._origins_by_sha1 = {}
self._origin_visits = {}
self._persons = []
- self._origin_metadata = defaultdict(list)
- self._tools = {}
- self._metadata_providers = {}
- self._objects = defaultdict(list)
- # ideally we would want a skip list for both fast inserts and searches
- self._sorted_sha1s = []
+ # {origin_url: {authority: [metadata]}}
+ self._origin_metadata: Dict[
+ str, Dict[Hashable, SortedList[datetime.datetime, Dict[str, Any]]]
+ ] = defaultdict(
+ lambda: defaultdict(lambda: SortedList(key=lambda x: x["discovery_date"]))
+ ) # noqa
+
+ self._metadata_fetchers: Dict[Hashable, Dict[str, Any]] = {}
+ self._metadata_authorities: Dict[Hashable, Dict[str, Any]] = {}
+ self._objects = defaultdict(list)
+ self._sorted_sha1s = SortedList[bytes, bytes]()
self.objstorage = ObjStorage({"cls": "memory", "args": {}})
@@ -82,7 +144,6 @@
self.journal_writer.content_add(contents)
content_add = 0
- content_add_bytes = 0
if with_data:
summary = self.objstorage.content_add(
c for c in contents if c.status != "absent"
@@ -111,7 +172,7 @@
self._content_indexes[algorithm][hash_].add(key)
self._objects[content.sha1_git].append(("content", content.sha1))
self._contents[key] = content
- bisect.insort(self._sorted_sha1s, content.sha1)
+ self._sorted_sha1s.add(content.sha1)
self._contents[key] = attr.evolve(self._contents[key], data=None)
content_add += 1
@@ -163,11 +224,9 @@
def content_get_range(self, start, end, limit=1000):
if limit is None:
raise StorageArgumentException("limit should not be None")
- from_index = bisect.bisect_left(self._sorted_sha1s, start)
- sha1s = itertools.islice(self._sorted_sha1s, from_index, None)
sha1s = (
(sha1, content_key)
- for sha1 in sha1s
+ for sha1 in self._sorted_sha1s.iter_from(start)
for content_key in self._content_indexes["sha1"][sha1]
)
matched = []
@@ -906,71 +965,104 @@
def refresh_stat_counters(self):
pass
- def origin_metadata_add(self, origin_url, ts, provider, tool, metadata):
+ def origin_metadata_add(
+ self,
+ origin_url: str,
+ discovery_date: datetime.datetime,
+ authority: Dict[str, Any],
+ fetcher: Dict[str, Any],
+ format: str,
+ metadata: bytes,
+ ) -> None:
if not isinstance(origin_url, str):
- raise TypeError("origin_id must be str, not %r" % (origin_url,))
-
- if isinstance(ts, str):
- ts = dateutil.parser.parse(ts)
+ raise StorageArgumentException(
+ "origin_id must be str, not %r" % (origin_url,)
+ )
+ authority_key = self._metadata_authority_key(authority)
+ if authority_key not in self._metadata_authorities:
+ raise StorageArgumentException(f"Unknown authority {authority}")
+ fetcher_key = self._metadata_fetcher_key(fetcher)
+ if fetcher_key not in self._metadata_fetchers:
+ raise StorageArgumentException(f"Unknown fetcher {fetcher}")
origin_metadata = {
"origin_url": origin_url,
- "discovery_date": ts,
- "tool_id": tool,
+ "discovery_date": discovery_date,
+ "authority": authority_key,
+ "fetcher": fetcher_key,
+ "format": format,
"metadata": metadata,
- "provider_id": provider,
}
- self._origin_metadata[origin_url].append(origin_metadata)
+ self._origin_metadata[origin_url][authority_key].add(origin_metadata)
return None
- def origin_metadata_get_by(self, origin_url, provider_type=None):
+ def origin_metadata_get(
+ self,
+ origin_url: str,
+ authority: Dict[str, str],
+ after: Optional[datetime.datetime] = None,
+ limit: Optional[int] = None,
+ ) -> List[Dict[str, Any]]:
if not isinstance(origin_url, str):
raise TypeError("origin_url must be str, not %r" % (origin_url,))
- metadata = []
- for item in self._origin_metadata[origin_url]:
- item = copy.deepcopy(item)
- provider = self.metadata_provider_get(item["provider_id"])
- for attr_name in ("name", "type", "url"):
- item["provider_" + attr_name] = provider["provider_" + attr_name]
- metadata.append(item)
- return metadata
-
- def tool_add(self, tools):
- inserted = []
- for tool in tools:
- key = self._tool_key(tool)
- assert "id" not in tool
- record = copy.deepcopy(tool)
- record["id"] = key # TODO: remove this
- if key not in self._tools:
- self._tools[key] = record
- inserted.append(copy.deepcopy(self._tools[key]))
-
- return inserted
-
- def tool_get(self, tool):
- return self._tools.get(self._tool_key(tool))
-
- def metadata_provider_add(
- self, provider_name, provider_type, provider_url, metadata
- ):
- provider = {
- "provider_name": provider_name,
- "provider_type": provider_type,
- "provider_url": provider_url,
+
+ authority_key = self._metadata_authority_key(authority)
+
+ if after is None:
+ entries = iter(self._origin_metadata[origin_url][authority_key])
+ else:
+ entries = self._origin_metadata[origin_url][authority_key].iter_from(after)
+ if limit:
+ entries = itertools.islice(entries, 0, limit)
+
+ results = []
+ for entry in entries:
+ authority = self._metadata_authorities[entry["authority"]]
+ fetcher = self._metadata_fetchers[entry["fetcher"]]
+ results.append(
+ {
+ **entry,
+ "authority": {"type": authority["type"], "url": authority["url"],},
+ "fetcher": {
+ "name": fetcher["name"],
+ "version": fetcher["version"],
+ },
+ }
+ )
+ return results
+
+ def metadata_fetcher_add(
+ self, name: str, version: str, metadata: Dict[str, Any]
+ ) -> None:
+ fetcher = {
+ "name": name,
+ "version": version,
"metadata": metadata,
}
- key = self._metadata_provider_key(provider)
- provider["id"] = key
- self._metadata_providers[key] = provider
- return key
+ key = self._metadata_fetcher_key(fetcher)
+ if key not in self._metadata_fetchers:
+ self._metadata_fetchers[key] = fetcher
- def metadata_provider_get(self, provider_id):
- return self._metadata_providers.get(provider_id)
+ def metadata_fetcher_get(self, name: str, version: str) -> Optional[Dict[str, Any]]:
+ return self._metadata_fetchers.get(
+ self._metadata_fetcher_key({"name": name, "version": version})
+ )
- def metadata_provider_get_by(self, provider):
- key = self._metadata_provider_key(provider)
- return self._metadata_providers.get(key)
+ def metadata_authority_add(
+ self, type: str, url: str, metadata: Dict[str, Any]
+ ) -> None:
+ authority = {
+ "type": type,
+ "url": url,
+ "metadata": metadata,
+ }
+ key = self._metadata_authority_key(authority)
+ self._metadata_authorities[key] = authority
+
+ def metadata_authority_get(self, type: str, url: str) -> Optional[Dict[str, Any]]:
+ return self._metadata_authorities.get(
+ self._metadata_authority_key({"type": type, "url": url})
+ )
def _get_origin_url(self, origin):
if isinstance(origin, str):
@@ -997,16 +1089,12 @@
return tuple((key, content.get(key)) for key in sorted(DEFAULT_ALGORITHMS))
@staticmethod
- def _tool_key(tool):
- return "%r %r %r" % (
- tool["name"],
- tool["version"],
- tuple(sorted(tool["configuration"].items())),
- )
+ def _metadata_fetcher_key(fetcher: Dict) -> Hashable:
+ return (fetcher["name"], fetcher["version"])
@staticmethod
- def _metadata_provider_key(provider):
- return "%r %r" % (provider["provider_name"], provider["provider_url"])
+ def _metadata_authority_key(authority: Dict) -> Hashable:
+ return (authority["type"], authority["url"])
def diff_directories(self, from_dir, to_dir, track_renaming=False):
raise NotImplementedError("InMemoryStorage.diff_directories")
diff --git a/swh/storage/interface.py b/swh/storage/interface.py
--- a/swh/storage/interface.py
+++ b/swh/storage/interface.py
@@ -1132,120 +1132,118 @@
...
@remote_api_endpoint("origin/metadata/add")
- def origin_metadata_add(self, origin_url, ts, provider, tool, metadata):
- """ Add an origin_metadata for the origin at ts with provenance and
- metadata.
+ def origin_metadata_add(
+ self,
+ origin_url: str,
+ discovery_date: datetime.datetime,
+ authority: Dict[str, Any],
+ fetcher: Dict[str, Any],
+ format: str,
+ metadata: bytes,
+ ) -> None:
+ """Add an origin_metadata for the origin at discovery_date,
+ obtained using the `fetcher` from the `authority`.
+
+ The authority and fetcher must be known to the storage before
+ using this endpoint.
Args:
- origin_url (str): the origin url for which the metadata is added
- ts (datetime): timestamp of the found metadata
- provider (int): the provider of metadata (ex:'hal')
- tool (int): tool used to extract metadata
- metadata (jsonb): the metadata retrieved at the time and location
+ discovery_date: when the metadata was fetched.
+ authority: a dict containing keys `type` and `url`, and
+ fetcher: a dict containing keys `name` and `version`.
+ format: text field indicating the format of the content of the
+ metadata: blob of raw metadata
"""
...
@remote_api_endpoint("origin/metadata/get")
- def origin_metadata_get_by(self, origin_url, provider_type=None):
+ def origin_metadata_get(
+ self,
+ origin_url: str,
+ authority: Dict[str, str],
+ after: Optional[datetime.datetime] = None,
+ limit: Optional[int] = None,
+ ) -> List[Dict[str, Any]]:
"""Retrieve list of all origin_metadata entries for the origin_id
Args:
- origin_url (str): the origin's URL
- provider_type (str): (optional) type of provider
+ origin_url: the origin's URL
+ authority: a dict containing keys `type` and `url`, and
+ after: minimum discovery_date for a result to be returned
+ limit: maximum number of results to be returned
Returns:
- list of dicts: the origin_metadata dictionary with the keys:
+ list of dicts in the format:
+
+ .. code-block: python
- - origin_id (int): origin's id
- - discovery_date (datetime): timestamp of discovery
- - tool_id (int): metadata's extracting tool
- - metadata (jsonb)
- - provider_id (int): metadata's provider
- - provider_name (str)
- - provider_type (str)
- - provider_url (str)
+ {
+ 'authority': {'type': ..., 'url': ...},
+ 'fetcher': {'name': ..., 'version': ...},
+ 'discovery_date': ...,
+ 'format': '...',
+ 'metadata': b'...'
+ }
"""
...
- @remote_api_endpoint("tool/add")
- def tool_add(self, tools):
- """Add new tools to the storage.
-
- Args:
- tools (iterable of :class:`dict`): Tool information to add to
- storage. Each tool is a :class:`dict` with the following keys:
+ @remote_api_endpoint("fetcher/add")
+ def metadata_fetcher_add(
+ self, name: str, version: str, metadata: Dict[str, Any]
+ ) -> None:
+ """Add a new metadata fetcher to the storage.
- - name (:class:`str`): name of the tool
- - version (:class:`str`): version of the tool
- - configuration (:class:`dict`): configuration of the tool,
- must be json-encodable
+ `name` and `version` together are a unique identifier of this
+ fetcher; and `metadata` is an arbitrary dict of JSONable data
+ with information about this fetcher.
- Returns:
- :class:`dict`: All the tools inserted in storage
- (including the internal ``id``). The order of the list is not
- guaranteed to match the order of the initial list.
+ Args:
+ name: the name of the fetcher
+ version: version of the fetcher
"""
...
- @remote_api_endpoint("tool/data")
- def tool_get(self, tool):
- """Retrieve tool information.
+ @remote_api_endpoint("fetcher/get")
+ def metadata_fetcher_get(self, name: str, version: str) -> Optional[Dict[str, Any]]:
+ """Retrieve information about a fetcher
Args:
- tool (dict): Tool information we want to retrieve from storage.
- The dicts have the same keys as those used in :func:`tool_add`.
+ name: the name of the fetcher
+ version: version of the fetcher
Returns:
- dict: The full tool information if it exists (``id`` included),
- None otherwise.
+ dictionary with keys `name`, `version`, and `metadata`; or None
+ if the fetcher is not known
"""
...
- @remote_api_endpoint("provider/add")
- def metadata_provider_add(
- self, provider_name, provider_type, provider_url, metadata
- ):
- """Add a metadata provider.
+ @remote_api_endpoint("authority/add")
+ def metadata_authority_add(
+ self, type: str, url: str, metadata: Dict[str, Any]
+ ) -> None:
+ """Add a metadata authority
Args:
- provider_name (str): Its name
- provider_type (str): Its type (eg. `'deposit-client'`)
- provider_url (str): Its URL
+ type: one of "deposit", "forge", or "registry"
+ url: unique URI identifying the authority
metadata: JSON-encodable object
-
- Returns:
- int: an identifier of the provider
- """
- ...
-
- @remote_api_endpoint("provider/get")
- def metadata_provider_get(self, provider_id):
- """Get a metadata provider
-
- Args:
- provider_id: Its identifier, as given by `metadata_provider_add`.
-
- Returns:
- dict: same as `metadata_provider_add`;
- or None if it does not exist.
"""
...
- @remote_api_endpoint("provider/getby")
- def metadata_provider_get_by(self, provider):
- """Get a metadata provider
+ @remote_api_endpoint("authority/get")
+ def metadata_authority_get(self, type: str, url: str) -> Optional[Dict[str, Any]]:
+ """Retrieve information about an authority
Args:
- provider (dict): A dictionary with keys:
- * provider_name: Its name
- * provider_url: Its URL
+ type: one of "deposit", "forge", or "registry"
+ url: unique URI identifying the authority
Returns:
- dict: same as `metadata_provider_add`;
- or None if it does not exist.
+ dictionary with keys `type`, `url`, and `metadata`; or None
+ if the authority is not known
"""
...
diff --git a/swh/storage/retry.py b/swh/storage/retry.py
--- a/swh/storage/retry.py
+++ b/swh/storage/retry.py
@@ -7,7 +7,7 @@
import traceback
from datetime import datetime
-from typing import Dict, Iterable, List, Optional, Union
+from typing import Any, Dict, Iterable, Optional, Union
from tenacity import (
retry,
@@ -127,29 +127,29 @@
)
@swh_retry
- def tool_add(self, tools: Iterable[Dict]) -> List[Dict]:
- tools = list(tools)
- return self.storage.tool_add(tools)
+ def metadata_fetcher_add(
+ self, name: str, version: str, metadata: Dict[str, Any]
+ ) -> None:
+ return self.storage.metadata_fetcher_add(name, version, metadata)
@swh_retry
- def metadata_provider_add(
- self, provider_name: str, provider_type: str, provider_url: str, metadata: Dict
- ) -> Union[str, int]:
- return self.storage.metadata_provider_add(
- provider_name, provider_type, provider_url, metadata
- )
+ def metadata_authority_add(
+ self, type: str, url: str, metadata: Dict[str, Any]
+ ) -> None:
+ return self.storage.metadata_authority_add(type, url, metadata)
@swh_retry
def origin_metadata_add(
self,
origin_url: str,
- ts: Union[str, datetime],
- provider_id: int,
- tool_id: int,
- metadata: Dict,
+ discovery_date: datetime,
+ authority: Dict[str, Any],
+ fetcher: Dict[str, Any],
+ format: str,
+ metadata: bytes,
) -> None:
return self.storage.origin_metadata_add(
- origin_url, ts, provider_id, tool_id, metadata
+ origin_url, discovery_date, authority, fetcher, format, metadata
)
@swh_retry
diff --git a/swh/storage/sql/30-swh-schema.sql b/swh/storage/sql/30-swh-schema.sql
--- a/swh/storage/sql/30-swh-schema.sql
+++ b/swh/storage/sql/30-swh-schema.sql
@@ -17,7 +17,7 @@
-- latest schema version
insert into dbversion(version, release, description)
- values(146, now(), 'Work In Progress');
+ values(148, now(), 'Work In Progress');
-- a SHA1 checksum
create domain sha1 as bytea check (length(value) = 20);
@@ -377,35 +377,34 @@
comment on column release.date_neg_utc_offset is 'True indicates -0 UTC offset for release timestamp';
-- Tools
-create table tool
+create table metadata_fetcher
(
- id serial not null,
- name text not null,
- version text not null,
- configuration jsonb
+ id serial not null,
+ name text not null,
+ version text not null,
+ metadata jsonb not null
);
-comment on table tool is 'Tool information';
-comment on column tool.id is 'Tool identifier';
-comment on column tool.version is 'Tool name';
-comment on column tool.version is 'Tool version';
-comment on column tool.configuration is 'Tool configuration: command line, flags, etc...';
+comment on table metadata_fetcher is 'Tools used to retrieve metadata';
+comment on column metadata_fetcher.id is 'Internal identifier of the fetcher';
+comment on column metadata_fetcher.name is 'Fetcher name';
+comment on column metadata_fetcher.version is 'Fetcher version';
+comment on column metadata_fetcher.metadata is 'Extra information about the fetcher';
-create table metadata_provider
+create table metadata_authority
(
- id serial not null,
- provider_name text not null,
- provider_type text not null,
- provider_url text,
- metadata jsonb
+ id serial not null,
+ type text not null,
+ url text not null,
+ metadata jsonb not null
);
-comment on table metadata_provider is 'Metadata provider information';
-comment on column metadata_provider.id is 'Provider''s identifier';
-comment on column metadata_provider.provider_name is 'Provider''s name';
-comment on column metadata_provider.provider_url is 'Provider''s url';
-comment on column metadata_provider.metadata is 'Other metadata about provider';
+comment on table metadata_authority is 'Metadata authority information';
+comment on column metadata_authority.id is 'Internal identifier of the authority';
+comment on column metadata_authority.type is 'Type of authority (deposit/forge/registry)';
+comment on column metadata_authority.url is 'Authority''s uri';
+comment on column metadata_authority.metadata is 'Other metadata about authority';
-- Discovery of metadata during a listing, loading, deposit or external_catalog of an origin
@@ -415,18 +414,20 @@
id bigserial not null, -- PK internal object identifier
origin_id bigint not null, -- references origin(id)
discovery_date timestamptz not null, -- when it was extracted
- provider_id bigint not null, -- ex: 'hal', 'lister-github', 'loader-github'
- tool_id bigint not null,
- metadata jsonb not null
+ authority_id bigint not null,
+ fetcher_id bigint not null,
+ format text not null,
+ metadata bytea not null
);
comment on table origin_metadata is 'keeps all metadata found concerning an origin';
comment on column origin_metadata.id is 'the origin_metadata object''s id';
comment on column origin_metadata.origin_id is 'the origin id for which the metadata was found';
comment on column origin_metadata.discovery_date is 'the date of retrieval';
-comment on column origin_metadata.provider_id is 'the metadata provider: github, openhub, deposit, etc.';
-comment on column origin_metadata.tool_id is 'the tool used for extracting metadata: lister-github, etc.';
-comment on column origin_metadata.metadata is 'metadata in json format but with original terms';
+comment on column origin_metadata.authority_id is 'the metadata provider: github, openhub, deposit, etc.';
+comment on column origin_metadata.fetcher_id is 'the tool used for extracting metadata: loaders, crawlers, etc.';
+comment on column origin_metadata.format is 'name of the format of metadata, used by readers to interpret it.';
+comment on column origin_metadata.metadata is 'original metadata in opaque format';
-- Keep a cache of object counts
diff --git a/swh/storage/sql/40-swh-func.sql b/swh/storage/sql/40-swh-func.sql
--- a/swh/storage/sql/40-swh-func.sql
+++ b/swh/storage/sql/40-swh-func.sql
@@ -101,17 +101,6 @@
) on commit delete rows;
$$;
--- create a temporary table for the tools
-create or replace function swh_mktemp_tool()
- returns void
- language sql
-as $$
- create temporary table if not exists tmp_tool (
- like tool including defaults
- ) on commit delete rows;
- alter table tmp_tool drop column if exists id;
-$$;
-
-- a content signature is a set of cryptographic checksums that we use to
-- uniquely identify content, for the purpose of verifying if we already have
-- some content or not during content injection
@@ -920,76 +909,6 @@
$$;
--- end revision_metadata functions
--- origin_metadata functions
-create type origin_metadata_signature as (
- id bigint,
- origin_url text,
- discovery_date timestamptz,
- tool_id bigint,
- metadata jsonb,
- provider_id integer,
- provider_name text,
- provider_type text,
- provider_url text
-);
-create or replace function swh_origin_metadata_get_by_origin(
- origin text)
- returns setof origin_metadata_signature
- language sql
- stable
-as $$
- select om.id as id, o.url as origin_url, discovery_date, tool_id, om.metadata,
- mp.id as provider_id, provider_name, provider_type, provider_url
- from origin_metadata as om
- inner join metadata_provider mp on om.provider_id = mp.id
- inner join origin o on om.origin_id = o.id
- where o.url = origin
- order by discovery_date desc;
-$$;
-
-create or replace function swh_origin_metadata_get_by_provider_type(
- origin_url text,
- provider_type text)
- returns setof origin_metadata_signature
- language sql
- stable
-as $$
- select om.id as id, o.url as origin_url, discovery_date, tool_id, om.metadata,
- mp.id as provider_id, provider_name, provider_type, provider_url
- from origin_metadata as om
- inner join metadata_provider mp on om.provider_id = mp.id
- inner join origin o on om.origin_id = o.id
- where o.url = origin_url
- and mp.provider_type = provider_type
- order by discovery_date desc;
-$$;
--- end origin_metadata functions
-
--- add tmp_tool entries to tool,
--- skipping duplicates if any.
---
--- operates in bulk: 0. create temporary tmp_tool, 1. COPY to
--- it, 2. call this function to insert and filtering out duplicates
-create or replace function swh_tool_add()
- returns setof tool
- language plpgsql
-as $$
-begin
- insert into tool(name, version, configuration)
- select name, version, configuration from tmp_tool tmp
- on conflict(name, version, configuration) do nothing;
-
- return query
- select id, name, version, configuration
- from tmp_tool join tool
- using(name, version, configuration);
-
- return;
-end
-$$;
-
-
-- simple counter mapping a textual label to an integer value
create type counter as (
label text,
diff --git a/swh/storage/sql/60-swh-indexes.sql b/swh/storage/sql/60-swh-indexes.sql
--- a/swh/storage/sql/60-swh-indexes.sql
+++ b/swh/storage/sql/60-swh-indexes.sql
@@ -144,32 +144,32 @@
alter table release add constraint release_author_date_check check ((date is null) or (author is not null)) not valid;
alter table release validate constraint release_author_date_check;
--- tool
-create unique index tool_pkey on tool(id);
-alter table tool add primary key using index tool_pkey;
+-- metadata_fetcher
+create unique index metadata_fetcher_pkey on metadata_fetcher(id);
+alter table metadata_fetcher add primary key using index metadata_fetcher_pkey;
-create unique index on tool(name, version, configuration);
+create unique index metadata_fetcher_name_version on metadata_fetcher(name, version);
--- metadata_provider
-create unique index concurrently metadata_provider_pkey on metadata_provider(id);
-alter table metadata_provider add primary key using index metadata_provider_pkey;
+-- metadata_authority
+create unique index concurrently metadata_authority_pkey on metadata_authority(id);
+alter table metadata_authority add primary key using index metadata_authority_pkey;
-create index concurrently on metadata_provider(provider_name, provider_url);
+create unique index metadata_authority_type_url on metadata_authority(type, url);
-- origin_metadata
create unique index concurrently origin_metadata_pkey on origin_metadata(id);
alter table origin_metadata add primary key using index origin_metadata_pkey;
-create index concurrently on origin_metadata(origin_id, provider_id, tool_id);
+create index concurrently origin_metadata_origin_authority_date on origin_metadata(origin_id, authority_id, discovery_date);
alter table origin_metadata add constraint origin_metadata_origin_fkey foreign key (origin_id) references origin(id) not valid;
alter table origin_metadata validate constraint origin_metadata_origin_fkey;
-alter table origin_metadata add constraint origin_metadata_provider_fkey foreign key (provider_id) references metadata_provider(id) not valid;
-alter table origin_metadata validate constraint origin_metadata_provider_fkey;
+alter table origin_metadata add constraint origin_metadata_authority_fkey foreign key (authority_id) references metadata_authority(id) not valid;
+alter table origin_metadata validate constraint origin_metadata_authority_fkey;
-alter table origin_metadata add constraint origin_metadata_tool_fkey foreign key (tool_id) references tool(id) not valid;
-alter table origin_metadata validate constraint origin_metadata_tool_fkey;
+alter table origin_metadata add constraint origin_metadata_fetcher_fkey foreign key (fetcher_id) references metadata_fetcher(id) not valid;
+alter table origin_metadata validate constraint origin_metadata_fetcher_fkey;
-- object_counts
create unique index concurrently object_counts_pkey on object_counts(object_type);
diff --git a/swh/storage/storage.py b/swh/storage/storage.py
--- a/swh/storage/storage.py
+++ b/swh/storage/storage.py
@@ -6,7 +6,6 @@
import contextlib
import datetime
import itertools
-import json
from collections import defaultdict
from contextlib import contextmanager
@@ -1138,72 +1137,101 @@
@timed
@db_transaction()
def origin_metadata_add(
- self, origin_url, ts, provider, tool, metadata, db=None, cur=None
- ):
- if isinstance(ts, str):
- ts = dateutil.parser.parse(ts)
-
- db.origin_metadata_add(origin_url, ts, provider, tool, metadata, cur)
+ self,
+ origin_url: str,
+ discovery_date: datetime.datetime,
+ authority: Dict[str, Any],
+ fetcher: Dict[str, Any],
+ format: str,
+ metadata: bytes,
+ db=None,
+ cur=None,
+ ) -> None:
+ authority_id = db.metadata_authority_get_id(
+ authority["type"], authority["url"], cur
+ )
+ fetcher_id = db.metadata_fetcher_get_id(
+ fetcher["name"], fetcher["version"], cur
+ )
+ if not authority_id:
+ raise StorageArgumentException(f"Unknown authority {authority}")
+ if not fetcher_id:
+ raise StorageArgumentException(f"Unknown fetcher {fetcher}")
+ db.origin_metadata_add(
+ origin_url, discovery_date, authority_id, fetcher_id, format, metadata, cur
+ )
send_metric("origin_metadata:add", count=1, method_name="origin_metadata_add")
@timed
- @db_transaction_generator(statement_timeout=500)
- def origin_metadata_get_by(self, origin_url, provider_type=None, db=None, cur=None):
- for line in db.origin_metadata_get_by(origin_url, provider_type, cur):
- yield dict(zip(db.origin_metadata_get_cols, line))
+ @db_transaction(statement_timeout=500)
+ def origin_metadata_get(
+ self,
+ origin_url: str,
+ authority: Dict[str, str],
+ after: Optional[datetime.datetime] = None,
+ limit: Optional[int] = None,
+ db=None,
+ cur=None,
+ ) -> List[Dict[str, Any]]:
+ authority_id = db.metadata_authority_get_id(
+ authority["type"], authority["url"], cur
+ )
+ if not authority_id:
+ return []
+ results = []
+ for line in db.origin_metadata_get(origin_url, authority_id, after, limit, cur):
+ row = dict(zip(db.origin_metadata_get_cols, line))
+ results.append(
+ {
+ "origin_url": row.pop("origin.url"),
+ "authority": {
+ "type": row.pop("metadata_authority.type"),
+ "url": row.pop("metadata_authority.url"),
+ },
+ "fetcher": {
+ "name": row.pop("metadata_fetcher.name"),
+ "version": row.pop("metadata_fetcher.version"),
+ },
+ **row,
+ }
+ )
+ return results
@timed
@db_transaction()
- def tool_add(self, tools, db=None, cur=None):
- db.mktemp_tool(cur)
- with convert_validation_exceptions():
- db.copy_to(tools, "tmp_tool", ["name", "version", "configuration"], cur)
- tools = db.tool_add_from_temp(cur)
-
- results = [dict(zip(db.tool_cols, line)) for line in tools]
- send_metric("tool:add", count=len(results), method_name="tool_add")
- return results
+ def metadata_fetcher_add(
+ self, name: str, version: str, metadata: Dict[str, Any], db=None, cur=None
+ ) -> None:
+ db.metadata_fetcher_add(name, version, metadata)
+ send_metric("metadata_fetcher:add", count=1, method_name="metadata_fetcher")
@timed
@db_transaction(statement_timeout=500)
- def tool_get(self, tool, db=None, cur=None):
- tool_conf = tool["configuration"]
- if isinstance(tool_conf, dict):
- tool_conf = json.dumps(tool_conf)
-
- idx = db.tool_get(tool["name"], tool["version"], tool_conf)
- if not idx:
+ def metadata_fetcher_get(
+ self, name: str, version: str, db=None, cur=None
+ ) -> Optional[Dict[str, Any]]:
+ row = db.metadata_fetcher_get(name, version)
+ if not row:
return None
- return dict(zip(db.tool_cols, idx))
-
- @timed
- @db_transaction()
- def metadata_provider_add(
- self, provider_name, provider_type, provider_url, metadata, db=None, cur=None
- ):
- result = db.metadata_provider_add(
- provider_name, provider_type, provider_url, metadata, cur
- )
- send_metric("metadata_provider:add", count=1, method_name="metadata_provider")
- return result
+ return dict(zip(db.metadata_fetcher_cols, row))
@timed
@db_transaction()
- def metadata_provider_get(self, provider_id, db=None, cur=None):
- result = db.metadata_provider_get(provider_id)
- if not result:
- return None
- return dict(zip(db.metadata_provider_cols, result))
+ def metadata_authority_add(
+ self, type: str, url: str, metadata: Dict[str, Any], db=None, cur=None
+ ) -> None:
+ db.metadata_authority_add(type, url, metadata, cur)
+ send_metric("metadata_authority:add", count=1, method_name="metadata_authority")
@timed
@db_transaction()
- def metadata_provider_get_by(self, provider, db=None, cur=None):
- result = db.metadata_provider_get_by(
- provider["provider_name"], provider["provider_url"]
- )
- if not result:
+ def metadata_authority_get(
+ self, type: str, url: str, db=None, cur=None
+ ) -> Optional[Dict[str, Any]]:
+ row = db.metadata_authority_get(type, url)
+ if not row:
return None
- return dict(zip(db.metadata_provider_cols, result))
+ return dict(zip(db.metadata_authority_cols, row))
@timed
def diff_directories(self, from_dir, to_dir, track_renaming=False):
diff --git a/swh/storage/tests/conftest.py b/swh/storage/tests/conftest.py
--- a/swh/storage/tests/conftest.py
+++ b/swh/storage/tests/conftest.py
@@ -248,7 +248,7 @@
"release": [data.release, data.release2, data.release3],
"snapshot": [data.snapshot],
"origin": [data.origin, data.origin2],
- "tool": [data.metadata_tool],
- "provider": [data.provider],
+ "fetcher": [data.metadata_fetcher],
+ "authority": [data.metadata_authority],
"origin_metadata": [data.origin_metadata, data.origin_metadata2],
}
diff --git a/swh/storage/tests/storage_data.py b/swh/storage/tests/storage_data.py
--- a/swh/storage/tests/storage_data.py
+++ b/swh/storage/tests/storage_data.py
@@ -326,17 +326,16 @@
origins = (origin, origin2)
-provider = {
- "name": "hal",
+metadata_authority = {
"type": "deposit-client",
"url": "http:///hal/inria",
"metadata": {"location": "France"},
}
-metadata_tool = {
+metadata_fetcher = {
"name": "swh-deposit",
"version": "0.0.1",
- "configuration": {"sword_version": "2"},
+ "metadata": {"sword_version": "2"},
}
date_visit1 = datetime.datetime(2015, 1, 1, 23, 0, 0, tzinfo=datetime.timezone.utc)
@@ -456,22 +455,36 @@
}
origin_metadata = {
- "origin": origin,
+ "origin_url": origin["url"],
"discovery_date": datetime.datetime(
2015, 1, 1, 23, 0, 0, tzinfo=datetime.timezone.utc
),
- "provider": provider,
- "tool": "swh-deposit",
- "metadata": {"name": "test_origin_metadata", "version": "0.0.1"},
+ "authority": {
+ "type": metadata_authority["type"],
+ "url": metadata_authority["url"],
+ },
+ "fetcher": {
+ "name": metadata_fetcher["name"],
+ "version": metadata_fetcher["version"],
+ },
+ "format": "json",
+ "metadata": b'{"foo": "bar"}',
}
origin_metadata2 = {
- "origin": origin,
+ "origin_url": origin["url"],
"discovery_date": datetime.datetime(
2017, 1, 1, 23, 0, 0, tzinfo=datetime.timezone.utc
),
- "provider": provider,
- "tool": "swh-deposit",
- "metadata": {"name": "test_origin_metadata", "version": "0.0.1"},
+ "authority": {
+ "type": metadata_authority["type"],
+ "url": metadata_authority["url"],
+ },
+ "fetcher": {
+ "name": metadata_fetcher["name"],
+ "version": metadata_fetcher["version"],
+ },
+ "format": "yaml",
+ "metadata": b"foo: bar",
}
person = {
diff --git a/swh/storage/tests/test_cassandra.py b/swh/storage/tests/test_cassandra.py
--- a/swh/storage/tests/test_cassandra.py
+++ b/swh/storage/tests/test_cassandra.py
@@ -335,30 +335,6 @@
def test_person_get(self):
pass
- @pytest.mark.skip("Not yet implemented")
- def test_metadata_provider_add(self):
- pass
-
- @pytest.mark.skip("Not yet implemented")
- def test_metadata_provider_get(self):
- pass
-
- @pytest.mark.skip("Not yet implemented")
- def test_metadata_provider_get_by(self):
- pass
-
- @pytest.mark.skip("Not yet implemented")
- def test_origin_metadata_add(self):
- pass
-
- @pytest.mark.skip("Not yet implemented")
- def test_origin_metadata_get(self):
- pass
-
- @pytest.mark.skip("Not yet implemented")
- def test_origin_metadata_get_by_provider_type(self):
- pass
-
@pytest.mark.skip("Not supported by Cassandra")
def test_origin_count(self):
pass
diff --git a/swh/storage/tests/test_in_memory.py b/swh/storage/tests/test_in_memory.py
--- a/swh/storage/tests/test_in_memory.py
+++ b/swh/storage/tests/test_in_memory.py
@@ -5,6 +5,7 @@
import pytest
+from swh.storage.in_memory import SortedList
from swh.storage.tests.test_storage import TestStorage, TestStorageGeneratedData # noqa
@@ -19,3 +20,51 @@
"cls": "memory",
"journal_writer": {"cls": "memory",},
}
+
+
+parametrize = pytest.mark.parametrize(
+ "items",
+ [
+ [1, 2, 3, 4, 5, 6, 10, 100],
+ [10, 100, 6, 5, 4, 3, 2, 1],
+ [10, 4, 5, 6, 1, 2, 3, 100],
+ ],
+)
+
+
+@parametrize
+def test_sorted_list_iter(items):
+ list1 = SortedList()
+ for item in items:
+ list1.add(item)
+ assert list(list1) == sorted(items)
+
+ list2 = SortedList(items)
+ assert list(list2) == sorted(items)
+
+
+@parametrize
+def test_sorted_list_iter__key(items):
+ list1 = SortedList(key=lambda item: -item)
+ for item in items:
+ list1.add(item)
+ assert list(list1) == list(reversed(sorted(items)))
+
+ list2 = SortedList(items, key=lambda item: -item)
+ assert list(list2) == list(reversed(sorted(items)))
+
+
+@parametrize
+def test_sorted_list_iter_from(items):
+ list_ = SortedList(items)
+ for split in items:
+ expected = sorted(item for item in items if item >= split)
+ assert list(list_.iter_from(split)) == expected, split
+
+
+@parametrize
+def test_sorted_list_iter_from__key(items):
+ list_ = SortedList(items, key=lambda item: -item)
+ for split in items:
+ expected = reversed(sorted(item for item in items if item <= split))
+ assert list(list_.iter_from(-split)) == list(expected), split
diff --git a/swh/storage/tests/test_retry.py b/swh/storage/tests/test_retry.py
--- a/swh/storage/tests/test_retry.py
+++ b/swh/storage/tests/test_retry.py
@@ -3,7 +3,6 @@
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
-from typing import Dict
from unittest.mock import call
import psycopg2
@@ -497,94 +496,78 @@
assert mock_memory.call_count == 1
-def to_provider(provider: Dict) -> Dict:
- return {
- "provider_name": provider["name"],
- "provider_url": provider["url"],
- "provider_type": provider["type"],
- "metadata": provider["metadata"],
- }
-
-
-def test_retrying_proxy_storage_metadata_provider_add(swh_storage, sample_data):
- """Standard metadata_provider_add works as before
+def test_retrying_proxy_storage_metadata_authority_add(swh_storage, sample_data):
+ """Standard metadata_authority_add works as before
"""
- provider = sample_data["provider"][0]
- provider_get = to_provider(provider)
+ authority = sample_data["authority"][0]
- provider = swh_storage.metadata_provider_get_by(provider_get)
- assert not provider
+ assert not swh_storage.metadata_authority_get(authority["type"], authority["url"])
- provider_id = swh_storage.metadata_provider_add(**provider_get)
- assert provider_id
+ swh_storage.metadata_authority_add(**authority)
- actual_provider = swh_storage.metadata_provider_get(provider_id)
- assert actual_provider
- actual_provider_id = actual_provider.pop("id")
- assert actual_provider_id == provider_id
- assert actual_provider == provider_get
+ actual_authority = swh_storage.metadata_authority_get(
+ authority["type"], authority["url"]
+ )
+ assert actual_authority == authority
-def test_retrying_proxy_storage_metadata_provider_add_with_retry(
+def test_retrying_proxy_storage_metadata_authority_add_with_retry(
swh_storage, sample_data, mocker, fake_hash_collision
):
"""Multiple retries for hash collision and psycopg2 error but finally ok
"""
- provider = sample_data["provider"][0]
- provider_get = to_provider(provider)
+ authority = sample_data["authority"][0]
mock_memory = mocker.patch(
- "swh.storage.in_memory.InMemoryStorage.metadata_provider_add"
+ "swh.storage.in_memory.InMemoryStorage.metadata_authority_add"
)
mock_memory.side_effect = [
# first try goes ko
fake_hash_collision,
# second try goes ko
- psycopg2.IntegrityError("provider_id already inserted"),
+ psycopg2.IntegrityError("foo bar"),
# ok then!
- "provider_id",
+ None,
]
+ from swh.storage.retry import RetryingProxyStorage # noqa
+
mock_sleep = mocker.patch(
- "swh.storage.retry.RetryingProxyStorage" ".metadata_provider_add.retry.sleep"
+ "swh.storage.retry.RetryingProxyStorage" ".metadata_authority_add.retry.sleep"
)
- provider = swh_storage.metadata_provider_get_by(provider_get)
- assert not provider
+ assert not swh_storage.metadata_authority_get(authority["type"], authority["url"])
- provider_id = swh_storage.metadata_provider_add(**provider_get)
- assert provider_id == "provider_id"
+ swh_storage.metadata_authority_add(**authority)
- provider_arg_names = ("provider_name", "provider_type", "provider_url", "metadata")
- provider_args = [provider_get[key] for key in provider_arg_names]
+ authority_arg_names = ("type", "url", "metadata")
+ authority_args = [authority[key] for key in authority_arg_names]
mock_memory.assert_has_calls(
- [call(*provider_args), call(*provider_args), call(*provider_args),]
+ [call(*authority_args), call(*authority_args), call(*authority_args),]
)
assert mock_sleep.call_count == 2
-def test_retrying_proxy_swh_storage_metadata_provider_add_failure(
+def test_retrying_proxy_swh_storage_metadata_authority_add_failure(
swh_storage, sample_data, mocker
):
"""Unfiltered errors are raising without retry
"""
mock_memory = mocker.patch(
- "swh.storage.in_memory.InMemoryStorage.metadata_provider_add"
+ "swh.storage.in_memory.InMemoryStorage.metadata_authority_add"
)
mock_memory.side_effect = StorageArgumentException(
- "Refuse to add provider_id always!"
+ "Refuse to add authority_id always!"
)
- provider = sample_data["provider"][0]
- provider_get = to_provider(provider)
+ authority = sample_data["authority"][0]
- provider_id = swh_storage.metadata_provider_get_by(provider_get)
- assert not provider_id
+ swh_storage.metadata_authority_get(authority["type"], authority["url"])
with pytest.raises(StorageArgumentException, match="Refuse to add"):
- swh_storage.metadata_provider_add(**provider_get)
+ swh_storage.metadata_authority_add(**authority)
assert mock_memory.call_count == 1
@@ -594,23 +577,20 @@
"""
ori_meta = sample_data["origin_metadata"][0]
- origin = ori_meta["origin"]
- swh_storage.origin_add_one(origin)
- provider_get = to_provider(ori_meta["provider"])
- provider_id = swh_storage.metadata_provider_add(**provider_get)
+ swh_storage.origin_add_one({"url": ori_meta["origin_url"]})
+ swh_storage.metadata_authority_add(**sample_data["authority"][0])
+ swh_storage.metadata_fetcher_add(**sample_data["fetcher"][0])
- origin_metadata = swh_storage.origin_metadata_get_by(origin["url"])
+ origin_metadata = swh_storage.origin_metadata_get(
+ ori_meta["origin_url"], ori_meta["authority"]
+ )
assert not origin_metadata
- swh_storage.origin_metadata_add(
- origin["url"],
- ori_meta["discovery_date"],
- provider_id,
- ori_meta["tool"],
- ori_meta["metadata"],
- )
+ swh_storage.origin_metadata_add(**ori_meta)
- origin_metadata = swh_storage.origin_metadata_get_by(origin["url"])
+ origin_metadata = swh_storage.origin_metadata_get(
+ ori_meta["origin_url"], ori_meta["authority"]
+ )
assert origin_metadata
@@ -621,10 +601,9 @@
"""
ori_meta = sample_data["origin_metadata"][0]
- origin = ori_meta["origin"]
- swh_storage.origin_add_one(origin)
- provider_get = to_provider(ori_meta["provider"])
- provider_id = swh_storage.metadata_provider_add(**provider_get)
+ swh_storage.origin_add_one({"url": ori_meta["origin_url"]})
+ swh_storage.metadata_authority_add(**sample_data["authority"][0])
+ swh_storage.metadata_fetcher_add(**sample_data["fetcher"][0])
mock_memory = mocker.patch(
"swh.storage.in_memory.InMemoryStorage.origin_metadata_add"
)
@@ -633,7 +612,7 @@
# first try goes ko
fake_hash_collision,
# second try goes ko
- psycopg2.IntegrityError("provider_id already inserted"),
+ psycopg2.IntegrityError("foo bar"),
# ok then!
None,
]
@@ -642,19 +621,35 @@
"swh.storage.retry.RetryingProxyStorage" ".origin_metadata_add.retry.sleep"
)
- url = origin["url"]
- ts = ori_meta["discovery_date"]
- tool_id = ori_meta["tool"]
- metadata = ori_meta["metadata"]
-
# No exception raised as insertion finally came through
- swh_storage.origin_metadata_add(url, ts, provider_id, tool_id, metadata)
+ swh_storage.origin_metadata_add(**ori_meta)
mock_memory.assert_has_calls(
[ # 3 calls, as long as error raised
- call(url, ts, provider_id, tool_id, metadata),
- call(url, ts, provider_id, tool_id, metadata),
- call(url, ts, provider_id, tool_id, metadata),
+ call(
+ ori_meta["origin_url"],
+ ori_meta["discovery_date"],
+ ori_meta["authority"],
+ ori_meta["fetcher"],
+ ori_meta["format"],
+ ori_meta["metadata"],
+ ),
+ call(
+ ori_meta["origin_url"],
+ ori_meta["discovery_date"],
+ ori_meta["authority"],
+ ori_meta["fetcher"],
+ ori_meta["format"],
+ ori_meta["metadata"],
+ ),
+ call(
+ ori_meta["origin_url"],
+ ori_meta["discovery_date"],
+ ori_meta["authority"],
+ ori_meta["fetcher"],
+ ori_meta["format"],
+ ori_meta["metadata"],
+ ),
]
)
assert mock_sleep.call_count == 2
@@ -672,17 +667,10 @@
mock_memory.side_effect = StorageArgumentException("Refuse to add always!")
ori_meta = sample_data["origin_metadata"][0]
- origin = ori_meta["origin"]
- swh_storage.origin_add_one(origin)
-
- url = origin["url"]
- ts = ori_meta["discovery_date"]
- provider_id = "provider_id"
- tool_id = ori_meta["tool"]
- metadata = ori_meta["metadata"]
+ swh_storage.origin_add_one({"url": ori_meta["origin_url"]})
with pytest.raises(StorageArgumentException, match="Refuse to add"):
- swh_storage.origin_metadata_add(url, ts, provider_id, tool_id, metadata)
+ swh_storage.origin_metadata_add(**ori_meta)
assert mock_memory.call_count == 1
diff --git a/swh/storage/tests/test_storage.py b/swh/storage/tests/test_storage.py
--- a/swh/storage/tests/test_storage.py
+++ b/swh/storage/tests/test_storage.py
@@ -3170,353 +3170,88 @@
assert expected == ret
- def test_tool_add(self, swh_storage):
- tool = {
- "name": "some-unknown-tool",
- "version": "some-version",
- "configuration": {"debian-package": "some-package"},
- }
-
- actual_tool = swh_storage.tool_get(tool)
- assert actual_tool is None # does not exist
-
- # add it
- actual_tools = swh_storage.tool_add([tool])
-
- assert len(actual_tools) == 1
- actual_tool = actual_tools[0]
- assert actual_tool is not None # now it exists
- new_id = actual_tool.pop("id")
- assert actual_tool == tool
-
- actual_tools2 = swh_storage.tool_add([tool])
- actual_tool2 = actual_tools2[0]
- assert actual_tool2 is not None # now it exists
- new_id2 = actual_tool2.pop("id")
-
- assert new_id == new_id2
- assert actual_tool == actual_tool2
-
- def test_tool_add_multiple(self, swh_storage):
- tool = {
- "name": "some-unknown-tool",
- "version": "some-version",
- "configuration": {"debian-package": "some-package"},
- }
-
- actual_tools = list(swh_storage.tool_add([tool]))
- assert len(actual_tools) == 1
-
- new_tools = [
- tool,
- {"name": "yet-another-tool", "version": "version", "configuration": {},},
- ]
-
- actual_tools = swh_storage.tool_add(new_tools)
- assert len(actual_tools) == 2
-
- # order not guaranteed, so we iterate over results to check
- for tool in actual_tools:
- _id = tool.pop("id")
- assert _id is not None
- assert tool in new_tools
-
- def test_tool_get_missing(self, swh_storage):
- tool = {
- "name": "unknown-tool",
- "version": "3.1.0rc2-31-ga2cbb8c",
- "configuration": {"command_line": "nomossa <filepath>"},
- }
-
- actual_tool = swh_storage.tool_get(tool)
-
- assert actual_tool is None
-
- def test_tool_metadata_get_missing_context(self, swh_storage):
- tool = {
- "name": "swh-metadata-translator",
- "version": "0.0.1",
- "configuration": {"context": "unknown-context"},
- }
-
- actual_tool = swh_storage.tool_get(tool)
-
- assert actual_tool is None
+ def test_metadata_fetcher_add_get(self, swh_storage):
+ actual_fetcher = swh_storage.metadata_fetcher_get(
+ data.metadata_fetcher["name"], data.metadata_fetcher["version"]
+ )
+ assert actual_fetcher is None # does not exist
- def test_tool_metadata_get(self, swh_storage):
- tool = {
- "name": "swh-metadata-translator",
- "version": "0.0.1",
- "configuration": {"type": "local", "context": "npm"},
- }
- expected_tool = swh_storage.tool_add([tool])[0]
+ swh_storage.metadata_fetcher_add(**data.metadata_fetcher)
- # when
- actual_tool = swh_storage.tool_get(tool)
+ res = swh_storage.metadata_fetcher_get(
+ data.metadata_fetcher["name"], data.metadata_fetcher["version"]
+ )
- # then
- assert expected_tool == actual_tool
+ assert res is not data.metadata_fetcher
+ assert res == data.metadata_fetcher
- def test_metadata_provider_get(self, swh_storage):
- # given
- no_provider = swh_storage.metadata_provider_get(6459456445615)
- assert no_provider is None
- # when
- provider_id = swh_storage.metadata_provider_add(
- data.provider["name"],
- data.provider["type"],
- data.provider["url"],
- data.provider["metadata"],
+ def test_metadata_authority_add_get(self, swh_storage):
+ actual_authority = swh_storage.metadata_authority_get(
+ data.metadata_authority["type"], data.metadata_authority["url"]
)
+ assert actual_authority is None # does not exist
- actual_provider = swh_storage.metadata_provider_get(provider_id)
- expected_provider = {
- "provider_name": data.provider["name"],
- "provider_url": data.provider["url"],
- }
- # then
- del actual_provider["id"]
- assert actual_provider, expected_provider
+ swh_storage.metadata_authority_add(**data.metadata_authority)
- def test_metadata_provider_get_by(self, swh_storage):
- # given
- no_provider = swh_storage.metadata_provider_get_by(
- {
- "provider_name": data.provider["name"],
- "provider_url": data.provider["url"],
- }
- )
- assert no_provider is None
- # when
- provider_id = swh_storage.metadata_provider_add(
- data.provider["name"],
- data.provider["type"],
- data.provider["url"],
- data.provider["metadata"],
+ res = swh_storage.metadata_authority_get(
+ data.metadata_authority["type"], data.metadata_authority["url"]
)
- actual_provider = swh_storage.metadata_provider_get_by(
- {
- "provider_name": data.provider["name"],
- "provider_url": data.provider["url"],
- }
- )
- # then
- assert provider_id, actual_provider["id"]
+ assert res is not data.metadata_authority
+ assert res == data.metadata_authority
def test_origin_metadata_add(self, swh_storage):
- # given
origin = data.origin
+ fetcher = data.metadata_fetcher
+ authority = data.metadata_authority
swh_storage.origin_add([origin])[0]
- tools = swh_storage.tool_add([data.metadata_tool])
- tool = tools[0]
+ swh_storage.metadata_fetcher_add(**fetcher)
+ swh_storage.metadata_authority_add(**authority)
- swh_storage.metadata_provider_add(
- data.provider["name"],
- data.provider["type"],
- data.provider["url"],
- data.provider["metadata"],
- )
- provider = swh_storage.metadata_provider_get_by(
- {
- "provider_name": data.provider["name"],
- "provider_url": data.provider["url"],
- }
- )
-
- # when adding for the same origin 2 metadatas
- n_om = len(list(swh_storage.origin_metadata_get_by(origin["url"])))
- swh_storage.origin_metadata_add(
- origin["url"],
- data.origin_metadata["discovery_date"],
- provider["id"],
- tool["id"],
- data.origin_metadata["metadata"],
- )
- swh_storage.origin_metadata_add(
- origin["url"],
- "2015-01-01 23:00:00+00",
- provider["id"],
- tool["id"],
- data.origin_metadata2["metadata"],
- )
- n_actual_om = len(list(swh_storage.origin_metadata_get_by(origin["url"])))
- # then
- assert n_actual_om == n_om + 2
+ print(origin)
+ print(data.origin_metadata)
+ swh_storage.origin_metadata_add(**data.origin_metadata)
+ swh_storage.origin_metadata_add(**data.origin_metadata2)
- def test_origin_metadata_get(self, swh_storage):
- # given
- origin_url = data.origin["url"]
- origin_url2 = data.origin2["url"]
- swh_storage.origin_add([data.origin])
- swh_storage.origin_add([data.origin2])
+ swh_storage.origin_metadata_get(origin["url"], authority)
- swh_storage.metadata_provider_add(
- data.provider["name"],
- data.provider["type"],
- data.provider["url"],
- data.provider["metadata"],
- )
- provider = swh_storage.metadata_provider_get_by(
- {
- "provider_name": data.provider["name"],
- "provider_url": data.provider["url"],
- }
- )
- tool = swh_storage.tool_add([data.metadata_tool])[0]
- # when adding for the same origin 2 metadatas
- swh_storage.origin_metadata_add(
- origin_url,
- data.origin_metadata["discovery_date"],
- provider["id"],
- tool["id"],
- data.origin_metadata["metadata"],
- )
- swh_storage.origin_metadata_add(
- origin_url2,
- data.origin_metadata2["discovery_date"],
- provider["id"],
- tool["id"],
- data.origin_metadata2["metadata"],
- )
- swh_storage.origin_metadata_add(
- origin_url,
- data.origin_metadata2["discovery_date"],
- provider["id"],
- tool["id"],
- data.origin_metadata2["metadata"],
- )
- all_metadatas = list(
+ assert [data.origin_metadata, data.origin_metadata2] == list(
sorted(
- swh_storage.origin_metadata_get_by(origin_url),
+ swh_storage.origin_metadata_get(origin["url"], authority),
key=lambda x: x["discovery_date"],
)
)
- metadatas_for_origin2 = list(swh_storage.origin_metadata_get_by(origin_url2))
- expected_results = [
- {
- "origin_url": origin_url,
- "discovery_date": datetime.datetime(
- 2015, 1, 1, 23, 0, tzinfo=datetime.timezone.utc
- ),
- "metadata": {"name": "test_origin_metadata", "version": "0.0.1"},
- "provider_id": provider["id"],
- "provider_name": "hal",
- "provider_type": "deposit-client",
- "provider_url": "http:///hal/inria",
- "tool_id": tool["id"],
- },
- {
- "origin_url": origin_url,
- "discovery_date": datetime.datetime(
- 2017, 1, 1, 23, 0, tzinfo=datetime.timezone.utc
- ),
- "metadata": {"name": "test_origin_metadata", "version": "0.0.1"},
- "provider_id": provider["id"],
- "provider_name": "hal",
- "provider_type": "deposit-client",
- "provider_url": "http:///hal/inria",
- "tool_id": tool["id"],
- },
- ]
- # then
- assert len(all_metadatas) == 2
- assert len(metadatas_for_origin2) == 1
- assert all_metadatas == expected_results
-
- def test_metadata_provider_add(self, swh_storage):
- provider = {
- "provider_name": "swMATH",
- "provider_type": "registry",
- "provider_url": "http://www.swmath.org/",
- "metadata": {
- "email": "contact@swmath.org",
- "license": "All rights reserved",
- },
- }
- provider["id"] = provider_id = swh_storage.metadata_provider_add(**provider)
- assert provider == swh_storage.metadata_provider_get_by(
- {"provider_name": "swMATH", "provider_url": "http://www.swmath.org/"}
- )
- assert provider == swh_storage.metadata_provider_get(provider_id)
-
- def test_origin_metadata_get_by_provider_type(self, swh_storage):
- # given
- origin_url = data.origin["url"]
+ def test_origin_metadata_get(self, swh_storage):
+ authority = data.metadata_authority
+ fetcher = data.metadata_fetcher
+ origin_url1 = data.origin["url"]
origin_url2 = data.origin2["url"]
swh_storage.origin_add([data.origin])
swh_storage.origin_add([data.origin2])
- provider1_id = swh_storage.metadata_provider_add(
- data.provider["name"],
- data.provider["type"],
- data.provider["url"],
- data.provider["metadata"],
- )
- provider1 = swh_storage.metadata_provider_get_by(
- {
- "provider_name": data.provider["name"],
- "provider_url": data.provider["url"],
- }
- )
- assert provider1 == swh_storage.metadata_provider_get(provider1_id)
- provider2_id = swh_storage.metadata_provider_add(
- "swMATH",
- "registry",
- "http://www.swmath.org/",
- {"email": "contact@swmath.org", "license": "All rights reserved"},
- )
- provider2 = swh_storage.metadata_provider_get_by(
- {"provider_name": "swMATH", "provider_url": "http://www.swmath.org/"}
- )
- assert provider2 == swh_storage.metadata_provider_get(provider2_id)
+ origin1_metadata1 = data.origin_metadata
+ origin1_metadata2 = data.origin_metadata2
+ origin2_metadata = {**data.origin_metadata2, "origin_url": origin_url2}
- # using the only tool now inserted in the data.sql, but for this
- # provider should be a crawler tool (not yet implemented)
- tool = swh_storage.tool_add([data.metadata_tool])[0]
+ swh_storage.metadata_authority_add(**authority)
+ swh_storage.metadata_fetcher_add(**fetcher)
- # when adding for the same origin 2 metadatas
- swh_storage.origin_metadata_add(
- origin_url,
- data.origin_metadata["discovery_date"],
- provider1["id"],
- tool["id"],
- data.origin_metadata["metadata"],
- )
- swh_storage.origin_metadata_add(
- origin_url2,
- data.origin_metadata2["discovery_date"],
- provider2["id"],
- tool["id"],
- data.origin_metadata2["metadata"],
- )
- provider_type = "registry"
- m_by_provider = list(
- swh_storage.origin_metadata_get_by(origin_url2, provider_type)
- )
- for item in m_by_provider:
- if "id" in item:
- del item["id"]
- expected_results = [
- {
- "origin_url": origin_url2,
- "discovery_date": datetime.datetime(
- 2017, 1, 1, 23, 0, tzinfo=datetime.timezone.utc
- ),
- "metadata": {"name": "test_origin_metadata", "version": "0.0.1"},
- "provider_id": provider2["id"],
- "provider_name": "swMATH",
- "provider_type": provider_type,
- "provider_url": "http://www.swmath.org/",
- "tool_id": tool["id"],
- }
- ]
- # then
+ swh_storage.origin_metadata_add(**origin1_metadata1)
+ swh_storage.origin_metadata_add(**origin1_metadata2)
+ swh_storage.origin_metadata_add(**origin2_metadata)
- assert len(m_by_provider) == 1
- assert m_by_provider == expected_results
+ assert [origin1_metadata1, origin1_metadata2] == list(
+ sorted(
+ swh_storage.origin_metadata_get(origin_url1, authority),
+ key=lambda x: x["discovery_date"],
+ )
+ )
+
+ assert [origin2_metadata] == list(
+ swh_storage.origin_metadata_get(origin_url2, authority)
+ )
class TestStorageGeneratedData:

File Metadata

Mime Type
text/plain
Expires
Thu, Jul 3, 12:20 PM (2 w, 4 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3222747

Event Timeline