diff --git a/docs/extrinsic-metadata-specification.rst b/docs/extrinsic-metadata-specification.rst --- a/docs/extrinsic-metadata-specification.rst +++ b/docs/extrinsic-metadata-specification.rst @@ -32,11 +32,11 @@ An authority is uniquely defined by these properties: * its type, representing the kind of authority, which is one of these values: - * `deposit`, for metadata pushed to Software Heritage at the same time - as a software artifact - * `forge`, for metadata pulled from the same source as the one hosting - the software artifacts (which includes package managers) - * `registry`, for metadata pulled from a third-party + * `deposit`, for metadata pushed to Software Heritage at the same time + as a software artifact + * `forge`, for metadata pulled from the same source as the one hosting + the software artifacts (which includes package managers) + * `registry`, for metadata pulled from a third-party * its URL, which unambiguously identifies an instance of the authority type. Examples: @@ -145,6 +145,7 @@ added from this origin, in the format:: { + 'origin_url': ..., 'authority': {'type': ..., 'url': ...}, 'fetcher': {'name': ..., 'version': ...}, 'discovery_date': ..., diff --git a/sql/upgrades/148.sql b/sql/upgrades/148.sql new file mode 100644 --- /dev/null +++ b/sql/upgrades/148.sql @@ -0,0 +1,93 @@ +-- SWH DB schema upgrade +-- from_version: 147 +-- to_version: 148 +-- description: Implement extrinsic origin-metadata specification + +-- latest schema version +insert into dbversion(version, release, description) + values(148, now(), 'Work In Progress'); + +-- metadata_fetcher + +alter table tool rename metadata_fetcher; +comment on table metadata_fetcher is 'Tools used to retrieve metadata'; + +alter table metadata_fetcher + rename column configuration to metadata; + +comment on column metadata_fetcher.id is 'Internal identifier of the fetcher'; +comment on column metadata_fetcher.name is 'Fetcher name'; +comment on column metadata_fetcher.version is 'Fetcher version'; +comment on column metadata_fetcher.metadata is 'Extra information about the fetcher'; + +alter index tool_pkey + rename to metadata_fetcher_pkey; +create unique index metadata_fetcher_name_version + on metadata_fetcher(name, version); + +drop index tool_tool_name_tool_version_tool_configuration_idx; + -- was an index on (name, version, configuration) + +-- metadata_authority + +alter table metadata_provider rename metadata_authority; +comment on table metadata_authority is 'Metadata authority information'; + +alter table metadata_authority + drop column provider_name; +alter table metadata_authority + rename column provider_type to type; +alter table metadata_authority + rename column provider_url to url; + +comment on column metadata_authority.id is 'Internal identifier of the authority'; +comment on column metadata_authority.type is 'Type of authority (deposit/forge/registry)'; +comment on column metadata_authority.url is 'Authority''s uri'; +comment on column metadata_authority.metadata is 'Other metadata about authority'; + +alter index metadata_provider_pkey + rename to metadata_authority_pkey +create unique index metadata_authority_type_url + on metadata_authority(type, url); + +drop index metadata_provider_provider_name_provider_url_idx; + -- was an index on (provider_name, provider_url) + +-- origin_metadata + +alter table origin_metadata + rename column provider_id to authority_id; +alter table origin_metadata + rename column tool_id to fetcher_id; +alter table origin_metadata + add column format text default 'sword-v2-atom-codemeta-v2-in-json'; +alter table origin_metadata + rename column metadata to metadata_jsonb; +alter table origin_metadata + add column metadata; + +create index concurrently origin_metadata_origin_authority_date + on origin_metadata(origin_id, authority_id, discovery_date); + +drop index origin_metadata_origin_id_provider_id_tool_id_idx; + -- was an index on (origin_id, provider_id, tool_id) + +-- migrate metadata_jsonb (a jsonb) to metadata (a bytea) with an external process + +alter table origin_metadata + drop column metadata_jsonb; + +comment on column origin_metadata.authority_id is 'the metadata provider: github, openhub, deposit, etc.'; +comment on column origin_metadata.fetcher_id is 'the tool used for extracting metadata: loaders, crawlers, etc.'; +comment on column origin_metadata.format is 'name of the format of metadata, used by readers to interpret it.' +comment on column origin_metadata.metadata is 'original metadata in opaque format'; + + +-- cleanup unused functions + +drop function swh_mktemp_tool; +drop function swh_tool_add + +drop function swh_origin_metadata_get_by_origin; +drop function swh_origin_metadata_get_by_provider_type; +drop type origin_metadata_signature; diff --git a/swh/storage/cassandra/cql.py b/swh/storage/cassandra/cql.py --- a/swh/storage/cassandra/cql.py +++ b/swh/storage/cassandra/cql.py @@ -785,34 +785,100 @@ yield from self._origin_visit_iter_to(start_token) ########################## - # 'tool' table + # 'metadata_authority' table ########################## - _tool_keys = ["id", "name", "version", "configuration"] + _metadata_authority_keys = ["url", "type", "metadata"] - @_prepared_insert_statement("tool_by_uuid", _tool_keys) - def tool_by_uuid_add_one(self, tool: Dict[str, Any], *, statement) -> None: - self._execute_with_retries(statement, [tool[key] for key in self._tool_keys]) + @_prepared_insert_statement("metadata_authority", _metadata_authority_keys) + def metadata_authority_add(self, url, type, metadata, *, statement): + return self._execute_with_retries(statement, [url, type, metadata]) - @_prepared_insert_statement("tool", _tool_keys) - def tool_add_one(self, tool: Dict[str, Any], *, statement) -> None: - self._execute_with_retries(statement, [tool[key] for key in self._tool_keys]) - self._increment_counter("tool", 1) + @_prepared_statement("SELECT * from metadata_authority WHERE type = ? AND url = ?") + def metadata_authority_get(self, type, url, *, statement) -> Optional[Row]: + return next(iter(self._execute_with_retries(statement, [type, url])), None) + + ########################## + # 'metadata_fetcher' table + ########################## + + _metadata_fetcher_keys = ["name", "version", "metadata"] + + @_prepared_insert_statement("metadata_fetcher", _metadata_fetcher_keys) + def metadata_fetcher_add(self, name, version, metadata, *, statement): + return self._execute_with_retries(statement, [name, version, metadata]) @_prepared_statement( - "SELECT id FROM tool " "WHERE name = ? AND version = ? " "AND configuration = ?" + "SELECT * from metadata_fetcher WHERE name = ? AND version = ?" ) - def tool_get_one_uuid( - self, name: str, version: str, configuration: Dict[str, Any], *, statement - ) -> Optional[str]: - rows = list( - self._execute_with_retries(statement, [name, version, configuration]) + def metadata_fetcher_get(self, name, version, *, statement) -> Optional[Row]: + return next(iter(self._execute_with_retries(statement, [name, version])), None) + + ########################## + # 'origin_metadata' table + ########################## + + _origin_metadata_keys = [ + "origin", + "authority_type", + "authority_url", + "discovery_date", + "fetcher_name", + "fetcher_version", + "format", + "metadata", + ] + + @_prepared_insert_statement("origin_metadata", _origin_metadata_keys) + def origin_metadata_add( + self, + origin, + authority_type, + authority_url, + discovery_date, + fetcher_name, + fetcher_version, + format, + metadata, + *, + statement, + ): + return self._execute_with_retries( + statement, + [ + origin, + authority_type, + authority_url, + discovery_date, + fetcher_name, + fetcher_version, + format, + metadata, + ], + ) + + @_prepared_statement( + "SELECT * from origin_metadata " + "WHERE origin=? AND authority_url=? AND discovery_date>=? " + "AND authority_type=?" + ) + def origin_metadata_get_after( + self, origin, authority_type, authority_url, after, *, statement + ): + return self._execute_with_retries( + statement, [origin, authority_url, after, authority_type] + ) + + @_prepared_statement( + "SELECT * from origin_metadata " + "WHERE origin=? AND authority_url=? AND authority_type=?" + ) + def origin_metadata_get( + self, origin, authority_type, authority_url, *, statement + ) -> Iterable[Row]: + return self._execute_with_retries( + statement, [origin, authority_url, authority_type] ) - if rows: - assert len(rows) == 1 - return rows[0].id - else: - return None ########################## # Miscellaneous diff --git a/swh/storage/cassandra/schema.py b/swh/storage/cassandra/schema.py --- a/swh/storage/cassandra/schema.py +++ b/swh/storage/cassandra/schema.py @@ -166,21 +166,37 @@ ); -CREATE TABLE IF NOT EXISTS tool_by_uuid ( - id timeuuid PRIMARY KEY, - name ascii, - version ascii, - configuration blob, +CREATE TABLE IF NOT EXISTS metadata_authority ( + url text, + type ascii, + metadata text, + PRIMARY KEY ((url), type) ); -CREATE TABLE IF NOT EXISTS tool ( - id timeuuid, +CREATE TABLE IF NOT EXISTS metadata_fetcher ( name ascii, version ascii, - configuration blob, - PRIMARY KEY ((name, version, configuration)) -) + metadata text, + PRIMARY KEY ((name), version) +); + + +CREATE TABLE IF NOT EXISTS origin_metadata ( + origin text, + authority_type text, + authority_url text, + discovery_date timestamp, + fetcher_name ascii, + fetcher_version ascii, + format ascii, + metadata blob, + PRIMARY KEY ((origin), authority_type, authority_url, discovery_date, + fetcher_name, fetcher_version), + -- for now, authority_url could be in the partition key; but leaving + -- in the partition key allows listing authorities with metadata on an + -- origin if we ever need to do it. +); CREATE TABLE IF NOT EXISTS object_count ( @@ -211,7 +227,8 @@ TABLES = ( "skipped_content content revision revision_parent release " "directory directory_entry snapshot snapshot_branch " - "origin_visit origin tool_by_uuid tool object_count" + "origin_visit origin metadata_authority metadata_fetcher " + "object_count" ).split() HASH_ALGORITHMS = ["sha1", "sha1_git", "sha256", "blake2s256"] diff --git a/swh/storage/cassandra/storage.py b/swh/storage/cassandra/storage.py --- a/swh/storage/cassandra/storage.py +++ b/swh/storage/cassandra/storage.py @@ -4,11 +4,11 @@ # See top-level LICENSE file for more information import datetime +import itertools import json import random import re from typing import Any, Dict, List, Iterable, Optional, Union -import uuid import attr import dateutil @@ -931,37 +931,6 @@ else: return None - def tool_add(self, tools): - inserted = [] - for tool in tools: - tool = tool.copy() - tool_json = tool.copy() - tool_json["configuration"] = json.dumps( - tool["configuration"], sort_keys=True - ).encode() - id_ = self._cql_runner.tool_get_one_uuid(**tool_json) - if not id_: - id_ = uuid.uuid1() - tool_json["id"] = id_ - self._cql_runner.tool_by_uuid_add_one(tool_json) - self._cql_runner.tool_add_one(tool_json) - tool["id"] = id_ - inserted.append(tool) - return inserted - - def tool_get(self, tool): - id_ = self._cql_runner.tool_get_one_uuid( - tool["name"], - tool["version"], - json.dumps(tool["configuration"], sort_keys=True).encode(), - ) - if id_: - tool = tool.copy() - tool["id"] = id_ - return tool - else: - return None - def stat_counters(self): rows = self._cql_runner.stat_counters() keys = ( @@ -981,27 +950,109 @@ def refresh_stat_counters(self): pass - def origin_metadata_add(self, origin_url, ts, provider, tool, metadata): - # TODO - raise NotImplementedError("not yet supported for Cassandra") + def origin_metadata_add( + self, + origin_url: str, + discovery_date: datetime.datetime, + authority: Dict[str, Any], + fetcher: Dict[str, Any], + format: str, + metadata: bytes, + ) -> None: + if not isinstance(origin_url, str): + raise StorageArgumentException( + "origin_id must be str, not %r" % (origin_url,) + ) + if not self._cql_runner.metadata_authority_get(**authority): + raise StorageArgumentException(f"Unknown authority {authority}") + if not self._cql_runner.metadata_fetcher_get(**fetcher): + raise StorageArgumentException(f"Unknown fetcher {fetcher}") + + self._cql_runner.origin_metadata_add( + origin_url, + authority["type"], + authority["url"], + discovery_date, + fetcher["name"], + fetcher["version"], + format, + metadata, + ) - def origin_metadata_get_by(self, origin_url, provider_type=None): - # TODO - raise NotImplementedError("not yet supported for Cassandra") + def origin_metadata_get( + self, + origin_url: str, + authority: Dict[str, str], + after: Optional[datetime.datetime] = None, + limit: Optional[int] = None, + ) -> List[Dict[str, Any]]: + if not isinstance(origin_url, str): + raise TypeError("origin_url must be str, not %r" % (origin_url,)) + + if after is None: + entries = self._cql_runner.origin_metadata_get( + origin_url, authority["type"], authority["url"] + ) + else: + entries = self._cql_runner.origin_metadata_get_after( + origin_url, authority["type"], authority["url"], after + ) - def metadata_provider_add( - self, provider_name, provider_type, provider_url, metadata - ): - # TODO - raise NotImplementedError("not yet supported for Cassandra") + if limit: + entries = itertools.islice(entries, 0, limit) - def metadata_provider_get(self, provider_id): - # TODO - raise NotImplementedError("not yet supported for Cassandra") + results = [] + for entry in entries: + discovery_date = entry.discovery_date.replace(tzinfo=datetime.timezone.utc) + results.append( + { + "origin_url": entry.origin, + "authority": { + "type": entry.authority_type, + "url": entry.authority_url, + }, + "fetcher": { + "name": entry.fetcher_name, + "version": entry.fetcher_version, + }, + "discovery_date": discovery_date, + "format": entry.format, + "metadata": entry.metadata, + } + ) + return results + + def metadata_fetcher_add( + self, name: str, version: str, metadata: Dict[str, Any] + ) -> None: + self._cql_runner.metadata_fetcher_add(name, version, json.dumps(metadata)) + + def metadata_fetcher_get(self, name: str, version: str) -> Optional[Dict[str, Any]]: + fetcher = self._cql_runner.metadata_fetcher_get(name, version) + if fetcher: + return { + "name": fetcher.name, + "version": fetcher.version, + "metadata": json.loads(fetcher.metadata), + } + else: + return None - def metadata_provider_get_by(self, provider): - # TODO - raise NotImplementedError("not yet supported for Cassandra") + def metadata_authority_add( + self, type: str, url: str, metadata: Dict[str, Any] + ) -> None: + self._cql_runner.metadata_authority_add(url, type, json.dumps(metadata)) + + def metadata_authority_get(self, type: str, url: str) -> Optional[Dict[str, Any]]: + authority = self._cql_runner.metadata_authority_get(type, url) + if authority: + return { + "type": authority.type, + "url": authority.url, + "metadata": json.loads(authority.metadata), + } + else: + return None def clear_buffers(self, object_types: Optional[Iterable[str]] = None) -> None: """Do nothing diff --git a/swh/storage/db.py b/swh/storage/db.py --- a/swh/storage/db.py +++ b/swh/storage/db.py @@ -3,8 +3,10 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +import datetime import random import select +from typing import Optional from swh.core.db import BaseDb from swh.core.db.db_utils import stored_procedure, jsonize @@ -997,134 +999,150 @@ def release_get_random(self, cur=None): return self._get_random_row_from_table("release", ["id"], "id", cur) - def origin_metadata_add(self, origin, ts, provider, tool, metadata, cur=None): + origin_metadata_get_cols = [ + "origin.url", + "discovery_date", + "metadata_authority.type", + "metadata_authority.url", + "metadata_fetcher.name", + "metadata_fetcher.version", + "format", + "metadata", + ] + + def origin_metadata_add( + self, + origin: str, + discovery_date: datetime.datetime, + authority: int, + fetcher: int, + format: str, + metadata: bytes, + cur=None, + ) -> None: """ Add an origin_metadata for the origin at ts with provider, tool and metadata. Args: - origin (int): the origin's id for which the metadata is added - ts (datetime): time when the metadata was found - provider (int): the metadata provider identifier - tool (int): the tool's identifier used to extract metadata - metadata (jsonb): the metadata retrieved at the time and location - - Returns: - id (int): the origin_metadata unique id - + origin: the origin's id for which the metadata is added + discovery_date: time when the metadata was found + authority: the metadata provider identifier + fetcher: the tool's identifier used to extract metadata + format: the format of the metadata + metadata: the metadata retrieved at the time and location """ cur = self._cursor(cur) insert = """INSERT INTO origin_metadata (origin_id, discovery_date, - provider_id, tool_id, metadata) - SELECT id, %s, %s, %s, %s FROM origin WHERE url = %s""" - cur.execute(insert, (ts, provider, tool, jsonize(metadata), origin)) - - origin_metadata_get_cols = [ - "origin_url", - "discovery_date", - "tool_id", - "metadata", - "provider_id", - "provider_name", - "provider_type", - "provider_url", - ] - - def origin_metadata_get_by(self, origin_url, provider_type=None, cur=None): - """Retrieve all origin_metadata entries for one origin_url + authority_id, fetcher_id, format, metadata) + SELECT id, %s, %s, %s, %s, %s FROM origin WHERE url = %s""" + cur.execute( + insert, + (discovery_date, authority, fetcher, format, jsonize(metadata), origin), + ) - """ + def origin_metadata_get( + self, + origin_url: str, + authority: int, + after: Optional[datetime.datetime], + limit: Optional[int], + cur=None, + ): cur = self._cursor(cur) - if not provider_type: - query = """SELECT %s - FROM swh_origin_metadata_get_by_origin( - %%s)""" % ( - ",".join(self.origin_metadata_get_cols) - ) + assert self.origin_metadata_get_cols[-1] == "metadata" + query_parts = [ + f"SELECT {', '.join(self.origin_metadata_get_cols[0:-1])}, " + f" origin_metadata.metadata AS metadata " + f"FROM origin_metadata " + f"INNER JOIN metadata_authority " + f" ON (metadata_authority.id=authority_id) " + f"INNER JOIN metadata_fetcher ON (metadata_fetcher.id=fetcher_id) " + f"INNER JOIN origin ON (origin.id=origin_metadata.origin_id) " + f"WHERE origin.url=%s AND authority_id=%s" + ] + args = [origin_url, authority] - cur.execute(query, (origin_url,)) + if after: + query_parts.append("AND discovery_date >= %s") + args.append(after) - else: - query = """SELECT %s - FROM swh_origin_metadata_get_by_provider_type( - %%s, %%s)""" % ( - ",".join(self.origin_metadata_get_cols) - ) + query_parts.append("ORDER BY discovery_date") - cur.execute(query, (origin_url, provider_type)) + if limit: + query_parts.append("LIMIT %s") + args.append(limit) + cur.execute(" ".join(query_parts), args) yield from cur - tool_cols = ["id", "name", "version", "configuration"] - - @stored_procedure("swh_mktemp_tool") - def mktemp_tool(self, cur=None): - pass + metadata_fetcher_cols = ["name", "version", "metadata"] - def tool_add_from_temp(self, cur=None): + def metadata_fetcher_add( + self, name: str, version: str, metadata: bytes, cur=None + ) -> None: cur = self._cursor(cur) - cur.execute("SELECT %s from swh_tool_add()" % (",".join(self.tool_cols),)) - yield from cur + cur.execute( + "INSERT INTO metadata_fetcher (name, version, metadata) " + "VALUES (%s, %s, %s) ON CONFLICT DO NOTHING", + (name, version, jsonize(metadata)), + ) - def tool_get(self, name, version, configuration, cur=None): + def metadata_fetcher_get(self, name: str, version: str, cur=None): cur = self._cursor(cur) cur.execute( - """select %s - from tool - where name=%%s and - version=%%s and - configuration=%%s""" - % (",".join(self.tool_cols)), - (name, version, configuration), + f"SELECT {', '.join(self.metadata_fetcher_cols)} " + f"FROM metadata_fetcher " + f"WHERE name=%s AND version=%s", + (name, version), ) - return cur.fetchone() - metadata_provider_cols = [ - "id", - "provider_name", - "provider_type", - "provider_url", - "metadata", - ] - - def metadata_provider_add( - self, provider_name, provider_type, provider_url, metadata, cur=None - ): - """Insert a new provider and return the new identifier.""" + def metadata_fetcher_get_id( + self, name: str, version: str, cur=None + ) -> Optional[int]: cur = self._cursor(cur) - insert = """INSERT INTO metadata_provider (provider_name, provider_type, - provider_url, metadata) values (%s, %s, %s, %s) - RETURNING id""" - cur.execute( - insert, (provider_name, provider_type, provider_url, jsonize(metadata)) + "SELECT id FROM metadata_fetcher WHERE name=%s AND version=%s", + (name, version), ) - return cur.fetchone()[0] + row = cur.fetchone() + if row: + return row[0] + else: + return None + + metadata_authority_cols = ["type", "url", "metadata"] - def metadata_provider_get(self, provider_id, cur=None): + def metadata_authority_add( + self, type: str, url: str, metadata: bytes, cur=None + ) -> None: cur = self._cursor(cur) cur.execute( - """select %s - from metadata_provider - where id=%%s """ - % (",".join(self.metadata_provider_cols)), - (provider_id,), + "INSERT INTO metadata_authority (type, url, metadata) " + "VALUES (%s, %s, %s) ON CONFLICT DO NOTHING", + (type, url, jsonize(metadata)), ) + def metadata_authority_get(self, type: str, url: str, cur=None): + cur = self._cursor(cur) + cur.execute( + f"SELECT {', '.join(self.metadata_authority_cols)} " + f"FROM metadata_authority " + f"WHERE type=%s AND url=%s", + (type, url), + ) return cur.fetchone() - def metadata_provider_get_by(self, provider_name, provider_url, cur=None): + def metadata_authority_get_id(self, type: str, url: str, cur=None) -> Optional[int]: cur = self._cursor(cur) cur.execute( - """select %s - from metadata_provider - where provider_name=%%s and - provider_url=%%s""" - % (",".join(self.metadata_provider_cols)), - (provider_name, provider_url), + "SELECT id FROM metadata_authority WHERE type=%s AND url=%s", (type, url) ) - - return cur.fetchone() + row = cur.fetchone() + if row: + return row[0] + else: + return None def _get_random_row_from_table(self, table_name, cols, id_col, cur=None): random_sha1 = bytes(random.randint(0, 255) for _ in range(SHA1_SIZE)) diff --git a/swh/storage/in_memory.py b/swh/storage/in_memory.py --- a/swh/storage/in_memory.py +++ b/swh/storage/in_memory.py @@ -14,7 +14,20 @@ from collections import defaultdict from datetime import timedelta -from typing import Any, Dict, Iterable, List, Optional, Union +from typing import ( + Any, + Callable, + Dict, + Generic, + Hashable, + Iterable, + Iterator, + List, + Optional, + Tuple, + TypeVar, + Union, +) import attr @@ -45,6 +58,50 @@ BULK_BLOCK_CONTENT_LEN_MAX = 10000 +SortedListItem = TypeVar("SortedListItem") +SortedListKey = TypeVar("SortedListKey") + + +class SortedList(collections.UserList, Generic[SortedListKey, SortedListItem]): + data: List[Tuple[Any, SortedListItem]] + + # https://github.com/python/mypy/issues/708 + # key: Callable[[SortedListItem], SortedListKey] + + def __init__( + self, + data: List[SortedListItem] = None, + key: Optional[Callable[[SortedListItem], SortedListKey]] = None, + ): + if key is None: + + def key(item): + return item + + assert key is not None # for mypy + super().__init__(sorted((key(x), x) for x in data or [])) + + self.key: Callable[[SortedListItem], SortedListKey] = key + + def add(self, item: SortedListItem): + k = self.key(item) + bisect.insort(self.data, (k, item)) + + def __iter__(self) -> Iterator[SortedListItem]: + for (k, item) in self.data: + yield item + + def iter_from(self, start_key: SortedListKey) -> Iterator[SortedListItem]: + """Returns an iterator over all the elements whose key is greater + or equal to `start_key`. + (This is an efficient equivalent to: + `(x for x in L if key(x) >= start_key)`) + """ + from_index = bisect.bisect_left(self.data, (start_key,)) + for (k, item) in itertools.islice(self.data, from_index, None): + yield item + + class InMemoryStorage: def __init__(self, journal_writer=None): @@ -65,13 +122,18 @@ self._origins_by_sha1 = {} self._origin_visits = {} self._persons = [] - self._origin_metadata = defaultdict(list) - self._tools = {} - self._metadata_providers = {} - self._objects = defaultdict(list) - # ideally we would want a skip list for both fast inserts and searches - self._sorted_sha1s = [] + # {origin_url: {authority: [metadata]}} + self._origin_metadata: Dict[ + str, Dict[Hashable, SortedList[datetime.datetime, Dict[str, Any]]] + ] = defaultdict( + lambda: defaultdict(lambda: SortedList(key=lambda x: x["discovery_date"])) + ) # noqa + + self._metadata_fetchers: Dict[Hashable, Dict[str, Any]] = {} + self._metadata_authorities: Dict[Hashable, Dict[str, Any]] = {} + self._objects = defaultdict(list) + self._sorted_sha1s = SortedList[bytes, bytes]() self.objstorage = ObjStorage({"cls": "memory", "args": {}}) @@ -82,7 +144,6 @@ self.journal_writer.content_add(contents) content_add = 0 - content_add_bytes = 0 if with_data: summary = self.objstorage.content_add( c for c in contents if c.status != "absent" @@ -111,7 +172,7 @@ self._content_indexes[algorithm][hash_].add(key) self._objects[content.sha1_git].append(("content", content.sha1)) self._contents[key] = content - bisect.insort(self._sorted_sha1s, content.sha1) + self._sorted_sha1s.add(content.sha1) self._contents[key] = attr.evolve(self._contents[key], data=None) content_add += 1 @@ -163,11 +224,9 @@ def content_get_range(self, start, end, limit=1000): if limit is None: raise StorageArgumentException("limit should not be None") - from_index = bisect.bisect_left(self._sorted_sha1s, start) - sha1s = itertools.islice(self._sorted_sha1s, from_index, None) sha1s = ( (sha1, content_key) - for sha1 in sha1s + for sha1 in self._sorted_sha1s.iter_from(start) for content_key in self._content_indexes["sha1"][sha1] ) matched = [] @@ -906,71 +965,104 @@ def refresh_stat_counters(self): pass - def origin_metadata_add(self, origin_url, ts, provider, tool, metadata): + def origin_metadata_add( + self, + origin_url: str, + discovery_date: datetime.datetime, + authority: Dict[str, Any], + fetcher: Dict[str, Any], + format: str, + metadata: bytes, + ) -> None: if not isinstance(origin_url, str): - raise TypeError("origin_id must be str, not %r" % (origin_url,)) - - if isinstance(ts, str): - ts = dateutil.parser.parse(ts) + raise StorageArgumentException( + "origin_id must be str, not %r" % (origin_url,) + ) + authority_key = self._metadata_authority_key(authority) + if authority_key not in self._metadata_authorities: + raise StorageArgumentException(f"Unknown authority {authority}") + fetcher_key = self._metadata_fetcher_key(fetcher) + if fetcher_key not in self._metadata_fetchers: + raise StorageArgumentException(f"Unknown fetcher {fetcher}") origin_metadata = { "origin_url": origin_url, - "discovery_date": ts, - "tool_id": tool, + "discovery_date": discovery_date, + "authority": authority_key, + "fetcher": fetcher_key, + "format": format, "metadata": metadata, - "provider_id": provider, } - self._origin_metadata[origin_url].append(origin_metadata) + self._origin_metadata[origin_url][authority_key].add(origin_metadata) return None - def origin_metadata_get_by(self, origin_url, provider_type=None): + def origin_metadata_get( + self, + origin_url: str, + authority: Dict[str, str], + after: Optional[datetime.datetime] = None, + limit: Optional[int] = None, + ) -> List[Dict[str, Any]]: if not isinstance(origin_url, str): raise TypeError("origin_url must be str, not %r" % (origin_url,)) - metadata = [] - for item in self._origin_metadata[origin_url]: - item = copy.deepcopy(item) - provider = self.metadata_provider_get(item["provider_id"]) - for attr_name in ("name", "type", "url"): - item["provider_" + attr_name] = provider["provider_" + attr_name] - metadata.append(item) - return metadata - - def tool_add(self, tools): - inserted = [] - for tool in tools: - key = self._tool_key(tool) - assert "id" not in tool - record = copy.deepcopy(tool) - record["id"] = key # TODO: remove this - if key not in self._tools: - self._tools[key] = record - inserted.append(copy.deepcopy(self._tools[key])) - - return inserted - - def tool_get(self, tool): - return self._tools.get(self._tool_key(tool)) - - def metadata_provider_add( - self, provider_name, provider_type, provider_url, metadata - ): - provider = { - "provider_name": provider_name, - "provider_type": provider_type, - "provider_url": provider_url, + + authority_key = self._metadata_authority_key(authority) + + if after is None: + entries = iter(self._origin_metadata[origin_url][authority_key]) + else: + entries = self._origin_metadata[origin_url][authority_key].iter_from(after) + if limit: + entries = itertools.islice(entries, 0, limit) + + results = [] + for entry in entries: + authority = self._metadata_authorities[entry["authority"]] + fetcher = self._metadata_fetchers[entry["fetcher"]] + results.append( + { + **entry, + "authority": {"type": authority["type"], "url": authority["url"],}, + "fetcher": { + "name": fetcher["name"], + "version": fetcher["version"], + }, + } + ) + return results + + def metadata_fetcher_add( + self, name: str, version: str, metadata: Dict[str, Any] + ) -> None: + fetcher = { + "name": name, + "version": version, "metadata": metadata, } - key = self._metadata_provider_key(provider) - provider["id"] = key - self._metadata_providers[key] = provider - return key + key = self._metadata_fetcher_key(fetcher) + if key not in self._metadata_fetchers: + self._metadata_fetchers[key] = fetcher - def metadata_provider_get(self, provider_id): - return self._metadata_providers.get(provider_id) + def metadata_fetcher_get(self, name: str, version: str) -> Optional[Dict[str, Any]]: + return self._metadata_fetchers.get( + self._metadata_fetcher_key({"name": name, "version": version}) + ) - def metadata_provider_get_by(self, provider): - key = self._metadata_provider_key(provider) - return self._metadata_providers.get(key) + def metadata_authority_add( + self, type: str, url: str, metadata: Dict[str, Any] + ) -> None: + authority = { + "type": type, + "url": url, + "metadata": metadata, + } + key = self._metadata_authority_key(authority) + self._metadata_authorities[key] = authority + + def metadata_authority_get(self, type: str, url: str) -> Optional[Dict[str, Any]]: + return self._metadata_authorities.get( + self._metadata_authority_key({"type": type, "url": url}) + ) def _get_origin_url(self, origin): if isinstance(origin, str): @@ -997,16 +1089,12 @@ return tuple((key, content.get(key)) for key in sorted(DEFAULT_ALGORITHMS)) @staticmethod - def _tool_key(tool): - return "%r %r %r" % ( - tool["name"], - tool["version"], - tuple(sorted(tool["configuration"].items())), - ) + def _metadata_fetcher_key(fetcher: Dict) -> Hashable: + return (fetcher["name"], fetcher["version"]) @staticmethod - def _metadata_provider_key(provider): - return "%r %r" % (provider["provider_name"], provider["provider_url"]) + def _metadata_authority_key(authority: Dict) -> Hashable: + return (authority["type"], authority["url"]) def diff_directories(self, from_dir, to_dir, track_renaming=False): raise NotImplementedError("InMemoryStorage.diff_directories") diff --git a/swh/storage/interface.py b/swh/storage/interface.py --- a/swh/storage/interface.py +++ b/swh/storage/interface.py @@ -1132,120 +1132,118 @@ ... @remote_api_endpoint("origin/metadata/add") - def origin_metadata_add(self, origin_url, ts, provider, tool, metadata): - """ Add an origin_metadata for the origin at ts with provenance and - metadata. + def origin_metadata_add( + self, + origin_url: str, + discovery_date: datetime.datetime, + authority: Dict[str, Any], + fetcher: Dict[str, Any], + format: str, + metadata: bytes, + ) -> None: + """Add an origin_metadata for the origin at discovery_date, + obtained using the `fetcher` from the `authority`. + + The authority and fetcher must be known to the storage before + using this endpoint. Args: - origin_url (str): the origin url for which the metadata is added - ts (datetime): timestamp of the found metadata - provider (int): the provider of metadata (ex:'hal') - tool (int): tool used to extract metadata - metadata (jsonb): the metadata retrieved at the time and location + discovery_date: when the metadata was fetched. + authority: a dict containing keys `type` and `url`. + fetcher: a dict containing keys `name` and `version`. + format: text field indicating the format of the content of the + metadata: blob of raw metadata """ ... @remote_api_endpoint("origin/metadata/get") - def origin_metadata_get_by(self, origin_url, provider_type=None): + def origin_metadata_get( + self, + origin_url: str, + authority: Dict[str, str], + after: Optional[datetime.datetime] = None, + limit: Optional[int] = None, + ) -> List[Dict[str, Any]]: """Retrieve list of all origin_metadata entries for the origin_id Args: - origin_url (str): the origin's URL - provider_type (str): (optional) type of provider + origin_url: the origin's URL + authority: a dict containing keys `type` and `url`. + after: minimum discovery_date for a result to be returned + limit: maximum number of results to be returned Returns: - list of dicts: the origin_metadata dictionary with the keys: + list of dicts in the format: + + .. code-block: python - - origin_id (int): origin's id - - discovery_date (datetime): timestamp of discovery - - tool_id (int): metadata's extracting tool - - metadata (jsonb) - - provider_id (int): metadata's provider - - provider_name (str) - - provider_type (str) - - provider_url (str) + { + 'authority': {'type': ..., 'url': ...}, + 'fetcher': {'name': ..., 'version': ...}, + 'discovery_date': ..., + 'format': '...', + 'metadata': b'...' + } """ ... - @remote_api_endpoint("tool/add") - def tool_add(self, tools): - """Add new tools to the storage. - - Args: - tools (iterable of :class:`dict`): Tool information to add to - storage. Each tool is a :class:`dict` with the following keys: + @remote_api_endpoint("fetcher/add") + def metadata_fetcher_add( + self, name: str, version: str, metadata: Dict[str, Any] + ) -> None: + """Add a new metadata fetcher to the storage. - - name (:class:`str`): name of the tool - - version (:class:`str`): version of the tool - - configuration (:class:`dict`): configuration of the tool, - must be json-encodable + `name` and `version` together are a unique identifier of this + fetcher; and `metadata` is an arbitrary dict of JSONable data + with information about this fetcher. - Returns: - :class:`dict`: All the tools inserted in storage - (including the internal ``id``). The order of the list is not - guaranteed to match the order of the initial list. + Args: + name: the name of the fetcher + version: version of the fetcher """ ... - @remote_api_endpoint("tool/data") - def tool_get(self, tool): - """Retrieve tool information. + @remote_api_endpoint("fetcher/get") + def metadata_fetcher_get(self, name: str, version: str) -> Optional[Dict[str, Any]]: + """Retrieve information about a fetcher Args: - tool (dict): Tool information we want to retrieve from storage. - The dicts have the same keys as those used in :func:`tool_add`. + name: the name of the fetcher + version: version of the fetcher Returns: - dict: The full tool information if it exists (``id`` included), - None otherwise. + dictionary with keys `name`, `version`, and `metadata`; or None + if the fetcher is not known """ ... - @remote_api_endpoint("provider/add") - def metadata_provider_add( - self, provider_name, provider_type, provider_url, metadata - ): - """Add a metadata provider. + @remote_api_endpoint("authority/add") + def metadata_authority_add( + self, type: str, url: str, metadata: Dict[str, Any] + ) -> None: + """Add a metadata authority Args: - provider_name (str): Its name - provider_type (str): Its type (eg. `'deposit-client'`) - provider_url (str): Its URL + type: one of "deposit", "forge", or "registry" + url: unique URI identifying the authority metadata: JSON-encodable object - - Returns: - int: an identifier of the provider - """ - ... - - @remote_api_endpoint("provider/get") - def metadata_provider_get(self, provider_id): - """Get a metadata provider - - Args: - provider_id: Its identifier, as given by `metadata_provider_add`. - - Returns: - dict: same as `metadata_provider_add`; - or None if it does not exist. """ ... - @remote_api_endpoint("provider/getby") - def metadata_provider_get_by(self, provider): - """Get a metadata provider + @remote_api_endpoint("authority/get") + def metadata_authority_get(self, type: str, url: str) -> Optional[Dict[str, Any]]: + """Retrieve information about an authority Args: - provider (dict): A dictionary with keys: - * provider_name: Its name - * provider_url: Its URL + type: one of "deposit", "forge", or "registry" + url: unique URI identifying the authority Returns: - dict: same as `metadata_provider_add`; - or None if it does not exist. + dictionary with keys `type`, `url`, and `metadata`; or None + if the authority is not known """ ... diff --git a/swh/storage/retry.py b/swh/storage/retry.py --- a/swh/storage/retry.py +++ b/swh/storage/retry.py @@ -7,7 +7,7 @@ import traceback from datetime import datetime -from typing import Dict, Iterable, List, Optional, Union +from typing import Any, Dict, Iterable, Optional, Union from tenacity import ( retry, @@ -127,29 +127,29 @@ ) @swh_retry - def tool_add(self, tools: Iterable[Dict]) -> List[Dict]: - tools = list(tools) - return self.storage.tool_add(tools) + def metadata_fetcher_add( + self, name: str, version: str, metadata: Dict[str, Any] + ) -> None: + return self.storage.metadata_fetcher_add(name, version, metadata) @swh_retry - def metadata_provider_add( - self, provider_name: str, provider_type: str, provider_url: str, metadata: Dict - ) -> Union[str, int]: - return self.storage.metadata_provider_add( - provider_name, provider_type, provider_url, metadata - ) + def metadata_authority_add( + self, type: str, url: str, metadata: Dict[str, Any] + ) -> None: + return self.storage.metadata_authority_add(type, url, metadata) @swh_retry def origin_metadata_add( self, origin_url: str, - ts: Union[str, datetime], - provider_id: int, - tool_id: int, - metadata: Dict, + discovery_date: datetime, + authority: Dict[str, Any], + fetcher: Dict[str, Any], + format: str, + metadata: bytes, ) -> None: return self.storage.origin_metadata_add( - origin_url, ts, provider_id, tool_id, metadata + origin_url, discovery_date, authority, fetcher, format, metadata ) @swh_retry diff --git a/swh/storage/sql/30-swh-schema.sql b/swh/storage/sql/30-swh-schema.sql --- a/swh/storage/sql/30-swh-schema.sql +++ b/swh/storage/sql/30-swh-schema.sql @@ -17,7 +17,7 @@ -- latest schema version insert into dbversion(version, release, description) - values(146, now(), 'Work In Progress'); + values(148, now(), 'Work In Progress'); -- a SHA1 checksum create domain sha1 as bytea check (length(value) = 20); @@ -377,35 +377,34 @@ comment on column release.date_neg_utc_offset is 'True indicates -0 UTC offset for release timestamp'; -- Tools -create table tool +create table metadata_fetcher ( - id serial not null, - name text not null, - version text not null, - configuration jsonb + id serial not null, + name text not null, + version text not null, + metadata jsonb not null ); -comment on table tool is 'Tool information'; -comment on column tool.id is 'Tool identifier'; -comment on column tool.version is 'Tool name'; -comment on column tool.version is 'Tool version'; -comment on column tool.configuration is 'Tool configuration: command line, flags, etc...'; +comment on table metadata_fetcher is 'Tools used to retrieve metadata'; +comment on column metadata_fetcher.id is 'Internal identifier of the fetcher'; +comment on column metadata_fetcher.name is 'Fetcher name'; +comment on column metadata_fetcher.version is 'Fetcher version'; +comment on column metadata_fetcher.metadata is 'Extra information about the fetcher'; -create table metadata_provider +create table metadata_authority ( - id serial not null, - provider_name text not null, - provider_type text not null, - provider_url text, - metadata jsonb + id serial not null, + type text not null, + url text not null, + metadata jsonb not null ); -comment on table metadata_provider is 'Metadata provider information'; -comment on column metadata_provider.id is 'Provider''s identifier'; -comment on column metadata_provider.provider_name is 'Provider''s name'; -comment on column metadata_provider.provider_url is 'Provider''s url'; -comment on column metadata_provider.metadata is 'Other metadata about provider'; +comment on table metadata_authority is 'Metadata authority information'; +comment on column metadata_authority.id is 'Internal identifier of the authority'; +comment on column metadata_authority.type is 'Type of authority (deposit/forge/registry)'; +comment on column metadata_authority.url is 'Authority''s uri'; +comment on column metadata_authority.metadata is 'Other metadata about authority'; -- Discovery of metadata during a listing, loading, deposit or external_catalog of an origin @@ -415,18 +414,20 @@ id bigserial not null, -- PK internal object identifier origin_id bigint not null, -- references origin(id) discovery_date timestamptz not null, -- when it was extracted - provider_id bigint not null, -- ex: 'hal', 'lister-github', 'loader-github' - tool_id bigint not null, - metadata jsonb not null + authority_id bigint not null, + fetcher_id bigint not null, + format text not null, + metadata bytea not null ); comment on table origin_metadata is 'keeps all metadata found concerning an origin'; comment on column origin_metadata.id is 'the origin_metadata object''s id'; comment on column origin_metadata.origin_id is 'the origin id for which the metadata was found'; comment on column origin_metadata.discovery_date is 'the date of retrieval'; -comment on column origin_metadata.provider_id is 'the metadata provider: github, openhub, deposit, etc.'; -comment on column origin_metadata.tool_id is 'the tool used for extracting metadata: lister-github, etc.'; -comment on column origin_metadata.metadata is 'metadata in json format but with original terms'; +comment on column origin_metadata.authority_id is 'the metadata provider: github, openhub, deposit, etc.'; +comment on column origin_metadata.fetcher_id is 'the tool used for extracting metadata: loaders, crawlers, etc.'; +comment on column origin_metadata.format is 'name of the format of metadata, used by readers to interpret it.'; +comment on column origin_metadata.metadata is 'original metadata in opaque format'; -- Keep a cache of object counts diff --git a/swh/storage/sql/40-swh-func.sql b/swh/storage/sql/40-swh-func.sql --- a/swh/storage/sql/40-swh-func.sql +++ b/swh/storage/sql/40-swh-func.sql @@ -101,17 +101,6 @@ ) on commit delete rows; $$; --- create a temporary table for the tools -create or replace function swh_mktemp_tool() - returns void - language sql -as $$ - create temporary table if not exists tmp_tool ( - like tool including defaults - ) on commit delete rows; - alter table tmp_tool drop column if exists id; -$$; - -- a content signature is a set of cryptographic checksums that we use to -- uniquely identify content, for the purpose of verifying if we already have -- some content or not during content injection @@ -920,76 +909,6 @@ $$; --- end revision_metadata functions --- origin_metadata functions -create type origin_metadata_signature as ( - id bigint, - origin_url text, - discovery_date timestamptz, - tool_id bigint, - metadata jsonb, - provider_id integer, - provider_name text, - provider_type text, - provider_url text -); -create or replace function swh_origin_metadata_get_by_origin( - origin text) - returns setof origin_metadata_signature - language sql - stable -as $$ - select om.id as id, o.url as origin_url, discovery_date, tool_id, om.metadata, - mp.id as provider_id, provider_name, provider_type, provider_url - from origin_metadata as om - inner join metadata_provider mp on om.provider_id = mp.id - inner join origin o on om.origin_id = o.id - where o.url = origin - order by discovery_date desc; -$$; - -create or replace function swh_origin_metadata_get_by_provider_type( - origin_url text, - provider_type text) - returns setof origin_metadata_signature - language sql - stable -as $$ - select om.id as id, o.url as origin_url, discovery_date, tool_id, om.metadata, - mp.id as provider_id, provider_name, provider_type, provider_url - from origin_metadata as om - inner join metadata_provider mp on om.provider_id = mp.id - inner join origin o on om.origin_id = o.id - where o.url = origin_url - and mp.provider_type = provider_type - order by discovery_date desc; -$$; --- end origin_metadata functions - --- add tmp_tool entries to tool, --- skipping duplicates if any. --- --- operates in bulk: 0. create temporary tmp_tool, 1. COPY to --- it, 2. call this function to insert and filtering out duplicates -create or replace function swh_tool_add() - returns setof tool - language plpgsql -as $$ -begin - insert into tool(name, version, configuration) - select name, version, configuration from tmp_tool tmp - on conflict(name, version, configuration) do nothing; - - return query - select id, name, version, configuration - from tmp_tool join tool - using(name, version, configuration); - - return; -end -$$; - - -- simple counter mapping a textual label to an integer value create type counter as ( label text, diff --git a/swh/storage/sql/60-swh-indexes.sql b/swh/storage/sql/60-swh-indexes.sql --- a/swh/storage/sql/60-swh-indexes.sql +++ b/swh/storage/sql/60-swh-indexes.sql @@ -144,32 +144,32 @@ alter table release add constraint release_author_date_check check ((date is null) or (author is not null)) not valid; alter table release validate constraint release_author_date_check; --- tool -create unique index tool_pkey on tool(id); -alter table tool add primary key using index tool_pkey; +-- metadata_fetcher +create unique index metadata_fetcher_pkey on metadata_fetcher(id); +alter table metadata_fetcher add primary key using index metadata_fetcher_pkey; -create unique index on tool(name, version, configuration); +create unique index metadata_fetcher_name_version on metadata_fetcher(name, version); --- metadata_provider -create unique index concurrently metadata_provider_pkey on metadata_provider(id); -alter table metadata_provider add primary key using index metadata_provider_pkey; +-- metadata_authority +create unique index concurrently metadata_authority_pkey on metadata_authority(id); +alter table metadata_authority add primary key using index metadata_authority_pkey; -create index concurrently on metadata_provider(provider_name, provider_url); +create unique index metadata_authority_type_url on metadata_authority(type, url); -- origin_metadata create unique index concurrently origin_metadata_pkey on origin_metadata(id); alter table origin_metadata add primary key using index origin_metadata_pkey; -create index concurrently on origin_metadata(origin_id, provider_id, tool_id); +create index concurrently origin_metadata_origin_authority_date on origin_metadata(origin_id, authority_id, discovery_date); alter table origin_metadata add constraint origin_metadata_origin_fkey foreign key (origin_id) references origin(id) not valid; alter table origin_metadata validate constraint origin_metadata_origin_fkey; -alter table origin_metadata add constraint origin_metadata_provider_fkey foreign key (provider_id) references metadata_provider(id) not valid; -alter table origin_metadata validate constraint origin_metadata_provider_fkey; +alter table origin_metadata add constraint origin_metadata_authority_fkey foreign key (authority_id) references metadata_authority(id) not valid; +alter table origin_metadata validate constraint origin_metadata_authority_fkey; -alter table origin_metadata add constraint origin_metadata_tool_fkey foreign key (tool_id) references tool(id) not valid; -alter table origin_metadata validate constraint origin_metadata_tool_fkey; +alter table origin_metadata add constraint origin_metadata_fetcher_fkey foreign key (fetcher_id) references metadata_fetcher(id) not valid; +alter table origin_metadata validate constraint origin_metadata_fetcher_fkey; -- object_counts create unique index concurrently object_counts_pkey on object_counts(object_type); diff --git a/swh/storage/storage.py b/swh/storage/storage.py --- a/swh/storage/storage.py +++ b/swh/storage/storage.py @@ -6,7 +6,6 @@ import contextlib import datetime import itertools -import json from collections import defaultdict from contextlib import contextmanager @@ -1138,72 +1137,101 @@ @timed @db_transaction() def origin_metadata_add( - self, origin_url, ts, provider, tool, metadata, db=None, cur=None - ): - if isinstance(ts, str): - ts = dateutil.parser.parse(ts) - - db.origin_metadata_add(origin_url, ts, provider, tool, metadata, cur) + self, + origin_url: str, + discovery_date: datetime.datetime, + authority: Dict[str, Any], + fetcher: Dict[str, Any], + format: str, + metadata: bytes, + db=None, + cur=None, + ) -> None: + authority_id = db.metadata_authority_get_id( + authority["type"], authority["url"], cur + ) + fetcher_id = db.metadata_fetcher_get_id( + fetcher["name"], fetcher["version"], cur + ) + if not authority_id: + raise StorageArgumentException(f"Unknown authority {authority}") + if not fetcher_id: + raise StorageArgumentException(f"Unknown fetcher {fetcher}") + db.origin_metadata_add( + origin_url, discovery_date, authority_id, fetcher_id, format, metadata, cur + ) send_metric("origin_metadata:add", count=1, method_name="origin_metadata_add") @timed - @db_transaction_generator(statement_timeout=500) - def origin_metadata_get_by(self, origin_url, provider_type=None, db=None, cur=None): - for line in db.origin_metadata_get_by(origin_url, provider_type, cur): - yield dict(zip(db.origin_metadata_get_cols, line)) + @db_transaction(statement_timeout=500) + def origin_metadata_get( + self, + origin_url: str, + authority: Dict[str, str], + after: Optional[datetime.datetime] = None, + limit: Optional[int] = None, + db=None, + cur=None, + ) -> List[Dict[str, Any]]: + authority_id = db.metadata_authority_get_id( + authority["type"], authority["url"], cur + ) + if not authority_id: + return [] + results = [] + for line in db.origin_metadata_get(origin_url, authority_id, after, limit, cur): + row = dict(zip(db.origin_metadata_get_cols, line)) + results.append( + { + "origin_url": row.pop("origin.url"), + "authority": { + "type": row.pop("metadata_authority.type"), + "url": row.pop("metadata_authority.url"), + }, + "fetcher": { + "name": row.pop("metadata_fetcher.name"), + "version": row.pop("metadata_fetcher.version"), + }, + **row, + } + ) + return results @timed @db_transaction() - def tool_add(self, tools, db=None, cur=None): - db.mktemp_tool(cur) - with convert_validation_exceptions(): - db.copy_to(tools, "tmp_tool", ["name", "version", "configuration"], cur) - tools = db.tool_add_from_temp(cur) - - results = [dict(zip(db.tool_cols, line)) for line in tools] - send_metric("tool:add", count=len(results), method_name="tool_add") - return results + def metadata_fetcher_add( + self, name: str, version: str, metadata: Dict[str, Any], db=None, cur=None + ) -> None: + db.metadata_fetcher_add(name, version, metadata) + send_metric("metadata_fetcher:add", count=1, method_name="metadata_fetcher") @timed @db_transaction(statement_timeout=500) - def tool_get(self, tool, db=None, cur=None): - tool_conf = tool["configuration"] - if isinstance(tool_conf, dict): - tool_conf = json.dumps(tool_conf) - - idx = db.tool_get(tool["name"], tool["version"], tool_conf) - if not idx: + def metadata_fetcher_get( + self, name: str, version: str, db=None, cur=None + ) -> Optional[Dict[str, Any]]: + row = db.metadata_fetcher_get(name, version) + if not row: return None - return dict(zip(db.tool_cols, idx)) - - @timed - @db_transaction() - def metadata_provider_add( - self, provider_name, provider_type, provider_url, metadata, db=None, cur=None - ): - result = db.metadata_provider_add( - provider_name, provider_type, provider_url, metadata, cur - ) - send_metric("metadata_provider:add", count=1, method_name="metadata_provider") - return result + return dict(zip(db.metadata_fetcher_cols, row)) @timed @db_transaction() - def metadata_provider_get(self, provider_id, db=None, cur=None): - result = db.metadata_provider_get(provider_id) - if not result: - return None - return dict(zip(db.metadata_provider_cols, result)) + def metadata_authority_add( + self, type: str, url: str, metadata: Dict[str, Any], db=None, cur=None + ) -> None: + db.metadata_authority_add(type, url, metadata, cur) + send_metric("metadata_authority:add", count=1, method_name="metadata_authority") @timed @db_transaction() - def metadata_provider_get_by(self, provider, db=None, cur=None): - result = db.metadata_provider_get_by( - provider["provider_name"], provider["provider_url"] - ) - if not result: + def metadata_authority_get( + self, type: str, url: str, db=None, cur=None + ) -> Optional[Dict[str, Any]]: + row = db.metadata_authority_get(type, url) + if not row: return None - return dict(zip(db.metadata_provider_cols, result)) + return dict(zip(db.metadata_authority_cols, row)) @timed def diff_directories(self, from_dir, to_dir, track_renaming=False): diff --git a/swh/storage/tests/conftest.py b/swh/storage/tests/conftest.py --- a/swh/storage/tests/conftest.py +++ b/swh/storage/tests/conftest.py @@ -248,7 +248,7 @@ "release": [data.release, data.release2, data.release3], "snapshot": [data.snapshot], "origin": [data.origin, data.origin2], - "tool": [data.metadata_tool], - "provider": [data.provider], + "fetcher": [data.metadata_fetcher], + "authority": [data.metadata_authority], "origin_metadata": [data.origin_metadata, data.origin_metadata2], } diff --git a/swh/storage/tests/storage_data.py b/swh/storage/tests/storage_data.py --- a/swh/storage/tests/storage_data.py +++ b/swh/storage/tests/storage_data.py @@ -326,17 +326,16 @@ origins = (origin, origin2) -provider = { - "name": "hal", - "type": "deposit-client", - "url": "http:///hal/inria", +metadata_authority = { + "type": "deposit", + "url": "http:///hal.inria.example.com/", "metadata": {"location": "France"}, } -metadata_tool = { +metadata_fetcher = { "name": "swh-deposit", "version": "0.0.1", - "configuration": {"sword_version": "2"}, + "metadata": {"sword_version": "2"}, } date_visit1 = datetime.datetime(2015, 1, 1, 23, 0, 0, tzinfo=datetime.timezone.utc) @@ -456,22 +455,36 @@ } origin_metadata = { - "origin": origin, + "origin_url": origin["url"], "discovery_date": datetime.datetime( 2015, 1, 1, 23, 0, 0, tzinfo=datetime.timezone.utc ), - "provider": provider, - "tool": "swh-deposit", - "metadata": {"name": "test_origin_metadata", "version": "0.0.1"}, + "authority": { + "type": metadata_authority["type"], + "url": metadata_authority["url"], + }, + "fetcher": { + "name": metadata_fetcher["name"], + "version": metadata_fetcher["version"], + }, + "format": "json", + "metadata": b'{"foo": "bar"}', } origin_metadata2 = { - "origin": origin, + "origin_url": origin["url"], "discovery_date": datetime.datetime( 2017, 1, 1, 23, 0, 0, tzinfo=datetime.timezone.utc ), - "provider": provider, - "tool": "swh-deposit", - "metadata": {"name": "test_origin_metadata", "version": "0.0.1"}, + "authority": { + "type": metadata_authority["type"], + "url": metadata_authority["url"], + }, + "fetcher": { + "name": metadata_fetcher["name"], + "version": metadata_fetcher["version"], + }, + "format": "yaml", + "metadata": b"foo: bar", } person = { diff --git a/swh/storage/tests/test_cassandra.py b/swh/storage/tests/test_cassandra.py --- a/swh/storage/tests/test_cassandra.py +++ b/swh/storage/tests/test_cassandra.py @@ -335,30 +335,6 @@ def test_person_get(self): pass - @pytest.mark.skip("Not yet implemented") - def test_metadata_provider_add(self): - pass - - @pytest.mark.skip("Not yet implemented") - def test_metadata_provider_get(self): - pass - - @pytest.mark.skip("Not yet implemented") - def test_metadata_provider_get_by(self): - pass - - @pytest.mark.skip("Not yet implemented") - def test_origin_metadata_add(self): - pass - - @pytest.mark.skip("Not yet implemented") - def test_origin_metadata_get(self): - pass - - @pytest.mark.skip("Not yet implemented") - def test_origin_metadata_get_by_provider_type(self): - pass - @pytest.mark.skip("Not supported by Cassandra") def test_origin_count(self): pass diff --git a/swh/storage/tests/test_in_memory.py b/swh/storage/tests/test_in_memory.py --- a/swh/storage/tests/test_in_memory.py +++ b/swh/storage/tests/test_in_memory.py @@ -5,6 +5,7 @@ import pytest +from swh.storage.in_memory import SortedList from swh.storage.tests.test_storage import TestStorage, TestStorageGeneratedData # noqa @@ -19,3 +20,51 @@ "cls": "memory", "journal_writer": {"cls": "memory",}, } + + +parametrize = pytest.mark.parametrize( + "items", + [ + [1, 2, 3, 4, 5, 6, 10, 100], + [10, 100, 6, 5, 4, 3, 2, 1], + [10, 4, 5, 6, 1, 2, 3, 100], + ], +) + + +@parametrize +def test_sorted_list_iter(items): + list1 = SortedList() + for item in items: + list1.add(item) + assert list(list1) == sorted(items) + + list2 = SortedList(items) + assert list(list2) == sorted(items) + + +@parametrize +def test_sorted_list_iter__key(items): + list1 = SortedList(key=lambda item: -item) + for item in items: + list1.add(item) + assert list(list1) == list(reversed(sorted(items))) + + list2 = SortedList(items, key=lambda item: -item) + assert list(list2) == list(reversed(sorted(items))) + + +@parametrize +def test_sorted_list_iter_from(items): + list_ = SortedList(items) + for split in items: + expected = sorted(item for item in items if item >= split) + assert list(list_.iter_from(split)) == expected, split + + +@parametrize +def test_sorted_list_iter_from__key(items): + list_ = SortedList(items, key=lambda item: -item) + for split in items: + expected = reversed(sorted(item for item in items if item <= split)) + assert list(list_.iter_from(-split)) == list(expected), split diff --git a/swh/storage/tests/test_retry.py b/swh/storage/tests/test_retry.py --- a/swh/storage/tests/test_retry.py +++ b/swh/storage/tests/test_retry.py @@ -3,7 +3,6 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -from typing import Dict from unittest.mock import call import psycopg2 @@ -427,164 +426,162 @@ ) -def test_retrying_proxy_storage_tool_add(swh_storage, sample_data): - """Standard tool_add works as before +def test_retrying_proxy_storage_metadata_fetcher_add(swh_storage, sample_data): + """Standard metadata_fetcher_add works as before """ - sample_tool = sample_data["tool"][0] + fetcher = sample_data["fetcher"][0] - tool = swh_storage.tool_get(sample_tool) - assert not tool + metadata_fetcher = swh_storage.metadata_fetcher_get( + fetcher["name"], fetcher["version"] + ) + assert not metadata_fetcher - tools = swh_storage.tool_add([sample_tool]) - assert tools - tool = tools[0] - tool.pop("id") - assert tool == sample_tool + swh_storage.metadata_fetcher_add(**fetcher) - tool = swh_storage.tool_get(sample_tool) - tool.pop("id") - assert tool == sample_tool + actual_fetcher = swh_storage.metadata_fetcher_get( + fetcher["name"], fetcher["version"] + ) + assert actual_fetcher == fetcher -def test_retrying_proxy_storage_tool_add_with_retry( +def test_retrying_proxy_storage_metadata_fetcher_add_with_retry( swh_storage, sample_data, mocker, fake_hash_collision ): """Multiple retries for hash collision and psycopg2 error but finally ok """ - sample_tool = sample_data["tool"][0] - mock_memory = mocker.patch("swh.storage.in_memory.InMemoryStorage.tool_add") + fetcher = sample_data["fetcher"][0] + mock_memory = mocker.patch( + "swh.storage.in_memory.InMemoryStorage.metadata_fetcher_add" + ) mock_memory.side_effect = [ # first try goes ko fake_hash_collision, # second try goes ko - psycopg2.IntegrityError("tool already inserted"), + psycopg2.IntegrityError("metadata_fetcher already inserted"), # ok then! - [sample_tool], + [fetcher], ] mock_sleep = mocker.patch( - "swh.storage.retry.RetryingProxyStorage" ".tool_add.retry.sleep" + "swh.storage.retry.RetryingProxyStorage" ".metadata_fetcher_add.retry.sleep" ) - tool = swh_storage.tool_get(sample_tool) - assert not tool + actual_fetcher = swh_storage.metadata_fetcher_get( + fetcher["name"], fetcher["version"] + ) + assert not actual_fetcher - tools = swh_storage.tool_add([sample_tool]) - assert tools == [sample_tool] + swh_storage.metadata_fetcher_add(**fetcher) mock_memory.assert_has_calls( - [call([sample_tool]), call([sample_tool]), call([sample_tool]),] + [ + call(fetcher["name"], fetcher["version"], fetcher["metadata"]), + call(fetcher["name"], fetcher["version"], fetcher["metadata"]), + call(fetcher["name"], fetcher["version"], fetcher["metadata"]), + ] ) assert mock_sleep.call_count == 2 -def test_retrying_proxy_swh_storage_tool_add_failure(swh_storage, sample_data, mocker): +def test_retrying_proxy_swh_storage_metadata_fetcher_add_failure( + swh_storage, sample_data, mocker +): """Unfiltered errors are raising without retry """ - mock_memory = mocker.patch("swh.storage.in_memory.InMemoryStorage.tool_add") - mock_memory.side_effect = StorageArgumentException("Refuse to add tool always!") + mock_memory = mocker.patch( + "swh.storage.in_memory.InMemoryStorage.metadata_fetcher_add" + ) + mock_memory.side_effect = StorageArgumentException( + "Refuse to add metadata_fetcher always!" + ) - sample_tool = sample_data["tool"][0] + fetcher = sample_data["fetcher"][0] - tool = swh_storage.tool_get(sample_tool) - assert not tool + actual_fetcher = swh_storage.metadata_fetcher_get( + fetcher["name"], fetcher["version"] + ) + assert not actual_fetcher with pytest.raises(StorageArgumentException, match="Refuse to add"): - swh_storage.tool_add([sample_tool]) + swh_storage.metadata_fetcher_add(**fetcher) assert mock_memory.call_count == 1 -def to_provider(provider: Dict) -> Dict: - return { - "provider_name": provider["name"], - "provider_url": provider["url"], - "provider_type": provider["type"], - "metadata": provider["metadata"], - } - - -def test_retrying_proxy_storage_metadata_provider_add(swh_storage, sample_data): - """Standard metadata_provider_add works as before +def test_retrying_proxy_storage_metadata_authority_add(swh_storage, sample_data): + """Standard metadata_authority_add works as before """ - provider = sample_data["provider"][0] - provider_get = to_provider(provider) + authority = sample_data["authority"][0] - provider = swh_storage.metadata_provider_get_by(provider_get) - assert not provider + assert not swh_storage.metadata_authority_get(authority["type"], authority["url"]) - provider_id = swh_storage.metadata_provider_add(**provider_get) - assert provider_id + swh_storage.metadata_authority_add(**authority) - actual_provider = swh_storage.metadata_provider_get(provider_id) - assert actual_provider - actual_provider_id = actual_provider.pop("id") - assert actual_provider_id == provider_id - assert actual_provider == provider_get + actual_authority = swh_storage.metadata_authority_get( + authority["type"], authority["url"] + ) + assert actual_authority == authority -def test_retrying_proxy_storage_metadata_provider_add_with_retry( +def test_retrying_proxy_storage_metadata_authority_add_with_retry( swh_storage, sample_data, mocker, fake_hash_collision ): """Multiple retries for hash collision and psycopg2 error but finally ok """ - provider = sample_data["provider"][0] - provider_get = to_provider(provider) + authority = sample_data["authority"][0] mock_memory = mocker.patch( - "swh.storage.in_memory.InMemoryStorage.metadata_provider_add" + "swh.storage.in_memory.InMemoryStorage.metadata_authority_add" ) mock_memory.side_effect = [ # first try goes ko fake_hash_collision, # second try goes ko - psycopg2.IntegrityError("provider_id already inserted"), + psycopg2.IntegrityError("foo bar"), # ok then! - "provider_id", + None, ] + from swh.storage.retry import RetryingProxyStorage # noqa + mock_sleep = mocker.patch( - "swh.storage.retry.RetryingProxyStorage" ".metadata_provider_add.retry.sleep" + "swh.storage.retry.RetryingProxyStorage" ".metadata_authority_add.retry.sleep" ) - provider = swh_storage.metadata_provider_get_by(provider_get) - assert not provider + assert not swh_storage.metadata_authority_get(authority["type"], authority["url"]) - provider_id = swh_storage.metadata_provider_add(**provider_get) - assert provider_id == "provider_id" + swh_storage.metadata_authority_add(**authority) - provider_arg_names = ("provider_name", "provider_type", "provider_url", "metadata") - provider_args = [provider_get[key] for key in provider_arg_names] + authority_arg_names = ("type", "url", "metadata") + authority_args = [authority[key] for key in authority_arg_names] mock_memory.assert_has_calls( - [call(*provider_args), call(*provider_args), call(*provider_args),] + [call(*authority_args), call(*authority_args), call(*authority_args),] ) assert mock_sleep.call_count == 2 -def test_retrying_proxy_swh_storage_metadata_provider_add_failure( +def test_retrying_proxy_swh_storage_metadata_authority_add_failure( swh_storage, sample_data, mocker ): """Unfiltered errors are raising without retry """ mock_memory = mocker.patch( - "swh.storage.in_memory.InMemoryStorage.metadata_provider_add" + "swh.storage.in_memory.InMemoryStorage.metadata_authority_add" ) mock_memory.side_effect = StorageArgumentException( - "Refuse to add provider_id always!" + "Refuse to add authority_id always!" ) - provider = sample_data["provider"][0] - provider_get = to_provider(provider) + authority = sample_data["authority"][0] - provider_id = swh_storage.metadata_provider_get_by(provider_get) - assert not provider_id + swh_storage.metadata_authority_get(authority["type"], authority["url"]) with pytest.raises(StorageArgumentException, match="Refuse to add"): - swh_storage.metadata_provider_add(**provider_get) + swh_storage.metadata_authority_add(**authority) assert mock_memory.call_count == 1 @@ -594,23 +591,20 @@ """ ori_meta = sample_data["origin_metadata"][0] - origin = ori_meta["origin"] - swh_storage.origin_add_one(origin) - provider_get = to_provider(ori_meta["provider"]) - provider_id = swh_storage.metadata_provider_add(**provider_get) + swh_storage.origin_add_one({"url": ori_meta["origin_url"]}) + swh_storage.metadata_authority_add(**sample_data["authority"][0]) + swh_storage.metadata_fetcher_add(**sample_data["fetcher"][0]) - origin_metadata = swh_storage.origin_metadata_get_by(origin["url"]) + origin_metadata = swh_storage.origin_metadata_get( + ori_meta["origin_url"], ori_meta["authority"] + ) assert not origin_metadata - swh_storage.origin_metadata_add( - origin["url"], - ori_meta["discovery_date"], - provider_id, - ori_meta["tool"], - ori_meta["metadata"], - ) + swh_storage.origin_metadata_add(**ori_meta) - origin_metadata = swh_storage.origin_metadata_get_by(origin["url"]) + origin_metadata = swh_storage.origin_metadata_get( + ori_meta["origin_url"], ori_meta["authority"] + ) assert origin_metadata @@ -621,10 +615,9 @@ """ ori_meta = sample_data["origin_metadata"][0] - origin = ori_meta["origin"] - swh_storage.origin_add_one(origin) - provider_get = to_provider(ori_meta["provider"]) - provider_id = swh_storage.metadata_provider_add(**provider_get) + swh_storage.origin_add_one({"url": ori_meta["origin_url"]}) + swh_storage.metadata_authority_add(**sample_data["authority"][0]) + swh_storage.metadata_fetcher_add(**sample_data["fetcher"][0]) mock_memory = mocker.patch( "swh.storage.in_memory.InMemoryStorage.origin_metadata_add" ) @@ -633,7 +626,7 @@ # first try goes ko fake_hash_collision, # second try goes ko - psycopg2.IntegrityError("provider_id already inserted"), + psycopg2.IntegrityError("foo bar"), # ok then! None, ] @@ -642,19 +635,35 @@ "swh.storage.retry.RetryingProxyStorage" ".origin_metadata_add.retry.sleep" ) - url = origin["url"] - ts = ori_meta["discovery_date"] - tool_id = ori_meta["tool"] - metadata = ori_meta["metadata"] - # No exception raised as insertion finally came through - swh_storage.origin_metadata_add(url, ts, provider_id, tool_id, metadata) + swh_storage.origin_metadata_add(**ori_meta) mock_memory.assert_has_calls( [ # 3 calls, as long as error raised - call(url, ts, provider_id, tool_id, metadata), - call(url, ts, provider_id, tool_id, metadata), - call(url, ts, provider_id, tool_id, metadata), + call( + ori_meta["origin_url"], + ori_meta["discovery_date"], + ori_meta["authority"], + ori_meta["fetcher"], + ori_meta["format"], + ori_meta["metadata"], + ), + call( + ori_meta["origin_url"], + ori_meta["discovery_date"], + ori_meta["authority"], + ori_meta["fetcher"], + ori_meta["format"], + ori_meta["metadata"], + ), + call( + ori_meta["origin_url"], + ori_meta["discovery_date"], + ori_meta["authority"], + ori_meta["fetcher"], + ori_meta["format"], + ori_meta["metadata"], + ), ] ) assert mock_sleep.call_count == 2 @@ -672,17 +681,10 @@ mock_memory.side_effect = StorageArgumentException("Refuse to add always!") ori_meta = sample_data["origin_metadata"][0] - origin = ori_meta["origin"] - swh_storage.origin_add_one(origin) - - url = origin["url"] - ts = ori_meta["discovery_date"] - provider_id = "provider_id" - tool_id = ori_meta["tool"] - metadata = ori_meta["metadata"] + swh_storage.origin_add_one({"url": ori_meta["origin_url"]}) with pytest.raises(StorageArgumentException, match="Refuse to add"): - swh_storage.origin_metadata_add(url, ts, provider_id, tool_id, metadata) + swh_storage.origin_metadata_add(**ori_meta) assert mock_memory.call_count == 1 diff --git a/swh/storage/tests/test_storage.py b/swh/storage/tests/test_storage.py --- a/swh/storage/tests/test_storage.py +++ b/swh/storage/tests/test_storage.py @@ -3170,353 +3170,88 @@ assert expected == ret - def test_tool_add(self, swh_storage): - tool = { - "name": "some-unknown-tool", - "version": "some-version", - "configuration": {"debian-package": "some-package"}, - } - - actual_tool = swh_storage.tool_get(tool) - assert actual_tool is None # does not exist - - # add it - actual_tools = swh_storage.tool_add([tool]) - - assert len(actual_tools) == 1 - actual_tool = actual_tools[0] - assert actual_tool is not None # now it exists - new_id = actual_tool.pop("id") - assert actual_tool == tool - - actual_tools2 = swh_storage.tool_add([tool]) - actual_tool2 = actual_tools2[0] - assert actual_tool2 is not None # now it exists - new_id2 = actual_tool2.pop("id") - - assert new_id == new_id2 - assert actual_tool == actual_tool2 - - def test_tool_add_multiple(self, swh_storage): - tool = { - "name": "some-unknown-tool", - "version": "some-version", - "configuration": {"debian-package": "some-package"}, - } - - actual_tools = list(swh_storage.tool_add([tool])) - assert len(actual_tools) == 1 - - new_tools = [ - tool, - {"name": "yet-another-tool", "version": "version", "configuration": {},}, - ] - - actual_tools = swh_storage.tool_add(new_tools) - assert len(actual_tools) == 2 - - # order not guaranteed, so we iterate over results to check - for tool in actual_tools: - _id = tool.pop("id") - assert _id is not None - assert tool in new_tools - - def test_tool_get_missing(self, swh_storage): - tool = { - "name": "unknown-tool", - "version": "3.1.0rc2-31-ga2cbb8c", - "configuration": {"command_line": "nomossa "}, - } - - actual_tool = swh_storage.tool_get(tool) - - assert actual_tool is None - - def test_tool_metadata_get_missing_context(self, swh_storage): - tool = { - "name": "swh-metadata-translator", - "version": "0.0.1", - "configuration": {"context": "unknown-context"}, - } - - actual_tool = swh_storage.tool_get(tool) - - assert actual_tool is None + def test_metadata_fetcher_add_get(self, swh_storage): + actual_fetcher = swh_storage.metadata_fetcher_get( + data.metadata_fetcher["name"], data.metadata_fetcher["version"] + ) + assert actual_fetcher is None # does not exist - def test_tool_metadata_get(self, swh_storage): - tool = { - "name": "swh-metadata-translator", - "version": "0.0.1", - "configuration": {"type": "local", "context": "npm"}, - } - expected_tool = swh_storage.tool_add([tool])[0] + swh_storage.metadata_fetcher_add(**data.metadata_fetcher) - # when - actual_tool = swh_storage.tool_get(tool) + res = swh_storage.metadata_fetcher_get( + data.metadata_fetcher["name"], data.metadata_fetcher["version"] + ) - # then - assert expected_tool == actual_tool + assert res is not data.metadata_fetcher + assert res == data.metadata_fetcher - def test_metadata_provider_get(self, swh_storage): - # given - no_provider = swh_storage.metadata_provider_get(6459456445615) - assert no_provider is None - # when - provider_id = swh_storage.metadata_provider_add( - data.provider["name"], - data.provider["type"], - data.provider["url"], - data.provider["metadata"], + def test_metadata_authority_add_get(self, swh_storage): + actual_authority = swh_storage.metadata_authority_get( + data.metadata_authority["type"], data.metadata_authority["url"] ) + assert actual_authority is None # does not exist - actual_provider = swh_storage.metadata_provider_get(provider_id) - expected_provider = { - "provider_name": data.provider["name"], - "provider_url": data.provider["url"], - } - # then - del actual_provider["id"] - assert actual_provider, expected_provider + swh_storage.metadata_authority_add(**data.metadata_authority) - def test_metadata_provider_get_by(self, swh_storage): - # given - no_provider = swh_storage.metadata_provider_get_by( - { - "provider_name": data.provider["name"], - "provider_url": data.provider["url"], - } - ) - assert no_provider is None - # when - provider_id = swh_storage.metadata_provider_add( - data.provider["name"], - data.provider["type"], - data.provider["url"], - data.provider["metadata"], + res = swh_storage.metadata_authority_get( + data.metadata_authority["type"], data.metadata_authority["url"] ) - actual_provider = swh_storage.metadata_provider_get_by( - { - "provider_name": data.provider["name"], - "provider_url": data.provider["url"], - } - ) - # then - assert provider_id, actual_provider["id"] + assert res is not data.metadata_authority + assert res == data.metadata_authority def test_origin_metadata_add(self, swh_storage): - # given origin = data.origin + fetcher = data.metadata_fetcher + authority = data.metadata_authority swh_storage.origin_add([origin])[0] - tools = swh_storage.tool_add([data.metadata_tool]) - tool = tools[0] + swh_storage.metadata_fetcher_add(**fetcher) + swh_storage.metadata_authority_add(**authority) - swh_storage.metadata_provider_add( - data.provider["name"], - data.provider["type"], - data.provider["url"], - data.provider["metadata"], - ) - provider = swh_storage.metadata_provider_get_by( - { - "provider_name": data.provider["name"], - "provider_url": data.provider["url"], - } - ) - - # when adding for the same origin 2 metadatas - n_om = len(list(swh_storage.origin_metadata_get_by(origin["url"]))) - swh_storage.origin_metadata_add( - origin["url"], - data.origin_metadata["discovery_date"], - provider["id"], - tool["id"], - data.origin_metadata["metadata"], - ) - swh_storage.origin_metadata_add( - origin["url"], - "2015-01-01 23:00:00+00", - provider["id"], - tool["id"], - data.origin_metadata2["metadata"], - ) - n_actual_om = len(list(swh_storage.origin_metadata_get_by(origin["url"]))) - # then - assert n_actual_om == n_om + 2 + print(origin) + print(data.origin_metadata) + swh_storage.origin_metadata_add(**data.origin_metadata) + swh_storage.origin_metadata_add(**data.origin_metadata2) - def test_origin_metadata_get(self, swh_storage): - # given - origin_url = data.origin["url"] - origin_url2 = data.origin2["url"] - swh_storage.origin_add([data.origin]) - swh_storage.origin_add([data.origin2]) + swh_storage.origin_metadata_get(origin["url"], authority) - swh_storage.metadata_provider_add( - data.provider["name"], - data.provider["type"], - data.provider["url"], - data.provider["metadata"], - ) - provider = swh_storage.metadata_provider_get_by( - { - "provider_name": data.provider["name"], - "provider_url": data.provider["url"], - } - ) - tool = swh_storage.tool_add([data.metadata_tool])[0] - # when adding for the same origin 2 metadatas - swh_storage.origin_metadata_add( - origin_url, - data.origin_metadata["discovery_date"], - provider["id"], - tool["id"], - data.origin_metadata["metadata"], - ) - swh_storage.origin_metadata_add( - origin_url2, - data.origin_metadata2["discovery_date"], - provider["id"], - tool["id"], - data.origin_metadata2["metadata"], - ) - swh_storage.origin_metadata_add( - origin_url, - data.origin_metadata2["discovery_date"], - provider["id"], - tool["id"], - data.origin_metadata2["metadata"], - ) - all_metadatas = list( + assert [data.origin_metadata, data.origin_metadata2] == list( sorted( - swh_storage.origin_metadata_get_by(origin_url), + swh_storage.origin_metadata_get(origin["url"], authority), key=lambda x: x["discovery_date"], ) ) - metadatas_for_origin2 = list(swh_storage.origin_metadata_get_by(origin_url2)) - expected_results = [ - { - "origin_url": origin_url, - "discovery_date": datetime.datetime( - 2015, 1, 1, 23, 0, tzinfo=datetime.timezone.utc - ), - "metadata": {"name": "test_origin_metadata", "version": "0.0.1"}, - "provider_id": provider["id"], - "provider_name": "hal", - "provider_type": "deposit-client", - "provider_url": "http:///hal/inria", - "tool_id": tool["id"], - }, - { - "origin_url": origin_url, - "discovery_date": datetime.datetime( - 2017, 1, 1, 23, 0, tzinfo=datetime.timezone.utc - ), - "metadata": {"name": "test_origin_metadata", "version": "0.0.1"}, - "provider_id": provider["id"], - "provider_name": "hal", - "provider_type": "deposit-client", - "provider_url": "http:///hal/inria", - "tool_id": tool["id"], - }, - ] - # then - assert len(all_metadatas) == 2 - assert len(metadatas_for_origin2) == 1 - assert all_metadatas == expected_results - - def test_metadata_provider_add(self, swh_storage): - provider = { - "provider_name": "swMATH", - "provider_type": "registry", - "provider_url": "http://www.swmath.org/", - "metadata": { - "email": "contact@swmath.org", - "license": "All rights reserved", - }, - } - provider["id"] = provider_id = swh_storage.metadata_provider_add(**provider) - assert provider == swh_storage.metadata_provider_get_by( - {"provider_name": "swMATH", "provider_url": "http://www.swmath.org/"} - ) - assert provider == swh_storage.metadata_provider_get(provider_id) - - def test_origin_metadata_get_by_provider_type(self, swh_storage): - # given - origin_url = data.origin["url"] + def test_origin_metadata_get(self, swh_storage): + authority = data.metadata_authority + fetcher = data.metadata_fetcher + origin_url1 = data.origin["url"] origin_url2 = data.origin2["url"] swh_storage.origin_add([data.origin]) swh_storage.origin_add([data.origin2]) - provider1_id = swh_storage.metadata_provider_add( - data.provider["name"], - data.provider["type"], - data.provider["url"], - data.provider["metadata"], - ) - provider1 = swh_storage.metadata_provider_get_by( - { - "provider_name": data.provider["name"], - "provider_url": data.provider["url"], - } - ) - assert provider1 == swh_storage.metadata_provider_get(provider1_id) - provider2_id = swh_storage.metadata_provider_add( - "swMATH", - "registry", - "http://www.swmath.org/", - {"email": "contact@swmath.org", "license": "All rights reserved"}, - ) - provider2 = swh_storage.metadata_provider_get_by( - {"provider_name": "swMATH", "provider_url": "http://www.swmath.org/"} - ) - assert provider2 == swh_storage.metadata_provider_get(provider2_id) + origin1_metadata1 = data.origin_metadata + origin1_metadata2 = data.origin_metadata2 + origin2_metadata = {**data.origin_metadata2, "origin_url": origin_url2} - # using the only tool now inserted in the data.sql, but for this - # provider should be a crawler tool (not yet implemented) - tool = swh_storage.tool_add([data.metadata_tool])[0] + swh_storage.metadata_authority_add(**authority) + swh_storage.metadata_fetcher_add(**fetcher) - # when adding for the same origin 2 metadatas - swh_storage.origin_metadata_add( - origin_url, - data.origin_metadata["discovery_date"], - provider1["id"], - tool["id"], - data.origin_metadata["metadata"], - ) - swh_storage.origin_metadata_add( - origin_url2, - data.origin_metadata2["discovery_date"], - provider2["id"], - tool["id"], - data.origin_metadata2["metadata"], - ) - provider_type = "registry" - m_by_provider = list( - swh_storage.origin_metadata_get_by(origin_url2, provider_type) - ) - for item in m_by_provider: - if "id" in item: - del item["id"] - expected_results = [ - { - "origin_url": origin_url2, - "discovery_date": datetime.datetime( - 2017, 1, 1, 23, 0, tzinfo=datetime.timezone.utc - ), - "metadata": {"name": "test_origin_metadata", "version": "0.0.1"}, - "provider_id": provider2["id"], - "provider_name": "swMATH", - "provider_type": provider_type, - "provider_url": "http://www.swmath.org/", - "tool_id": tool["id"], - } - ] - # then + swh_storage.origin_metadata_add(**origin1_metadata1) + swh_storage.origin_metadata_add(**origin1_metadata2) + swh_storage.origin_metadata_add(**origin2_metadata) - assert len(m_by_provider) == 1 - assert m_by_provider == expected_results + assert [origin1_metadata1, origin1_metadata2] == list( + sorted( + swh_storage.origin_metadata_get(origin_url1, authority), + key=lambda x: x["discovery_date"], + ) + ) + + assert [origin2_metadata] == list( + swh_storage.origin_metadata_get(origin_url2, authority) + ) class TestStorageGeneratedData: