Page MenuHomeSoftware Heritage

D2609.diff
No OneTemporary

D2609.diff

diff --git a/requirements-swh.txt b/requirements-swh.txt
--- a/requirements-swh.txt
+++ b/requirements-swh.txt
@@ -1,4 +1,4 @@
-swh.core[db,http] >= 0.0.65
+swh.core[db,http] >= 0.0.87
swh.model >= 0.0.15
swh.objstorage >= 0.0.28
swh.scheduler >= 0.0.47
diff --git a/swh/indexer/storage/__init__.py b/swh/indexer/storage/__init__.py
--- a/swh/indexer/storage/__init__.py
+++ b/swh/indexer/storage/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2015-2018 The Software Heritage developers
+# Copyright (C) 2015-2020 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
@@ -9,7 +9,6 @@
from collections import defaultdict
-from swh.core.api import remote_api_endpoint
from swh.storage.common import db_transaction_generator, db_transaction
from swh.storage.exc import StorageDBError
from .db import Db
@@ -108,10 +107,8 @@
if db is not self._db:
db.put_conn()
- @remote_api_endpoint('check_config')
@db_transaction()
def check_config(self, *, check_write, db=None, cur=None):
- """Check that the storage is configured and ready to go."""
# Check permissions on one of the tables
if check_write:
check = 'INSERT'
@@ -124,22 +121,8 @@
)
return cur.fetchone()[0]
- @remote_api_endpoint('content_mimetype/missing')
@db_transaction_generator()
def content_mimetype_missing(self, mimetypes, db=None, cur=None):
- """Generate mimetypes missing from storage.
-
- Args:
- mimetypes (iterable): iterable of dict with keys:
-
- - **id** (bytes): sha1 identifier
- - **indexer_configuration_id** (int): tool used to compute the
- results
-
- Yields:
- tuple (id, indexer_configuration_id): missing id
-
- """
for obj in db.content_mimetype_missing_from_list(mimetypes, cur):
yield obj[0]
@@ -147,34 +130,6 @@
indexer_configuration_id, limit=1000,
with_textual_data=False,
db=None, cur=None):
- """Retrieve ids of type content_type within range [start, end] bound
- by limit.
-
- Args:
- **content_type** (str): content's type (mimetype, language, etc...)
- **start** (bytes): Starting identifier range (expected smaller
- than end)
- **end** (bytes): Ending identifier range (expected larger
- than start)
- **indexer_configuration_id** (int): The tool used to index data
- **limit** (int): Limit result (default to 1000)
- **with_textual_data** (bool): Deal with only textual
- content (True) or all
- content (all contents by
- defaults, False)
-
- Raises:
- ValueError for;
- - limit to None
- - wrong content_type provided
-
- Returns:
- a dict with keys:
- - **ids** [bytes]: iterable of content ids within the range.
- - **next** (Optional[bytes]): The next range of sha1 starts at
- this sha1 if any
-
- """
if limit is None:
raise ValueError('Development error: limit should not be None')
if content_type not in db.content_indexer_names:
@@ -199,53 +154,16 @@
'next': next_id
}
- @remote_api_endpoint('content_mimetype/range')
@db_transaction()
def content_mimetype_get_range(self, start, end, indexer_configuration_id,
limit=1000, db=None, cur=None):
- """Retrieve mimetypes within range [start, end] bound by limit.
-
- Args:
- **start** (bytes): Starting identifier range (expected smaller
- than end)
- **end** (bytes): Ending identifier range (expected larger
- than start)
- **indexer_configuration_id** (int): The tool used to index data
- **limit** (int): Limit result (default to 1000)
-
- Raises:
- ValueError for limit to None
-
- Returns:
- a dict with keys:
- - **ids** [bytes]: iterable of content ids within the range.
- - **next** (Optional[bytes]): The next range of sha1 starts at
- this sha1 if any
-
- """
return self._content_get_range('mimetype', start, end,
indexer_configuration_id, limit=limit,
db=db, cur=cur)
- @remote_api_endpoint('content_mimetype/add')
@db_transaction()
def content_mimetype_add(self, mimetypes, conflict_update=False, db=None,
cur=None):
- """Add mimetypes not present in storage.
-
- Args:
- mimetypes (iterable): dictionaries with keys:
-
- - **id** (bytes): sha1 identifier
- - **mimetype** (bytes): raw content's mimetype
- - **encoding** (bytes): raw content's encoding
- - **indexer_configuration_id** (int): tool's id used to
- compute the results
- - **conflict_update** (bool): Flag to determine if we want to
- overwrite (``True``) or skip duplicates (``False``, the
- default)
-
- """
_check_id_duplicates(mimetypes)
mimetypes.sort(key=lambda m: m['id'])
db.mktemp_content_mimetype(cur)
@@ -254,84 +172,26 @@
cur)
db.content_mimetype_add_from_temp(conflict_update, cur)
- @remote_api_endpoint('content_mimetype')
@db_transaction_generator()
def content_mimetype_get(self, ids, db=None, cur=None):
- """Retrieve full content mimetype per ids.
-
- Args:
- ids (iterable): sha1 identifier
-
- Yields:
- mimetypes (iterable): dictionaries with keys:
-
- - **id** (bytes): sha1 identifier
- - **mimetype** (bytes): raw content's mimetype
- - **encoding** (bytes): raw content's encoding
- - **tool** (dict): Tool used to compute the language
-
- """
for c in db.content_mimetype_get_from_list(ids, cur):
yield converters.db_to_mimetype(
dict(zip(db.content_mimetype_cols, c)))
- @remote_api_endpoint('content_language/missing')
@db_transaction_generator()
def content_language_missing(self, languages, db=None, cur=None):
- """List languages missing from storage.
-
- Args:
- languages (iterable): dictionaries with keys:
-
- - **id** (bytes): sha1 identifier
- - **indexer_configuration_id** (int): tool used to compute
- the results
-
- Yields:
- an iterable of missing id for the tuple (id,
- indexer_configuration_id)
-
- """
for obj in db.content_language_missing_from_list(languages, cur):
yield obj[0]
- @remote_api_endpoint('content_language')
@db_transaction_generator()
def content_language_get(self, ids, db=None, cur=None):
- """Retrieve full content language per ids.
-
- Args:
- ids (iterable): sha1 identifier
-
- Yields:
- languages (iterable): dictionaries with keys:
-
- - **id** (bytes): sha1 identifier
- - **lang** (bytes): raw content's language
- - **tool** (dict): Tool used to compute the language
-
- """
for c in db.content_language_get_from_list(ids, cur):
yield converters.db_to_language(
dict(zip(db.content_language_cols, c)))
- @remote_api_endpoint('content_language/add')
@db_transaction()
def content_language_add(self, languages, conflict_update=False, db=None,
cur=None):
- """Add languages not present in storage.
-
- Args:
- languages (iterable): dictionaries with keys:
-
- - **id** (bytes): sha1
- - **lang** (bytes): language detected
-
- conflict_update (bool): Flag to determine if we want to
- overwrite (true) or skip duplicates (false, the
- default)
-
- """
_check_id_duplicates(languages)
languages.sort(key=lambda m: m['id'])
db.mktemp_content_language(cur)
@@ -347,62 +207,19 @@
db.content_language_add_from_temp(conflict_update, cur)
- @remote_api_endpoint('content/ctags/missing')
@db_transaction_generator()
def content_ctags_missing(self, ctags, db=None, cur=None):
- """List ctags missing from storage.
-
- Args:
- ctags (iterable): dicts with keys:
-
- - **id** (bytes): sha1 identifier
- - **indexer_configuration_id** (int): tool used to compute
- the results
-
- Yields:
- an iterable of missing id for the tuple (id,
- indexer_configuration_id)
-
- """
for obj in db.content_ctags_missing_from_list(ctags, cur):
yield obj[0]
- @remote_api_endpoint('content/ctags')
@db_transaction_generator()
def content_ctags_get(self, ids, db=None, cur=None):
- """Retrieve ctags per id.
-
- Args:
- ids (iterable): sha1 checksums
-
- Yields:
- Dictionaries with keys:
-
- - **id** (bytes): content's identifier
- - **name** (str): symbol's name
- - **kind** (str): symbol's kind
- - **lang** (str): language for that content
- - **tool** (dict): tool used to compute the ctags' info
-
-
- """
for c in db.content_ctags_get_from_list(ids, cur):
yield converters.db_to_ctags(dict(zip(db.content_ctags_cols, c)))
- @remote_api_endpoint('content/ctags/add')
@db_transaction()
def content_ctags_add(self, ctags, conflict_update=False, db=None,
cur=None):
- """Add ctags not present in storage
-
- Args:
- ctags (iterable): dictionaries with keys:
-
- - **id** (bytes): sha1
- - **ctags** ([list): List of dictionary with keys: name, kind,
- line, lang
-
- """
_check_id_duplicates(ctags)
ctags.sort(key=lambda m: m['id'])
@@ -422,41 +239,15 @@
db.content_ctags_add_from_temp(conflict_update, cur)
- @remote_api_endpoint('content/ctags/search')
@db_transaction_generator()
def content_ctags_search(self, expression,
limit=10, last_sha1=None, db=None, cur=None):
- """Search through content's raw ctags symbols.
-
- Args:
- expression (str): Expression to search for
- limit (int): Number of rows to return (default to 10).
- last_sha1 (str): Offset from which retrieving data (default to '').
-
- Yields:
- rows of ctags including id, name, lang, kind, line, etc...
-
- """
for obj in db.content_ctags_search(expression, last_sha1, limit,
cur=cur):
yield converters.db_to_ctags(dict(zip(db.content_ctags_cols, obj)))
- @remote_api_endpoint('content/fossology_license')
@db_transaction_generator()
def content_fossology_license_get(self, ids, db=None, cur=None):
- """Retrieve licenses per id.
-
- Args:
- ids (iterable): sha1 checksums
-
- Yields:
- dict: ``{id: facts}`` where ``facts`` is a dict with the
- following keys:
-
- - **licenses** ([str]): associated licenses for that content
- - **tool** (dict): Tool used to compute the license
-
- """
d = defaultdict(list)
for c in db.content_fossology_license_get_from_list(ids, cur):
license = dict(zip(db.content_fossology_license_cols, c))
@@ -467,26 +258,9 @@
for id_, facts in d.items():
yield {id_: facts}
- @remote_api_endpoint('content/fossology_license/add')
@db_transaction()
def content_fossology_license_add(self, licenses, conflict_update=False,
db=None, cur=None):
- """Add licenses not present in storage.
-
- Args:
- licenses (iterable): dictionaries with keys:
-
- - **id**: sha1
- - **licenses** ([bytes]): List of licenses associated to sha1
- - **tool** (str): nomossa
-
- conflict_update: Flag to determine if we want to overwrite (true)
- or skip duplicates (false, the default)
-
- Returns:
- list: content_license entries which failed due to unknown licenses
-
- """
_check_id_duplicates(licenses)
licenses.sort(key=lambda m: m['id'])
db.mktemp_content_fossology_license(cur)
@@ -502,90 +276,28 @@
cur=cur)
db.content_fossology_license_add_from_temp(conflict_update, cur)
- @remote_api_endpoint('content/fossology_license/range')
@db_transaction()
def content_fossology_license_get_range(
self, start, end, indexer_configuration_id,
limit=1000, db=None, cur=None):
- """Retrieve licenses within range [start, end] bound by limit.
-
- Args:
- **start** (bytes): Starting identifier range (expected smaller
- than end)
- **end** (bytes): Ending identifier range (expected larger
- than start)
- **indexer_configuration_id** (int): The tool used to index data
- **limit** (int): Limit result (default to 1000)
-
- Raises:
- ValueError for limit to None
-
- Returns:
- a dict with keys:
- - **ids** [bytes]: iterable of content ids within the range.
- - **next** (Optional[bytes]): The next range of sha1 starts at
- this sha1 if any
-
- """
return self._content_get_range('fossology_license', start, end,
indexer_configuration_id, limit=limit,
with_textual_data=True, db=db, cur=cur)
- @remote_api_endpoint('content_metadata/missing')
@db_transaction_generator()
def content_metadata_missing(self, metadata, db=None, cur=None):
- """List metadata missing from storage.
-
- Args:
- metadata (iterable): dictionaries with keys:
-
- - **id** (bytes): sha1 identifier
- - **indexer_configuration_id** (int): tool used to compute
- the results
-
- Yields:
- missing sha1s
-
- """
for obj in db.content_metadata_missing_from_list(metadata, cur):
yield obj[0]
- @remote_api_endpoint('content_metadata')
@db_transaction_generator()
def content_metadata_get(self, ids, db=None, cur=None):
- """Retrieve metadata per id.
-
- Args:
- ids (iterable): sha1 checksums
-
- Yields:
- dictionaries with the following keys:
-
- id (bytes)
- metadata (str): associated metadata
- tool (dict): tool used to compute metadata
-
- """
for c in db.content_metadata_get_from_list(ids, cur):
yield converters.db_to_metadata(
dict(zip(db.content_metadata_cols, c)))
- @remote_api_endpoint('content_metadata/add')
@db_transaction()
def content_metadata_add(self, metadata, conflict_update=False, db=None,
cur=None):
- """Add metadata not present in storage.
-
- Args:
- metadata (iterable): dictionaries with keys:
-
- - **id**: sha1
- - **metadata**: arbitrary dict
-
- conflict_update: Flag to determine if we want to overwrite (true)
- or skip duplicates (false, the default)
-
- """
_check_id_duplicates(metadata)
metadata.sort(key=lambda m: m['id'])
@@ -596,67 +308,21 @@
cur)
db.content_metadata_add_from_temp(conflict_update, cur)
- @remote_api_endpoint('revision_intrinsic_metadata/missing')
@db_transaction_generator()
def revision_intrinsic_metadata_missing(self, metadata, db=None, cur=None):
- """List metadata missing from storage.
-
- Args:
- metadata (iterable): dictionaries with keys:
-
- - **id** (bytes): sha1_git revision identifier
- - **indexer_configuration_id** (int): tool used to compute
- the results
-
- Yields:
- missing ids
-
- """
for obj in db.revision_intrinsic_metadata_missing_from_list(
metadata, cur):
yield obj[0]
- @remote_api_endpoint('revision_intrinsic_metadata')
@db_transaction_generator()
def revision_intrinsic_metadata_get(self, ids, db=None, cur=None):
- """Retrieve revision metadata per id.
-
- Args:
- ids (iterable): sha1 checksums
-
- Yields:
- : dictionaries with the following keys:
-
- - **id** (bytes)
- - **metadata** (str): associated metadata
- - **tool** (dict): tool used to compute metadata
- - **mappings** (List[str]): list of mappings used to translate
- these metadata
-
- """
for c in db.revision_intrinsic_metadata_get_from_list(ids, cur):
yield converters.db_to_metadata(
dict(zip(db.revision_intrinsic_metadata_cols, c)))
- @remote_api_endpoint('revision_intrinsic_metadata/add')
@db_transaction()
def revision_intrinsic_metadata_add(self, metadata, conflict_update=False,
db=None, cur=None):
- """Add metadata not present in storage.
-
- Args:
- metadata (iterable): dictionaries with keys:
-
- - **id**: sha1_git of revision
- - **metadata**: arbitrary dict
- - **indexer_configuration_id**: tool used to compute metadata
- - **mappings** (List[str]): list of mappings used to translate
- these metadata
-
- conflict_update: Flag to determine if we want to overwrite (true)
- or skip duplicates (false, the default)
-
- """
_check_id_duplicates(metadata)
metadata.sort(key=lambda m: m['id'])
@@ -668,66 +334,20 @@
cur)
db.revision_intrinsic_metadata_add_from_temp(conflict_update, cur)
- @remote_api_endpoint('revision_intrinsic_metadata/delete')
@db_transaction()
def revision_intrinsic_metadata_delete(self, entries, db=None, cur=None):
- """Remove revision metadata from the storage.
-
- Args:
- entries (dict): dictionaries with the following keys:
-
- - **id** (bytes): revision identifier
- - **indexer_configuration_id** (int): tool used to compute
- metadata
- """
db.revision_intrinsic_metadata_delete(entries, cur)
- @remote_api_endpoint('origin_intrinsic_metadata')
@db_transaction_generator()
def origin_intrinsic_metadata_get(self, ids, db=None, cur=None):
- """Retrieve origin metadata per id.
-
- Args:
- ids (iterable): origin identifiers
-
- Yields:
- list: dictionaries with the following keys:
-
- - **id** (str): origin url
- - **from_revision** (bytes): which revision this metadata
- was extracted from
- - **metadata** (str): associated metadata
- - **tool** (dict): tool used to compute metadata
- - **mappings** (List[str]): list of mappings used to translate
- these metadata
-
- """
for c in db.origin_intrinsic_metadata_get_from_list(ids, cur):
yield converters.db_to_metadata(
dict(zip(db.origin_intrinsic_metadata_cols, c)))
- @remote_api_endpoint('origin_intrinsic_metadata/add')
@db_transaction()
def origin_intrinsic_metadata_add(self, metadata,
conflict_update=False, db=None,
cur=None):
- """Add origin metadata not present in storage.
-
- Args:
- metadata (iterable): dictionaries with keys:
-
- - **id**: origin urls
- - **from_revision**: sha1 id of the revision used to generate
- these metadata.
- - **metadata**: arbitrary dict
- - **indexer_configuration_id**: tool used to compute metadata
- - **mappings** (List[str]): list of mappings used to translate
- these metadata
-
- conflict_update: Flag to determine if we want to overwrite (true)
- or skip duplicates (false, the default)
-
- """
_check_id_duplicates(metadata)
metadata.sort(key=lambda m: m['id'])
@@ -740,81 +360,24 @@
cur)
db.origin_intrinsic_metadata_add_from_temp(conflict_update, cur)
- @remote_api_endpoint('origin_intrinsic_metadata/delete')
@db_transaction()
def origin_intrinsic_metadata_delete(
self, entries, db=None, cur=None):
- """Remove origin metadata from the storage.
-
- Args:
- entries (dict): dictionaries with the following keys:
-
- - **id** (str): origin urls
- - **indexer_configuration_id** (int): tool used to compute
- metadata
- """
db.origin_intrinsic_metadata_delete(entries, cur)
- @remote_api_endpoint('origin_intrinsic_metadata/search/fulltext')
@db_transaction_generator()
def origin_intrinsic_metadata_search_fulltext(
self, conjunction, limit=100, db=None, cur=None):
- """Returns the list of origins whose metadata contain all the terms.
-
- Args:
- conjunction (List[str]): List of terms to be searched for.
- limit (int): The maximum number of results to return
-
- Yields:
- list: dictionaries with the following keys:
-
- - **id** (str): origin urls
- - **from_revision**: sha1 id of the revision used to generate
- these metadata.
- - **metadata** (str): associated metadata
- - **tool** (dict): tool used to compute metadata
- - **mappings** (List[str]): list of mappings used to translate
- these metadata
-
- """
for c in db.origin_intrinsic_metadata_search_fulltext(
conjunction, limit=limit, cur=cur):
yield converters.db_to_metadata(
dict(zip(db.origin_intrinsic_metadata_cols, c)))
- @remote_api_endpoint('origin_intrinsic_metadata/search/by_producer')
@db_transaction()
def origin_intrinsic_metadata_search_by_producer(
self, page_token='', limit=100, ids_only=False,
mappings=None, tool_ids=None,
db=None, cur=None):
- """Returns the list of origins whose metadata contain all the terms.
-
- Args:
- page_token (str): Opaque token used for pagination.
- limit (int): The maximum number of results to return
- ids_only (bool): Determines whether only origin urls are
- returned or the content as well
- mappings (List[str]): Returns origins whose intrinsic metadata
- were generated using at least one of these mappings.
-
- Returns:
- dict: dict with the following keys:
- - **next_page_token** (str, optional): opaque token to be used as
- `page_token` for retrieving the next page. If absent, there is
- no more pages to gather.
- - **origins** (list): list of origin url (str) if `ids_only=True`
- else dictionaries with the following keys:
-
- - **id** (str): origin urls
- - **from_revision**: sha1 id of the revision used to generate
- these metadata.
- - **metadata** (str): associated metadata
- - **tool** (dict): tool used to compute metadata
- - **mappings** (List[str]): list of mappings used to translate
- these metadata
-
- """
assert isinstance(page_token, str)
# we go to limit+1 to check whether we should add next_page_token in
# the response
@@ -834,25 +397,9 @@
result['next_page_token'] = result['origins'][-1]['id']
return result
- @remote_api_endpoint('origin_intrinsic_metadata/stats')
@db_transaction()
def origin_intrinsic_metadata_stats(
self, db=None, cur=None):
- """Returns counts of indexed metadata per origins, broken down
- into metadata types.
-
- Returns:
- dict: dictionary with keys:
-
- - total (int): total number of origins that were indexed
- (possibly yielding an empty metadata dictionary)
- - non_empty (int): total number of origins that we extracted
- a non-empty metadata dictionary from
- - per_mapping (dict): a dictionary with mapping names as
- keys and number of origins whose indexing used this
- mapping. Note that indexing a given origin may use
- 0, 1, or many mappings.
- """
mapping_names = [m for m in MAPPING_NAMES]
select_parts = []
@@ -880,26 +427,8 @@
'per_mapping': results,
}
- @remote_api_endpoint('indexer_configuration/add')
@db_transaction_generator()
def indexer_configuration_add(self, tools, db=None, cur=None):
- """Add new tools to the storage.
-
- Args:
- tools ([dict]): List of dictionary representing tool to
- insert in the db. Dictionary with the following keys:
-
- - **tool_name** (str): tool's name
- - **tool_version** (str): tool's version
- - **tool_configuration** (dict): tool's configuration
- (free form dict)
-
- Returns:
- List of dict inserted in the db (holding the id key as
- well). The order of the list is not guaranteed to match
- the order of the initial list.
-
- """
db.mktemp_indexer_configuration(cur)
db.copy_to(tools, 'tmp_indexer_configuration',
['tool_name', 'tool_version', 'tool_configuration'],
@@ -909,24 +438,8 @@
for line in tools:
yield dict(zip(db.indexer_configuration_cols, line))
- @remote_api_endpoint('indexer_configuration/data')
@db_transaction()
def indexer_configuration_get(self, tool, db=None, cur=None):
- """Retrieve tool information.
-
- Args:
- tool (dict): Dictionary representing a tool with the
- following keys:
-
- - **tool_name** (str): tool's name
- - **tool_version** (str): tool's version
- - **tool_configuration** (dict): tool's configuration
- (free form dict)
-
- Returns:
- The same dictionary with an `id` key, None otherwise.
-
- """
tool_conf = tool['tool_configuration']
if isinstance(tool_conf, dict):
tool_conf = json.dumps(tool_conf)
diff --git a/swh/indexer/storage/api/client.py b/swh/indexer/storage/api/client.py
--- a/swh/indexer/storage/api/client.py
+++ b/swh/indexer/storage/api/client.py
@@ -7,11 +7,11 @@
from swh.storage.exc import StorageAPIError
-from .. import IndexerStorage
+from ..interface import IndexerStorageInterface
class RemoteStorage(RPCClient):
"""Proxy to a remote storage API"""
- backend_class = IndexerStorage
+ backend_class = IndexerStorageInterface
api_exception = StorageAPIError
diff --git a/swh/indexer/storage/api/server.py b/swh/indexer/storage/api/server.py
--- a/swh/indexer/storage/api/server.py
+++ b/swh/indexer/storage/api/server.py
@@ -10,8 +10,9 @@
from swh.core.api import (RPCServerApp, error_handler,
encode_data_server as encode_data)
from swh.indexer.storage import (
- get_indexer_storage, INDEXER_CFG_KEY, IndexerStorage
+ get_indexer_storage, INDEXER_CFG_KEY
)
+from swh.indexer.storage.interface import IndexerStorageInterface
def get_storage():
@@ -23,7 +24,7 @@
app = RPCServerApp(__name__,
- backend_class=IndexerStorage,
+ backend_class=IndexerStorageInterface,
backend_factory=get_storage)
storage = None
diff --git a/swh/indexer/storage/in_memory.py b/swh/indexer/storage/in_memory.py
--- a/swh/indexer/storage/in_memory.py
+++ b/swh/indexer/storage/in_memory.py
@@ -203,168 +203,36 @@
return True
def content_mimetype_missing(self, mimetypes):
- """Generate mimetypes missing from storage.
-
- Args:
- mimetypes (iterable): iterable of dict with keys:
-
- - **id** (bytes): sha1 identifier
- - **indexer_configuration_id** (int): tool used to compute the
- results
-
- Yields:
- tuple (id, indexer_configuration_id): missing id
-
- """
yield from self._mimetypes.missing(mimetypes)
def content_mimetype_get_range(
self, start, end, indexer_configuration_id, limit=1000):
- """Retrieve mimetypes within range [start, end] bound by limit.
-
- Args:
- **start** (bytes): Starting identifier range (expected smaller
- than end)
- **end** (bytes): Ending identifier range (expected larger
- than start)
- **indexer_configuration_id** (int): The tool used to index data
- **limit** (int): Limit result (default to 1000)
-
- Raises:
- ValueError for limit to None
-
- Returns:
- a dict with keys:
- - **ids** [bytes]: iterable of content ids within the range.
- - **next** (Optional[bytes]): The next range of sha1 starts at
- this sha1 if any
-
- """
return self._mimetypes.get_range(
start, end, indexer_configuration_id, limit)
def content_mimetype_add(self, mimetypes, conflict_update=False):
- """Add mimetypes not present in storage.
-
- Args:
- mimetypes (iterable): dictionaries with keys:
-
- - **id** (bytes): sha1 identifier
- - **mimetype** (bytes): raw content's mimetype
- - **encoding** (bytes): raw content's encoding
- - **indexer_configuration_id** (int): tool's id used to
- compute the results
- - **conflict_update** (bool): Flag to determine if we want to
- overwrite (``True``) or skip duplicates (``False``, the
- default)
-
- """
if not all(isinstance(x['id'], bytes) for x in mimetypes):
raise TypeError('identifiers must be bytes.')
self._mimetypes.add(mimetypes, conflict_update)
- def content_mimetype_get(self, ids, db=None, cur=None):
- """Retrieve full content mimetype per ids.
-
- Args:
- ids (iterable): sha1 identifier
-
- Yields:
- mimetypes (iterable): dictionaries with keys:
-
- - **id** (bytes): sha1 identifier
- - **mimetype** (bytes): raw content's mimetype
- - **encoding** (bytes): raw content's encoding
- - **tool** (dict): Tool used to compute the language
-
- """
+ def content_mimetype_get(self, ids):
yield from self._mimetypes.get(ids)
def content_language_missing(self, languages):
- """List languages missing from storage.
-
- Args:
- languages (iterable): dictionaries with keys:
-
- - **id** (bytes): sha1 identifier
- - **indexer_configuration_id** (int): tool used to compute
- the results
-
- Yields:
- an iterable of missing id for the tuple (id,
- indexer_configuration_id)
-
- """
yield from self._languages.missing(languages)
def content_language_get(self, ids):
- """Retrieve full content language per ids.
-
- Args:
- ids (iterable): sha1 identifier
-
- Yields:
- languages (iterable): dictionaries with keys:
-
- - **id** (bytes): sha1 identifier
- - **lang** (bytes): raw content's language
- - **tool** (dict): Tool used to compute the language
-
- """
yield from self._languages.get(ids)
def content_language_add(self, languages, conflict_update=False):
- """Add languages not present in storage.
-
- Args:
- languages (iterable): dictionaries with keys:
-
- - **id** (bytes): sha1
- - **lang** (bytes): language detected
-
- conflict_update (bool): Flag to determine if we want to
- overwrite (true) or skip duplicates (false, the
- default)
-
- """
if not all(isinstance(x['id'], bytes) for x in languages):
raise TypeError('identifiers must be bytes.')
self._languages.add(languages, conflict_update)
def content_ctags_missing(self, ctags):
- """List ctags missing from storage.
-
- Args:
- ctags (iterable): dicts with keys:
-
- - **id** (bytes): sha1 identifier
- - **indexer_configuration_id** (int): tool used to compute
- the results
-
- Yields:
- an iterable of missing id for the tuple (id,
- indexer_configuration_id)
-
- """
yield from self._content_ctags.missing(ctags)
def content_ctags_get(self, ids):
- """Retrieve ctags per id.
-
- Args:
- ids (iterable): sha1 checksums
-
- Yields:
- Dictionaries with keys:
-
- - **id** (bytes): content's identifier
- - **name** (str): symbol's name
- - **kind** (str): symbol's kind
- - **lang** (str): language for that content
- - **tool** (dict): tool used to compute the ctags' info
-
-
- """
for item in self._content_ctags.get(ids):
for item_ctags_item in item['ctags']:
yield {
@@ -374,35 +242,12 @@
}
def content_ctags_add(self, ctags, conflict_update=False):
- """Add ctags not present in storage
-
- Args:
- ctags (iterable): dictionaries with keys:
-
- - **id** (bytes): sha1
- - **ctags** ([list): List of dictionary with keys: name, kind,
- line, lang
- - **indexer_configuration_id**: tool used to compute the
- results
-
- """
if not all(isinstance(x['id'], bytes) for x in ctags):
raise TypeError('identifiers must be bytes.')
self._content_ctags.add_merge(ctags, conflict_update, 'ctags')
def content_ctags_search(self, expression,
- limit=10, last_sha1=None, db=None, cur=None):
- """Search through content's raw ctags symbols.
-
- Args:
- expression (str): Expression to search for
- limit (int): Number of rows to return (default to 10).
- last_sha1 (str): Offset from which retrieving data (default to '').
-
- Yields:
- rows of ctags including id, name, lang, kind, line, etc...
-
- """
+ limit=10, last_sha1=None):
nb_matches = 0
for ((id_, tool_id), item) in \
sorted(self._content_ctags._data.items()):
@@ -421,19 +266,6 @@
return
def content_fossology_license_get(self, ids):
- """Retrieve licenses per id.
-
- Args:
- ids (iterable): sha1 checksums
-
- Yields:
- dict: ``{id: facts}`` where ``facts`` is a dict with the
- following keys:
-
- - **licenses** ([str]): associated licenses for that content
- - **tool** (dict): Tool used to compute the license
-
- """
# Rewrites the output of SubStorage.get from the old format to
# the new one. SubStorage.get should be updated once all other
# *_get methods use the new format.
@@ -445,239 +277,52 @@
yield {id_: facts}
def content_fossology_license_add(self, licenses, conflict_update=False):
- """Add licenses not present in storage.
-
- Args:
- licenses (iterable): dictionaries with keys:
-
- - **id**: sha1
- - **licenses** ([bytes]): List of licenses associated to sha1
- - **tool** (str): nomossa
-
- conflict_update: Flag to determine if we want to overwrite (true)
- or skip duplicates (false, the default)
-
- Returns:
- list: content_license entries which failed due to unknown licenses
-
- """
if not all(isinstance(x['id'], bytes) for x in licenses):
raise TypeError('identifiers must be bytes.')
self._licenses.add_merge(licenses, conflict_update, 'licenses')
def content_fossology_license_get_range(
self, start, end, indexer_configuration_id, limit=1000):
- """Retrieve licenses within range [start, end] bound by limit.
-
- Args:
- **start** (bytes): Starting identifier range (expected smaller
- than end)
- **end** (bytes): Ending identifier range (expected larger
- than start)
- **indexer_configuration_id** (int): The tool used to index data
- **limit** (int): Limit result (default to 1000)
-
- Raises:
- ValueError for limit to None
-
- Returns:
- a dict with keys:
- - **ids** [bytes]: iterable of content ids within the range.
- - **next** (Optional[bytes]): The next range of sha1 starts at
- this sha1 if any
-
- """
return self._licenses.get_range(
start, end, indexer_configuration_id, limit)
def content_metadata_missing(self, metadata):
- """List metadata missing from storage.
-
- Args:
- metadata (iterable): dictionaries with keys:
-
- - **id** (bytes): sha1 identifier
- - **indexer_configuration_id** (int): tool used to compute
- the results
-
- Yields:
- missing sha1s
-
- """
yield from self._content_metadata.missing(metadata)
def content_metadata_get(self, ids):
- """Retrieve metadata per id.
-
- Args:
- ids (iterable): sha1 checksums
-
- Yields:
- dictionaries with the following keys:
-
- - **id** (bytes)
- - **metadata** (str): associated metadata
- - **tool** (dict): tool used to compute metadata
-
- """
yield from self._content_metadata.get(ids)
def content_metadata_add(self, metadata, conflict_update=False):
- """Add metadata not present in storage.
-
- Args:
- metadata (iterable): dictionaries with keys:
-
- - **id**: sha1
- - **metadata**: arbitrary dict
- - **indexer_configuration_id**: tool used to compute the
- results
-
- conflict_update: Flag to determine if we want to overwrite (true)
- or skip duplicates (false, the default)
-
- """
if not all(isinstance(x['id'], bytes) for x in metadata):
raise TypeError('identifiers must be bytes.')
self._content_metadata.add(metadata, conflict_update)
def revision_intrinsic_metadata_missing(self, metadata):
- """List metadata missing from storage.
-
- Args:
- metadata (iterable): dictionaries with keys:
-
- - **id** (bytes): sha1_git revision identifier
- - **indexer_configuration_id** (int): tool used to compute
- the results
-
- Yields:
- missing ids
-
- """
yield from self._revision_intrinsic_metadata.missing(metadata)
def revision_intrinsic_metadata_get(self, ids):
- """Retrieve revision metadata per id.
-
- Args:
- ids (iterable): sha1 checksums
-
- Yields:
- dictionaries with the following keys:
-
- - **id** (bytes)
- - **metadata** (str): associated metadata
- - **tool** (dict): tool used to compute metadata
- - **mappings** (List[str]): list of mappings used to translate
- these metadata
-
- """
yield from self._revision_intrinsic_metadata.get(ids)
def revision_intrinsic_metadata_add(self, metadata, conflict_update=False):
- """Add metadata not present in storage.
-
- Args:
- metadata (iterable): dictionaries with keys:
-
- - **id**: sha1_git of revision
- - **metadata**: arbitrary dict
- - **indexer_configuration_id**: tool used to compute metadata
- - **mappings** (List[str]): list of mappings used to translate
- these metadata
-
- conflict_update: Flag to determine if we want to overwrite (true)
- or skip duplicates (false, the default)
-
- """
if not all(isinstance(x['id'], bytes) for x in metadata):
raise TypeError('identifiers must be bytes.')
self._revision_intrinsic_metadata.add(metadata, conflict_update)
def revision_intrinsic_metadata_delete(self, entries):
- """Remove revision metadata from the storage.
-
- Args:
- entries (dict): dictionaries with the following keys:
- - **revision** (int): origin identifier
- - **id** (int): tool used to compute metadata
- """
self._revision_intrinsic_metadata.delete(entries)
def origin_intrinsic_metadata_get(self, ids):
- """Retrieve origin metadata per id.
-
- Args:
- ids (iterable): origin identifiers
-
- Yields:
- list: dictionaries with the following keys:
-
- - **id** (str): origin url
- - **from_revision** (bytes): which revision this metadata
- was extracted from
- - **metadata** (str): associated metadata
- - **tool** (dict): tool used to compute metadata
- - **mappings** (List[str]): list of mappings used to translate
- these metadata
-
- """
yield from self._origin_intrinsic_metadata.get(ids)
def origin_intrinsic_metadata_add(self, metadata,
conflict_update=False):
- """Add origin metadata not present in storage.
-
- Args:
- metadata (iterable): dictionaries with keys:
-
- - **id**: origin url
- - **from_revision**: sha1 id of the revision used to generate
- these metadata.
- - **metadata**: arbitrary dict
- - **indexer_configuration_id**: tool used to compute metadata
- - **mappings** (List[str]): list of mappings used to translate
- these metadata
-
- conflict_update: Flag to determine if we want to overwrite (true)
- or skip duplicates (false, the default)
-
- """
self._origin_intrinsic_metadata.add(metadata, conflict_update)
def origin_intrinsic_metadata_delete(self, entries):
- """Remove origin metadata from the storage.
-
- Args:
- entries (dict): dictionaries with the following keys:
-
- - **id** (str): origin url
- - **indexer_configuration_id** (int): tool used to compute
- metadata
- """
self._origin_intrinsic_metadata.delete(entries)
def origin_intrinsic_metadata_search_fulltext(
self, conjunction, limit=100):
- """Returns the list of origins whose metadata contain all the terms.
-
- Args:
- conjunction (List[str]): List of terms to be searched for.
- limit (int): The maximum number of results to return
-
- Yields:
- list: dictionaries with the following keys:
-
- - **id** (str): origin url
- - **from_revision** (bytes): which revision this metadata
- was extracted from
- - **metadata** (str): associated metadata
- - **tool** (dict): tool used to compute metadata
- - **mappings** (List[str]): list of mappings used to translate
- these metadata
-
- """
# A very crude fulltext search implementation, but that's enough
# to work on English metadata
tokens_re = re.compile('[a-zA-Z0-9]+')
@@ -711,34 +356,7 @@
def origin_intrinsic_metadata_search_by_producer(
self, page_token='', limit=100, ids_only=False,
- mappings=None, tool_ids=None,
- db=None, cur=None):
- """Returns the list of origins whose metadata contain all the terms.
-
- Args:
- page_token (str): Opaque token used for pagination.
- limit (int): The maximum number of results to return
- ids_only (bool): Determines whether only origin ids are returned
- or the content as well
- mappings (List[str]): Returns origins whose intrinsic metadata
- were generated using at least one of these mappings.
-
- Returns:
- dict: dict with the following keys:
- - **next_page_token** (str, optional): opaque token to be used as
- `page_token` for retrieveing the next page.
- - **origins** (list): list of origin url (str) if `ids_only=True`
- else dictionaries with the following keys:
-
- - **id** (str): origin urls
- - **from_revision**: sha1 id of the revision used to generate
- these metadata.
- - **metadata** (str): associated metadata
- - **tool** (dict): tool used to compute metadata
- - **mappings** (List[str]): list of mappings used to translate
- these metadata
-
- """
+ mappings=None, tool_ids=None):
assert isinstance(page_token, str)
nb_results = 0
if mappings is not None:
@@ -771,20 +389,6 @@
return result
def origin_intrinsic_metadata_stats(self):
- """Returns statistics on stored intrinsic metadata.
-
- Returns:
- dict: dictionary with keys:
-
- - total (int): total number of origins that were indexed
- (possibly yielding an empty metadata dictionary)
- - non_empty (int): total number of origins that we extracted
- a non-empty metadata dictionary from
- - per_mapping (dict): a dictionary with mapping names as
- keys and number of origins whose indexing used this
- mapping. Note that indexing a given origin may use
- 0, 1, or many mappings.
- """
mapping_count = {m: 0 for m in MAPPING_NAMES}
total = non_empty = 0
for data in self._origin_intrinsic_metadata.get_all():
@@ -800,23 +404,6 @@
}
def indexer_configuration_add(self, tools):
- """Add new tools to the storage.
-
- Args:
- tools ([dict]): List of dictionary representing tool to
- insert in the db. Dictionary with the following keys:
-
- - **tool_name** (str): tool's name
- - **tool_version** (str): tool's version
- - **tool_configuration** (dict): tool's configuration
- (free form dict)
-
- Returns:
- list: List of dict inserted in the db (holding the id key as
- well). The order of the list is not guaranteed to match
- the order of the initial list.
-
- """
inserted = []
for tool in tools:
tool = tool.copy()
@@ -827,21 +414,6 @@
return inserted
def indexer_configuration_get(self, tool):
- """Retrieve tool information.
-
- Args:
- tool (dict): Dictionary representing a tool with the
- following keys:
-
- - **tool_name** (str): tool's name
- - **tool_version** (str): tool's version
- - **tool_configuration** (dict): tool's configuration
- (free form dict)
-
- Returns:
- The same dictionary with an `id` key, None otherwise.
-
- """
return self._tools.get(self._tool_key(tool))
def _tool_key(self, tool):
diff --git a/swh/indexer/storage/__init__.py b/swh/indexer/storage/interface.py
copy from swh/indexer/storage/__init__.py
copy to swh/indexer/storage/interface.py
--- a/swh/indexer/storage/__init__.py
+++ b/swh/indexer/storage/interface.py
@@ -1,132 +1,20 @@
-# Copyright (C) 2015-2018 The Software Heritage developers
+# Copyright (C) 2015-2020 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
-import json
-import psycopg2
-
-from collections import defaultdict
-
from swh.core.api import remote_api_endpoint
-from swh.storage.common import db_transaction_generator, db_transaction
-from swh.storage.exc import StorageDBError
-from .db import Db
-
-from . import converters
-
-
-INDEXER_CFG_KEY = 'indexer_storage'
-
-
-MAPPING_NAMES = ['codemeta', 'gemspec', 'maven', 'npm', 'pkg-info']
-
-
-def get_indexer_storage(cls, args):
- """Get an indexer storage object of class `storage_class` with
- arguments `storage_args`.
-
- Args:
- cls (str): storage's class, either 'local' or 'remote'
- args (dict): dictionary of arguments passed to the
- storage class constructor
-
- Returns:
- an instance of swh.indexer's storage (either local or remote)
-
- Raises:
- ValueError if passed an unknown storage class.
-
- """
- if cls == 'remote':
- from .api.client import RemoteStorage as IndexerStorage
- elif cls == 'local':
- from . import IndexerStorage
- elif cls == 'memory':
- from .in_memory import IndexerStorage
- else:
- raise ValueError('Unknown indexer storage class `%s`' % cls)
-
- return IndexerStorage(**args)
-
-
-def _check_id_duplicates(data):
- """
- If any two dictionaries in `data` have the same id, raises
- a `ValueError`.
-
- Values associated to the key must be hashable.
-
- Args:
- data (List[dict]): List of dictionaries to be inserted
-
- >>> _check_id_duplicates([
- ... {'id': 'foo', 'data': 'spam'},
- ... {'id': 'bar', 'data': 'egg'},
- ... ])
- >>> _check_id_duplicates([
- ... {'id': 'foo', 'data': 'spam'},
- ... {'id': 'foo', 'data': 'egg'},
- ... ])
- Traceback (most recent call last):
- ...
- ValueError: The same id is present more than once.
- """
- if len({item['id'] for item in data}) < len(data):
- raise ValueError('The same id is present more than once.')
-class IndexerStorage:
- """SWH Indexer Storage
-
- """
- def __init__(self, db, min_pool_conns=1, max_pool_conns=10):
- """
- Args:
- db_conn: either a libpq connection string, or a psycopg2 connection
-
- """
- try:
- if isinstance(db, psycopg2.extensions.connection):
- self._pool = None
- self._db = Db(db)
- else:
- self._pool = psycopg2.pool.ThreadedConnectionPool(
- min_pool_conns, max_pool_conns, db
- )
- self._db = None
- except psycopg2.OperationalError as e:
- raise StorageDBError(e)
-
- def get_db(self):
- if self._db:
- return self._db
- return Db.from_pool(self._pool)
-
- def put_db(self, db):
- if db is not self._db:
- db.put_conn()
-
+class IndexerStorageInterface:
@remote_api_endpoint('check_config')
- @db_transaction()
- def check_config(self, *, check_write, db=None, cur=None):
+ def check_config(self, *, check_write):
"""Check that the storage is configured and ready to go."""
- # Check permissions on one of the tables
- if check_write:
- check = 'INSERT'
- else:
- check = 'SELECT'
-
- cur.execute(
- "select has_table_privilege(current_user, 'content_mimetype', %s)", # noqa
- (check,)
- )
- return cur.fetchone()[0]
+ ...
@remote_api_endpoint('content_mimetype/missing')
- @db_transaction_generator()
- def content_mimetype_missing(self, mimetypes, db=None, cur=None):
+ def content_mimetype_missing(self, mimetypes):
"""Generate mimetypes missing from storage.
Args:
@@ -140,13 +28,11 @@
tuple (id, indexer_configuration_id): missing id
"""
- for obj in db.content_mimetype_missing_from_list(mimetypes, cur):
- yield obj[0]
+ ...
def _content_get_range(self, content_type, start, end,
indexer_configuration_id, limit=1000,
- with_textual_data=False,
- db=None, cur=None):
+ with_textual_data=False):
"""Retrieve ids of type content_type within range [start, end] bound
by limit.
@@ -175,34 +61,11 @@
this sha1 if any
"""
- if limit is None:
- raise ValueError('Development error: limit should not be None')
- if content_type not in db.content_indexer_names:
- err = 'Development error: Wrong type. Should be one of [%s]' % (
- ','.join(db.content_indexer_names))
- raise ValueError(err)
-
- ids = []
- next_id = None
- for counter, obj in enumerate(db.content_get_range(
- content_type, start, end, indexer_configuration_id,
- limit=limit+1, with_textual_data=with_textual_data, cur=cur)):
- _id = obj[0]
- if counter >= limit:
- next_id = _id
- break
-
- ids.append(_id)
-
- return {
- 'ids': ids,
- 'next': next_id
- }
+ ...
@remote_api_endpoint('content_mimetype/range')
- @db_transaction()
def content_mimetype_get_range(self, start, end, indexer_configuration_id,
- limit=1000, db=None, cur=None):
+ limit=1000):
"""Retrieve mimetypes within range [start, end] bound by limit.
Args:
@@ -223,14 +86,10 @@
this sha1 if any
"""
- return self._content_get_range('mimetype', start, end,
- indexer_configuration_id, limit=limit,
- db=db, cur=cur)
+ ...
@remote_api_endpoint('content_mimetype/add')
- @db_transaction()
- def content_mimetype_add(self, mimetypes, conflict_update=False, db=None,
- cur=None):
+ def content_mimetype_add(self, mimetypes, conflict_update=False):
"""Add mimetypes not present in storage.
Args:
@@ -246,17 +105,10 @@
default)
"""
- _check_id_duplicates(mimetypes)
- mimetypes.sort(key=lambda m: m['id'])
- db.mktemp_content_mimetype(cur)
- db.copy_to(mimetypes, 'tmp_content_mimetype',
- ['id', 'mimetype', 'encoding', 'indexer_configuration_id'],
- cur)
- db.content_mimetype_add_from_temp(conflict_update, cur)
+ ...
@remote_api_endpoint('content_mimetype')
- @db_transaction_generator()
- def content_mimetype_get(self, ids, db=None, cur=None):
+ def content_mimetype_get(self, ids):
"""Retrieve full content mimetype per ids.
Args:
@@ -271,13 +123,10 @@
- **tool** (dict): Tool used to compute the language
"""
- for c in db.content_mimetype_get_from_list(ids, cur):
- yield converters.db_to_mimetype(
- dict(zip(db.content_mimetype_cols, c)))
+ ...
@remote_api_endpoint('content_language/missing')
- @db_transaction_generator()
- def content_language_missing(self, languages, db=None, cur=None):
+ def content_language_missing(self, languages):
"""List languages missing from storage.
Args:
@@ -292,12 +141,10 @@
indexer_configuration_id)
"""
- for obj in db.content_language_missing_from_list(languages, cur):
- yield obj[0]
+ ...
@remote_api_endpoint('content_language')
- @db_transaction_generator()
- def content_language_get(self, ids, db=None, cur=None):
+ def content_language_get(self, ids):
"""Retrieve full content language per ids.
Args:
@@ -311,14 +158,10 @@
- **tool** (dict): Tool used to compute the language
"""
- for c in db.content_language_get_from_list(ids, cur):
- yield converters.db_to_language(
- dict(zip(db.content_language_cols, c)))
+ ...
@remote_api_endpoint('content_language/add')
- @db_transaction()
- def content_language_add(self, languages, conflict_update=False, db=None,
- cur=None):
+ def content_language_add(self, languages, conflict_update=False):
"""Add languages not present in storage.
Args:
@@ -332,24 +175,10 @@
default)
"""
- _check_id_duplicates(languages)
- languages.sort(key=lambda m: m['id'])
- db.mktemp_content_language(cur)
- # empty language is mapped to 'unknown'
- db.copy_to(
- ({
- 'id': l['id'],
- 'lang': 'unknown' if not l['lang'] else l['lang'],
- 'indexer_configuration_id': l['indexer_configuration_id'],
- } for l in languages),
- 'tmp_content_language',
- ['id', 'lang', 'indexer_configuration_id'], cur)
-
- db.content_language_add_from_temp(conflict_update, cur)
+ ...
@remote_api_endpoint('content/ctags/missing')
- @db_transaction_generator()
- def content_ctags_missing(self, ctags, db=None, cur=None):
+ def content_ctags_missing(self, ctags):
"""List ctags missing from storage.
Args:
@@ -364,12 +193,10 @@
indexer_configuration_id)
"""
- for obj in db.content_ctags_missing_from_list(ctags, cur):
- yield obj[0]
+ ...
@remote_api_endpoint('content/ctags')
- @db_transaction_generator()
- def content_ctags_get(self, ids, db=None, cur=None):
+ def content_ctags_get(self, ids):
"""Retrieve ctags per id.
Args:
@@ -386,13 +213,10 @@
"""
- for c in db.content_ctags_get_from_list(ids, cur):
- yield converters.db_to_ctags(dict(zip(db.content_ctags_cols, c)))
+ ...
@remote_api_endpoint('content/ctags/add')
- @db_transaction()
- def content_ctags_add(self, ctags, conflict_update=False, db=None,
- cur=None):
+ def content_ctags_add(self, ctags, conflict_update=False):
"""Add ctags not present in storage
Args:
@@ -403,29 +227,11 @@
line, lang
"""
- _check_id_duplicates(ctags)
- ctags.sort(key=lambda m: m['id'])
-
- def _convert_ctags(__ctags):
- """Convert ctags dict to list of ctags.
-
- """
- for ctags in __ctags:
- yield from converters.ctags_to_db(ctags)
-
- db.mktemp_content_ctags(cur)
- db.copy_to(list(_convert_ctags(ctags)),
- tblname='tmp_content_ctags',
- columns=['id', 'name', 'kind', 'line',
- 'lang', 'indexer_configuration_id'],
- cur=cur)
-
- db.content_ctags_add_from_temp(conflict_update, cur)
+ ...
@remote_api_endpoint('content/ctags/search')
- @db_transaction_generator()
def content_ctags_search(self, expression,
- limit=10, last_sha1=None, db=None, cur=None):
+ limit=10, last_sha1=None):
"""Search through content's raw ctags symbols.
Args:
@@ -437,13 +243,10 @@
rows of ctags including id, name, lang, kind, line, etc...
"""
- for obj in db.content_ctags_search(expression, last_sha1, limit,
- cur=cur):
- yield converters.db_to_ctags(dict(zip(db.content_ctags_cols, obj)))
+ ...
@remote_api_endpoint('content/fossology_license')
- @db_transaction_generator()
- def content_fossology_license_get(self, ids, db=None, cur=None):
+ def content_fossology_license_get(self, ids):
"""Retrieve licenses per id.
Args:
@@ -457,20 +260,10 @@
- **tool** (dict): Tool used to compute the license
"""
- d = defaultdict(list)
- for c in db.content_fossology_license_get_from_list(ids, cur):
- license = dict(zip(db.content_fossology_license_cols, c))
-
- id_ = license['id']
- d[id_].append(converters.db_to_fossology_license(license))
-
- for id_, facts in d.items():
- yield {id_: facts}
+ ...
@remote_api_endpoint('content/fossology_license/add')
- @db_transaction()
- def content_fossology_license_add(self, licenses, conflict_update=False,
- db=None, cur=None):
+ def content_fossology_license_add(self, licenses, conflict_update=False):
"""Add licenses not present in storage.
Args:
@@ -487,26 +280,12 @@
list: content_license entries which failed due to unknown licenses
"""
- _check_id_duplicates(licenses)
- licenses.sort(key=lambda m: m['id'])
- db.mktemp_content_fossology_license(cur)
- db.copy_to(
- ({
- 'id': sha1['id'],
- 'indexer_configuration_id': sha1['indexer_configuration_id'],
- 'license': license,
- } for sha1 in licenses
- for license in sha1['licenses']),
- tblname='tmp_content_fossology_license',
- columns=['id', 'license', 'indexer_configuration_id'],
- cur=cur)
- db.content_fossology_license_add_from_temp(conflict_update, cur)
+ ...
@remote_api_endpoint('content/fossology_license/range')
- @db_transaction()
def content_fossology_license_get_range(
self, start, end, indexer_configuration_id,
- limit=1000, db=None, cur=None):
+ limit=1000):
"""Retrieve licenses within range [start, end] bound by limit.
Args:
@@ -527,13 +306,10 @@
this sha1 if any
"""
- return self._content_get_range('fossology_license', start, end,
- indexer_configuration_id, limit=limit,
- with_textual_data=True, db=db, cur=cur)
+ ...
@remote_api_endpoint('content_metadata/missing')
- @db_transaction_generator()
- def content_metadata_missing(self, metadata, db=None, cur=None):
+ def content_metadata_missing(self, metadata):
"""List metadata missing from storage.
Args:
@@ -547,12 +323,10 @@
missing sha1s
"""
- for obj in db.content_metadata_missing_from_list(metadata, cur):
- yield obj[0]
+ ...
@remote_api_endpoint('content_metadata')
- @db_transaction_generator()
- def content_metadata_get(self, ids, db=None, cur=None):
+ def content_metadata_get(self, ids):
"""Retrieve metadata per id.
Args:
@@ -566,14 +340,10 @@
tool (dict): tool used to compute metadata
"""
- for c in db.content_metadata_get_from_list(ids, cur):
- yield converters.db_to_metadata(
- dict(zip(db.content_metadata_cols, c)))
+ ...
@remote_api_endpoint('content_metadata/add')
- @db_transaction()
- def content_metadata_add(self, metadata, conflict_update=False, db=None,
- cur=None):
+ def content_metadata_add(self, metadata, conflict_update=False):
"""Add metadata not present in storage.
Args:
@@ -586,19 +356,10 @@
or skip duplicates (false, the default)
"""
- _check_id_duplicates(metadata)
- metadata.sort(key=lambda m: m['id'])
-
- db.mktemp_content_metadata(cur)
-
- db.copy_to(metadata, 'tmp_content_metadata',
- ['id', 'metadata', 'indexer_configuration_id'],
- cur)
- db.content_metadata_add_from_temp(conflict_update, cur)
+ ...
@remote_api_endpoint('revision_intrinsic_metadata/missing')
- @db_transaction_generator()
- def revision_intrinsic_metadata_missing(self, metadata, db=None, cur=None):
+ def revision_intrinsic_metadata_missing(self, metadata):
"""List metadata missing from storage.
Args:
@@ -612,13 +373,10 @@
missing ids
"""
- for obj in db.revision_intrinsic_metadata_missing_from_list(
- metadata, cur):
- yield obj[0]
+ ...
@remote_api_endpoint('revision_intrinsic_metadata')
- @db_transaction_generator()
- def revision_intrinsic_metadata_get(self, ids, db=None, cur=None):
+ def revision_intrinsic_metadata_get(self, ids):
"""Retrieve revision metadata per id.
Args:
@@ -634,14 +392,10 @@
these metadata
"""
- for c in db.revision_intrinsic_metadata_get_from_list(ids, cur):
- yield converters.db_to_metadata(
- dict(zip(db.revision_intrinsic_metadata_cols, c)))
+ ...
@remote_api_endpoint('revision_intrinsic_metadata/add')
- @db_transaction()
- def revision_intrinsic_metadata_add(self, metadata, conflict_update=False,
- db=None, cur=None):
+ def revision_intrinsic_metadata_add(self, metadata, conflict_update=False):
"""Add metadata not present in storage.
Args:
@@ -657,20 +411,10 @@
or skip duplicates (false, the default)
"""
- _check_id_duplicates(metadata)
- metadata.sort(key=lambda m: m['id'])
-
- db.mktemp_revision_intrinsic_metadata(cur)
-
- db.copy_to(metadata, 'tmp_revision_intrinsic_metadata',
- ['id', 'metadata', 'mappings',
- 'indexer_configuration_id'],
- cur)
- db.revision_intrinsic_metadata_add_from_temp(conflict_update, cur)
+ ...
@remote_api_endpoint('revision_intrinsic_metadata/delete')
- @db_transaction()
- def revision_intrinsic_metadata_delete(self, entries, db=None, cur=None):
+ def revision_intrinsic_metadata_delete(self, entries):
"""Remove revision metadata from the storage.
Args:
@@ -680,11 +424,10 @@
- **indexer_configuration_id** (int): tool used to compute
metadata
"""
- db.revision_intrinsic_metadata_delete(entries, cur)
+ ...
@remote_api_endpoint('origin_intrinsic_metadata')
- @db_transaction_generator()
- def origin_intrinsic_metadata_get(self, ids, db=None, cur=None):
+ def origin_intrinsic_metadata_get(self, ids):
"""Retrieve origin metadata per id.
Args:
@@ -702,15 +445,11 @@
these metadata
"""
- for c in db.origin_intrinsic_metadata_get_from_list(ids, cur):
- yield converters.db_to_metadata(
- dict(zip(db.origin_intrinsic_metadata_cols, c)))
+ ...
@remote_api_endpoint('origin_intrinsic_metadata/add')
- @db_transaction()
def origin_intrinsic_metadata_add(self, metadata,
- conflict_update=False, db=None,
- cur=None):
+ conflict_update=False):
"""Add origin metadata not present in storage.
Args:
@@ -728,22 +467,11 @@
or skip duplicates (false, the default)
"""
- _check_id_duplicates(metadata)
- metadata.sort(key=lambda m: m['id'])
-
- db.mktemp_origin_intrinsic_metadata(cur)
-
- db.copy_to(metadata, 'tmp_origin_intrinsic_metadata',
- ['id', 'metadata',
- 'indexer_configuration_id',
- 'from_revision', 'mappings'],
- cur)
- db.origin_intrinsic_metadata_add_from_temp(conflict_update, cur)
+ ...
@remote_api_endpoint('origin_intrinsic_metadata/delete')
- @db_transaction()
def origin_intrinsic_metadata_delete(
- self, entries, db=None, cur=None):
+ self, entries):
"""Remove origin metadata from the storage.
Args:
@@ -753,12 +481,11 @@
- **indexer_configuration_id** (int): tool used to compute
metadata
"""
- db.origin_intrinsic_metadata_delete(entries, cur)
+ ...
@remote_api_endpoint('origin_intrinsic_metadata/search/fulltext')
- @db_transaction_generator()
def origin_intrinsic_metadata_search_fulltext(
- self, conjunction, limit=100, db=None, cur=None):
+ self, conjunction, limit=100):
"""Returns the list of origins whose metadata contain all the terms.
Args:
@@ -777,17 +504,12 @@
these metadata
"""
- for c in db.origin_intrinsic_metadata_search_fulltext(
- conjunction, limit=limit, cur=cur):
- yield converters.db_to_metadata(
- dict(zip(db.origin_intrinsic_metadata_cols, c)))
+ ...
@remote_api_endpoint('origin_intrinsic_metadata/search/by_producer')
- @db_transaction()
def origin_intrinsic_metadata_search_by_producer(
self, page_token='', limit=100, ids_only=False,
- mappings=None, tool_ids=None,
- db=None, cur=None):
+ mappings=None, tool_ids=None):
"""Returns the list of origins whose metadata contain all the terms.
Args:
@@ -815,29 +537,11 @@
these metadata
"""
- assert isinstance(page_token, str)
- # we go to limit+1 to check whether we should add next_page_token in
- # the response
- res = db.origin_intrinsic_metadata_search_by_producer(
- page_token, limit + 1, ids_only, mappings, tool_ids, cur)
- result = {}
- if ids_only:
- result['origins'] = [origin for (origin,) in res]
- if len(result['origins']) > limit:
- result['origins'][limit:] = []
- result['next_page_token'] = result['origins'][-1]
- else:
- result['origins'] = [converters.db_to_metadata(
- dict(zip(db.origin_intrinsic_metadata_cols, c)))for c in res]
- if len(result['origins']) > limit:
- result['origins'][limit:] = []
- result['next_page_token'] = result['origins'][-1]['id']
- return result
+ ...
@remote_api_endpoint('origin_intrinsic_metadata/stats')
- @db_transaction()
def origin_intrinsic_metadata_stats(
- self, db=None, cur=None):
+ self):
"""Returns counts of indexed metadata per origins, broken down
into metadata types.
@@ -853,36 +557,10 @@
mapping. Note that indexing a given origin may use
0, 1, or many mappings.
"""
- mapping_names = [m for m in MAPPING_NAMES]
- select_parts = []
-
- # Count rows for each mapping
- for mapping_name in mapping_names:
- select_parts.append((
- "sum(case when (mappings @> ARRAY['%s']) "
- " then 1 else 0 end)"
- ) % mapping_name)
-
- # Total
- select_parts.append("sum(1)")
-
- # Rows whose metadata has at least one key that is not '@context'
- select_parts.append(
- "sum(case when ('{}'::jsonb @> (metadata - '@context')) "
- " then 0 else 1 end)")
- cur.execute('select ' + ', '.join(select_parts)
- + ' from origin_intrinsic_metadata')
- results = dict(zip(mapping_names + ['total', 'non_empty'],
- cur.fetchone()))
- return {
- 'total': results.pop('total'),
- 'non_empty': results.pop('non_empty'),
- 'per_mapping': results,
- }
+ ...
@remote_api_endpoint('indexer_configuration/add')
- @db_transaction_generator()
- def indexer_configuration_add(self, tools, db=None, cur=None):
+ def indexer_configuration_add(self, tools):
"""Add new tools to the storage.
Args:
@@ -900,18 +578,10 @@
the order of the initial list.
"""
- db.mktemp_indexer_configuration(cur)
- db.copy_to(tools, 'tmp_indexer_configuration',
- ['tool_name', 'tool_version', 'tool_configuration'],
- cur)
-
- tools = db.indexer_configuration_add_from_temp(cur)
- for line in tools:
- yield dict(zip(db.indexer_configuration_cols, line))
+ ...
@remote_api_endpoint('indexer_configuration/data')
- @db_transaction()
- def indexer_configuration_get(self, tool, db=None, cur=None):
+ def indexer_configuration_get(self, tool):
"""Retrieve tool information.
Args:
@@ -927,12 +597,4 @@
The same dictionary with an `id` key, None otherwise.
"""
- tool_conf = tool['tool_configuration']
- if isinstance(tool_conf, dict):
- tool_conf = json.dumps(tool_conf)
- idx = db.indexer_configuration_get(tool['tool_name'],
- tool['tool_version'],
- tool_conf)
- if not idx:
- return None
- return dict(zip(db.indexer_configuration_cols, idx))
+ ...
diff --git a/swh/indexer/tests/storage/test_storage.py b/swh/indexer/tests/storage/test_storage.py
--- a/swh/indexer/tests/storage/test_storage.py
+++ b/swh/indexer/tests/storage/test_storage.py
@@ -3,10 +3,15 @@
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
+import inspect
import threading
+
import pytest
+
from swh.model.hashutil import hash_to_bytes
+from swh.indexer.storage.interface import IndexerStorageInterface
+
def prepare_mimetypes_from(fossology_licenses):
"""Fossology license needs some consistent data in db to run.
@@ -1853,3 +1858,31 @@
storage = swh_indexer_storage
assert storage.check_config(check_write=True)
assert storage.check_config(check_write=False)
+
+ def test_types(self, swh_indexer_storage):
+ """Checks all methods of StorageInterface are implemented by this
+ backend, and that they have the same signature."""
+ # Create an instance of the protocol (which cannot be instantiated
+ # directly, so this creates a subclass, then instantiates it)
+ interface = type('_', (IndexerStorageInterface,), {})()
+
+ assert 'content_mimetype_add' in dir(interface)
+
+ missing_methods = []
+
+ for meth_name in dir(interface):
+ if meth_name.startswith('_'):
+ continue
+ interface_meth = getattr(interface, meth_name)
+ try:
+ concrete_meth = getattr(swh_indexer_storage, meth_name)
+ except AttributeError:
+ missing_methods.append(meth_name)
+ continue
+
+ expected_signature = inspect.signature(interface_meth)
+ actual_signature = inspect.signature(concrete_meth)
+
+ assert expected_signature == actual_signature, meth_name
+
+ assert missing_methods == []

File Metadata

Mime Type
text/plain
Expires
Wed, Dec 18, 12:13 PM (9 h, 5 m ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3221555

Event Timeline