Changeset View
Changeset View
Standalone View
Standalone View
swh/indexer/storage/in_memory.py
Show First 20 Lines • Show All 197 Lines • ▼ Show 20 Lines | def __init__(self): | ||||
self._content_metadata = SubStorage(self._tools) | self._content_metadata = SubStorage(self._tools) | ||||
self._revision_intrinsic_metadata = SubStorage(self._tools) | self._revision_intrinsic_metadata = SubStorage(self._tools) | ||||
self._origin_intrinsic_metadata = SubStorage(self._tools) | self._origin_intrinsic_metadata = SubStorage(self._tools) | ||||
def check_config(self, *, check_write): | def check_config(self, *, check_write): | ||||
return True | return True | ||||
def content_mimetype_missing(self, mimetypes): | def content_mimetype_missing(self, mimetypes): | ||||
"""Generate mimetypes missing from storage. | |||||
Args: | |||||
mimetypes (iterable): iterable of dict with keys: | |||||
- **id** (bytes): sha1 identifier | |||||
- **indexer_configuration_id** (int): tool used to compute the | |||||
results | |||||
Yields: | |||||
tuple (id, indexer_configuration_id): missing id | |||||
""" | |||||
yield from self._mimetypes.missing(mimetypes) | yield from self._mimetypes.missing(mimetypes) | ||||
def content_mimetype_get_range( | def content_mimetype_get_range( | ||||
self, start, end, indexer_configuration_id, limit=1000): | self, start, end, indexer_configuration_id, limit=1000): | ||||
"""Retrieve mimetypes within range [start, end] bound by limit. | |||||
Args: | |||||
**start** (bytes): Starting identifier range (expected smaller | |||||
than end) | |||||
**end** (bytes): Ending identifier range (expected larger | |||||
than start) | |||||
**indexer_configuration_id** (int): The tool used to index data | |||||
**limit** (int): Limit result (default to 1000) | |||||
Raises: | |||||
ValueError for limit to None | |||||
Returns: | |||||
a dict with keys: | |||||
- **ids** [bytes]: iterable of content ids within the range. | |||||
- **next** (Optional[bytes]): The next range of sha1 starts at | |||||
this sha1 if any | |||||
""" | |||||
return self._mimetypes.get_range( | return self._mimetypes.get_range( | ||||
start, end, indexer_configuration_id, limit) | start, end, indexer_configuration_id, limit) | ||||
def content_mimetype_add(self, mimetypes, conflict_update=False): | def content_mimetype_add(self, mimetypes, conflict_update=False): | ||||
"""Add mimetypes not present in storage. | |||||
Args: | |||||
mimetypes (iterable): dictionaries with keys: | |||||
- **id** (bytes): sha1 identifier | |||||
- **mimetype** (bytes): raw content's mimetype | |||||
- **encoding** (bytes): raw content's encoding | |||||
- **indexer_configuration_id** (int): tool's id used to | |||||
compute the results | |||||
- **conflict_update** (bool): Flag to determine if we want to | |||||
overwrite (``True``) or skip duplicates (``False``, the | |||||
default) | |||||
""" | |||||
if not all(isinstance(x['id'], bytes) for x in mimetypes): | if not all(isinstance(x['id'], bytes) for x in mimetypes): | ||||
raise TypeError('identifiers must be bytes.') | raise TypeError('identifiers must be bytes.') | ||||
self._mimetypes.add(mimetypes, conflict_update) | self._mimetypes.add(mimetypes, conflict_update) | ||||
def content_mimetype_get(self, ids, db=None, cur=None): | def content_mimetype_get(self, ids): | ||||
"""Retrieve full content mimetype per ids. | |||||
Args: | |||||
ids (iterable): sha1 identifier | |||||
Yields: | |||||
mimetypes (iterable): dictionaries with keys: | |||||
- **id** (bytes): sha1 identifier | |||||
- **mimetype** (bytes): raw content's mimetype | |||||
- **encoding** (bytes): raw content's encoding | |||||
- **tool** (dict): Tool used to compute the language | |||||
""" | |||||
yield from self._mimetypes.get(ids) | yield from self._mimetypes.get(ids) | ||||
def content_language_missing(self, languages): | def content_language_missing(self, languages): | ||||
"""List languages missing from storage. | |||||
Args: | |||||
languages (iterable): dictionaries with keys: | |||||
- **id** (bytes): sha1 identifier | |||||
- **indexer_configuration_id** (int): tool used to compute | |||||
the results | |||||
Yields: | |||||
an iterable of missing id for the tuple (id, | |||||
indexer_configuration_id) | |||||
""" | |||||
yield from self._languages.missing(languages) | yield from self._languages.missing(languages) | ||||
def content_language_get(self, ids): | def content_language_get(self, ids): | ||||
"""Retrieve full content language per ids. | |||||
Args: | |||||
ids (iterable): sha1 identifier | |||||
Yields: | |||||
languages (iterable): dictionaries with keys: | |||||
- **id** (bytes): sha1 identifier | |||||
- **lang** (bytes): raw content's language | |||||
- **tool** (dict): Tool used to compute the language | |||||
""" | |||||
yield from self._languages.get(ids) | yield from self._languages.get(ids) | ||||
def content_language_add(self, languages, conflict_update=False): | def content_language_add(self, languages, conflict_update=False): | ||||
"""Add languages not present in storage. | |||||
Args: | |||||
languages (iterable): dictionaries with keys: | |||||
- **id** (bytes): sha1 | |||||
- **lang** (bytes): language detected | |||||
conflict_update (bool): Flag to determine if we want to | |||||
overwrite (true) or skip duplicates (false, the | |||||
default) | |||||
""" | |||||
if not all(isinstance(x['id'], bytes) for x in languages): | if not all(isinstance(x['id'], bytes) for x in languages): | ||||
raise TypeError('identifiers must be bytes.') | raise TypeError('identifiers must be bytes.') | ||||
self._languages.add(languages, conflict_update) | self._languages.add(languages, conflict_update) | ||||
def content_ctags_missing(self, ctags): | def content_ctags_missing(self, ctags): | ||||
"""List ctags missing from storage. | |||||
Args: | |||||
ctags (iterable): dicts with keys: | |||||
- **id** (bytes): sha1 identifier | |||||
- **indexer_configuration_id** (int): tool used to compute | |||||
the results | |||||
Yields: | |||||
an iterable of missing id for the tuple (id, | |||||
indexer_configuration_id) | |||||
""" | |||||
yield from self._content_ctags.missing(ctags) | yield from self._content_ctags.missing(ctags) | ||||
def content_ctags_get(self, ids): | def content_ctags_get(self, ids): | ||||
"""Retrieve ctags per id. | |||||
Args: | |||||
ids (iterable): sha1 checksums | |||||
Yields: | |||||
Dictionaries with keys: | |||||
- **id** (bytes): content's identifier | |||||
- **name** (str): symbol's name | |||||
- **kind** (str): symbol's kind | |||||
- **lang** (str): language for that content | |||||
- **tool** (dict): tool used to compute the ctags' info | |||||
""" | |||||
for item in self._content_ctags.get(ids): | for item in self._content_ctags.get(ids): | ||||
for item_ctags_item in item['ctags']: | for item_ctags_item in item['ctags']: | ||||
yield { | yield { | ||||
'id': item['id'], | 'id': item['id'], | ||||
'tool': item['tool'], | 'tool': item['tool'], | ||||
**item_ctags_item | **item_ctags_item | ||||
} | } | ||||
def content_ctags_add(self, ctags, conflict_update=False): | def content_ctags_add(self, ctags, conflict_update=False): | ||||
"""Add ctags not present in storage | |||||
Args: | |||||
ctags (iterable): dictionaries with keys: | |||||
- **id** (bytes): sha1 | |||||
- **ctags** ([list): List of dictionary with keys: name, kind, | |||||
line, lang | |||||
- **indexer_configuration_id**: tool used to compute the | |||||
results | |||||
""" | |||||
if not all(isinstance(x['id'], bytes) for x in ctags): | if not all(isinstance(x['id'], bytes) for x in ctags): | ||||
raise TypeError('identifiers must be bytes.') | raise TypeError('identifiers must be bytes.') | ||||
self._content_ctags.add_merge(ctags, conflict_update, 'ctags') | self._content_ctags.add_merge(ctags, conflict_update, 'ctags') | ||||
def content_ctags_search(self, expression, | def content_ctags_search(self, expression, | ||||
limit=10, last_sha1=None, db=None, cur=None): | limit=10, last_sha1=None): | ||||
"""Search through content's raw ctags symbols. | |||||
Args: | |||||
expression (str): Expression to search for | |||||
limit (int): Number of rows to return (default to 10). | |||||
last_sha1 (str): Offset from which retrieving data (default to ''). | |||||
Yields: | |||||
rows of ctags including id, name, lang, kind, line, etc... | |||||
""" | |||||
nb_matches = 0 | nb_matches = 0 | ||||
for ((id_, tool_id), item) in \ | for ((id_, tool_id), item) in \ | ||||
sorted(self._content_ctags._data.items()): | sorted(self._content_ctags._data.items()): | ||||
if id_ <= (last_sha1 or bytes(0 for _ in range(SHA1_DIGEST_SIZE))): | if id_ <= (last_sha1 or bytes(0 for _ in range(SHA1_DIGEST_SIZE))): | ||||
continue | continue | ||||
for ctags_item in item['ctags']: | for ctags_item in item['ctags']: | ||||
if ctags_item['name'] != expression: | if ctags_item['name'] != expression: | ||||
continue | continue | ||||
nb_matches += 1 | nb_matches += 1 | ||||
yield { | yield { | ||||
'id': id_, | 'id': id_, | ||||
'tool': _transform_tool(self._tools[tool_id]), | 'tool': _transform_tool(self._tools[tool_id]), | ||||
**ctags_item | **ctags_item | ||||
} | } | ||||
if nb_matches >= limit: | if nb_matches >= limit: | ||||
return | return | ||||
def content_fossology_license_get(self, ids): | def content_fossology_license_get(self, ids): | ||||
"""Retrieve licenses per id. | |||||
Args: | |||||
ids (iterable): sha1 checksums | |||||
Yields: | |||||
dict: ``{id: facts}`` where ``facts`` is a dict with the | |||||
following keys: | |||||
- **licenses** ([str]): associated licenses for that content | |||||
- **tool** (dict): Tool used to compute the license | |||||
""" | |||||
# Rewrites the output of SubStorage.get from the old format to | # Rewrites the output of SubStorage.get from the old format to | ||||
# the new one. SubStorage.get should be updated once all other | # the new one. SubStorage.get should be updated once all other | ||||
# *_get methods use the new format. | # *_get methods use the new format. | ||||
# See: https://forge.softwareheritage.org/T1433 | # See: https://forge.softwareheritage.org/T1433 | ||||
res = {} | res = {} | ||||
for d in self._licenses.get(ids): | for d in self._licenses.get(ids): | ||||
res.setdefault(d.pop('id'), []).append(d) | res.setdefault(d.pop('id'), []).append(d) | ||||
for (id_, facts) in res.items(): | for (id_, facts) in res.items(): | ||||
yield {id_: facts} | yield {id_: facts} | ||||
def content_fossology_license_add(self, licenses, conflict_update=False): | def content_fossology_license_add(self, licenses, conflict_update=False): | ||||
"""Add licenses not present in storage. | |||||
Args: | |||||
licenses (iterable): dictionaries with keys: | |||||
- **id**: sha1 | |||||
- **licenses** ([bytes]): List of licenses associated to sha1 | |||||
- **tool** (str): nomossa | |||||
conflict_update: Flag to determine if we want to overwrite (true) | |||||
or skip duplicates (false, the default) | |||||
Returns: | |||||
list: content_license entries which failed due to unknown licenses | |||||
""" | |||||
if not all(isinstance(x['id'], bytes) for x in licenses): | if not all(isinstance(x['id'], bytes) for x in licenses): | ||||
raise TypeError('identifiers must be bytes.') | raise TypeError('identifiers must be bytes.') | ||||
self._licenses.add_merge(licenses, conflict_update, 'licenses') | self._licenses.add_merge(licenses, conflict_update, 'licenses') | ||||
def content_fossology_license_get_range( | def content_fossology_license_get_range( | ||||
self, start, end, indexer_configuration_id, limit=1000): | self, start, end, indexer_configuration_id, limit=1000): | ||||
"""Retrieve licenses within range [start, end] bound by limit. | |||||
Args: | |||||
**start** (bytes): Starting identifier range (expected smaller | |||||
than end) | |||||
**end** (bytes): Ending identifier range (expected larger | |||||
than start) | |||||
**indexer_configuration_id** (int): The tool used to index data | |||||
**limit** (int): Limit result (default to 1000) | |||||
Raises: | |||||
ValueError for limit to None | |||||
Returns: | |||||
a dict with keys: | |||||
- **ids** [bytes]: iterable of content ids within the range. | |||||
- **next** (Optional[bytes]): The next range of sha1 starts at | |||||
this sha1 if any | |||||
""" | |||||
return self._licenses.get_range( | return self._licenses.get_range( | ||||
start, end, indexer_configuration_id, limit) | start, end, indexer_configuration_id, limit) | ||||
def content_metadata_missing(self, metadata): | def content_metadata_missing(self, metadata): | ||||
"""List metadata missing from storage. | |||||
Args: | |||||
metadata (iterable): dictionaries with keys: | |||||
- **id** (bytes): sha1 identifier | |||||
- **indexer_configuration_id** (int): tool used to compute | |||||
the results | |||||
Yields: | |||||
missing sha1s | |||||
""" | |||||
yield from self._content_metadata.missing(metadata) | yield from self._content_metadata.missing(metadata) | ||||
def content_metadata_get(self, ids): | def content_metadata_get(self, ids): | ||||
"""Retrieve metadata per id. | |||||
Args: | |||||
ids (iterable): sha1 checksums | |||||
Yields: | |||||
dictionaries with the following keys: | |||||
- **id** (bytes) | |||||
- **metadata** (str): associated metadata | |||||
- **tool** (dict): tool used to compute metadata | |||||
""" | |||||
yield from self._content_metadata.get(ids) | yield from self._content_metadata.get(ids) | ||||
def content_metadata_add(self, metadata, conflict_update=False): | def content_metadata_add(self, metadata, conflict_update=False): | ||||
"""Add metadata not present in storage. | |||||
Args: | |||||
metadata (iterable): dictionaries with keys: | |||||
- **id**: sha1 | |||||
- **metadata**: arbitrary dict | |||||
- **indexer_configuration_id**: tool used to compute the | |||||
results | |||||
conflict_update: Flag to determine if we want to overwrite (true) | |||||
or skip duplicates (false, the default) | |||||
""" | |||||
if not all(isinstance(x['id'], bytes) for x in metadata): | if not all(isinstance(x['id'], bytes) for x in metadata): | ||||
raise TypeError('identifiers must be bytes.') | raise TypeError('identifiers must be bytes.') | ||||
self._content_metadata.add(metadata, conflict_update) | self._content_metadata.add(metadata, conflict_update) | ||||
def revision_intrinsic_metadata_missing(self, metadata): | def revision_intrinsic_metadata_missing(self, metadata): | ||||
"""List metadata missing from storage. | |||||
Args: | |||||
metadata (iterable): dictionaries with keys: | |||||
- **id** (bytes): sha1_git revision identifier | |||||
- **indexer_configuration_id** (int): tool used to compute | |||||
the results | |||||
Yields: | |||||
missing ids | |||||
""" | |||||
yield from self._revision_intrinsic_metadata.missing(metadata) | yield from self._revision_intrinsic_metadata.missing(metadata) | ||||
def revision_intrinsic_metadata_get(self, ids): | def revision_intrinsic_metadata_get(self, ids): | ||||
"""Retrieve revision metadata per id. | |||||
Args: | |||||
ids (iterable): sha1 checksums | |||||
Yields: | |||||
dictionaries with the following keys: | |||||
- **id** (bytes) | |||||
- **metadata** (str): associated metadata | |||||
- **tool** (dict): tool used to compute metadata | |||||
- **mappings** (List[str]): list of mappings used to translate | |||||
these metadata | |||||
""" | |||||
yield from self._revision_intrinsic_metadata.get(ids) | yield from self._revision_intrinsic_metadata.get(ids) | ||||
def revision_intrinsic_metadata_add(self, metadata, conflict_update=False): | def revision_intrinsic_metadata_add(self, metadata, conflict_update=False): | ||||
"""Add metadata not present in storage. | |||||
Args: | |||||
metadata (iterable): dictionaries with keys: | |||||
- **id**: sha1_git of revision | |||||
- **metadata**: arbitrary dict | |||||
- **indexer_configuration_id**: tool used to compute metadata | |||||
- **mappings** (List[str]): list of mappings used to translate | |||||
these metadata | |||||
conflict_update: Flag to determine if we want to overwrite (true) | |||||
or skip duplicates (false, the default) | |||||
""" | |||||
if not all(isinstance(x['id'], bytes) for x in metadata): | if not all(isinstance(x['id'], bytes) for x in metadata): | ||||
raise TypeError('identifiers must be bytes.') | raise TypeError('identifiers must be bytes.') | ||||
self._revision_intrinsic_metadata.add(metadata, conflict_update) | self._revision_intrinsic_metadata.add(metadata, conflict_update) | ||||
def revision_intrinsic_metadata_delete(self, entries): | def revision_intrinsic_metadata_delete(self, entries): | ||||
"""Remove revision metadata from the storage. | |||||
Args: | |||||
entries (dict): dictionaries with the following keys: | |||||
- **revision** (int): origin identifier | |||||
- **id** (int): tool used to compute metadata | |||||
""" | |||||
self._revision_intrinsic_metadata.delete(entries) | self._revision_intrinsic_metadata.delete(entries) | ||||
def origin_intrinsic_metadata_get(self, ids): | def origin_intrinsic_metadata_get(self, ids): | ||||
"""Retrieve origin metadata per id. | |||||
Args: | |||||
ids (iterable): origin identifiers | |||||
Yields: | |||||
list: dictionaries with the following keys: | |||||
- **id** (str): origin url | |||||
- **from_revision** (bytes): which revision this metadata | |||||
was extracted from | |||||
- **metadata** (str): associated metadata | |||||
- **tool** (dict): tool used to compute metadata | |||||
- **mappings** (List[str]): list of mappings used to translate | |||||
these metadata | |||||
""" | |||||
yield from self._origin_intrinsic_metadata.get(ids) | yield from self._origin_intrinsic_metadata.get(ids) | ||||
def origin_intrinsic_metadata_add(self, metadata, | def origin_intrinsic_metadata_add(self, metadata, | ||||
conflict_update=False): | conflict_update=False): | ||||
"""Add origin metadata not present in storage. | |||||
Args: | |||||
metadata (iterable): dictionaries with keys: | |||||
- **id**: origin url | |||||
- **from_revision**: sha1 id of the revision used to generate | |||||
these metadata. | |||||
- **metadata**: arbitrary dict | |||||
- **indexer_configuration_id**: tool used to compute metadata | |||||
- **mappings** (List[str]): list of mappings used to translate | |||||
these metadata | |||||
conflict_update: Flag to determine if we want to overwrite (true) | |||||
or skip duplicates (false, the default) | |||||
""" | |||||
self._origin_intrinsic_metadata.add(metadata, conflict_update) | self._origin_intrinsic_metadata.add(metadata, conflict_update) | ||||
def origin_intrinsic_metadata_delete(self, entries): | def origin_intrinsic_metadata_delete(self, entries): | ||||
"""Remove origin metadata from the storage. | |||||
Args: | |||||
entries (dict): dictionaries with the following keys: | |||||
- **id** (str): origin url | |||||
- **indexer_configuration_id** (int): tool used to compute | |||||
metadata | |||||
""" | |||||
self._origin_intrinsic_metadata.delete(entries) | self._origin_intrinsic_metadata.delete(entries) | ||||
def origin_intrinsic_metadata_search_fulltext( | def origin_intrinsic_metadata_search_fulltext( | ||||
self, conjunction, limit=100): | self, conjunction, limit=100): | ||||
"""Returns the list of origins whose metadata contain all the terms. | |||||
Args: | |||||
conjunction (List[str]): List of terms to be searched for. | |||||
limit (int): The maximum number of results to return | |||||
Yields: | |||||
list: dictionaries with the following keys: | |||||
- **id** (str): origin url | |||||
- **from_revision** (bytes): which revision this metadata | |||||
was extracted from | |||||
- **metadata** (str): associated metadata | |||||
- **tool** (dict): tool used to compute metadata | |||||
- **mappings** (List[str]): list of mappings used to translate | |||||
these metadata | |||||
""" | |||||
# A very crude fulltext search implementation, but that's enough | # A very crude fulltext search implementation, but that's enough | ||||
# to work on English metadata | # to work on English metadata | ||||
tokens_re = re.compile('[a-zA-Z0-9]+') | tokens_re = re.compile('[a-zA-Z0-9]+') | ||||
search_tokens = list(itertools.chain( | search_tokens = list(itertools.chain( | ||||
*map(tokens_re.findall, conjunction))) | *map(tokens_re.findall, conjunction))) | ||||
def rank(data): | def rank(data): | ||||
# Tokenize the metadata | # Tokenize the metadata | ||||
Show All 17 Lines | def origin_intrinsic_metadata_search_fulltext( | ||||
results = [(rank_, data) for (rank_, data) in results if rank_ > 0] | results = [(rank_, data) for (rank_, data) in results if rank_ > 0] | ||||
results.sort(key=operator.itemgetter(0), # Don't try to order 'data' | results.sort(key=operator.itemgetter(0), # Don't try to order 'data' | ||||
reverse=True) | reverse=True) | ||||
for (rank_, result) in results[:limit]: | for (rank_, result) in results[:limit]: | ||||
yield result | yield result | ||||
def origin_intrinsic_metadata_search_by_producer( | def origin_intrinsic_metadata_search_by_producer( | ||||
self, page_token='', limit=100, ids_only=False, | self, page_token='', limit=100, ids_only=False, | ||||
mappings=None, tool_ids=None, | mappings=None, tool_ids=None): | ||||
db=None, cur=None): | |||||
"""Returns the list of origins whose metadata contain all the terms. | |||||
Args: | |||||
page_token (str): Opaque token used for pagination. | |||||
limit (int): The maximum number of results to return | |||||
ids_only (bool): Determines whether only origin ids are returned | |||||
or the content as well | |||||
mappings (List[str]): Returns origins whose intrinsic metadata | |||||
were generated using at least one of these mappings. | |||||
Returns: | |||||
dict: dict with the following keys: | |||||
- **next_page_token** (str, optional): opaque token to be used as | |||||
`page_token` for retrieveing the next page. | |||||
- **origins** (list): list of origin url (str) if `ids_only=True` | |||||
else dictionaries with the following keys: | |||||
- **id** (str): origin urls | |||||
- **from_revision**: sha1 id of the revision used to generate | |||||
these metadata. | |||||
- **metadata** (str): associated metadata | |||||
- **tool** (dict): tool used to compute metadata | |||||
- **mappings** (List[str]): list of mappings used to translate | |||||
these metadata | |||||
""" | |||||
assert isinstance(page_token, str) | assert isinstance(page_token, str) | ||||
nb_results = 0 | nb_results = 0 | ||||
if mappings is not None: | if mappings is not None: | ||||
mappings = frozenset(mappings) | mappings = frozenset(mappings) | ||||
if tool_ids is not None: | if tool_ids is not None: | ||||
tool_ids = frozenset(tool_ids) | tool_ids = frozenset(tool_ids) | ||||
origins = [] | origins = [] | ||||
Show All 16 Lines | def origin_intrinsic_metadata_search_by_producer( | ||||
origins = origins[:limit] | origins = origins[:limit] | ||||
result['next_page_token'] = origins[-1]['id'] | result['next_page_token'] = origins[-1]['id'] | ||||
if ids_only: | if ids_only: | ||||
origins = [origin['id'] for origin in origins] | origins = [origin['id'] for origin in origins] | ||||
result['origins'] = origins | result['origins'] = origins | ||||
return result | return result | ||||
def origin_intrinsic_metadata_stats(self): | def origin_intrinsic_metadata_stats(self): | ||||
"""Returns statistics on stored intrinsic metadata. | |||||
Returns: | |||||
dict: dictionary with keys: | |||||
- total (int): total number of origins that were indexed | |||||
(possibly yielding an empty metadata dictionary) | |||||
- non_empty (int): total number of origins that we extracted | |||||
a non-empty metadata dictionary from | |||||
- per_mapping (dict): a dictionary with mapping names as | |||||
keys and number of origins whose indexing used this | |||||
mapping. Note that indexing a given origin may use | |||||
0, 1, or many mappings. | |||||
""" | |||||
mapping_count = {m: 0 for m in MAPPING_NAMES} | mapping_count = {m: 0 for m in MAPPING_NAMES} | ||||
total = non_empty = 0 | total = non_empty = 0 | ||||
for data in self._origin_intrinsic_metadata.get_all(): | for data in self._origin_intrinsic_metadata.get_all(): | ||||
total += 1 | total += 1 | ||||
if set(data['metadata']) - {'@context'}: | if set(data['metadata']) - {'@context'}: | ||||
non_empty += 1 | non_empty += 1 | ||||
for mapping in data['mappings']: | for mapping in data['mappings']: | ||||
mapping_count[mapping] += 1 | mapping_count[mapping] += 1 | ||||
return { | return { | ||||
'per_mapping': mapping_count, | 'per_mapping': mapping_count, | ||||
'total': total, | 'total': total, | ||||
'non_empty': non_empty | 'non_empty': non_empty | ||||
} | } | ||||
def indexer_configuration_add(self, tools): | def indexer_configuration_add(self, tools): | ||||
"""Add new tools to the storage. | |||||
Args: | |||||
tools ([dict]): List of dictionary representing tool to | |||||
insert in the db. Dictionary with the following keys: | |||||
- **tool_name** (str): tool's name | |||||
- **tool_version** (str): tool's version | |||||
- **tool_configuration** (dict): tool's configuration | |||||
(free form dict) | |||||
Returns: | |||||
list: List of dict inserted in the db (holding the id key as | |||||
well). The order of the list is not guaranteed to match | |||||
the order of the initial list. | |||||
""" | |||||
inserted = [] | inserted = [] | ||||
for tool in tools: | for tool in tools: | ||||
tool = tool.copy() | tool = tool.copy() | ||||
id_ = self._tool_key(tool) | id_ = self._tool_key(tool) | ||||
tool['id'] = id_ | tool['id'] = id_ | ||||
self._tools[id_] = tool | self._tools[id_] = tool | ||||
inserted.append(tool) | inserted.append(tool) | ||||
return inserted | return inserted | ||||
def indexer_configuration_get(self, tool): | def indexer_configuration_get(self, tool): | ||||
"""Retrieve tool information. | |||||
Args: | |||||
tool (dict): Dictionary representing a tool with the | |||||
following keys: | |||||
- **tool_name** (str): tool's name | |||||
- **tool_version** (str): tool's version | |||||
- **tool_configuration** (dict): tool's configuration | |||||
(free form dict) | |||||
Returns: | |||||
The same dictionary with an `id` key, None otherwise. | |||||
""" | |||||
return self._tools.get(self._tool_key(tool)) | return self._tools.get(self._tool_key(tool)) | ||||
def _tool_key(self, tool): | def _tool_key(self, tool): | ||||
return hash((tool['tool_name'], tool['tool_version'], | return hash((tool['tool_name'], tool['tool_version'], | ||||
json.dumps(tool['tool_configuration'], sort_keys=True))) | json.dumps(tool['tool_configuration'], sort_keys=True))) |