Changeset View
Changeset View
Standalone View
Standalone View
swh/indexer/storage/in_memory.py
# Copyright (C) 2018 The Software Heritage developers | # Copyright (C) 2018 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
import bisect | |||||
from collections import defaultdict | from collections import defaultdict | ||||
import json | import json | ||||
SHA1_DIGEST_SIZE = 160 | SHA1_DIGEST_SIZE = 160 | ||||
def _transform_tool(tool): | def _transform_tool(tool): | ||||
return { | return { | ||||
'id': tool['id'], | 'id': tool['id'], | ||||
'name': tool['tool_name'], | 'name': tool['tool_name'], | ||||
'version': tool['tool_version'], | 'version': tool['tool_version'], | ||||
'configuration': tool['tool_configuration'], | 'configuration': tool['tool_configuration'], | ||||
} | } | ||||
class SubStorage: | class SubStorage: | ||||
"""Implements common missing/get/add logic for each indexer type.""" | """Implements common missing/get/add logic for each indexer type.""" | ||||
def __init__(self, tools): | def __init__(self, tools): | ||||
self._tools = tools | self._tools = tools | ||||
self._sorted_ids = [] | |||||
self._data = {} # map (id_, tool_id) -> metadata_dict | self._data = {} # map (id_, tool_id) -> metadata_dict | ||||
self._tools_per_id = defaultdict(set) # map id_ -> Set[tool_id] | self._tools_per_id = defaultdict(set) # map id_ -> Set[tool_id] | ||||
def missing(self, ids): | def missing(self, ids): | ||||
"""List data missing from storage. | """List data missing from storage. | ||||
Args: | Args: | ||||
data (iterable): dictionaries with keys: | data (iterable): dictionaries with keys: | ||||
Show All 30 Lines | def get(self, ids): | ||||
for tool_id in self._tools_per_id.get(id_, set()): | for tool_id in self._tools_per_id.get(id_, set()): | ||||
key = (id_, tool_id) | key = (id_, tool_id) | ||||
yield { | yield { | ||||
'id': id_, | 'id': id_, | ||||
'tool': _transform_tool(self._tools[tool_id]), | 'tool': _transform_tool(self._tools[tool_id]), | ||||
**self._data[key], | **self._data[key], | ||||
} | } | ||||
def get_range(self, start, end, indexer_configuration_id, limit): | |||||
"""Retrieve data within range [start, end] bound by limit. | |||||
Args: | |||||
**start** (bytes): Starting identifier range (expected smaller | |||||
than end) | |||||
**end** (bytes): Ending identifier range (expected larger | |||||
than start) | |||||
**indexer_configuration_id** (int): The tool used to index data | |||||
**limit** (int): Limit result | |||||
Raises: | |||||
ValueError for limit to None | |||||
Returns: | |||||
a dict with keys: | |||||
- **ids** [bytes]: iterable of content ids within the range. | |||||
- **next** (Optional[bytes]): The next range of sha1 starts at | |||||
this sha1 if any | |||||
""" | |||||
if limit is None: | |||||
raise ValueError('Development error: limit should not be None') | |||||
from_index = bisect.bisect_left(self._sorted_ids, start) | |||||
to_index = bisect.bisect_right(self._sorted_ids, end, lo=from_index) | |||||
if to_index - from_index >= limit: | |||||
return { | |||||
'ids': self._sorted_ids[from_index:from_index+limit], | |||||
'next': self._sorted_ids[from_index+limit], | |||||
} | |||||
else: | |||||
return { | |||||
'ids': self._sorted_ids[from_index:to_index], | |||||
'next': None, | |||||
} | |||||
def add(self, data, conflict_update): | def add(self, data, conflict_update): | ||||
"""Add data not present in storage. | """Add data not present in storage. | ||||
Args: | Args: | ||||
data (iterable): dictionaries with keys: | data (iterable): dictionaries with keys: | ||||
- **id**: sha1 | - **id**: sha1 | ||||
- **indexer_configuration_id**: tool used to compute the | - **indexer_configuration_id**: tool used to compute the | ||||
Show All 11 Lines | def add(self, data, conflict_update): | ||||
data = item | data = item | ||||
if not conflict_update and \ | if not conflict_update and \ | ||||
tool_id in self._tools_per_id.get(id_, set()): | tool_id in self._tools_per_id.get(id_, set()): | ||||
# Duplicate, should not be updated | # Duplicate, should not be updated | ||||
continue | continue | ||||
key = (id_, tool_id) | key = (id_, tool_id) | ||||
self._data[key] = data | self._data[key] = data | ||||
self._tools_per_id[id_].add(tool_id) | self._tools_per_id[id_].add(tool_id) | ||||
if id_ not in self._sorted_ids: | |||||
bisect.insort(self._sorted_ids, id_) | |||||
def add_merge(self, new_data, conflict_update, merged_key): | |||||
for new_item in new_data: | |||||
id_ = new_item['id'] | |||||
tool_id = new_item['indexer_configuration_id'] | |||||
if conflict_update: | |||||
all_subitems = [] | |||||
else: | |||||
existing = list(self.get([id_])) | |||||
all_subitems = [ | |||||
old_subitem | |||||
for existing_item in existing | |||||
if existing_item['tool']['id'] == tool_id | |||||
for old_subitem in existing_item[merged_key] | |||||
] | |||||
for new_subitem in new_item[merged_key]: | |||||
if new_subitem not in all_subitems: | |||||
all_subitems.append(new_subitem) | |||||
self.add([ | |||||
{ | |||||
'id': id_, | |||||
'indexer_configuration_id': tool_id, | |||||
merged_key: all_subitems, | |||||
} | |||||
], conflict_update=True) | |||||
if id_ not in self._sorted_ids: | |||||
bisect.insort(self._sorted_ids, id_) | |||||
ardumont: That's not supposed to be merge.
As i mention early in our discussion [1]
Another point to… | |||||
class IndexerStorage: | class IndexerStorage: | ||||
"""In-memory SWH indexer storage.""" | """In-memory SWH indexer storage.""" | ||||
def __init__(self): | def __init__(self): | ||||
Not Done Inline ActionsMy point is, if you add a simple add scenario here, everything should still be green. ardumont: My point is, if you add a simple add scenario here, everything should still be green. | |||||
Not Done Inline ActionsOk, so everything is fine in the end. ardumont: Ok, so everything is fine in the end.
It's also an implementation detail of the in-memory… | |||||
self._tools = {} | self._tools = {} | ||||
self._mimetypes = SubStorage(self._tools) | self._mimetypes = SubStorage(self._tools) | ||||
self._content_ctags = SubStorage(self._tools) | self._content_ctags = SubStorage(self._tools) | ||||
self._licenses = SubStorage(self._tools) | |||||
self._content_metadata = SubStorage(self._tools) | self._content_metadata = SubStorage(self._tools) | ||||
self._revision_metadata = SubStorage(self._tools) | self._revision_metadata = SubStorage(self._tools) | ||||
def content_mimetype_missing(self, mimetypes): | def content_mimetype_missing(self, mimetypes): | ||||
"""Generate mimetypes missing from storage. | """Generate mimetypes missing from storage. | ||||
Args: | Args: | ||||
mimetypes (iterable): iterable of dict with keys: | mimetypes (iterable): iterable of dict with keys: | ||||
▲ Show 20 Lines • Show All 86 Lines • ▼ Show 20 Lines | def content_ctags_get(self, ids): | ||||
} | } | ||||
def content_ctags_add(self, ctags, conflict_update=False): | def content_ctags_add(self, ctags, conflict_update=False): | ||||
"""Add ctags not present in storage | """Add ctags not present in storage | ||||
Args: | Args: | ||||
ctags (iterable): dictionaries with keys: | ctags (iterable): dictionaries with keys: | ||||
- **id** (bytes): sha1 | - **id** (bytes): sha1 | ||||
Not Done Inline ActionsThat's actually the other way around as you mentioned in T1443. ardumont: That's actually the other way around as you mentioned in T1443. | |||||
Not Done Inline Actionsfixed in another diff. ardumont: fixed in another diff. | |||||
- **ctags** ([list): List of dictionary with keys: name, kind, | - **ctags** ([list): List of dictionary with keys: name, kind, | ||||
line, lang | line, lang | ||||
- **indexer_configuration_id**: tool used to compute the | - **indexer_configuration_id**: tool used to compute the | ||||
results | results | ||||
""" | """ | ||||
for item in ctags: | self._content_ctags.add_merge(ctags, conflict_update, 'ctags') | ||||
tool_id = item['indexer_configuration_id'] | |||||
if conflict_update: | |||||
item_ctags = [] | |||||
else: | |||||
# merge old ctags with new ctags | |||||
existing = list(self._content_ctags.get([item['id']])) | |||||
item_ctags = [ | |||||
{ | |||||
key: ctags_item[key] | |||||
for key in ('name', 'kind', 'line', 'lang') | |||||
} | |||||
for existing_item in existing | |||||
if existing_item['tool']['id'] == tool_id | |||||
for ctags_item in existing_item['ctags'] | |||||
] | |||||
for new_item_ctags in item['ctags']: | |||||
if new_item_ctags not in item_ctags: | |||||
item_ctags.append(new_item_ctags) | |||||
self._content_ctags.add([ | |||||
{ | |||||
'id': item['id'], | |||||
'indexer_configuration_id': tool_id, | |||||
'ctags': item_ctags, | |||||
} | |||||
], conflict_update=True) | |||||
def content_ctags_search(self, expression, | def content_ctags_search(self, expression, | ||||
limit=10, last_sha1=None, db=None, cur=None): | limit=10, last_sha1=None, db=None, cur=None): | ||||
"""Search through content's raw ctags symbols. | """Search through content's raw ctags symbols. | ||||
Args: | Args: | ||||
expression (str): Expression to search for | expression (str): Expression to search for | ||||
limit (int): Number of rows to return (default to 10). | limit (int): Number of rows to return (default to 10). | ||||
Show All 15 Lines | def content_ctags_search(self, expression, | ||||
yield { | yield { | ||||
'id': id_, | 'id': id_, | ||||
'tool': _transform_tool(self._tools[tool_id]), | 'tool': _transform_tool(self._tools[tool_id]), | ||||
**ctags_item | **ctags_item | ||||
} | } | ||||
if nb_matches >= limit: | if nb_matches >= limit: | ||||
return | return | ||||
def content_fossology_license_get(self, ids): | |||||
"""Retrieve licenses per id. | |||||
Args: | |||||
ids (iterable): sha1 checksums | |||||
Yields: | |||||
`{id: facts}` where `facts` is a dict with the following keys: | |||||
- **licenses** ([str]): associated licenses for that content | |||||
- **tool** (dict): Tool used to compute the license | |||||
""" | |||||
# TODO: remove this reformatting in order to yield items with the | |||||
# same format as other _get methods. | |||||
res = {} | |||||
for d in self._licenses.get(ids): | |||||
res.setdefault(d.pop('id'), []).append(d) | |||||
for (id_, facts) in res.items(): | |||||
yield {id_: facts} | |||||
def content_fossology_license_add(self, licenses, conflict_update=False): | |||||
"""Add licenses not present in storage. | |||||
Args: | |||||
licenses (iterable): dictionaries with keys: | |||||
- **id**: sha1 | |||||
- **licenses** ([bytes]): List of licenses associated to sha1 | |||||
- **tool** (str): nomossa | |||||
conflict_update: Flag to determine if we want to overwrite (true) | |||||
or skip duplicates (false, the default) | |||||
Returns: | |||||
list: content_license entries which failed due to unknown licenses | |||||
""" | |||||
self._licenses.add_merge(licenses, conflict_update, 'licenses') | |||||
def content_fossology_license_get_range( | |||||
self, start, end, indexer_configuration_id, limit=1000): | |||||
"""Retrieve licenses within range [start, end] bound by limit. | |||||
Args: | |||||
**start** (bytes): Starting identifier range (expected smaller | |||||
than end) | |||||
**end** (bytes): Ending identifier range (expected larger | |||||
than start) | |||||
**indexer_configuration_id** (int): The tool used to index data | |||||
**limit** (int): Limit result (default to 1000) | |||||
Raises: | |||||
ValueError for limit to None | |||||
Returns: | |||||
a dict with keys: | |||||
- **ids** [bytes]: iterable of content ids within the range. | |||||
- **next** (Optional[bytes]): The next range of sha1 starts at | |||||
this sha1 if any | |||||
""" | |||||
return self._licenses.get_range( | |||||
start, end, indexer_configuration_id, limit) | |||||
def content_metadata_missing(self, metadata): | def content_metadata_missing(self, metadata): | ||||
"""List metadata missing from storage. | """List metadata missing from storage. | ||||
Args: | Args: | ||||
metadata (iterable): dictionaries with keys: | metadata (iterable): dictionaries with keys: | ||||
- **id** (bytes): sha1 identifier | - **id** (bytes): sha1 identifier | ||||
- **indexer_configuration_id** (int): tool used to compute | - **indexer_configuration_id** (int): tool used to compute | ||||
▲ Show 20 Lines • Show All 137 Lines • Show Last 20 Lines |
That's not supposed to be merge.
As i mention early in our discussion [1]
Another point to what i said earlier is that there is no merge scenario in the indexer storage tests.
Knowing me, i would have added it if i initially added this behavior it.
[1] D783#inline-4276