diff --git a/swh/indexer/storage/in_memory.py b/swh/indexer/storage/in_memory.py index c166dfd..e2f52ea 100644 --- a/swh/indexer/storage/in_memory.py +++ b/swh/indexer/storage/in_memory.py @@ -1,365 +1,417 @@ # Copyright (C) 2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from collections import defaultdict import json SHA1_DIGEST_SIZE = 160 def _transform_tool(tool): return { 'id': tool['id'], 'name': tool['tool_name'], 'version': tool['tool_version'], 'configuration': tool['tool_configuration'], } class SubStorage: """Implements common missing/get/add logic for each indexer type.""" def __init__(self, tools): self._tools = tools self._data = {} # map (id_, tool_id) -> metadata_dict self._tools_per_id = defaultdict(set) # map id_ -> Set[tool_id] def missing(self, ids): """List data missing from storage. Args: data (iterable): dictionaries with keys: - **id** (bytes): sha1 identifier - **indexer_configuration_id** (int): tool used to compute the results Yields: missing sha1s """ for id_ in ids: tool_id = id_['indexer_configuration_id'] id_ = id_['id'] if tool_id not in self._tools_per_id.get(id_, set()): yield id_ def get(self, ids): """Retrieve data per id. Args: ids (iterable): sha1 checksums Yields: dict: dictionaries with the following keys: - **id** (bytes) - **tool** (dict): tool used to compute metadata - arbitrary data (as provided to `add`) """ for id_ in ids: for tool_id in self._tools_per_id.get(id_, set()): key = (id_, tool_id) yield { 'id': id_, 'tool': _transform_tool(self._tools[tool_id]), **self._data[key], } def add(self, data, conflict_update): """Add data not present in storage. Args: data (iterable): dictionaries with keys: - **id**: sha1 - **indexer_configuration_id**: tool used to compute the results - arbitrary data conflict_update (bool): Flag to determine if we want to overwrite (true) or skip duplicates (false) """ for item in data: item = item.copy() tool_id = item.pop('indexer_configuration_id') id_ = item.pop('id') data = item if not conflict_update and \ tool_id in self._tools_per_id.get(id_, set()): # Duplicate, should not be updated continue key = (id_, tool_id) self._data[key] = data self._tools_per_id[id_].add(tool_id) class IndexerStorage: """In-memory SWH indexer storage.""" def __init__(self): self._tools = {} + self._mimetypes = SubStorage(self._tools) self._content_ctags = SubStorage(self._tools) self._content_metadata = SubStorage(self._tools) self._revision_metadata = SubStorage(self._tools) + def content_mimetype_missing(self, mimetypes): + """Generate mimetypes missing from storage. + + Args: + mimetypes (iterable): iterable of dict with keys: + + - **id** (bytes): sha1 identifier + - **indexer_configuration_id** (int): tool used to compute the + results + + Yields: + tuple (id, indexer_configuration_id): missing id + + """ + yield from self._mimetypes.missing(mimetypes) + + def content_mimetype_add(self, mimetypes, conflict_update=False): + """Add mimetypes not present in storage. + + Args: + mimetypes (iterable): dictionaries with keys: + + - **id** (bytes): sha1 identifier + - **mimetype** (bytes): raw content's mimetype + - **encoding** (bytes): raw content's encoding + - **indexer_configuration_id** (int): tool's id used to + compute the results + - **conflict_update** (bool): Flag to determine if we want to + overwrite (``True``) or skip duplicates (``False``, the + default) + + """ + self._mimetypes.add(mimetypes, conflict_update) + + def content_mimetype_get(self, ids, db=None, cur=None): + """Retrieve full content mimetype per ids. + + Args: + ids (iterable): sha1 identifier + + Yields: + mimetypes (iterable): dictionaries with keys: + + - **id** (bytes): sha1 identifier + - **mimetype** (bytes): raw content's mimetype + - **encoding** (bytes): raw content's encoding + - **tool** (dict): Tool used to compute the language + + """ + yield from self._mimetypes.get(ids) + def content_ctags_missing(self, ctags): """List ctags missing from storage. Args: ctags (iterable): dicts with keys: - **id** (bytes): sha1 identifier - **indexer_configuration_id** (int): tool used to compute the results Yields: an iterable of missing id for the tuple (id, indexer_configuration_id) """ yield from self._content_ctags.missing(ctags) def content_ctags_get(self, ids): """Retrieve ctags per id. Args: ids (iterable): sha1 checksums Yields: Dictionaries with keys: - **id** (bytes): content's identifier - **name** (str): symbol's name - **kind** (str): symbol's kind - **lang** (str): language for that content - **tool** (dict): tool used to compute the ctags' info """ for item in self._content_ctags.get(ids): for item_ctags_item in item['ctags']: yield { 'id': item['id'], 'tool': item['tool'], **item_ctags_item } def content_ctags_add(self, ctags, conflict_update=False): """Add ctags not present in storage Args: ctags (iterable): dictionaries with keys: - **id** (bytes): sha1 - **ctags** ([list): List of dictionary with keys: name, kind, line, lang - **indexer_configuration_id**: tool used to compute the results """ for item in ctags: tool_id = item['indexer_configuration_id'] if conflict_update: item_ctags = [] else: # merge old ctags with new ctags existing = list(self._content_ctags.get([item['id']])) item_ctags = [ { key: ctags_item[key] for key in ('name', 'kind', 'line', 'lang') } for existing_item in existing if existing_item['tool']['id'] == tool_id for ctags_item in existing_item['ctags'] ] for new_item_ctags in item['ctags']: if new_item_ctags not in item_ctags: item_ctags.append(new_item_ctags) self._content_ctags.add([ { 'id': item['id'], 'indexer_configuration_id': tool_id, 'ctags': item_ctags, } ], conflict_update=True) def content_ctags_search(self, expression, limit=10, last_sha1=None, db=None, cur=None): """Search through content's raw ctags symbols. Args: expression (str): Expression to search for limit (int): Number of rows to return (default to 10). last_sha1 (str): Offset from which retrieving data (default to ''). Yields: rows of ctags including id, name, lang, kind, line, etc... """ nb_matches = 0 for ((id_, tool_id), item) in \ sorted(self._content_ctags._data.items()): if id_ <= (last_sha1 or bytes(0 for _ in range(SHA1_DIGEST_SIZE))): continue nb_matches += 1 for ctags_item in item['ctags']: if ctags_item['name'] != expression: continue yield { 'id': id_, 'tool': _transform_tool(self._tools[tool_id]), **ctags_item } if nb_matches >= limit: return def content_metadata_missing(self, metadata): """List metadata missing from storage. Args: metadata (iterable): dictionaries with keys: - **id** (bytes): sha1 identifier - **indexer_configuration_id** (int): tool used to compute the results Yields: missing sha1s """ yield from self._content_metadata.missing(metadata) def content_metadata_get(self, ids): """Retrieve metadata per id. Args: ids (iterable): sha1 checksums Yields: dictionaries with the following keys: - **id** (bytes) - **translated_metadata** (str): associated metadata - **tool** (dict): tool used to compute metadata """ yield from self._content_metadata.get(ids) def content_metadata_add(self, metadata, conflict_update=False): """Add metadata not present in storage. Args: metadata (iterable): dictionaries with keys: - **id**: sha1 - **translated_metadata**: arbitrary dict - **indexer_configuration_id**: tool used to compute the results conflict_update: Flag to determine if we want to overwrite (true) or skip duplicates (false, the default) """ self._content_metadata.add(metadata, conflict_update) def revision_metadata_missing(self, metadata): """List metadata missing from storage. Args: metadata (iterable): dictionaries with keys: - **id** (bytes): sha1_git revision identifier - **indexer_configuration_id** (int): tool used to compute the results Yields: missing ids """ yield from self._revision_metadata.missing(metadata) def revision_metadata_get(self, ids): """Retrieve revision metadata per id. Args: ids (iterable): sha1 checksums Yields: dictionaries with the following keys: - **id** (bytes) - **translated_metadata** (str): associated metadata - **tool** (dict): tool used to compute metadata """ yield from self._revision_metadata.get(ids) def revision_metadata_add(self, metadata, conflict_update=False): """Add metadata not present in storage. Args: metadata (iterable): dictionaries with keys: - **id**: sha1_git of revision - **translated_metadata**: arbitrary dict - **indexer_configuration_id**: tool used to compute metadata conflict_update: Flag to determine if we want to overwrite (true) or skip duplicates (false, the default) """ self._revision_metadata.add(metadata, conflict_update) def indexer_configuration_add(self, tools): """Add new tools to the storage. Args: tools ([dict]): List of dictionary representing tool to insert in the db. Dictionary with the following keys: - **tool_name** (str): tool's name - **tool_version** (str): tool's version - **tool_configuration** (dict): tool's configuration (free form dict) Returns: list: List of dict inserted in the db (holding the id key as well). The order of the list is not guaranteed to match the order of the initial list. """ inserted = [] for tool in tools: tool = tool.copy() id_ = self._tool_key(tool) tool['id'] = id_ self._tools[id_] = tool inserted.append(tool) return inserted def indexer_configuration_get(self, tool): """Retrieve tool information. Args: tool (dict): Dictionary representing a tool with the following keys: - **tool_name** (str): tool's name - **tool_version** (str): tool's version - **tool_configuration** (dict): tool's configuration (free form dict) Returns: The same dictionary with an `id` key, None otherwise. """ return self._tools.get(self._tool_key(tool)) def _tool_key(self, tool): return (tool['tool_name'], tool['tool_version'], json.dumps(tool['tool_configuration'], sort_keys=True)) diff --git a/swh/indexer/tests/storage/test_in_memory.py b/swh/indexer/tests/storage/test_in_memory.py index ff18b6e..d5e69a5 100644 --- a/swh/indexer/tests/storage/test_in_memory.py +++ b/swh/indexer/tests/storage/test_in_memory.py @@ -1,119 +1,103 @@ from unittest import TestCase import pytest from .test_storage import CommonTestStorage class IndexerTestInMemoryStorage(CommonTestStorage, TestCase): def setUp(self): self.storage_config = { 'cls': 'memory', 'args': { }, } super().setUp() @pytest.mark.xfail def test_check_config(self): pass - @pytest.mark.xfail - def test_content_mimetype_missing(self): - pass - - @pytest.mark.xfail - def test_content_mimetype_add__drop_duplicate(self): - pass - - @pytest.mark.xfail - def test_content_mimetype_add__update_in_place_duplicate(self): - pass - - @pytest.mark.xfail - def test_content_mimetype_get(self): - pass - @pytest.mark.xfail def test_content_language_missing(self): pass @pytest.mark.xfail def test_content_language_get(self): pass @pytest.mark.xfail def test_content_language_add__drop_duplicate(self): pass @pytest.mark.xfail def test_content_language_add__update_in_place_duplicate(self): pass @pytest.mark.xfail def test_content_fossology_license_get(self): pass @pytest.mark.xfail def test_content_fossology_license_add__new_license_added(self): pass @pytest.mark.xfail def test_content_fossology_license_add__update_in_place_duplicate(self): pass @pytest.mark.xfail def test_origin_intrinsic_metadata_get(self): pass @pytest.mark.xfail def test_origin_intrinsic_metadata_add_drop_duplicate(self): pass @pytest.mark.xfail def test_origin_intrinsic_metadata_add_update_in_place_duplicate(self): pass @pytest.mark.xfail def test_origin_intrinsic_metadata_search_fulltext(self): pass @pytest.mark.xfail def test_origin_intrinsic_metadata_search_fulltext_rank(self): pass @pytest.mark.xfail def test_indexer_configuration_metadata_get_missing_context(self): pass @pytest.mark.xfail def test_indexer_configuration_metadata_get(self): pass @pytest.mark.xfail def test_generate_content_mimetype_get_range_limit_none(self): pass @pytest.mark.xfail def test_generate_content_mimetype_get_range_no_limit(self, mimetypes): pass @pytest.mark.xfail def test_generate_content_mimetype_get_range_limit(self, mimetypes): pass @pytest.mark.xfail def test_generate_content_fossology_license_get_range_limit_none(self): pass @pytest.mark.xfail def test_generate_content_fossology_license_get_range_no_limit(self): pass @pytest.mark.xfail def test_generate_content_fossology_license_get_range_no_limit_with_filter( self): pass @pytest.mark.xfail def test_generate_fossology_license_get_range_limit(self): pass diff --git a/swh/indexer/tests/test_mimetype.py b/swh/indexer/tests/test_mimetype.py index e5223bc..e472149 100644 --- a/swh/indexer/tests/test_mimetype.py +++ b/swh/indexer/tests/test_mimetype.py @@ -1,192 +1,190 @@ # Copyright (C) 2017-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import unittest -import logging from unittest.mock import patch from swh.indexer.mimetype import ( MimetypeIndexer, MimetypeRangeIndexer, compute_mimetype_encoding ) from swh.indexer.tests.test_utils import ( MockObjStorage, BasicMockStorage, BasicMockIndexerStorage, CommonContentIndexerTest, CommonContentIndexerRangeTest, CommonIndexerWithErrorsTest, CommonIndexerNoTool, BASE_TEST_CONFIG ) class FakeMagicResult: def __init__(self, mimetype, encoding): self.mime_type = mimetype self.encoding = encoding class BasicTest(unittest.TestCase): @patch('swh.indexer.mimetype.magic') def test_compute_mimetype_encoding(self, mock_magic): """Compute mimetype encoding should return results""" for _input, _mimetype, _encoding in [ (b'some-content', 'text/plain', 'utf-8'), (b'raw-content', 'application/json', 'ascii')]: mock_magic.detect_from_content.return_value = FakeMagicResult( _mimetype, _encoding) actual_result = compute_mimetype_encoding(_input) self.assertEqual(actual_result, { 'mimetype': _mimetype, 'encoding': _encoding }) class MimetypeTestIndexer(MimetypeIndexer): """Specific mimetype indexer instance whose configuration is enough to satisfy the indexing tests. """ def parse_config_file(self, *args, **kwargs): return { **BASE_TEST_CONFIG, 'tools': { 'name': 'file', 'version': '1:5.30-1+deb9u1', 'configuration': { "type": "library", "debian-package": "python3-magic" }, }, } - def prepare(self): - super().prepare() - self.idx_storage = BasicMockIndexerStorage() - self.log = logging.getLogger('swh.indexer') - self.objstorage = MockObjStorage() - class TestMimetypeIndexer(CommonContentIndexerTest, unittest.TestCase): """Mimetype indexer test scenarios: - Known sha1s in the input list have their data indexed - Unknown sha1 in the input list are not indexed """ + + def get_indexer_results(self, ids): + yield from self.idx_storage.content_mimetype_get(ids) + def setUp(self): self.indexer = MimetypeTestIndexer() + self.idx_storage = self.indexer.idx_storage self.id0 = '01c9379dfc33803963d07c1ccc748d3fe4c96bb5' self.id1 = '688a5ef812c53907562fe379d4b3851e69c7cb15' self.id2 = 'da39a3ee5e6b4b0d3255bfef95601890afd80709' tool_id = self.indexer.tool['id'] self.expected_results = { self.id0: { 'id': self.id0, 'indexer_configuration_id': tool_id, 'mimetype': 'text/plain', 'encoding': 'us-ascii', }, self.id1: { 'id': self.id1, 'indexer_configuration_id': tool_id, 'mimetype': 'text/plain', 'encoding': 'us-ascii', }, self.id2: { 'id': self.id2, 'indexer_configuration_id': tool_id, 'mimetype': 'application/x-empty', 'encoding': 'binary', } } class MimetypeRangeIndexerTest(MimetypeRangeIndexer): """Specific mimetype whose configuration is enough to satisfy the indexing tests. """ def parse_config_file(self, *args, **kwargs): return { **BASE_TEST_CONFIG, 'tools': { 'name': 'file', 'version': '1:5.30-1+deb9u1', 'configuration': { "type": "library", "debian-package": "python3-magic" }, }, 'write_batch_size': 100, } def prepare(self): super().prepare() self.idx_storage = BasicMockIndexerStorage() # this hardcodes some contents, will use this to setup the storage self.objstorage = MockObjStorage() # sync objstorage and storage contents = [{'sha1': c_id} for c_id in self.objstorage] self.storage = BasicMockStorage(contents) class TestMimetypeRangeIndexer( CommonContentIndexerRangeTest, unittest.TestCase): """Range Mimetype Indexer tests. - new data within range are indexed - no data outside a range are indexed - with filtering existing indexed data prior to compute new index - without filtering existing indexed data prior to compute new index """ def setUp(self): self.indexer = MimetypeRangeIndexerTest() # will play along with the objstorage's mocked contents for now self.contents = sorted(self.indexer.objstorage) # FIXME: leverage swh.objstorage.in_memory_storage's # InMemoryObjStorage, swh.storage.tests's gen_contents, and # hypothesis to generate data to actually run indexer on those self.id0 = '01c9379dfc33803963d07c1ccc748d3fe4c96bb5' self.id1 = '02fb2c89e14f7fab46701478c83779c7beb7b069' self.id2 = '103bc087db1d26afc3a0283f38663d081e9b01e6' tool_id = self.indexer.tool['id'] self.expected_results = { self.id0: { 'encoding': 'us-ascii', 'id': self.id0, 'indexer_configuration_id': tool_id, 'mimetype': 'text/plain'}, self.id1: { 'encoding': 'us-ascii', 'id': self.id1, 'indexer_configuration_id': tool_id, 'mimetype': 'text/x-python'}, self.id2: { 'encoding': 'us-ascii', 'id': self.id2, 'indexer_configuration_id': tool_id, 'mimetype': 'text/plain'} } class MimetypeIndexerUnknownToolTestStorage( CommonIndexerNoTool, MimetypeTestIndexer): """Mimetype indexer with wrong configuration""" class MimetypeRangeIndexerUnknownToolTestStorage( CommonIndexerNoTool, MimetypeRangeIndexerTest): """Mimetype range indexer with wrong configuration""" class TestMimetypeIndexersErrors( CommonIndexerWithErrorsTest, unittest.TestCase): """Test the indexer raise the right errors when wrongly initialized""" Indexer = MimetypeIndexerUnknownToolTestStorage RangeIndexer = MimetypeRangeIndexerUnknownToolTestStorage