diff --git a/swh/indexer/storage/in_memory.py b/swh/indexer/storage/in_memory.py index 3cca457..c166dfd 100644 --- a/swh/indexer/storage/in_memory.py +++ b/swh/indexer/storage/in_memory.py @@ -1,366 +1,365 @@ # Copyright (C) 2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from collections import defaultdict import json SHA1_DIGEST_SIZE = 160 def _transform_tool(tool): return { 'id': tool['id'], 'name': tool['tool_name'], 'version': tool['tool_version'], 'configuration': tool['tool_configuration'], } class SubStorage: """Implements common missing/get/add logic for each indexer type.""" def __init__(self, tools): self._tools = tools self._data = {} # map (id_, tool_id) -> metadata_dict self._tools_per_id = defaultdict(set) # map id_ -> Set[tool_id] def missing(self, ids): """List data missing from storage. Args: data (iterable): dictionaries with keys: - **id** (bytes): sha1 identifier - **indexer_configuration_id** (int): tool used to compute the results Yields: missing sha1s """ for id_ in ids: tool_id = id_['indexer_configuration_id'] id_ = id_['id'] if tool_id not in self._tools_per_id.get(id_, set()): yield id_ def get(self, ids): """Retrieve data per id. Args: ids (iterable): sha1 checksums Yields: dict: dictionaries with the following keys: - **id** (bytes) - **tool** (dict): tool used to compute metadata - arbitrary data (as provided to `add`) """ for id_ in ids: for tool_id in self._tools_per_id.get(id_, set()): key = (id_, tool_id) yield { 'id': id_, 'tool': _transform_tool(self._tools[tool_id]), **self._data[key], } def add(self, data, conflict_update): """Add data not present in storage. Args: data (iterable): dictionaries with keys: - **id**: sha1 - **indexer_configuration_id**: tool used to compute the results - arbitrary data conflict_update (bool): Flag to determine if we want to overwrite (true) or skip duplicates (false) """ for item in data: item = item.copy() tool_id = item.pop('indexer_configuration_id') id_ = item.pop('id') data = item if not conflict_update and \ tool_id in self._tools_per_id.get(id_, set()): # Duplicate, should not be updated continue key = (id_, tool_id) self._data[key] = data self._tools_per_id[id_].add(tool_id) class IndexerStorage: """In-memory SWH indexer storage.""" def __init__(self): self._tools = {} self._content_ctags = SubStorage(self._tools) self._content_metadata = SubStorage(self._tools) self._revision_metadata = SubStorage(self._tools) def content_ctags_missing(self, ctags): """List ctags missing from storage. Args: ctags (iterable): dicts with keys: - **id** (bytes): sha1 identifier - **indexer_configuration_id** (int): tool used to compute the results Yields: an iterable of missing id for the tuple (id, indexer_configuration_id) """ yield from self._content_ctags.missing(ctags) def content_ctags_get(self, ids): """Retrieve ctags per id. Args: ids (iterable): sha1 checksums Yields: Dictionaries with keys: - **id** (bytes): content's identifier - **name** (str): symbol's name - **kind** (str): symbol's kind - **lang** (str): language for that content - **tool** (dict): tool used to compute the ctags' info """ for item in self._content_ctags.get(ids): for item_ctags_item in item['ctags']: yield { 'id': item['id'], 'tool': item['tool'], **item_ctags_item } def content_ctags_add(self, ctags, conflict_update=False): """Add ctags not present in storage Args: ctags (iterable): dictionaries with keys: - **id** (bytes): sha1 - **ctags** ([list): List of dictionary with keys: name, kind, line, lang - **indexer_configuration_id**: tool used to compute the results """ for item in ctags: tool_id = item['indexer_configuration_id'] if conflict_update: item_ctags = [] else: - # TODO: this merges old ctags with new ctags. This is - # pointless, new ctags should replace the old ones. + # merge old ctags with new ctags existing = list(self._content_ctags.get([item['id']])) item_ctags = [ { key: ctags_item[key] for key in ('name', 'kind', 'line', 'lang') } for existing_item in existing if existing_item['tool']['id'] == tool_id for ctags_item in existing_item['ctags'] ] for new_item_ctags in item['ctags']: if new_item_ctags not in item_ctags: item_ctags.append(new_item_ctags) self._content_ctags.add([ { 'id': item['id'], 'indexer_configuration_id': tool_id, 'ctags': item_ctags, } ], conflict_update=True) def content_ctags_search(self, expression, limit=10, last_sha1=None, db=None, cur=None): """Search through content's raw ctags symbols. Args: expression (str): Expression to search for limit (int): Number of rows to return (default to 10). last_sha1 (str): Offset from which retrieving data (default to ''). Yields: rows of ctags including id, name, lang, kind, line, etc... """ nb_matches = 0 for ((id_, tool_id), item) in \ sorted(self._content_ctags._data.items()): if id_ <= (last_sha1 or bytes(0 for _ in range(SHA1_DIGEST_SIZE))): continue nb_matches += 1 for ctags_item in item['ctags']: if ctags_item['name'] != expression: continue yield { 'id': id_, 'tool': _transform_tool(self._tools[tool_id]), **ctags_item } if nb_matches >= limit: return def content_metadata_missing(self, metadata): """List metadata missing from storage. Args: metadata (iterable): dictionaries with keys: - **id** (bytes): sha1 identifier - **indexer_configuration_id** (int): tool used to compute the results Yields: missing sha1s """ yield from self._content_metadata.missing(metadata) def content_metadata_get(self, ids): """Retrieve metadata per id. Args: ids (iterable): sha1 checksums Yields: dictionaries with the following keys: - **id** (bytes) - **translated_metadata** (str): associated metadata - **tool** (dict): tool used to compute metadata """ yield from self._content_metadata.get(ids) def content_metadata_add(self, metadata, conflict_update=False): """Add metadata not present in storage. Args: metadata (iterable): dictionaries with keys: - **id**: sha1 - **translated_metadata**: arbitrary dict - **indexer_configuration_id**: tool used to compute the results conflict_update: Flag to determine if we want to overwrite (true) or skip duplicates (false, the default) """ self._content_metadata.add(metadata, conflict_update) def revision_metadata_missing(self, metadata): """List metadata missing from storage. Args: metadata (iterable): dictionaries with keys: - **id** (bytes): sha1_git revision identifier - **indexer_configuration_id** (int): tool used to compute the results Yields: missing ids """ yield from self._revision_metadata.missing(metadata) def revision_metadata_get(self, ids): """Retrieve revision metadata per id. Args: ids (iterable): sha1 checksums Yields: dictionaries with the following keys: - **id** (bytes) - **translated_metadata** (str): associated metadata - **tool** (dict): tool used to compute metadata """ yield from self._revision_metadata.get(ids) def revision_metadata_add(self, metadata, conflict_update=False): """Add metadata not present in storage. Args: metadata (iterable): dictionaries with keys: - **id**: sha1_git of revision - **translated_metadata**: arbitrary dict - **indexer_configuration_id**: tool used to compute metadata conflict_update: Flag to determine if we want to overwrite (true) or skip duplicates (false, the default) """ self._revision_metadata.add(metadata, conflict_update) def indexer_configuration_add(self, tools): """Add new tools to the storage. Args: tools ([dict]): List of dictionary representing tool to insert in the db. Dictionary with the following keys: - **tool_name** (str): tool's name - **tool_version** (str): tool's version - **tool_configuration** (dict): tool's configuration (free form dict) Returns: list: List of dict inserted in the db (holding the id key as well). The order of the list is not guaranteed to match the order of the initial list. """ inserted = [] for tool in tools: tool = tool.copy() id_ = self._tool_key(tool) tool['id'] = id_ self._tools[id_] = tool inserted.append(tool) return inserted def indexer_configuration_get(self, tool): """Retrieve tool information. Args: tool (dict): Dictionary representing a tool with the following keys: - **tool_name** (str): tool's name - **tool_version** (str): tool's version - **tool_configuration** (dict): tool's configuration (free form dict) Returns: The same dictionary with an `id` key, None otherwise. """ return self._tools.get(self._tool_key(tool)) def _tool_key(self, tool): return (tool['tool_name'], tool['tool_version'], json.dumps(tool['tool_configuration'], sort_keys=True)) diff --git a/swh/indexer/tests/test_ctags.py b/swh/indexer/tests/test_ctags.py index 1aa80c8..ed7186d 100644 --- a/swh/indexer/tests/test_ctags.py +++ b/swh/indexer/tests/test_ctags.py @@ -1,146 +1,151 @@ # Copyright (C) 2017-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import unittest from unittest.mock import patch from swh.indexer.ctags import ( CtagsIndexer, run_ctags ) from swh.indexer.tests.test_utils import ( CommonContentIndexerTest, CommonIndexerWithErrorsTest, CommonIndexerNoTool, SHA1_TO_CTAGS, NoDiskIndexer, BASE_TEST_CONFIG ) class BasicTest(unittest.TestCase): @patch('swh.indexer.ctags.subprocess') def test_run_ctags(self, mock_subprocess): """Computing licenses from a raw content should return results """ output0 = """ {"name":"defun","kind":"function","line":1,"language":"scheme"} {"name":"name","kind":"symbol","line":5,"language":"else"}""" output1 = """ {"name":"let","kind":"var","line":10,"language":"something"}""" expected_result0 = [ { 'name': 'defun', 'kind': 'function', 'line': 1, 'lang': 'scheme' }, { 'name': 'name', 'kind': 'symbol', 'line': 5, 'lang': 'else' } ] expected_result1 = [ { 'name': 'let', 'kind': 'var', 'line': 10, 'lang': 'something' } ] for path, lang, intermediary_result, expected_result in [ (b'some/path', 'lisp', output0, expected_result0), (b'some/path/2', 'markdown', output1, expected_result1) ]: mock_subprocess.check_output.return_value = intermediary_result actual_result = list(run_ctags(path, lang=lang)) self.assertEqual(actual_result, expected_result) class InjectCtagsIndexer: """Override ctags computations. """ def compute_ctags(self, path, lang): """Inject fake ctags given path (sha1 identifier). """ return { 'lang': lang, **SHA1_TO_CTAGS.get(path) } class CtagsIndexerTest(NoDiskIndexer, InjectCtagsIndexer, CtagsIndexer): """Specific language whose configuration is enough to satisfy the indexing tests. """ def parse_config_file(self, *args, **kwargs): return { **BASE_TEST_CONFIG, 'tools': { 'name': 'universal-ctags', 'version': '~git7859817b', 'configuration': { 'command_line': '''ctags --fields=+lnz --sort=no ''' ''' --links=no ''', 'max_content_size': 1000, }, }, 'languages': { 'python': 'python', 'haskell': 'haskell', 'bar': 'bar', }, 'workdir': '/nowhere', } class TestCtagsIndexer(CommonContentIndexerTest, unittest.TestCase): """Ctags indexer test scenarios: - Known sha1s in the input list have their data indexed - Unknown sha1 in the input list are not indexed """ + + def get_indexer_results(self, ids): + yield from self.idx_storage.content_ctags_get(ids) + def setUp(self): self.indexer = CtagsIndexerTest() + self.idx_storage = self.indexer.idx_storage # Prepare test input self.id0 = '01c9379dfc33803963d07c1ccc748d3fe4c96bb5' self.id1 = 'd4c647f0fc257591cc9ba1722484229780d1c607' self.id2 = '688a5ef812c53907562fe379d4b3851e69c7cb15' tool_id = self.indexer.tool['id'] self.expected_results = { self.id0: { 'id': self.id0, 'indexer_configuration_id': tool_id, 'ctags': SHA1_TO_CTAGS[self.id0], }, self.id1: { 'id': self.id1, 'indexer_configuration_id': tool_id, 'ctags': SHA1_TO_CTAGS[self.id1], }, self.id2: { 'id': self.id2, 'indexer_configuration_id': tool_id, 'ctags': SHA1_TO_CTAGS[self.id2], } } class CtagsIndexerUnknownToolTestStorage( CommonIndexerNoTool, CtagsIndexerTest): """Fossology license indexer with wrong configuration""" class TestCtagsIndexersErrors( CommonIndexerWithErrorsTest, unittest.TestCase): """Test the indexer raise the right errors when wrongly initialized""" Indexer = CtagsIndexerUnknownToolTestStorage diff --git a/swh/indexer/tests/test_utils.py b/swh/indexer/tests/test_utils.py index 6abdd19..4784fc7 100644 --- a/swh/indexer/tests/test_utils.py +++ b/swh/indexer/tests/test_utils.py @@ -1,806 +1,808 @@ # Copyright (C) 2017-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import datetime from swh.objstorage.exc import ObjNotFoundError from swh.model import hashutil from swh.model.hashutil import hash_to_bytes from swh.indexer.storage import INDEXER_CFG_KEY BASE_TEST_CONFIG = { 'storage': { 'cls': 'memory', 'args': { }, }, 'objstorage': { 'cls': 'memory', 'args': { }, }, INDEXER_CFG_KEY: { 'cls': 'memory', 'args': { }, }, } ORIGINS = [ { 'id': 52189575, 'lister': None, 'project': None, 'type': 'git', 'url': 'https://github.com/SoftwareHeritage/swh-storage'}, { 'id': 4423668, 'lister': None, 'project': None, 'type': 'ftp', 'url': 'rsync://ftp.gnu.org/gnu/3dldf'}, { 'id': 77775770, 'lister': None, 'project': None, 'type': 'deposit', 'url': 'https://forge.softwareheritage.org/source/jesuisgpl/'}, { 'id': 85072327, 'lister': None, 'project': None, 'type': 'pypi', 'url': 'https://pypi.org/project/limnoria/'}, { 'id': 49908349, 'lister': None, 'project': None, 'type': 'svn', 'url': 'http://0-512-md.googlecode.com/svn/'}, { 'id': 54974445, 'lister': None, 'project': None, 'type': 'git', 'url': 'https://github.com/librariesio/yarn-parser'}, ] SNAPSHOTS = { 52189575: { 'branches': { b'refs/heads/add-revision-origin-cache': { 'target': b'L[\xce\x1c\x88\x8eF\t\xf1"\x19\x1e\xfb\xc0' b's\xe7/\xe9l\x1e', 'target_type': 'revision'}, b'HEAD': { 'target': b'8K\x12\x00d\x03\xcc\xe4]bS\xe3\x8f{\xd7}' b'\xac\xefrm', 'target_type': 'revision'}, b'refs/tags/v0.0.103': { 'target': b'\xb6"Im{\xfdLb\xb0\x94N\xea\x96m\x13x\x88+' b'\x0f\xdd', 'target_type': 'release'}, }}, 4423668: { 'branches': { b'3DLDF-1.1.4.tar.gz': { 'target': b'dJ\xfb\x1c\x91\xf4\x82B%]6\xa2\x90|\xd3\xfc' b'"G\x99\x11', 'target_type': 'revision'}, b'3DLDF-2.0.2.tar.gz': { 'target': b'\xb6\x0e\xe7\x9e9\xac\xaa\x19\x9e=' b'\xd1\xc5\x00\\\xc6\xfc\xe0\xa6\xb4V', 'target_type': 'revision'}, b'3DLDF-2.0.3-examples.tar.gz': { 'target': b'!H\x19\xc0\xee\x82-\x12F1\xbd\x97' b'\xfe\xadZ\x80\x80\xc1\x83\xff', 'target_type': 'revision'}, b'3DLDF-2.0.3.tar.gz': { 'target': b'\x8e\xa9\x8e/\xea}\x9feF\xf4\x9f\xfd\xee' b'\xcc\x1a\xb4`\x8c\x8by', 'target_type': 'revision'}, b'3DLDF-2.0.tar.gz': { 'target': b'F6*\xff(?\x19a\xef\xb6\xc2\x1fv$S\xe3G' b'\xd3\xd1m', b'target_type': 'revision'} }}, 77775770: { 'branches': { b'master': { 'target': b'\xe7n\xa4\x9c\x9f\xfb\xb7\xf76\x11\x08{' b'\xa6\xe9\x99\xb1\x9e]q\xeb', 'target_type': 'revision'} }, 'id': b"h\xc0\xd2a\x04\xd4~'\x8d\xd6\xbe\x07\xeda\xfa\xfbV" b"\x1d\r "}, 85072327: { 'branches': { b'HEAD': { 'target': b'releases/2018.09.09', 'target_type': 'alias'}, b'releases/2018.09.01': { 'target': b'<\xee1(\xe8\x8d_\xc1\xc9\xa6rT\xf1\x1d' b'\xbb\xdfF\xfdw\xcf', 'target_type': 'revision'}, b'releases/2018.09.09': { 'target': b'\x83\xb9\xb6\xc7\x05\xb1%\xd0\xfem\xd8k' b'A\x10\x9d\xc5\xfa2\xf8t', 'target_type': 'revision'}}, 'id': b'{\xda\x8e\x84\x7fX\xff\x92\x80^\x93V\x18\xa3\xfay' b'\x12\x9e\xd6\xb3'}, 49908349: { 'branches': { b'master': { 'target': b'\xe4?r\xe1,\x88\xab\xec\xe7\x9a\x87\xb8' b'\xc9\xad#.\x1bw=\x18', 'target_type': 'revision'}}, 'id': b'\xa1\xa2\x8c\n\xb3\x87\xa8\xf9\xe0a\x8c\xb7' b'\x05\xea\xb8\x1f\xc4H\xf4s'}, 54974445: { 'branches': { b'HEAD': { 'target': hash_to_bytes( '8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f'), 'target_type': 'revision'}}} } REVISIONS = [{ 'id': hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f'), 'committer': { 'id': 26, 'name': b'Andrew Nesbitt', 'fullname': b'Andrew Nesbitt ', 'email': b'andrewnez@gmail.com' }, 'synthetic': False, 'date': { 'negative_utc': False, 'timestamp': { 'seconds': 1487596456, 'microseconds': 0 }, 'offset': 0 }, 'directory': b'10' }] DIRECTORY_ID = b'10' DIRECTORY = [{ 'sha1_git': b'abc', 'name': b'index.js', 'target': b'abc', 'length': 897, 'status': 'visible', 'type': 'file', 'perms': 33188, 'sha1': b'bcd' }, { 'sha1_git': b'aab', 'name': b'package.json', 'target': b'aab', 'length': 712, 'status': 'visible', 'type': 'file', 'perms': 33188, 'sha1': b'cde' }, { 'target': b'11', 'type': 'dir', 'length': None, 'name': b'.github', 'sha1': None, 'perms': 16384, 'sha1_git': None, 'status': None, 'sha256': None } ] SHA1_TO_LICENSES = { '01c9379dfc33803963d07c1ccc748d3fe4c96bb5': ['GPL'], '02fb2c89e14f7fab46701478c83779c7beb7b069': ['Apache2.0'], '103bc087db1d26afc3a0283f38663d081e9b01e6': ['MIT'], '688a5ef812c53907562fe379d4b3851e69c7cb15': ['AGPL'], 'da39a3ee5e6b4b0d3255bfef95601890afd80709': [], } SHA1_TO_CTAGS = { '01c9379dfc33803963d07c1ccc748d3fe4c96bb5': [{ 'name': 'foo', 'kind': 'str', 'line': 10, 'lang': 'bar', }], 'd4c647f0fc257591cc9ba1722484229780d1c607': [{ 'name': 'let', 'kind': 'int', 'line': 100, 'lang': 'haskell', }], '688a5ef812c53907562fe379d4b3851e69c7cb15': [{ 'name': 'symbol', 'kind': 'float', 'line': 99, 'lang': 'python', }], } OBJ_STORAGE_DATA = { '01c9379dfc33803963d07c1ccc748d3fe4c96bb5': b'this is some text', '688a5ef812c53907562fe379d4b3851e69c7cb15': b'another text', '8986af901dd2043044ce8f0d8fc039153641cf17': b'yet another text', '02fb2c89e14f7fab46701478c83779c7beb7b069': b""" import unittest import logging from swh.indexer.mimetype import MimetypeIndexer from swh.indexer.tests.test_utils import MockObjStorage class MockStorage(): def content_mimetype_add(self, mimetypes): self.state = mimetypes self.conflict_update = conflict_update def indexer_configuration_add(self, tools): return [{ 'id': 10, }] """, '103bc087db1d26afc3a0283f38663d081e9b01e6': b""" #ifndef __AVL__ #define __AVL__ typedef struct _avl_tree avl_tree; typedef struct _data_t { int content; } data_t; """, '93666f74f1cf635c8c8ac118879da6ec5623c410': b""" (should 'pygments (recognize 'lisp 'easily)) """, '26a9f72a7c87cc9205725cfd879f514ff4f3d8d5': b""" { "name": "test_metadata", "version": "0.0.1", "description": "Simple package.json test for indexer", "repository": { "type": "git", "url": "https://github.com/moranegg/metadata_test" } } """, 'd4c647f0fc257591cc9ba1722484229780d1c607': b""" { "version": "5.0.3", "name": "npm", "description": "a package manager for JavaScript", "keywords": [ "install", "modules", "package manager", "package.json" ], "preferGlobal": true, "config": { "publishtest": false }, "homepage": "https://docs.npmjs.com/", "author": "Isaac Z. Schlueter (http://blog.izs.me)", "repository": { "type": "git", "url": "https://github.com/npm/npm" }, "bugs": { "url": "https://github.com/npm/npm/issues" }, "dependencies": { "JSONStream": "~1.3.1", "abbrev": "~1.1.0", "ansi-regex": "~2.1.1", "ansicolors": "~0.3.2", "ansistyles": "~0.1.3" }, "devDependencies": { "tacks": "~1.2.6", "tap": "~10.3.2" }, "license": "Artistic-2.0" } """, 'a7ab314d8a11d2c93e3dcf528ca294e7b431c449': b""" """, 'da39a3ee5e6b4b0d3255bfef95601890afd80709': b'', } CONTENT_METADATA = [{ 'tool': { 'configuration': { 'type': 'local', 'context': 'NpmMapping' }, 'version': '0.0.1', 'id': 6, 'name': 'swh-metadata-translator' }, 'id': b'cde', 'translated_metadata': { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'type': 'SoftwareSourceCode', 'codemeta:issueTracker': 'https://github.com/librariesio/yarn-parser/issues', 'version': '1.0.0', 'name': 'yarn-parser', 'schema:author': 'Andrew Nesbitt', 'url': 'https://github.com/librariesio/yarn-parser#readme', 'processorRequirements': {'node': '7.5'}, 'license': 'AGPL-3.0', 'keywords': ['yarn', 'parse', 'lock', 'dependencies'], 'schema:codeRepository': 'git+https://github.com/librariesio/yarn-parser.git', 'description': 'Tiny web service for parsing yarn.lock files', } }] def fill_obj_storage(obj_storage): """Add some content in an object storage.""" for (obj_id, content) in OBJ_STORAGE_DATA.items(): obj_storage.add(content, obj_id=hash_to_bytes(obj_id)) class MockObjStorage: """Mock an swh-objstorage objstorage with predefined contents. """ data = {} def __init__(self): self.data = OBJ_STORAGE_DATA.copy() def __iter__(self): yield from self.data.keys() def __contains__(self, sha1): return self.data.get(sha1) is not None def get(self, sha1): raw_content = self.data.get(sha1) if raw_content is None: raise ObjNotFoundError(sha1) return raw_content class MockIndexerStorage(): """Mock an swh-indexer storage. """ added_data = [] revision_metadata = {} tools = {} def indexer_configuration_add(self, tools): results = [] for tool in tools: results.append(self._indexer_configuration_add_one(tool)) return results def _indexer_configuration_add_one(self, tool): if tool['tool_name'] == 'swh-metadata-translator': tool2 = { 'id': 30, 'tool_name': 'swh-metadata-translator', 'tool_version': '0.0.1', 'tool_configuration': { 'type': 'local', 'context': 'NpmMapping' }, } elif tool['tool_name'] == 'swh-metadata-detector': tool2 = { 'id': 7, 'tool_name': 'swh-metadata-detector', 'tool_version': '0.0.1', 'tool_configuration': { 'type': 'local', 'context': 'NpmMapping' }, } elif tool['tool_name'] == 'origin-metadata': tool2 = { 'id': 8, 'tool_name': 'origin-metadata', 'tool_version': '0.0.1', 'tool_configuration': {}, } else: assert False, 'Unknown tool {tool_name}'.format(**tool) self.tools[tool2['id']] = tool2 return tool2 def content_metadata_missing(self, sha1s): yield from [] def content_metadata_add(self, metadata, conflict_update=None): self.added_data.append( ('content_metadata', conflict_update, metadata)) def revision_metadata_add(self, metadata, conflict_update=None): assert conflict_update self.added_data.append( ('revision_metadata', conflict_update, metadata)) for item in metadata: assert isinstance(item['id'], bytes) self.revision_metadata.setdefault(item['id'], []).append(item) def revision_metadata_get(self, ids): for id_ in ids: assert isinstance(id_, bytes) for item in self.revision_metadata.get(id_): item = item.copy() tool_id = item.pop('indexer_configuration_id') if tool_id in self.tools: item['tool'] = self.tools[tool_id].copy() else: # HACK: this needs to be removed altogether item['tool'] = { 'id': tool_id, 'name': tool_id[0], 'version': tool_id[1], 'configuration': tool_id[2], } yield item def origin_intrinsic_metadata_add(self, metadata, conflict_update=None): self.added_data.append( ('origin_intrinsic_metadata', conflict_update, metadata)) def content_metadata_get(self, sha1s): assert sha1s == [b'cde'] return CONTENT_METADATA def fill_storage(storage): for origin in ORIGINS: origin = origin.copy() del origin['id'] last_origin_id = storage.origin_add_one(origin) visit = storage.origin_visit_add(last_origin_id, datetime.datetime.now()) for (snap_id, snap_branches) in SNAPSHOTS.items(): storage.snapshot_add(last_origin_id, visit['visit'], { 'id': snap_id, 'branches': snap_branches }) storage.revision_add(REVISIONS) storage.directory_add([{ 'id': DIRECTORY_ID, 'entries': DIRECTORY, }]) class MockStorage(): """Mock a real swh-storage storage to simplify reading indexers' outputs. """ def origin_get(self, id_): for origin in ORIGINS: for (k, v) in id_.items(): if origin[k] != v: break else: # This block is run iff we didn't break, ie. if all supplied # parts of the id are set to the expected value. return origin assert False, id_ def snapshot_get_latest(self, origin_id): if origin_id in SNAPSHOTS: return SNAPSHOTS[origin_id] else: assert False, origin_id def revision_get(self, revisions): return REVISIONS.copy() def directory_ls(self, directory, recursive=False, cur=None): assert directory == DIRECTORY_ID return DIRECTORY class BasicMockStorage(): """In memory implementation to fake the content_get_range api. FIXME: To remove when the actual in-memory lands. """ contents = [] def __init__(self, contents): self.contents = contents def content_get_range(self, start, end, limit=1000): # to make input test data consilient with actual runtime the # other way of doing properly things would be to rewrite all # tests (that's another task entirely so not right now) if isinstance(start, bytes): start = hashutil.hash_to_hex(start) if isinstance(end, bytes): end = hashutil.hash_to_hex(end) results = [] _next_id = None counter = 0 for c in self.contents: _id = c['sha1'] if start <= _id and _id <= end: results.append(c) if counter >= limit: break counter += 1 return { 'contents': results, 'next': _next_id } class BasicMockIndexerStorage(): """Mock Indexer storage to simplify reading indexers' outputs. """ state = [] def _internal_add(self, data, conflict_update=None): """All content indexer have the same structure. So reuse `data` as the same data. It's either mimetype, language, fossology_license, etc... """ self.state = data self.conflict_update = conflict_update def content_mimetype_add(self, data, conflict_update=None): self._internal_add(data, conflict_update=conflict_update) def content_fossology_license_add(self, data, conflict_update=None): self._internal_add(data, conflict_update=conflict_update) def content_language_add(self, data, conflict_update=None): self._internal_add(data, conflict_update=conflict_update) def content_ctags_add(self, data, conflict_update=None): self._internal_add(data, conflict_update=conflict_update) def _internal_get_range(self, start, end, indexer_configuration_id, limit=1000): """Same logic as _internal_add, we retrieve indexed data given an identifier. So the code here does not change even though the underlying data does. """ # to make input test data consilient with actual runtime the # other way of doing properly things would be to rewrite all # tests (that's another task entirely so not right now) if isinstance(start, bytes): start = hashutil.hash_to_hex(start) if isinstance(end, bytes): end = hashutil.hash_to_hex(end) results = [] _next = None counter = 0 for m in self.state: _id = m['id'] _tool_id = m['indexer_configuration_id'] if (start <= _id and _id <= end and _tool_id == indexer_configuration_id): results.append(_id) if counter >= limit: break counter += 1 return { 'ids': results, 'next': _next } def content_mimetype_get_range( self, start, end, indexer_configuration_id, limit=1000): return self._internal_get_range( start, end, indexer_configuration_id, limit=limit) def content_fossology_license_get_range( self, start, end, indexer_configuration_id, limit=1000): return self._internal_get_range( start, end, indexer_configuration_id, limit=limit) def indexer_configuration_add(self, tools): return [{ 'id': 10, }] class CommonIndexerNoTool: """Mixin to wronly initialize content indexer""" def prepare(self): super().prepare() self.tools = None class CommonIndexerWithErrorsTest: """Test indexer configuration checks. """ Indexer = None RangeIndexer = None def test_wrong_unknown_configuration_tool(self): """Indexer with unknown configuration tool fails check""" with self.assertRaisesRegex(ValueError, 'Tools None is unknown'): print('indexer: %s' % self.Indexer) self.Indexer() def test_wrong_unknown_configuration_tool_range(self): """Range Indexer with unknown configuration tool fails check""" if self.RangeIndexer is not None: with self.assertRaisesRegex(ValueError, 'Tools None is unknown'): self.RangeIndexer() class CommonContentIndexerTest: - def assert_results_ok(self, actual_results, expected_results=None): + def get_indexer_results(self, ids): + """Override this for indexers that don't have a mock storage.""" + return self.indexer.idx_storage.state + + def assert_results_ok(self, sha1s, expected_results=None): + actual_results = self.get_indexer_results(sha1s) + if expected_results is None: expected_results = self.expected_results for indexed_data in actual_results: _id = indexed_data['id'] self.assertEqual(indexed_data, expected_results[_id]) _tool_id = indexed_data['indexer_configuration_id'] self.assertEqual(_tool_id, self.indexer.tool['id']) def test_index(self): """Known sha1 have their data indexed """ sha1s = [self.id0, self.id1, self.id2] # when self.indexer.run(sha1s, policy_update='update-dups') - actual_results = self.indexer.idx_storage.state - self.assertTrue(self.indexer.idx_storage.conflict_update) - self.assert_results_ok(actual_results) + self.assert_results_ok(sha1s) # 2nd pass self.indexer.run(sha1s, policy_update='ignore-dups') - self.assertFalse(self.indexer.idx_storage.conflict_update) - self.assert_results_ok(actual_results) + self.assert_results_ok(sha1s) def test_index_one_unknown_sha1(self): """Unknown sha1 are not indexed""" sha1s = [self.id1, '799a5ef812c53907562fe379d4b3851e69c7cb15', # unknown '800a5ef812c53907562fe379d4b3851e69c7cb15'] # unknown # when self.indexer.run(sha1s, policy_update='update-dups') - actual_results = self.indexer.idx_storage.state # then expected_results = { k: v for k, v in self.expected_results.items() if k in sha1s } - self.assert_results_ok(actual_results, expected_results) + self.assert_results_ok(sha1s, expected_results) class CommonContentIndexerRangeTest: """Allows to factorize tests on range indexer. """ def assert_results_ok(self, start, end, actual_results, expected_results=None): if expected_results is None: expected_results = self.expected_results for indexed_data in actual_results: _id = indexed_data['id'] self.assertEqual(indexed_data, expected_results[_id]) self.assertTrue(start <= _id and _id <= end) _tool_id = indexed_data['indexer_configuration_id'] self.assertEqual(_tool_id, self.indexer.tool['id']) def test__index_contents(self): """Indexing contents without existing data results in indexed data """ start, end = [self.contents[0], self.contents[2]] # output hex ids # given actual_results = list(self.indexer._index_contents( start, end, indexed={})) self.assert_results_ok(start, end, actual_results) def test__index_contents_with_indexed_data(self): """Indexing contents with existing data results in less indexed data """ start, end = [self.contents[0], self.contents[2]] # output hex ids data_indexed = [self.id0, self.id2] # given actual_results = self.indexer._index_contents( start, end, indexed=set(data_indexed)) # craft the expected results expected_results = self.expected_results.copy() for already_indexed_key in data_indexed: expected_results.pop(already_indexed_key) self.assert_results_ok( start, end, actual_results, expected_results) def test_generate_content_get(self): """Optimal indexing should result in indexed data """ start, end = [self.contents[0], self.contents[2]] # output hex ids # given actual_results = self.indexer.run(start, end) # then self.assertTrue(actual_results) def test_generate_content_get_input_as_bytes(self): """Optimal indexing should result in indexed data Input are in bytes here. """ _start, _end = [self.contents[0], self.contents[2]] # output hex ids start, end = map(hashutil.hash_to_bytes, (_start, _end)) # given actual_results = self.indexer.run( # checks the bytes input this time start, end, skip_existing=False) # no already indexed data so same result as prior test # then self.assertTrue(actual_results) def test_generate_content_get_no_result(self): """No result indexed returns False""" start, end = ['0000000000000000000000000000000000000000', '0000000000000000000000000000000000000001'] # given actual_results = self.indexer.run( start, end, incremental=False) # then self.assertFalse(actual_results) class NoDiskIndexer: """Mixin to override the DiskIndexer behavior avoiding side-effects in tests. """ def write_to_temp(self, filename, data): # noop return filename def cleanup(self, content_path): # noop return None