diff --git a/swh/indexer/ctags.py b/swh/indexer/ctags.py index b37732e..dbf7e15 100644 --- a/swh/indexer/ctags.py +++ b/swh/indexer/ctags.py @@ -1,149 +1,146 @@ # Copyright (C) 2015-2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import subprocess import json from swh.model import hashutil from .language import compute_language -from .indexer import ContentIndexer, DiskIndexer +from .indexer import ContentIndexer, write_to_temp # Options used to compute tags __FLAGS = [ '--fields=+lnz', # +l: language # +n: line number of tag definition # +z: include the symbol's kind (function, variable, ...) '--sort=no', # sort output on tag name '--links=no', # do not follow symlinks '--output-format=json', # outputs in json ] def run_ctags(path, lang=None, ctags_command='ctags'): """Run ctags on file path with optional language. Args: path: path to the file lang: language for that path (optional) Yields: dict: ctags' output """ optional = [] if lang: optional = ['--language-force=%s' % lang] cmd = [ctags_command] + __FLAGS + optional + [path] output = subprocess.check_output(cmd, universal_newlines=True) for symbol in output.split('\n'): if not symbol: continue js_symbol = json.loads(symbol) yield { 'name': js_symbol['name'], 'kind': js_symbol['kind'], 'line': js_symbol['line'], 'lang': js_symbol['language'], } -class CtagsIndexer(ContentIndexer, DiskIndexer): +class CtagsIndexer(ContentIndexer): CONFIG_BASE_FILENAME = 'indexer/ctags' ADDITIONAL_CONFIG = { 'workdir': ('str', '/tmp/swh/indexer.ctags'), 'tools': ('dict', { 'name': 'universal-ctags', 'version': '~git7859817b', 'configuration': { 'command_line': '''ctags --fields=+lnz --sort=no --links=no ''' '''--output-format=json ''' }, }), 'languages': ('dict', { 'ada': 'Ada', 'adl': None, 'agda': None, # ... }) } def prepare(self): super().prepare() self.working_directory = self.config['workdir'] self.language_map = self.config['languages'] def filter(self, ids): """Filter out known sha1s and return only missing ones. """ yield from self.idx_storage.content_ctags_missing(( { 'id': sha1, 'indexer_configuration_id': self.tool['id'], } for sha1 in ids )) def index(self, id, data): """Index sha1s' content and store result. Args: id (bytes): content's identifier data (bytes): raw content in bytes Returns: dict: a dict representing a content_mimetype with keys: - **id** (bytes): content's identifier (sha1) - **ctags** ([dict]): ctags list of symbols """ lang = compute_language(data, log=self.log)['lang'] if not lang: return None ctags_lang = self.language_map.get(lang) if not ctags_lang: return None ctags = { 'id': id, } filename = hashutil.hash_to_hex(id) - content_path = self.write_to_temp( - filename=filename, - data=data) - - result = run_ctags(content_path, lang=ctags_lang) - ctags.update({ - 'ctags': list(result), - 'indexer_configuration_id': self.tool['id'], - }) - - self.cleanup(content_path) + with write_to_temp( + filename=filename, data=data, + working_directory=self.working_directory) as content_path: + result = run_ctags(content_path, lang=ctags_lang) + ctags.update({ + 'ctags': list(result), + 'indexer_configuration_id': self.tool['id'], + }) return ctags def persist_index_computations(self, results, policy_update): """Persist the results in storage. Args: results ([dict]): list of content_mimetype, dict with the following keys: - id (bytes): content's identifier (sha1) - ctags ([dict]): ctags list of symbols policy_update ([str]): either 'update-dups' or 'ignore-dups' to respectively update duplicates or ignore them """ self.idx_storage.content_ctags_add( results, conflict_update=(policy_update == 'update-dups')) diff --git a/swh/indexer/fossology_license.py b/swh/indexer/fossology_license.py index 0860a40..017d918 100644 --- a/swh/indexer/fossology_license.py +++ b/swh/indexer/fossology_license.py @@ -1,176 +1,172 @@ # Copyright (C) 2016-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import subprocess from swh.model import hashutil -from .indexer import ContentIndexer, ContentRangeIndexer, DiskIndexer +from .indexer import ContentIndexer, ContentRangeIndexer, write_to_temp def compute_license(path, log=None): """Determine license from file at path. Args: path: filepath to determine the license Returns: dict: A dict with the following keys: - licenses ([str]): associated detected licenses to path - path (bytes): content filepath """ try: properties = subprocess.check_output(['nomossa', path], universal_newlines=True) if properties: res = properties.rstrip().split(' contains license(s) ') licenses = res[1].split(',') else: licenses = [] return { 'licenses': licenses, 'path': path, } except subprocess.CalledProcessError: if log: from os import path as __path log.exception('Problem during license detection for sha1 %s' % __path.basename(path)) return { 'licenses': [], 'path': path, } class MixinFossologyLicenseIndexer: """Mixin fossology license indexer. See :class:`FossologyLicenseIndexer` and :class:`FossologyLicenseRangeIndexer` """ ADDITIONAL_CONFIG = { 'workdir': ('str', '/tmp/swh/indexer.fossology.license'), 'tools': ('dict', { 'name': 'nomos', 'version': '3.1.0rc2-31-ga2cbb8c', 'configuration': { 'command_line': 'nomossa ', }, }), 'write_batch_size': ('int', 1000), } CONFIG_BASE_FILENAME = 'indexer/fossology_license' def prepare(self): super().prepare() self.working_directory = self.config['workdir'] def index(self, id, data): """Index sha1s' content and store result. Args: id (bytes): content's identifier raw_content (bytes): associated raw content to content id Returns: dict: A dict, representing a content_license, with keys: - id (bytes): content's identifier (sha1) - license (bytes): license in bytes - path (bytes): path - indexer_configuration_id (int): tool used to compute the output """ assert isinstance(id, bytes) - content_path = self.write_to_temp( - filename=hashutil.hash_to_hex(id), # use the id as pathname - data=data) - - try: + with write_to_temp( + filename=hashutil.hash_to_hex(id), # use the id as pathname + data=data, + working_directory=self.working_directory) as content_path: properties = compute_license(path=content_path, log=self.log) properties.update({ 'id': id, 'indexer_configuration_id': self.tool['id'], }) - finally: - self.cleanup(content_path) - return properties def persist_index_computations(self, results, policy_update): """Persist the results in storage. Args: results ([dict]): list of content_license, dict with the following keys: - id (bytes): content's identifier (sha1) - license (bytes): license in bytes - path (bytes): path policy_update ([str]): either 'update-dups' or 'ignore-dups' to respectively update duplicates or ignore them """ self.idx_storage.content_fossology_license_add( results, conflict_update=(policy_update == 'update-dups')) class FossologyLicenseIndexer( - MixinFossologyLicenseIndexer, DiskIndexer, ContentIndexer): + MixinFossologyLicenseIndexer, ContentIndexer): """Indexer in charge of: - filtering out content already indexed - reading content from objstorage per the content's id (sha1) - computing {license, encoding} from that content - store result in storage """ def filter(self, ids): """Filter out known sha1s and return only missing ones. """ yield from self.idx_storage.content_fossology_license_missing(( { 'id': sha1, 'indexer_configuration_id': self.tool['id'], } for sha1 in ids )) class FossologyLicenseRangeIndexer( - MixinFossologyLicenseIndexer, DiskIndexer, ContentRangeIndexer): + MixinFossologyLicenseIndexer, ContentRangeIndexer): """FossologyLicense Range Indexer working on range of content identifiers. - filters out the non textual content - (optionally) filters out content already indexed (cf :meth:`.indexed_contents_in_range`) - reads content from objstorage per the content's id (sha1) - computes {mimetype, encoding} from that content - stores result in storage """ def indexed_contents_in_range(self, start, end): """Retrieve indexed content id within range [start, end]. Args: start (bytes): Starting bound from range identifier end (bytes): End range identifier Returns: dict: a dict with keys: - **ids** [bytes]: iterable of content ids within the range. - **next** (Optional[bytes]): The next range of sha1 starts at this sha1 if any """ return self.idx_storage.content_fossology_license_get_range( start, end, self.tool['id']) diff --git a/swh/indexer/indexer.py b/swh/indexer/indexer.py index 766d278..a779951 100644 --- a/swh/indexer/indexer.py +++ b/swh/indexer/indexer.py @@ -1,637 +1,619 @@ # Copyright (C) 2016-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import abc import ast import os import logging import shutil import tempfile import datetime from copy import deepcopy +from contextlib import contextmanager from swh.scheduler import get_scheduler from swh.storage import get_storage from swh.core.config import SWHConfig from swh.objstorage import get_objstorage from swh.objstorage.exc import ObjNotFoundError from swh.indexer.storage import get_indexer_storage, INDEXER_CFG_KEY from swh.model import hashutil from swh.core import utils -class DiskIndexer: - """Mixin intended to be used with other SomethingIndexer classes. +@contextmanager +def write_to_temp(filename, data, working_directory): + """Write the sha1's content in a temporary file. - Indexers inheriting from this class are a category of indexers - which needs the disk for their computations. + Args: + filename (str): one of sha1's many filenames + data (bytes): the sha1's content to write in temporary + file - Note: - This expects `self.working_directory` variable defined at - runtime. + Returns: + The path to the temporary file created. That file is + filled in with the raw content's data. """ - def write_to_temp(self, filename, data): - """Write the sha1's content in a temporary file. + os.makedirs(working_directory, exist_ok=True) + temp_dir = tempfile.mkdtemp(dir=working_directory) + content_path = os.path.join(temp_dir, filename) - Args: - filename (str): one of sha1's many filenames - data (bytes): the sha1's content to write in temporary - file - - Returns: - The path to the temporary file created. That file is - filled in with the raw content's data. - - """ - os.makedirs(self.working_directory, exist_ok=True) - temp_dir = tempfile.mkdtemp(dir=self.working_directory) - content_path = os.path.join(temp_dir, filename) - - with open(content_path, 'wb') as f: - f.write(data) + with open(content_path, 'wb') as f: + f.write(data) - return content_path - - def cleanup(self, content_path): - """Remove content_path from working directory. - - Args: - content_path (str): the file to remove - - """ - temp_dir = os.path.dirname(content_path) - shutil.rmtree(temp_dir) + yield content_path + shutil.rmtree(temp_dir) class BaseIndexer(SWHConfig, metaclass=abc.ABCMeta): """Base class for indexers to inherit from. The main entry point is the :func:`run` function which is in charge of triggering the computations on the batch dict/ids received. Indexers can: - filter out ids whose data has already been indexed. - retrieve ids data from storage or objstorage - index this data depending on the object and store the result in storage. To implement a new object type indexer, inherit from the BaseIndexer and implement indexing: :meth:`~BaseIndexer.run`: object_ids are different depending on object. For example: sha1 for content, sha1_git for revision, directory, release, and id for origin To implement a new concrete indexer, inherit from the object level classes: :class:`ContentIndexer`, :class:`RevisionIndexer`, :class:`OriginIndexer`. Then you need to implement the following functions: :meth:`~BaseIndexer.filter`: filter out data already indexed (in storage). :meth:`~BaseIndexer.index_object`: compute index on id with data (retrieved from the storage or the objstorage by the id key) and return the resulting index computation. :meth:`~BaseIndexer.persist_index_computations`: persist the results of multiple index computations in the storage. The new indexer implementation can also override the following functions: :meth:`~BaseIndexer.prepare`: Configuration preparation for the indexer. When overriding, this must call the `super().prepare()` instruction. :meth:`~BaseIndexer.check`: Configuration check for the indexer. When overriding, this must call the `super().check()` instruction. :meth:`~BaseIndexer.register_tools`: This should return a dict of the tool(s) to use when indexing or filtering. """ CONFIG = 'indexer/base' DEFAULT_CONFIG = { INDEXER_CFG_KEY: ('dict', { 'cls': 'remote', 'args': { 'url': 'http://localhost:5007/' } }), 'storage': ('dict', { 'cls': 'remote', 'args': { 'url': 'http://localhost:5002/', } }), 'objstorage': ('dict', { 'cls': 'remote', 'args': { 'url': 'http://localhost:5003/', } }) } ADDITIONAL_CONFIG = {} USE_TOOLS = True def __init__(self, config=None, **kw): """Prepare and check that the indexer is ready to run. """ super().__init__() if config is not None: self.config = config else: config_keys = ('base_filename', 'config_filename', 'additional_configs', 'global_config') config_args = {k: v for k, v in kw.items() if k in config_keys} self.config = self.parse_config_file(**config_args) self.prepare() self.check() def prepare(self): """Prepare the indexer's needed runtime configuration. Without this step, the indexer cannot possibly run. """ config_storage = self.config.get('storage') if config_storage: self.storage = get_storage(**config_storage) objstorage = self.config['objstorage'] self.objstorage = get_objstorage(objstorage['cls'], objstorage['args']) idx_storage = self.config[INDEXER_CFG_KEY] self.idx_storage = get_indexer_storage(**idx_storage) _log = logging.getLogger('requests.packages.urllib3.connectionpool') _log.setLevel(logging.WARN) self.log = logging.getLogger('swh.indexer') if self.USE_TOOLS: self.tools = list(self.register_tools( self.config.get('tools', []))) self.results = [] @property def tool(self): return self.tools[0] def check(self): """Check the indexer's configuration is ok before proceeding. If ok, does nothing. If not raise error. """ if self.USE_TOOLS and not self.tools: raise ValueError('Tools %s is unknown, cannot continue' % self.tools) def _prepare_tool(self, tool): """Prepare the tool dict to be compliant with the storage api. """ return {'tool_%s' % key: value for key, value in tool.items()} def register_tools(self, tools): """Permit to register tools to the storage. Add a sensible default which can be overridden if not sufficient. (For now, all indexers use only one tool) Expects the self.config['tools'] property to be set with one or more tools. Args: tools (dict/[dict]): Either a dict or a list of dict. Returns: list: List of dicts with additional id key. Raises: ValueError: if not a list nor a dict. """ if isinstance(tools, list): tools = list(map(self._prepare_tool, tools)) elif isinstance(tools, dict): tools = [self._prepare_tool(tools)] else: raise ValueError('Configuration tool(s) must be a dict or list!') if tools: return self.idx_storage.indexer_configuration_add(tools) else: return [] @abc.abstractmethod def index(self, id, data): """Index computation for the id and associated raw data. Args: id (bytes): identifier data (bytes): id's data from storage or objstorage depending on object type Returns: dict: a dict that makes sense for the :meth:`.persist_index_computations` method. """ pass def filter(self, ids): """Filter missing ids for that particular indexer. Args: ids ([bytes]): list of ids Yields: iterator of missing ids """ yield from ids @abc.abstractmethod def persist_index_computations(self, results, policy_update): """Persist the computation resulting from the index. Args: results ([result]): List of results. One result is the result of the index function. policy_update ([str]): either 'update-dups' or 'ignore-dups' to respectively update duplicates or ignore them Returns: None """ pass def next_step(self, results, task): """Do something else with computations results (e.g. send to another queue, ...). (This is not an abstractmethod since it is optional). Args: results ([result]): List of results (dict) as returned by index function. task (dict): a dict in the form expected by `scheduler.backend.SchedulerBackend.create_tasks` without `next_run`, plus an optional `result_name` key. Returns: None """ if task: if getattr(self, 'scheduler', None): scheduler = self.scheduler else: scheduler = get_scheduler(**self.config['scheduler']) task = deepcopy(task) result_name = task.pop('result_name', None) task['next_run'] = datetime.datetime.now() if result_name: task['arguments']['kwargs'][result_name] = self.results scheduler.create_tasks([task]) @abc.abstractmethod def run(self, ids, policy_update, next_step=None, **kwargs): """Given a list of ids: - retrieves the data from the storage - executes the indexing computations - stores the results (according to policy_update) Args: ids ([bytes]): id's identifier list policy_update (str): either 'update-dups' or 'ignore-dups' to respectively update duplicates or ignore them next_step (dict): a dict in the form expected by `scheduler.backend.SchedulerBackend.create_tasks` without `next_run`, plus a `result_name` key. **kwargs: passed to the `index` method """ pass class ContentIndexer(BaseIndexer): """A content indexer working on a list of ids directly. To work on indexer range, use the :class:`ContentRangeIndexer` instead. Note: :class:`ContentIndexer` is not an instantiable object. To use it, one should inherit from this class and override the methods mentioned in the :class:`BaseIndexer` class. """ def run(self, ids, policy_update, next_step=None, **kwargs): """Given a list of ids: - retrieve the content from the storage - execute the indexing computations - store the results (according to policy_update) Args: ids (Iterable[Union[bytes, str]]): sha1's identifier list policy_update (str): either 'update-dups' or 'ignore-dups' to respectively update duplicates or ignore them next_step (dict): a dict in the form expected by `scheduler.backend.SchedulerBackend.create_tasks` without `next_run`, plus an optional `result_name` key. **kwargs: passed to the `index` method """ ids = [hashutil.hash_to_bytes(id_) if isinstance(id_, str) else id_ for id_ in ids] results = [] try: for sha1 in ids: try: raw_content = self.objstorage.get(sha1) except ObjNotFoundError: self.log.warning('Content %s not found in objstorage' % hashutil.hash_to_hex(sha1)) continue res = self.index(sha1, raw_content, **kwargs) if res: # If no results, skip it results.append(res) self.persist_index_computations(results, policy_update) self.results = results return self.next_step(results, task=next_step) except Exception: self.log.exception( 'Problem when reading contents metadata.') class ContentRangeIndexer(BaseIndexer): """A content range indexer. This expects as input a range of ids to index. To work on a list of ids, use the :class:`ContentIndexer` instead. Note: :class:`ContentRangeIndexer` is not an instantiable object. To use it, one should inherit from this class and override the methods mentioned in the :class:`BaseIndexer` class. """ @abc.abstractmethod def indexed_contents_in_range(self, start, end): """Retrieve indexed contents within range [start, end]. Args: start (bytes): Starting bound from range identifier end (bytes): End range identifier Yields: bytes: Content identifier present in the range ``[start, end]`` """ pass def _list_contents_to_index(self, start, end, indexed): """Compute from storage the new contents to index in the range [start, end]. The already indexed contents are skipped. Args: start (bytes): Starting bound from range identifier end (bytes): End range identifier indexed (Set[bytes]): Set of content already indexed. Yields: bytes: Identifier of contents to index. """ if not isinstance(start, bytes) or not isinstance(end, bytes): raise TypeError('identifiers must be bytes, not %r and %r.' % (start, end)) while start: result = self.storage.content_get_range(start, end) contents = result['contents'] for c in contents: _id = hashutil.hash_to_bytes(c['sha1']) if _id in indexed: continue yield _id start = result['next'] def _index_contents(self, start, end, indexed, **kwargs): """Index the contents from within range [start, end] Args: start (bytes): Starting bound from range identifier end (bytes): End range identifier indexed (Set[bytes]): Set of content already indexed. Yields: dict: Data indexed to persist using the indexer storage """ for sha1 in self._list_contents_to_index(start, end, indexed): try: raw_content = self.objstorage.get(sha1) except ObjNotFoundError: self.log.warning('Content %s not found in objstorage' % hashutil.hash_to_hex(sha1)) continue res = self.index(sha1, raw_content, **kwargs) if res: if not isinstance(res['id'], bytes): raise TypeError( '%r.index should return ids as bytes, not %r' % (self.__class__.__name__, res['id'])) yield res def _index_with_skipping_already_done(self, start, end): """Index not already indexed contents in range [start, end]. Args: start** (Union[bytes, str]): Starting range identifier end (Union[bytes, str]): Ending range identifier Yields: bytes: Content identifier present in the range ``[start, end]`` which are not already indexed. """ while start: indexed_page = self.indexed_contents_in_range(start, end) contents = indexed_page['ids'] _end = contents[-1] if contents else end yield from self._index_contents( start, _end, contents) start = indexed_page['next'] def run(self, start, end, skip_existing=True, **kwargs): """Given a range of content ids, compute the indexing computations on the contents within. Either the indexer is incremental (filter out existing computed data) or not (compute everything from scratch). Args: start (Union[bytes, str]): Starting range identifier end (Union[bytes, str]): Ending range identifier skip_existing (bool): Skip existing indexed data (default) or not **kwargs: passed to the `index` method Returns: bool: True if data was indexed, False otherwise. """ with_indexed_data = False try: if isinstance(start, str): start = hashutil.hash_to_bytes(start) if isinstance(end, str): end = hashutil.hash_to_bytes(end) if skip_existing: gen = self._index_with_skipping_already_done(start, end) else: gen = self._index_contents(start, end, indexed=[]) for results in utils.grouper(gen, n=self.config['write_batch_size']): self.persist_index_computations( results, policy_update='update-dups') with_indexed_data = True except Exception: self.log.exception( 'Problem when computing metadata.') finally: return with_indexed_data class OriginIndexer(BaseIndexer): """An object type indexer, inherits from the :class:`BaseIndexer` and implements Origin indexing using the run method Note: the :class:`OriginIndexer` is not an instantiable object. To use it in another context one should inherit from this class and override the methods mentioned in the :class:`BaseIndexer` class. """ def run(self, ids, policy_update='update-dups', parse_ids=True, next_step=None, **kwargs): """Given a list of origin ids: - retrieve origins from storage - execute the indexing computations - store the results (according to policy_update) Args: ids ([Union[int, Tuple[str, bytes]]]): list of origin ids or (type, url) tuples. policy_update (str): either 'update-dups' or 'ignore-dups' to respectively update duplicates (default) or ignore them next_step (dict): a dict in the form expected by `scheduler.backend.SchedulerBackend.create_tasks` without `next_run`, plus an optional `result_name` key. parse_ids (bool): Do we need to parse id or not (default) **kwargs: passed to the `index` method """ if parse_ids: ids = [o.split('+', 1) if ':' in o else int(o) # type+url or id for o in ids] results = [] for id_ in ids: if isinstance(id_, str): # Data coming from JSON, which requires string keys, so # one extra level of deserialization is needed id_ = ast.literal_eval(id_) if isinstance(id_, (tuple, list)): if len(id_) != 2: raise TypeError('Expected a (type, url) tuple.') (type_, url) = id_ params = {'type': type_, 'url': url} elif isinstance(id_, int): params = {'id': id_} else: raise TypeError('Invalid value in "ids": %r' % id_) origin = self.storage.origin_get(params) if not origin: self.log.warning('Origin %s not found in storage' % list(id_)) continue try: res = self.index(origin, **kwargs) if res: # If no results, skip it results.append(res) except Exception: self.log.exception( 'Problem when processing origin %s' % (id_,)) self.persist_index_computations(results, policy_update) self.results = results return self.next_step(results, task=next_step) class RevisionIndexer(BaseIndexer): """An object type indexer, inherits from the :class:`BaseIndexer` and implements Revision indexing using the run method Note: the :class:`RevisionIndexer` is not an instantiable object. To use it in another context one should inherit from this class and override the methods mentioned in the :class:`BaseIndexer` class. """ def run(self, ids, policy_update, next_step=None): """Given a list of sha1_gits: - retrieve revisions from storage - execute the indexing computations - store the results (according to policy_update) Args: ids ([bytes or str]): sha1_git's identifier list policy_update (str): either 'update-dups' or 'ignore-dups' to respectively update duplicates or ignore them """ results = [] ids = [hashutil.hash_to_bytes(id_) if isinstance(id_, str) else id_ for id_ in ids] revs = self.storage.revision_get(ids) for rev in revs: if not rev: self.log.warning('Revisions %s not found in storage' % list(map(hashutil.hash_to_hex, ids))) continue try: res = self.index(rev) if res: # If no results, skip it results.append(res) except Exception: self.log.exception( 'Problem when processing revision') self.persist_index_computations(results, policy_update) self.results = results return self.next_step(results, task=next_step) diff --git a/swh/indexer/tests/test_ctags.py b/swh/indexer/tests/test_ctags.py index 6a02b9e..aa1b175 100644 --- a/swh/indexer/tests/test_ctags.py +++ b/swh/indexer/tests/test_ctags.py @@ -1,192 +1,192 @@ # Copyright (C) 2017-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import json import unittest from unittest.mock import patch import swh.indexer.ctags from swh.indexer.ctags import ( CtagsIndexer, run_ctags ) from swh.indexer.tests.utils import ( CommonContentIndexerTest, CommonIndexerWithErrorsTest, CommonIndexerNoTool, - SHA1_TO_CTAGS, NoDiskIndexer, BASE_TEST_CONFIG, + SHA1_TO_CTAGS, BASE_TEST_CONFIG, OBJ_STORAGE_DATA, fill_storage, fill_obj_storage ) class BasicTest(unittest.TestCase): @patch('swh.indexer.ctags.subprocess') def test_run_ctags(self, mock_subprocess): """Computing licenses from a raw content should return results """ output0 = """ {"name":"defun","kind":"function","line":1,"language":"scheme"} {"name":"name","kind":"symbol","line":5,"language":"else"}""" output1 = """ {"name":"let","kind":"var","line":10,"language":"something"}""" expected_result0 = [ { 'name': 'defun', 'kind': 'function', 'line': 1, 'lang': 'scheme' }, { 'name': 'name', 'kind': 'symbol', 'line': 5, 'lang': 'else' } ] expected_result1 = [ { 'name': 'let', 'kind': 'var', 'line': 10, 'lang': 'something' } ] for path, lang, intermediary_result, expected_result in [ (b'some/path', 'lisp', output0, expected_result0), (b'some/path/2', 'markdown', output1, expected_result1) ]: mock_subprocess.check_output.return_value = intermediary_result actual_result = list(run_ctags(path, lang=lang)) self.assertEqual(actual_result, expected_result) class InjectCtagsIndexer: """Override ctags computations. """ def compute_ctags(self, path, lang): """Inject fake ctags given path (sha1 identifier). """ return { 'lang': lang, **SHA1_TO_CTAGS.get(path) } -class CtagsIndexerTest(NoDiskIndexer, InjectCtagsIndexer, CtagsIndexer): +class CtagsIndexerTest(InjectCtagsIndexer, CtagsIndexer): """Specific language whose configuration is enough to satisfy the indexing tests. """ def parse_config_file(self, *args, **kwargs): return { **BASE_TEST_CONFIG, 'tools': { 'name': 'universal-ctags', 'version': '~git7859817b', 'configuration': { 'command_line': '''ctags --fields=+lnz --sort=no ''' ''' --links=no ''', 'max_content_size': 1000, }, }, 'languages': { 'python': 'python', 'haskell': 'haskell', 'bar': 'bar', }, - 'workdir': '/nowhere', + 'workdir': '/tmp', } class TestCtagsIndexer(CommonContentIndexerTest, unittest.TestCase): """Ctags indexer test scenarios: - Known sha1s in the input list have their data indexed - Unknown sha1 in the input list are not indexed """ legacy_get_format = True def get_indexer_results(self, ids): yield from self.idx_storage.content_ctags_get(ids) def setUp(self): super().setUp() self.indexer = CtagsIndexerTest() self.idx_storage = self.indexer.idx_storage fill_storage(self.indexer.storage) fill_obj_storage(self.indexer.objstorage) # Prepare test input self.id0 = '01c9379dfc33803963d07c1ccc748d3fe4c96bb5' self.id1 = 'd4c647f0fc257591cc9ba1722484229780d1c607' self.id2 = '688a5ef812c53907562fe379d4b3851e69c7cb15' tool = {k.replace('tool_', ''): v for (k, v) in self.indexer.tool.items()} self.expected_results = { self.id0: { 'id': self.id0, 'tool': tool, **SHA1_TO_CTAGS[self.id0][0], }, self.id1: { 'id': self.id1, 'tool': tool, **SHA1_TO_CTAGS[self.id1][0], }, self.id2: { 'id': self.id2, 'tool': tool, **SHA1_TO_CTAGS[self.id2][0], } } self._set_mocks() def _set_mocks(self): def find_ctags_for_content(raw_content): for (sha1, ctags) in SHA1_TO_CTAGS.items(): if OBJ_STORAGE_DATA[sha1] == raw_content: return ctags else: raise ValueError(('%r not found in objstorage, can\'t mock ' 'its ctags.') % raw_content) def fake_language(raw_content, *args, **kwargs): ctags = find_ctags_for_content(raw_content) return {'lang': ctags[0]['lang']} self._real_compute_language = swh.indexer.ctags.compute_language swh.indexer.ctags.compute_language = fake_language def fake_check_output(cmd, *args, **kwargs): print(cmd) - id_ = cmd[-1] # when using NoDiskIndexer, path is replaced by id + id_ = cmd[-1].split('/')[-1] return '\n'.join( json.dumps({'language': ctag['lang'], **ctag}) for ctag in SHA1_TO_CTAGS[id_]) self._real_check_output = swh.indexer.ctags.subprocess.check_output swh.indexer.ctags.subprocess.check_output = fake_check_output def tearDown(self): swh.indexer.ctags.compute_language = self._real_compute_language swh.indexer.ctags.subprocess.check_output = self._real_check_output super().tearDown() class CtagsIndexerUnknownToolTestStorage( CommonIndexerNoTool, CtagsIndexerTest): """Fossology license indexer with wrong configuration""" class TestCtagsIndexersErrors( CommonIndexerWithErrorsTest, unittest.TestCase): """Test the indexer raise the right errors when wrongly initialized""" Indexer = CtagsIndexerUnknownToolTestStorage diff --git a/swh/indexer/tests/test_fossology_license.py b/swh/indexer/tests/test_fossology_license.py index c47be8b..697542b 100644 --- a/swh/indexer/tests/test_fossology_license.py +++ b/swh/indexer/tests/test_fossology_license.py @@ -1,208 +1,208 @@ # Copyright (C) 2017-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import unittest from unittest.mock import patch from swh.indexer import fossology_license from swh.indexer.fossology_license import ( FossologyLicenseIndexer, FossologyLicenseRangeIndexer, compute_license ) from swh.indexer.tests.utils import ( SHA1_TO_LICENSES, CommonContentIndexerTest, CommonContentIndexerRangeTest, - CommonIndexerWithErrorsTest, CommonIndexerNoTool, NoDiskIndexer, + CommonIndexerWithErrorsTest, CommonIndexerNoTool, BASE_TEST_CONFIG, fill_storage, fill_obj_storage ) class BasicTest(unittest.TestCase): @patch('swh.indexer.fossology_license.subprocess') def test_compute_license(self, mock_subprocess): """Computing licenses from a raw content should return results """ for path, intermediary_result, output in [ (b'some/path', None, []), (b'some/path/2', [], []), (b'other/path', ' contains license(s) GPL,AGPL', ['GPL', 'AGPL'])]: mock_subprocess.check_output.return_value = intermediary_result actual_result = compute_license(path, log=None) self.assertEqual(actual_result, { 'licenses': output, 'path': path, }) def mock_compute_license(path, log=None): """path is the content identifier """ if isinstance(id, bytes): path = path.decode('utf-8') + # path is something like /tmp/tmpXXX/ so we keep only the sha1 part + path = path.split('/')[-1] return { 'licenses': SHA1_TO_LICENSES.get(path) } -class FossologyLicenseTestIndexer( - NoDiskIndexer, FossologyLicenseIndexer): +class FossologyLicenseTestIndexer(FossologyLicenseIndexer): """Specific fossology license whose configuration is enough to satisfy the indexing checks. """ def parse_config_file(self, *args, **kwargs): return { **BASE_TEST_CONFIG, - 'workdir': '/nowhere', + 'workdir': '/tmp', 'tools': { 'name': 'nomos', 'version': '3.1.0rc2-31-ga2cbb8c', 'configuration': { 'command_line': 'nomossa ', }, }, } class TestFossologyLicenseIndexer(CommonContentIndexerTest, unittest.TestCase): """Language indexer test scenarios: - Known sha1s in the input list have their data indexed - Unknown sha1 in the input list are not indexed """ def get_indexer_results(self, ids): yield from self.idx_storage.content_fossology_license_get(ids) def setUp(self): super().setUp() # replace actual license computation with a mock self.orig_compute_license = fossology_license.compute_license fossology_license.compute_license = mock_compute_license self.indexer = FossologyLicenseTestIndexer() self.idx_storage = self.indexer.idx_storage fill_storage(self.indexer.storage) fill_obj_storage(self.indexer.objstorage) self.id0 = '01c9379dfc33803963d07c1ccc748d3fe4c96bb5' self.id1 = '688a5ef812c53907562fe379d4b3851e69c7cb15' self.id2 = 'da39a3ee5e6b4b0d3255bfef95601890afd80709' # empty content tool = {k.replace('tool_', ''): v for (k, v) in self.indexer.tool.items()} # then self.expected_results = { self.id0: { 'tool': tool, 'licenses': SHA1_TO_LICENSES[self.id0], }, self.id1: { 'tool': tool, 'licenses': SHA1_TO_LICENSES[self.id1], }, self.id2: { 'tool': tool, 'licenses': SHA1_TO_LICENSES[self.id2], } } def tearDown(self): super().tearDown() fossology_license.compute_license = self.orig_compute_license -class FossologyLicenseRangeIndexerTest( - NoDiskIndexer, FossologyLicenseRangeIndexer): +class FossologyLicenseRangeIndexerTest(FossologyLicenseRangeIndexer): """Testing the range indexer on fossology license. """ def parse_config_file(self, *args, **kwargs): return { **BASE_TEST_CONFIG, - 'workdir': '/nowhere', + 'workdir': '/tmp', 'tools': { 'name': 'nomos', 'version': '3.1.0rc2-31-ga2cbb8c', 'configuration': { 'command_line': 'nomossa ', }, }, 'write_batch_size': 100, } class TestFossologyLicenseRangeIndexer( CommonContentIndexerRangeTest, unittest.TestCase): """Range Fossology License Indexer tests. - new data within range are indexed - no data outside a range are indexed - with filtering existing indexed data prior to compute new index - without filtering existing indexed data prior to compute new index """ def setUp(self): super().setUp() # replace actual license computation with a mock self.orig_compute_license = fossology_license.compute_license fossology_license.compute_license = mock_compute_license self.indexer = FossologyLicenseRangeIndexerTest() fill_storage(self.indexer.storage) fill_obj_storage(self.indexer.objstorage) self.id0 = '01c9379dfc33803963d07c1ccc748d3fe4c96bb5' self.id1 = '02fb2c89e14f7fab46701478c83779c7beb7b069' self.id2 = '103bc087db1d26afc3a0283f38663d081e9b01e6' tool_id = self.indexer.tool['id'] self.expected_results = { self.id0: { 'id': self.id0, 'indexer_configuration_id': tool_id, 'licenses': SHA1_TO_LICENSES[self.id0] }, self.id1: { 'id': self.id1, 'indexer_configuration_id': tool_id, 'licenses': SHA1_TO_LICENSES[self.id1] }, self.id2: { 'id': self.id2, 'indexer_configuration_id': tool_id, 'licenses': SHA1_TO_LICENSES[self.id2] } } def tearDown(self): super().tearDown() fossology_license.compute_license = self.orig_compute_license class FossologyLicenseIndexerUnknownToolTestStorage( CommonIndexerNoTool, FossologyLicenseTestIndexer): """Fossology license indexer with wrong configuration""" class FossologyLicenseRangeIndexerUnknownToolTestStorage( CommonIndexerNoTool, FossologyLicenseRangeIndexerTest): """Fossology license range indexer with wrong configuration""" class TestFossologyLicenseIndexersErrors( CommonIndexerWithErrorsTest, unittest.TestCase): """Test the indexer raise the right errors when wrongly initialized""" Indexer = FossologyLicenseIndexerUnknownToolTestStorage RangeIndexer = FossologyLicenseRangeIndexerUnknownToolTestStorage diff --git a/swh/indexer/tests/utils.py b/swh/indexer/tests/utils.py index cf5e85b..63fce88 100644 --- a/swh/indexer/tests/utils.py +++ b/swh/indexer/tests/utils.py @@ -1,673 +1,660 @@ # Copyright (C) 2017-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import abc import datetime import hashlib import random from swh.model import hashutil from swh.model.hashutil import hash_to_bytes, hash_to_hex from swh.indexer.storage import INDEXER_CFG_KEY BASE_TEST_CONFIG = { 'storage': { 'cls': 'memory', 'args': { }, }, 'objstorage': { 'cls': 'memory', 'args': { }, }, INDEXER_CFG_KEY: { 'cls': 'memory', 'args': { }, }, } ORIGINS = [ { 'id': 52189575, 'lister': None, 'project': None, 'type': 'git', 'url': 'https://github.com/SoftwareHeritage/swh-storage'}, { 'id': 4423668, 'lister': None, 'project': None, 'type': 'ftp', 'url': 'rsync://ftp.gnu.org/gnu/3dldf'}, { 'id': 77775770, 'lister': None, 'project': None, 'type': 'deposit', 'url': 'https://forge.softwareheritage.org/source/jesuisgpl/'}, { 'id': 85072327, 'lister': None, 'project': None, 'type': 'pypi', 'url': 'https://pypi.org/project/limnoria/'}, { 'id': 49908349, 'lister': None, 'project': None, 'type': 'svn', 'url': 'http://0-512-md.googlecode.com/svn/'}, { 'id': 54974445, 'lister': None, 'project': None, 'type': 'git', 'url': 'https://github.com/librariesio/yarn-parser'}, ] SNAPSHOTS = { 52189575: { 'branches': { b'refs/heads/add-revision-origin-cache': { 'target': b'L[\xce\x1c\x88\x8eF\t\xf1"\x19\x1e\xfb\xc0' b's\xe7/\xe9l\x1e', 'target_type': 'revision'}, b'HEAD': { 'target': b'8K\x12\x00d\x03\xcc\xe4]bS\xe3\x8f{\xd7}' b'\xac\xefrm', 'target_type': 'revision'}, b'refs/tags/v0.0.103': { 'target': b'\xb6"Im{\xfdLb\xb0\x94N\xea\x96m\x13x\x88+' b'\x0f\xdd', 'target_type': 'release'}, }}, 4423668: { 'branches': { b'3DLDF-1.1.4.tar.gz': { 'target': b'dJ\xfb\x1c\x91\xf4\x82B%]6\xa2\x90|\xd3\xfc' b'"G\x99\x11', 'target_type': 'revision'}, b'3DLDF-2.0.2.tar.gz': { 'target': b'\xb6\x0e\xe7\x9e9\xac\xaa\x19\x9e=' b'\xd1\xc5\x00\\\xc6\xfc\xe0\xa6\xb4V', 'target_type': 'revision'}, b'3DLDF-2.0.3-examples.tar.gz': { 'target': b'!H\x19\xc0\xee\x82-\x12F1\xbd\x97' b'\xfe\xadZ\x80\x80\xc1\x83\xff', 'target_type': 'revision'}, b'3DLDF-2.0.3.tar.gz': { 'target': b'\x8e\xa9\x8e/\xea}\x9feF\xf4\x9f\xfd\xee' b'\xcc\x1a\xb4`\x8c\x8by', 'target_type': 'revision'}, b'3DLDF-2.0.tar.gz': { 'target': b'F6*\xff(?\x19a\xef\xb6\xc2\x1fv$S\xe3G' b'\xd3\xd1m', b'target_type': 'revision'} }}, 77775770: { 'branches': { b'master': { 'target': b'\xe7n\xa4\x9c\x9f\xfb\xb7\xf76\x11\x08{' b'\xa6\xe9\x99\xb1\x9e]q\xeb', 'target_type': 'revision'} }, 'id': b"h\xc0\xd2a\x04\xd4~'\x8d\xd6\xbe\x07\xeda\xfa\xfbV" b"\x1d\r "}, 85072327: { 'branches': { b'HEAD': { 'target': b'releases/2018.09.09', 'target_type': 'alias'}, b'releases/2018.09.01': { 'target': b'<\xee1(\xe8\x8d_\xc1\xc9\xa6rT\xf1\x1d' b'\xbb\xdfF\xfdw\xcf', 'target_type': 'revision'}, b'releases/2018.09.09': { 'target': b'\x83\xb9\xb6\xc7\x05\xb1%\xd0\xfem\xd8k' b'A\x10\x9d\xc5\xfa2\xf8t', 'target_type': 'revision'}}, 'id': b'{\xda\x8e\x84\x7fX\xff\x92\x80^\x93V\x18\xa3\xfay' b'\x12\x9e\xd6\xb3'}, 49908349: { 'branches': { b'master': { 'target': b'\xe4?r\xe1,\x88\xab\xec\xe7\x9a\x87\xb8' b'\xc9\xad#.\x1bw=\x18', 'target_type': 'revision'}}, 'id': b'\xa1\xa2\x8c\n\xb3\x87\xa8\xf9\xe0a\x8c\xb7' b'\x05\xea\xb8\x1f\xc4H\xf4s'}, 54974445: { 'branches': { b'HEAD': { 'target': hash_to_bytes( '8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f'), 'target_type': 'revision'}}} } REVISIONS = [{ 'id': hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f'), 'author': { 'id': 26, 'name': b'Andrew Nesbitt', 'fullname': b'Andrew Nesbitt ', 'email': b'andrewnez@gmail.com' }, 'committer': { 'id': 26, 'name': b'Andrew Nesbitt', 'fullname': b'Andrew Nesbitt ', 'email': b'andrewnez@gmail.com' }, 'synthetic': False, 'date': { 'negative_utc': False, 'timestamp': { 'seconds': 1487596456, 'microseconds': 0 }, 'offset': 0 }, 'directory': b'10' }] DIRECTORY_ID = b'10' DIRECTORY = [{ 'sha1_git': b'abc', 'name': b'index.js', 'target': b'abc', 'length': 897, 'status': 'visible', 'type': 'file', 'perms': 33188, 'sha1': b'bcd' }, { 'sha1_git': b'aab', 'name': b'package.json', 'target': b'aab', 'length': 712, 'status': 'visible', 'type': 'file', 'perms': 33188, 'sha1': b'cde' }, { 'target': b'11', 'type': 'dir', 'length': None, 'name': b'.github', 'sha1': None, 'perms': 16384, 'sha1_git': None, 'status': None, 'sha256': None } ] SHA1_TO_LICENSES = { '01c9379dfc33803963d07c1ccc748d3fe4c96bb5': ['GPL'], '02fb2c89e14f7fab46701478c83779c7beb7b069': ['Apache2.0'], '103bc087db1d26afc3a0283f38663d081e9b01e6': ['MIT'], '688a5ef812c53907562fe379d4b3851e69c7cb15': ['AGPL'], 'da39a3ee5e6b4b0d3255bfef95601890afd80709': [], } SHA1_TO_CTAGS = { '01c9379dfc33803963d07c1ccc748d3fe4c96bb5': [{ 'name': 'foo', 'kind': 'str', 'line': 10, 'lang': 'bar', }], 'd4c647f0fc257591cc9ba1722484229780d1c607': [{ 'name': 'let', 'kind': 'int', 'line': 100, 'lang': 'haskell', }], '688a5ef812c53907562fe379d4b3851e69c7cb15': [{ 'name': 'symbol', 'kind': 'float', 'line': 99, 'lang': 'python', }], } OBJ_STORAGE_DATA = { '01c9379dfc33803963d07c1ccc748d3fe4c96bb5': b'this is some text', '688a5ef812c53907562fe379d4b3851e69c7cb15': b'another text', '8986af901dd2043044ce8f0d8fc039153641cf17': b'yet another text', '02fb2c89e14f7fab46701478c83779c7beb7b069': b""" import unittest import logging from swh.indexer.mimetype import MimetypeIndexer from swh.indexer.tests.test_utils import MockObjStorage class MockStorage(): def content_mimetype_add(self, mimetypes): self.state = mimetypes self.conflict_update = conflict_update def indexer_configuration_add(self, tools): return [{ 'id': 10, }] """, '103bc087db1d26afc3a0283f38663d081e9b01e6': b""" #ifndef __AVL__ #define __AVL__ typedef struct _avl_tree avl_tree; typedef struct _data_t { int content; } data_t; """, '93666f74f1cf635c8c8ac118879da6ec5623c410': b""" (should 'pygments (recognize 'lisp 'easily)) """, '26a9f72a7c87cc9205725cfd879f514ff4f3d8d5': b""" { "name": "test_metadata", "version": "0.0.1", "description": "Simple package.json test for indexer", "repository": { "type": "git", "url": "https://github.com/moranegg/metadata_test" } } """, 'd4c647f0fc257591cc9ba1722484229780d1c607': b""" { "version": "5.0.3", "name": "npm", "description": "a package manager for JavaScript", "keywords": [ "install", "modules", "package manager", "package.json" ], "preferGlobal": true, "config": { "publishtest": false }, "homepage": "https://docs.npmjs.com/", "author": "Isaac Z. Schlueter (http://blog.izs.me)", "repository": { "type": "git", "url": "https://github.com/npm/npm" }, "bugs": { "url": "https://github.com/npm/npm/issues" }, "dependencies": { "JSONStream": "~1.3.1", "abbrev": "~1.1.0", "ansi-regex": "~2.1.1", "ansicolors": "~0.3.2", "ansistyles": "~0.1.3" }, "devDependencies": { "tacks": "~1.2.6", "tap": "~10.3.2" }, "license": "Artistic-2.0" } """, 'a7ab314d8a11d2c93e3dcf528ca294e7b431c449': b""" """, 'da39a3ee5e6b4b0d3255bfef95601890afd80709': b'', '636465': b""" { "name": "yarn-parser", "version": "1.0.0", "description": "Tiny web service for parsing yarn.lock files", "main": "index.js", "scripts": { "start": "node index.js", "test": "mocha" }, "engines": { "node": "9.8.0" }, "repository": { "type": "git", "url": "git+https://github.com/librariesio/yarn-parser.git" }, "keywords": [ "yarn", "parse", "lock", "dependencies" ], "author": "Andrew Nesbitt", "license": "AGPL-3.0", "bugs": { "url": "https://github.com/librariesio/yarn-parser/issues" }, "homepage": "https://github.com/librariesio/yarn-parser#readme", "dependencies": { "@yarnpkg/lockfile": "^1.0.0", "body-parser": "^1.15.2", "express": "^4.14.0" }, "devDependencies": { "chai": "^4.1.2", "mocha": "^5.2.0", "request": "^2.87.0", "test": "^0.6.0" } } """ } CONTENT_METADATA = [{ 'tool': { 'configuration': { 'type': 'local', 'context': 'NpmMapping' }, 'version': '0.0.1', 'id': 6, 'name': 'swh-metadata-translator' }, 'id': b'cde', 'translated_metadata': { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'type': 'SoftwareSourceCode', 'codemeta:issueTracker': 'https://github.com/librariesio/yarn-parser/issues', 'version': '1.0.0', 'name': 'yarn-parser', 'schema:author': 'Andrew Nesbitt', 'url': 'https://github.com/librariesio/yarn-parser#readme', 'processorRequirements': {'node': '7.5'}, 'license': 'AGPL-3.0', 'keywords': ['yarn', 'parse', 'lock', 'dependencies'], 'schema:codeRepository': 'git+https://github.com/librariesio/yarn-parser.git', 'description': 'Tiny web service for parsing yarn.lock files', } }] def fill_obj_storage(obj_storage): """Add some content in an object storage.""" for (obj_id, content) in OBJ_STORAGE_DATA.items(): obj_storage.add(content, obj_id=hash_to_bytes(obj_id)) def fill_storage(storage): for origin in ORIGINS: origin = origin.copy() del origin['id'] storage.origin_add_one(origin) for (orig_pseudo_id, snap) in SNAPSHOTS.items(): for orig in ORIGINS: if orig_pseudo_id == orig['id']: origin_id = storage.origin_get( {'type': orig['type'], 'url': orig['url']})['id'] break else: assert False visit = storage.origin_visit_add(origin_id, datetime.datetime.now()) snap_id = snap.get('id') or \ bytes([random.randint(0, 255) for _ in range(32)]) storage.snapshot_add(origin_id, visit['visit'], { 'id': snap_id, 'branches': snap['branches'] }) storage.revision_add(REVISIONS) storage.directory_add([{ 'id': DIRECTORY_ID, 'entries': DIRECTORY, }]) for (obj_id, content) in OBJ_STORAGE_DATA.items(): # TODO: use MultiHash if hasattr(hashlib, 'blake2s'): blake2s256 = hashlib.blake2s(content, digest_size=32).digest() else: # fallback for Python <3.6 blake2s256 = bytes([random.randint(0, 255) for _ in range(32)]) storage.content_add([{ 'data': content, 'length': len(content), 'status': 'visible', 'sha1': hash_to_bytes(obj_id), 'sha1_git': hash_to_bytes(obj_id), 'sha256': hashlib.sha256(content).digest(), 'blake2s256': blake2s256 }]) class CommonIndexerNoTool: """Mixin to wronly initialize content indexer""" def prepare(self): super().prepare() self.tools = None class CommonIndexerWithErrorsTest: """Test indexer configuration checks. """ Indexer = None RangeIndexer = None def test_wrong_unknown_configuration_tool(self): """Indexer with unknown configuration tool fails check""" with self.assertRaisesRegex(ValueError, 'Tools None is unknown'): print('indexer: %s' % self.Indexer) self.Indexer() def test_wrong_unknown_configuration_tool_range(self): """Range Indexer with unknown configuration tool fails check""" if self.RangeIndexer is not None: with self.assertRaisesRegex(ValueError, 'Tools None is unknown'): self.RangeIndexer() class CommonContentIndexerTest(metaclass=abc.ABCMeta): legacy_get_format = False """True iff the tested indexer uses the legacy format. see: https://forge.softwareheritage.org/T1433""" def get_indexer_results(self, ids): """Override this for indexers that don't have a mock storage.""" return self.indexer.idx_storage.state def assert_legacy_results_ok(self, sha1s, expected_results=None): # XXX old format, remove this when all endpoints are # updated to the new one # see: https://forge.softwareheritage.org/T1433 sha1s = [sha1 if isinstance(sha1, bytes) else hash_to_bytes(sha1) for sha1 in sha1s] actual_results = list(self.get_indexer_results(sha1s)) if expected_results is None: expected_results = self.expected_results self.assertEqual(len(expected_results), len(actual_results), (expected_results, actual_results)) for indexed_data in actual_results: _id = indexed_data['id'] expected_data = expected_results[hashutil.hash_to_hex(_id)].copy() expected_data['id'] = _id self.assertEqual(indexed_data, expected_data) def assert_results_ok(self, sha1s, expected_results=None): if self.legacy_get_format: self.assert_legacy_results_ok(sha1s, expected_results) return sha1s = [sha1 if isinstance(sha1, bytes) else hash_to_bytes(sha1) for sha1 in sha1s] actual_results = list(self.get_indexer_results(sha1s)) if expected_results is None: expected_results = self.expected_results self.assertEqual(len(expected_results), len(actual_results), (expected_results, actual_results)) for indexed_data in actual_results: (_id, indexed_data) = list(indexed_data.items())[0] expected_data = expected_results[hashutil.hash_to_hex(_id)].copy() expected_data = [expected_data] self.assertEqual(indexed_data, expected_data) def test_index(self): """Known sha1 have their data indexed """ sha1s = [self.id0, self.id1, self.id2] # when self.indexer.run(sha1s, policy_update='update-dups') self.assert_results_ok(sha1s) # 2nd pass self.indexer.run(sha1s, policy_update='ignore-dups') self.assert_results_ok(sha1s) def test_index_one_unknown_sha1(self): """Unknown sha1 are not indexed""" sha1s = [self.id1, '799a5ef812c53907562fe379d4b3851e69c7cb15', # unknown '800a5ef812c53907562fe379d4b3851e69c7cb15'] # unknown # when self.indexer.run(sha1s, policy_update='update-dups') # then expected_results = { k: v for k, v in self.expected_results.items() if k in sha1s } self.assert_results_ok(sha1s, expected_results) class CommonContentIndexerRangeTest: """Allows to factorize tests on range indexer. """ def setUp(self): self.contents = sorted(OBJ_STORAGE_DATA) def assert_results_ok(self, start, end, actual_results, expected_results=None): if expected_results is None: expected_results = self.expected_results actual_results = list(actual_results) for indexed_data in actual_results: _id = indexed_data['id'] assert isinstance(_id, bytes) indexed_data = indexed_data.copy() indexed_data['id'] = hash_to_hex(indexed_data['id']) self.assertEqual(indexed_data, expected_results[hash_to_hex(_id)]) self.assertTrue(start <= _id <= end) _tool_id = indexed_data['indexer_configuration_id'] self.assertEqual(_tool_id, self.indexer.tool['id']) def test__index_contents(self): """Indexing contents without existing data results in indexed data """ _start, _end = [self.contents[0], self.contents[2]] # output hex ids start, end = map(hashutil.hash_to_bytes, (_start, _end)) # given actual_results = list(self.indexer._index_contents( start, end, indexed={})) self.assert_results_ok(start, end, actual_results) def test__index_contents_with_indexed_data(self): """Indexing contents with existing data results in less indexed data """ _start, _end = [self.contents[0], self.contents[2]] # output hex ids start, end = map(hashutil.hash_to_bytes, (_start, _end)) data_indexed = [self.id0, self.id2] # given actual_results = self.indexer._index_contents( start, end, indexed=set(map(hash_to_bytes, data_indexed))) # craft the expected results expected_results = self.expected_results.copy() for already_indexed_key in data_indexed: expected_results.pop(already_indexed_key) self.assert_results_ok( start, end, actual_results, expected_results) def test_generate_content_get(self): """Optimal indexing should result in indexed data """ _start, _end = [self.contents[0], self.contents[2]] # output hex ids start, end = map(hashutil.hash_to_bytes, (_start, _end)) # given actual_results = self.indexer.run(start, end) # then self.assertTrue(actual_results) def test_generate_content_get_input_as_bytes(self): """Optimal indexing should result in indexed data Input are in bytes here. """ _start, _end = [self.contents[0], self.contents[2]] # output hex ids start, end = map(hashutil.hash_to_bytes, (_start, _end)) # given actual_results = self.indexer.run( # checks the bytes input this time start, end, skip_existing=False) # no already indexed data so same result as prior test # then self.assertTrue(actual_results) def test_generate_content_get_no_result(self): """No result indexed returns False""" _start, _end = ['0000000000000000000000000000000000000000', '0000000000000000000000000000000000000001'] start, end = map(hashutil.hash_to_bytes, (_start, _end)) # given actual_results = self.indexer.run( start, end, incremental=False) # then self.assertFalse(actual_results) - - -class NoDiskIndexer: - """Mixin to override the DiskIndexer behavior avoiding side-effects in - tests. - - """ - - def write_to_temp(self, filename, data): # noop - return filename - - def cleanup(self, content_path): # noop - return None