diff --git a/swh/indexer/indexer.py b/swh/indexer/indexer.py index 7831b89..1bf3263 100644 --- a/swh/indexer/indexer.py +++ b/swh/indexer/indexer.py @@ -1,563 +1,563 @@ # Copyright (C) 2016-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import abc import os import logging import shutil import tempfile import datetime from copy import deepcopy from swh.scheduler import get_scheduler from swh.storage import get_storage from swh.core.config import SWHConfig from swh.objstorage import get_objstorage from swh.objstorage.exc import ObjNotFoundError from swh.model import hashutil from swh.indexer.storage import get_indexer_storage, INDEXER_CFG_KEY class DiskIndexer: """Mixin intended to be used with other SomethingIndexer classes. Indexers inheriting from this class are a category of indexers which needs the disk for their computations. Note: This expects `self.working_directory` variable defined at runtime. """ def write_to_temp(self, filename, data): """Write the sha1's content in a temporary file. Args: sha1 (str): the sha1 name filename (str): one of sha1's many filenames data (bytes): the sha1's content to write in temporary file Returns: The path to the temporary file created. That file is filled in with the raw content's data. """ os.makedirs(self.working_directory, exist_ok=True) temp_dir = tempfile.mkdtemp(dir=self.working_directory) content_path = os.path.join(temp_dir, filename) with open(content_path, 'wb') as f: f.write(data) return content_path def cleanup(self, content_path): """Remove content_path from working directory. Args: content_path (str): the file to remove """ temp_dir = os.path.dirname(content_path) shutil.rmtree(temp_dir) class BaseIndexer(SWHConfig, metaclass=abc.ABCMeta): """Base class for indexers to inherit from. The main entry point is the :func:`run` function which is in charge of triggering the computations on the batch dict/ids received. Indexers can: - filter out ids whose data has already been indexed. - retrieve ids data from storage or objstorage - index this data depending on the object and store the result in storage. To implement a new object type indexer, inherit from the BaseIndexer and implement indexing: :func:`run`: object_ids are different depending on object. For example: sha1 for content, sha1_git for revision, directory, release, and id for origin To implement a new concrete indexer, inherit from the object level classes: :class:`ContentIndexer`, :class:`RevisionIndexer`, :class:`OriginIndexer`. Then you need to implement the following functions: :func:`filter`: filter out data already indexed (in storage). :func:`index_object`: compute index on id with data (retrieved from the storage or the objstorage by the id key) and return the resulting index computation. :func:`persist_index_computations`: persist the results of multiple index computations in the storage. The new indexer implementation can also override the following functions: :func:`prepare`: Configuration preparation for the indexer. When overriding, this must call the `super().prepare()` instruction. :func:`check`: Configuration check for the indexer. When overriding, this must call the `super().check()` instruction. :func:`register_tools`: This should return a dict of the tool(s) to use when indexing or filtering. """ CONFIG = 'indexer/base' DEFAULT_CONFIG = { INDEXER_CFG_KEY: ('dict', { 'cls': 'remote', 'args': { 'url': 'http://localhost:5007/' } }), 'storage': ('dict', { 'cls': 'remote', 'args': { 'url': 'http://localhost:5002/', } }), 'objstorage': ('dict', { 'cls': 'remote', 'args': { 'url': 'http://localhost:5003/', } }) } ADDITIONAL_CONFIG = {} def __init__(self): """Prepare and check that the indexer is ready to run. """ super().__init__() self.prepare() self.check() def prepare(self): """Prepare the indexer's needed runtime configuration. Without this step, the indexer cannot possibly run. """ self.config = self.parse_config_file( additional_configs=[self.ADDITIONAL_CONFIG]) if self.config['storage']: self.storage = get_storage(**self.config['storage']) objstorage = self.config['objstorage'] self.objstorage = get_objstorage(objstorage['cls'], objstorage['args']) idx_storage = self.config[INDEXER_CFG_KEY] self.idx_storage = get_indexer_storage(**idx_storage) _log = logging.getLogger('requests.packages.urllib3.connectionpool') _log.setLevel(logging.WARN) self.log = logging.getLogger('swh.indexer') self.tools = list(self.register_tools(self.config['tools'])) def check(self): """Check the indexer's configuration is ok before proceeding. If ok, does nothing. If not raise error. """ if not self.tools: raise ValueError('Tools %s is unknown, cannot continue' % self.tools) def _prepare_tool(self, tool): """Prepare the tool dict to be compliant with the storage api. """ return {'tool_%s' % key: value for key, value in tool.items()} def register_tools(self, tools): """Permit to register tools to the storage. Add a sensible default which can be overridden if not sufficient. (For now, all indexers use only one tool) Expects the self.config['tools'] property to be set with one or more tools. Args: tools (dict/[dict]): Either a dict or a list of dict. Returns: List of dict with additional id key. Raises: ValueError if not a list nor a dict. """ tools = self.config['tools'] if isinstance(tools, list): tools = map(self._prepare_tool, tools) elif isinstance(tools, dict): tools = [self._prepare_tool(tools)] else: raise ValueError('Configuration tool(s) must be a dict or list!') return self.idx_storage.indexer_configuration_add(tools) @abc.abstractmethod def index(self, id, data): """Index computation for the id and associated raw data. Args: id (bytes): identifier data (bytes): id's data from storage or objstorage depending on object type Returns: a dict that makes sense for the persist_index_computations function. """ pass @abc.abstractmethod def persist_index_computations(self, results, policy_update): """Persist the computation resulting from the index. Args: results ([result]): List of results. One result is the result of the index function. policy_update ([str]): either 'update-dups' or 'ignore-dups' to respectively update duplicates or ignore them Returns: None """ pass def next_step(self, results, task): """Do something else with computations results (e.g. send to another queue, ...). (This is not an abstractmethod since it is optional). Args: results ([result]): List of results (dict) as returned by index function. task (dict): a dict in the form expected by `scheduler.backend.SchedulerBackend.create_tasks` without `next_run`, plus a `result_name` key. Returns: None """ if task: if getattr(self, 'scheduler', None): scheduler = self.scheduler else: scheduler = get_scheduler(**self.config['scheduler']) task = deepcopy(task) result_name = task.pop('result_name') task['next_run'] = datetime.datetime.now() task['arguments']['kwargs'][result_name] = self.results scheduler.create_tasks([task]) @abc.abstractmethod def run(self, ids, policy_update, next_step=None, **kwargs): """Given a list of ids: - retrieves the data from the storage - executes the indexing computations - stores the results (according to policy_update) Args: ids ([bytes]): id's identifier list policy_update (str): either 'update-dups' or 'ignore-dups' to respectively update duplicates or ignore them next_step (dict): a dict in the form expected by `scheduler.backend.SchedulerBackend.create_tasks` without `next_run`, plus a `result_name` key. **kwargs: passed to the `index` method """ pass class ContentIndexer(BaseIndexer): """A content indexer working on a list of ids directly. To work on indexer range, use the :class:`ContentRangeIndexer` instead. Note: :class:`ContentRangeIndexer` is not an instantiable object. To use it, one should inherit from this class and override the methods mentioned in the :class:`BaseIndexer` class. """ @abc.abstractmethod def filter(self, ids): """Filter missing ids for that particular indexer. Args: ids ([bytes]): list of ids Yields: iterator of missing ids """ pass def run(self, ids, policy_update, next_step=None, **kwargs): """Given a list of ids: - retrieve the content from the storage - execute the indexing computations - store the results (according to policy_update) Args: ids ([bytes]): sha1's identifier list policy_update (str): either 'update-dups' or 'ignore-dups' to respectively update duplicates or ignore them next_step (dict): a dict in the form expected by `scheduler.backend.SchedulerBackend.create_tasks` without `next_run`, plus a `result_name` key. **kwargs: passed to the `index` method """ results = [] try: for sha1 in ids: try: raw_content = self.objstorage.get(sha1) except ObjNotFoundError: self.log.warning('Content %s not found in objstorage' % hashutil.hash_to_hex(sha1)) continue res = self.index(sha1, raw_content, **kwargs) if res: # If no results, skip it results.append(res) self.persist_index_computations(results, policy_update) self.results = results return self.next_step(results, task=next_step) except Exception: self.log.exception( 'Problem when reading contents metadata.') class ContentRangeIndexer(BaseIndexer): """A content range indexer. This expects as input a range of ids to index. Note: :class:`ContentRangeIndexer` is not an instantiable object. To use it, one should inherit from this class and override the methods mentioned in the :class:`BaseIndexer` class. """ @abc.abstractmethod def range(self, start, end): """Retrieve indexed contents within range [start, end]. Args **start** (bytes): Starting bound from range identifier **end** (bytes): End range identifier Yields: Content identifier (bytes) present in the range [start, end] """ pass def _new(self, start, end, indexed): """Compute new contents in range [start, end] to index given a set of already indexed ids. Args: **start** (bytes): Starting bound from range identifier **end** (bytes): End range identifier **indexed** (Set(bytes)): Set of content already indexed. """ while True: result = self.storage.content_get_range(start, end) contents = result['contents'] for c in contents: _id = c['sha1'] if _id in indexed: continue yield _id - next_id = result['next_id'] + next_id = result['next'] if next_id is None: break def run(self, ids, policy_update, **kwargs): """Given a range of content ids, compute the indexing computation on the contents within. Either only new ones (policy_update to 'update-dups') or all (policy_update to 'ignore-dups'. Args: **ids** ([bytes]): a list of 2 elements represeting a range **policy_update** (str): either 'update-dups' to do all contents, or 'ignore-dups' to only compute new ones **kwargs: passed to the `index` method """ if len(ids) != 2: # range raise ValueError('Range of ids expected') results = [] try: [start, end] = ids if policy_update == 'update-dups': # incremental indexed = set(self.range(start, end)) else: indexed = {} for sha1 in self._new(start, end, indexed): try: raw_content = self.objstorage.get(sha1) except ObjNotFoundError: self.log.warning('Content %s not found in objstorage' % hashutil.hash_to_hex(sha1)) continue res = self.index(sha1, raw_content, **kwargs) if res: # If no results, skip it results.append(res) self.persist_index_computations(results, policy_update) self.results = results return self.results except Exception: self.log.exception( 'Problem when reading contents metadata.') class OriginIndexer(BaseIndexer): """An object type indexer, inherits from the :class:`BaseIndexer` and implements Origin indexing using the run method Note: the :class:`OriginIndexer` is not an instantiable object. To use it in another context one should inherit from this class and override the methods mentioned in the :class:`BaseIndexer` class. """ def run(self, ids, policy_update, parse_ids=False, next_step=None, **kwargs): """Given a list of origin ids: - retrieve origins from storage - execute the indexing computations - store the results (according to policy_update) Args: ids ([Union[int, Tuple[str, bytes]]]): list of origin ids or (type, url) tuples. policy_update (str): either 'update-dups' or 'ignore-dups' to respectively update duplicates or ignore them parse_ids (bool: If `True`, will try to convert `ids` from a human input to the valid type. next_step (dict): a dict in the form expected by `scheduler.backend.SchedulerBackend.create_tasks` without `next_run`, plus a `result_name` key. **kwargs: passed to the `index` method """ if parse_ids: ids = [ o.split('+', 1) if ':' in o else int(o) # type+url or id for o in ids] results = [] for id_ in ids: if isinstance(id_, (tuple, list)): if len(id_) != 2: raise TypeError('Expected a (type, url) tuple.') (type_, url) = id_ params = {'type': type_, 'url': url} elif isinstance(id_, int): params = {'id': id_} else: raise TypeError('Invalid value in "ids": %r' % id_) origin = self.storage.origin_get(params) if not origin: self.log.warning('Origins %s not found in storage' % list(ids)) continue try: res = self.index(origin, **kwargs) if origin: # If no results, skip it results.append(res) except Exception: self.log.exception( 'Problem when processing origin %s' % id_) self.persist_index_computations(results, policy_update) self.results = results return self.next_step(results, task=next_step) class RevisionIndexer(BaseIndexer): """An object type indexer, inherits from the :class:`BaseIndexer` and implements Revision indexing using the run method Note: the :class:`RevisionIndexer` is not an instantiable object. To use it in another context one should inherit from this class and override the methods mentioned in the :class:`BaseIndexer` class. """ def run(self, ids, policy_update, next_step=None): """Given a list of sha1_gits: - retrieve revisions from storage - execute the indexing computations - store the results (according to policy_update) Args: ids ([bytes or str]): sha1_git's identifier list policy_update (str): either 'update-dups' or 'ignore-dups' to respectively update duplicates or ignore them """ results = [] ids = [id_.encode() if isinstance(id_, str) else id_ for id_ in ids] revs = self.storage.revision_get(ids) for rev in revs: if not rev: self.log.warning('Revisions %s not found in storage' % list(map(hashutil.hash_to_hex, ids))) continue try: res = self.index(rev) if res: # If no results, skip it results.append(res) except Exception: self.log.exception( 'Problem when processing revision') self.persist_index_computations(results, policy_update) self.results = results return self.next_step(results, task=next_step) diff --git a/swh/indexer/mimetype.py b/swh/indexer/mimetype.py index 2b00afe..bd3a252 100644 --- a/swh/indexer/mimetype.py +++ b/swh/indexer/mimetype.py @@ -1,134 +1,179 @@ # Copyright (C) 2016-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import click import magic from swh.model import hashutil from swh.scheduler import get_scheduler -from .indexer import ContentIndexer +from .indexer import ContentIndexer, ContentRangeIndexer def compute_mimetype_encoding(raw_content): """Determine mimetype and encoding from the raw content. Args: raw_content (bytes): content's raw data Returns: A dict with mimetype and encoding key and corresponding values (as bytes). """ r = magic.detect_from_content(raw_content) return { 'mimetype': r.mime_type.encode('utf-8'), 'encoding': r.encoding.encode('utf-8'), } -class ContentMimetypeIndexer(ContentIndexer): - """Indexer in charge of: +class MixinMimetypeIndexer: + """Mixin mimetype indexer. - - filtering out content already indexed - - reading content from objstorage per the content's id (sha1) - - computing {mimetype, encoding} from that content - - store result in storage + See :class:`ContentMimetypeIndexer` and :class:`MimetypeRangeIndexer` """ ADDITIONAL_CONFIG = { 'scheduler': { 'cls': 'remote', 'args': { 'url': 'http://localhost:5008', }, }, 'tools': ('dict', { 'name': 'file', 'version': '1:5.30-1+deb9u1', 'configuration': { "type": "library", "debian-package": "python3-magic" }, }), } CONFIG_BASE_FILENAME = 'indexer/mimetype' def prepare(self): super().prepare() self.scheduler = get_scheduler(**self.config['scheduler']) self.tool = self.tools[0] - def filter(self, ids): - """Filter out known sha1s and return only missing ones. - - """ - yield from self.idx_storage.content_mimetype_missing(( - { - 'id': sha1, - 'indexer_configuration_id': self.tool['id'], - } for sha1 in ids - )) - def index(self, id, data): """Index sha1s' content and store result. Args: id (bytes): content's identifier data (bytes): raw content in bytes Returns: A dict, representing a content_mimetype, with keys: - id (bytes): content's identifier (sha1) - mimetype (bytes): mimetype in bytes - encoding (bytes): encoding in bytes """ try: properties = compute_mimetype_encoding(data) properties.update({ 'id': id, 'indexer_configuration_id': self.tool['id'], }) except TypeError: self.log.error('Detecting mimetype error for id %s' % ( hashutil.hash_to_hex(id), )) return None return properties def persist_index_computations(self, results, policy_update): """Persist the results in storage. Args: results ([dict]): list of content_mimetype, dict with the following keys: - id (bytes): content's identifier (sha1) - mimetype (bytes): mimetype in bytes - encoding (bytes): encoding in bytes policy_update ([str]): either 'update-dups' or 'ignore-dups' to respectively update duplicates or ignore them """ self.idx_storage.content_mimetype_add( results, conflict_update=(policy_update == 'update-dups')) +class ContentMimetypeIndexer(MixinMimetypeIndexer, ContentIndexer): + """Mimetype Indexer working on list of content identifiers. + + It: + - (optionally) filters out content already indexed (cf. :callable:`filter`) + - reads content from objstorage per the content's id (sha1) + - computes {mimetype, encoding} from that content + - stores result in storage + + FIXME: + - 1. Rename redundant ContentMimetypeIndexer to MimetypeIndexer + - 2. Do we keep it afterwards? ~> i think this can be used with the journal + + """ + def filter(self, ids): + """Filter out known sha1s and return only missing ones. + + """ + yield from self.idx_storage.content_mimetype_missing(( + { + 'id': sha1, + 'indexer_configuration_id': self.tool['id'], + } for sha1 in ids + )) + + +class MimetypeRangeIndexer(MixinMimetypeIndexer, ContentRangeIndexer): + """Mimetype Range Indexer working on range of content identifiers. + + It: + - (optionally) filters out content already indexed (cf :callable:`range`) + - reads content from objstorage per the content's id (sha1) + - computes {mimetype, encoding} from that content + - stores result in storage + + """ + def range(self, start, end): + """Retrieve indexed content id within range [start, end]. + + Args + **start** (bytes): Starting bound from range identifier + **end** (bytes): End range identifier + + Yields: + Content identifier (bytes) present in the range [start, end] + + """ + while True: + result = self.idx_storage.content_mimetype_get_range( + start, end, self.tool['id']) + print('#### result; %s' % result) + contents = result['ids'] + for _id in contents: + yield _id + start = result['next'] + if start is None: + break + + @click.command() @click.option('--path', help="Path to execute index on") def main(path): with open(path, 'rb') as f: raw_content = f.read() print(compute_mimetype_encoding(raw_content)) if __name__ == '__main__': main() diff --git a/swh/indexer/tests/storage/__init__.py b/swh/indexer/tests/storage/__init__.py index f440d9c..f2c5f52 100644 --- a/swh/indexer/tests/storage/__init__.py +++ b/swh/indexer/tests/storage/__init__.py @@ -1,84 +1,83 @@ # Copyright (C) 2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from os import path import swh.storage from swh.model.hashutil import MultiHash from hypothesis.strategies import (composite, sets, one_of, uuids, tuples, sampled_from) - SQL_DIR = path.join(path.dirname(swh.indexer.__file__), 'sql') MIMETYPES = [ b'application/json', b'application/octet-stream', b'application/xml', b'text/plain', ] ENCODINGS = [ b'iso8859-1', b'iso8859-15', b'latin1', b'utf-8', ] def gen_mimetype(): """Generate one mimetype strategy. """ return one_of(sampled_from(MIMETYPES)) def gen_encoding(): """Generate one encoding strategy. """ return one_of(sampled_from(ENCODINGS)) @composite def gen_content_mimetypes(draw, *, min_size=0, max_size=100): """Generate valid and consistent content_mimetypes. Context: Test purposes Args: **draw** (callable): Used by hypothesis to generate data **min_size** (int): Minimal number of elements to generate (default: 0) **max_size** (int): Maximal number of elements to generate (default: 100) Returns: List of content_mimetypes as expected by the content_mimetype_add api endpoint. """ _ids = draw( sets( tuples( uuids(), gen_mimetype(), gen_encoding() ), min_size=min_size, max_size=max_size ) ) content_mimetypes = [] for uuid, mimetype, encoding in _ids: content_id = MultiHash.from_data(uuid.bytes, {'sha1'}).digest()['sha1'] content_mimetypes.append({ 'id': content_id, 'mimetype': mimetype, 'encoding': encoding, 'indexer_configuration_id': 1, }) return content_mimetypes diff --git a/swh/indexer/tests/test_mimetype.py b/swh/indexer/tests/test_mimetype.py index 2b46438..0d09bd4 100644 --- a/swh/indexer/tests/test_mimetype.py +++ b/swh/indexer/tests/test_mimetype.py @@ -1,148 +1,272 @@ # Copyright (C) 2017-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import unittest import logging -from swh.indexer.mimetype import ContentMimetypeIndexer +from swh.indexer.mimetype import ( + ContentMimetypeIndexer, MimetypeRangeIndexer +) from swh.indexer.tests.test_utils import MockObjStorage +class _MockStorage(): + """In memory implementation to fake the content_get_range api. + + FIXME: To remove when the actual in-memory lands. + + """ + contents = [] + + def __init__(self, contents): + self.contents = contents + + def content_get_range(self, start, end, limit=1000): + results = [] + _next_id = None + counter = 0 + for c in self.contents: + _id = c['sha1'] + if start <= _id and _id <= end: + results.append(c) + if counter >= limit: + break + counter += 1 + + return { + 'contents': results, + 'next': _next_id + } + + class _MockIndexerStorage(): """Mock storage to simplify reading indexers' outputs. """ + state = [] + def content_mimetype_add(self, mimetypes, conflict_update=None): self.state = mimetypes self.conflict_update = conflict_update + def content_mimetype_get_range(self, start, end, indexer_configuration_id, + limit=1000): + """Basic in-memory implementation (limit is unused). + + """ + results = [] + _next = None + counter = 0 + for m in self.state: + _id = m['id'] + _tool_id = m['indexer_configuration_id'] + if start <= _id and _id <= end and \ + _tool_id == indexer_configuration_id: + results.append(_id) + if counter >= limit: + break + counter += 1 + + return { + 'ids': results, + 'next': _next + } + def indexer_configuration_add(self, tools): return [{ 'id': 10, }] class MimetypeTestIndexer(ContentMimetypeIndexer): """Specific mimetype whose configuration is enough to satisfy the indexing tests. """ def prepare(self): self.config = { 'tools': { 'name': 'file', 'version': '1:5.30-1+deb9u1', 'configuration': { "type": "library", "debian-package": "python3-magic" }, }, } self.idx_storage = _MockIndexerStorage() self.log = logging.getLogger('swh.indexer') self.objstorage = MockObjStorage() self.tools = self.register_tools(self.config['tools']) self.tool = self.tools[0] class MimetypeIndexerUnknownToolTestStorage(MimetypeTestIndexer): """Specific mimetype whose configuration is not enough to satisfy the indexing tests. """ def prepare(self): super().prepare() self.tools = None class TestMimetypeIndexerWithErrors(unittest.TestCase): def test_wrong_unknown_configuration_tool(self): """Indexer with unknown configuration tool should fail the check""" with self.assertRaisesRegex(ValueError, 'Tools None is unknown'): MimetypeIndexerUnknownToolTestStorage() class TestMimetypeIndexer(unittest.TestCase): def setUp(self): self.indexer = MimetypeTestIndexer() def test_index_no_update(self): # given sha1s = [ '01c9379dfc33803963d07c1ccc748d3fe4c96bb5', '688a5ef812c53907562fe379d4b3851e69c7cb15', ] # when self.indexer.run(sha1s, policy_update='ignore-dups') # then expected_results = [{ 'id': '01c9379dfc33803963d07c1ccc748d3fe4c96bb5', 'indexer_configuration_id': 10, 'mimetype': b'text/plain', 'encoding': b'us-ascii', }, { 'id': '688a5ef812c53907562fe379d4b3851e69c7cb15', 'indexer_configuration_id': 10, 'mimetype': b'text/plain', 'encoding': b'us-ascii', }] self.assertFalse(self.indexer.idx_storage.conflict_update) self.assertEqual(expected_results, self.indexer.idx_storage.state) def test_index_update(self): # given sha1s = [ '01c9379dfc33803963d07c1ccc748d3fe4c96bb5', '688a5ef812c53907562fe379d4b3851e69c7cb15', 'da39a3ee5e6b4b0d3255bfef95601890afd80709', # empty content ] # when self.indexer.run(sha1s, policy_update='update-dups') # then expected_results = [{ 'id': '01c9379dfc33803963d07c1ccc748d3fe4c96bb5', 'indexer_configuration_id': 10, 'mimetype': b'text/plain', 'encoding': b'us-ascii', }, { 'id': '688a5ef812c53907562fe379d4b3851e69c7cb15', 'indexer_configuration_id': 10, 'mimetype': b'text/plain', 'encoding': b'us-ascii', }, { 'id': 'da39a3ee5e6b4b0d3255bfef95601890afd80709', 'indexer_configuration_id': 10, 'mimetype': b'application/x-empty', 'encoding': b'binary', }] self.assertTrue(self.indexer.idx_storage.conflict_update) self.assertEqual(expected_results, self.indexer.idx_storage.state) def test_index_one_unknown_sha1(self): # given sha1s = ['688a5ef812c53907562fe379d4b3851e69c7cb15', '799a5ef812c53907562fe379d4b3851e69c7cb15', # unknown '800a5ef812c53907562fe379d4b3851e69c7cb15'] # unknown # when self.indexer.run(sha1s, policy_update='update-dups') # then expected_results = [{ 'id': '688a5ef812c53907562fe379d4b3851e69c7cb15', 'indexer_configuration_id': 10, 'mimetype': b'text/plain', 'encoding': b'us-ascii', }] self.assertTrue(self.indexer.idx_storage.conflict_update) self.assertEqual(expected_results, self.indexer.idx_storage.state) + + +class MimetypeRangeIndexerTest(MimetypeRangeIndexer): + """Specific mimetype whose configuration is enough to satisfy the + indexing tests. + + """ + def prepare(self): + self.config = { + 'tools': { + 'name': 'file', + 'version': '1:5.30-1+deb9u1', + 'configuration': { + "type": "library", + "debian-package": "python3-magic" + }, + }, + } + self.idx_storage = _MockIndexerStorage() + self.log = logging.getLogger('swh.indexer') + # this hardcodes some contents, will use this to setup the storage + self.objstorage = MockObjStorage() + # sync objstorage and storage + contents = [{'sha1': c_id} for c_id in self.objstorage] + self.storage = _MockStorage(contents) + self.tools = self.register_tools(self.config['tools']) + self.tool = self.tools[0] + + +class TestMimetypeRangeIndexer(unittest.TestCase): + def setUp(self): + self.indexer = MimetypeRangeIndexerTest() + # will play along with the objstorage's mocked contents for now + self.contents = sorted(self.indexer.objstorage) + # FIXME: leverage swh.objstorage.in_memory_storage's + # InMemoryObjStorage, swh.storage.tests's gen_contents, and + # hypothesis to generate data to actually run indexer on those + + def test_generate_content_mimetype_get_range_no_limit(self): + start, end = [self.contents[0], self.contents[2]] + + # given + actual_results = self.indexer.run( + [start, end], policy_update='update-dups') + + # then + expected_results = [ + {'encoding': b'us-ascii', + 'id': '01c9379dfc33803963d07c1ccc748d3fe4c96bb5', + 'indexer_configuration_id': 10, + 'mimetype': b'text/plain'}, + {'encoding': b'us-ascii', + 'id': '02fb2c89e14f7fab46701478c83779c7beb7b069', + 'indexer_configuration_id': 10, + 'mimetype': b'text/x-python'}, + {'encoding': b'us-ascii', + 'id': '103bc087db1d26afc3a0283f38663d081e9b01e6', + 'indexer_configuration_id': 10, + 'mimetype': b'text/plain'} + ] + + self.assertEqual(expected_results, actual_results) + + for m in actual_results: + _id = m['id'] + self.assertTrue(start <= _id and _id <= end) + _tool_id = m['indexer_configuration_id'] + self.assertEqual(_tool_id, self.indexer.tool['id'])