diff --git a/docs/dev-info.rst b/docs/dev-info.rst index b8cf648..493b102 100644 --- a/docs/dev-info.rst +++ b/docs/dev-info.rst @@ -1,206 +1,206 @@ Hacking on swh-indexer ====================== This tutorial will guide you through the hacking on the swh-indexer. If you do not have a local copy of the Software Heritage archive, go to the `getting started tutorial `_ Configuration files ------------------- You will need the following YAML configuration files to run the swh-indexer commands: - Orchestrator at ``~/.config/swh/indexer/orchestrator.yml`` .. code-block:: yaml indexers: mimetype: check_presence: false batch_size: 100 - Orchestrator-text at ``~/.config/swh/indexer/orchestrator-text.yml`` .. code-block:: yaml indexers: # language: # batch_size: 10 # check_presence: false fossology_license: batch_size: 10 check_presence: false # ctags: # batch_size: 2 # check_presence: false - Mimetype indexer at ``~/.config/swh/indexer/mimetype.yml`` .. code-block:: yaml # storage to read sha1's metadata (path) # storage: # cls: local # args: # db: "service=swh-dev" # objstorage: # cls: pathslicing # args: # root: /home/storage/swh-storage/ # slicing: 0:1/1:5 storage: cls: remote args: url: http://localhost:5002/ indexer_storage: cls: remote args: url: http://localhost:5007/ # storage to read sha1's content # adapt this to your need # locally: this needs to match your storage's setup objstorage: cls: pathslicing args: slicing: 0:1/1:5 root: /home/storage/swh-storage/ - destination_queue: swh.indexer.tasks.SWHOrchestratorTextContentsTask + destination_task: swh.indexer.tasks.SWHOrchestratorTextContentsTask rescheduling_task: swh.indexer.tasks.SWHContentMimetypeTask - Fossology indexer at ``~/.config/swh/indexer/fossology_license.yml`` .. code-block:: yaml # storage to read sha1's metadata (path) # storage: # cls: local # args: # db: "service=swh-dev" # objstorage: # cls: pathslicing # args: # root: /home/storage/swh-storage/ # slicing: 0:1/1:5 storage: cls: remote url: http://localhost:5002/ indexer_storage: cls: remote args: url: http://localhost:5007/ # storage to read sha1's content # adapt this to your need # locally: this needs to match your storage's setup objstorage: cls: pathslicing args: slicing: 0:1/1:5 root: /home/storage/swh-storage/ workdir: /tmp/swh/worker.indexer/license/ tools: name: 'nomos' version: '3.1.0rc2-31-ga2cbb8c' configuration: command_line: 'nomossa ' - Worker at ``~/.config/swh/worker.yml`` .. code-block:: yaml task_broker: amqp://guest@localhost// task_modules: - swh.loader.svn.tasks - swh.loader.tar.tasks - swh.loader.git.tasks - swh.storage.archiver.tasks - swh.indexer.tasks - swh.indexer.orchestrator task_queues: - swh_loader_svn - swh_loader_tar - swh_reader_git_to_azure_archive - swh_storage_archive_worker_to_backend - swh_indexer_orchestrator_content_all - swh_indexer_orchestrator_content_text - swh_indexer_content_mimetype - swh_indexer_content_language - swh_indexer_content_ctags - swh_indexer_content_fossology_license - swh_loader_svn_mount_and_load - swh_loader_git_express - swh_loader_git_archive - swh_loader_svn_archive task_soft_time_limit: 0 Database -------- swh-indxer uses a database to store the indexed content. The default db is expected to be called swh-indexer-dev. Create or add ``swh-dev`` and ``swh-indexer-dev`` to the ``~/.pg_service.conf`` and ``~/.pgpass`` files, which are postgresql's configuration files. Add data to local DB -------------------- from within the ``swh-environment``, run the following command:: make rebuild-testdata and fetch some real data to work with, using:: python3 -m swh.loader.git.updater --origin-url Then you can list all content files using this script:: #!/usr/bin/env bash psql service=swh-dev -c "copy (select sha1 from content) to stdin" | sed -e 's/^\\\\x//g' Run the indexers ----------------- Use the list off contents to feed the indexers with with the following command:: ./list-sha1.sh | python3 -m swh.indexer.producer --batch 100 --task-name orchestrator_all Activate the workers -------------------- To send messages to different queues using rabbitmq (which should already be installed through dependencies installation), run the following command in a dedicated terminal:: python3 -m celery worker --app=swh.scheduler.celery_backend.config.app \ --pool=prefork \ --concurrency=1 \ -Ofair \ --loglevel=info \ --without-gossip \ --without-mingle \ --without-heartbeat 2>&1 With this command rabbitmq will consume message using the worker configuration file. Note: for the fossology_license indexer, you need a package fossology-nomossa which is in our `public debian repository `_. diff --git a/swh/indexer/mimetype.py b/swh/indexer/mimetype.py index 57bcd3a..858e75a 100644 --- a/swh/indexer/mimetype.py +++ b/swh/indexer/mimetype.py @@ -1,158 +1,158 @@ # Copyright (C) 2016-2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import click import magic from swh.model import hashutil from swh.scheduler import utils from .indexer import ContentIndexer def compute_mimetype_encoding(raw_content): """Determine mimetype and encoding from the raw content. Args: raw_content (bytes): content's raw data Returns: A dict with mimetype and encoding key and corresponding values (as bytes). """ r = magic.detect_from_content(raw_content) return { 'mimetype': r.mime_type.encode('utf-8'), 'encoding': r.encoding.encode('utf-8'), } class ContentMimetypeIndexer(ContentIndexer): """Indexer in charge of: - filtering out content already indexed - reading content from objstorage per the content's id (sha1) - computing {mimetype, encoding} from that content - store result in storage """ ADDITIONAL_CONFIG = { - 'destination_queue': ('str', None), + 'destination_task': ('str', None), 'tools': ('dict', { 'name': 'file', 'version': '1:5.30-1+deb9u1', 'configuration': { "type": "library", "debian-package": "python3-magic" }, }), } CONFIG_BASE_FILENAME = 'indexer/mimetype' def prepare(self): super().prepare() - destination_queue = self.config.get('destination_queue') - if destination_queue: - self.task_destination = utils.get_task(destination_queue) + destination_task = self.config.get('destination_task') + if destination_task: + self.destination_task = utils.get_task(destination_task) else: - self.task_destination = None + self.destination_task = None self.tool = self.tools[0] def filter(self, ids): """Filter out known sha1s and return only missing ones. """ yield from self.idx_storage.content_mimetype_missing(( { 'id': sha1, 'indexer_configuration_id': self.tool['id'], } for sha1 in ids )) def index(self, id, data): """Index sha1s' content and store result. Args: id (bytes): content's identifier data (bytes): raw content in bytes Returns: A dict, representing a content_mimetype, with keys: - id (bytes): content's identifier (sha1) - mimetype (bytes): mimetype in bytes - encoding (bytes): encoding in bytes """ try: properties = compute_mimetype_encoding(data) properties.update({ 'id': id, 'indexer_configuration_id': self.tool['id'], }) except TypeError: self.log.error('Detecting mimetype error for id %s' % ( hashutil.hash_to_hex(id), )) return None return properties def persist_index_computations(self, results, policy_update): """Persist the results in storage. Args: results ([dict]): list of content_mimetype, dict with the following keys: - id (bytes): content's identifier (sha1) - mimetype (bytes): mimetype in bytes - encoding (bytes): encoding in bytes policy_update ([str]): either 'update-dups' or 'ignore-dups' to respectively update duplicates or ignore them """ self.idx_storage.content_mimetype_add( results, conflict_update=(policy_update == 'update-dups')) def _filter_text(self, results): """Filter sha1 whose raw content is text. """ for result in results: if b'binary' in result['encoding']: continue yield result['id'] def next_step(self, results): """When the computations is done, we'd like to send over only text contents to the text content orchestrator. Args: results ([dict]): List of content_mimetype results, dict with the following keys: - id (bytes): content's identifier (sha1) - mimetype (bytes): mimetype in bytes - encoding (bytes): encoding in bytes """ - if self.task_destination: - self.task_destination.delay(list(self._filter_text(results))) + if self.destination_task: + self.destination_task.delay(list(self._filter_text(results))) @click.command() @click.option('--path', help="Path to execute index on") def main(path): with open(path, 'rb') as f: raw_content = f.read() print(compute_mimetype_encoding(raw_content)) if __name__ == '__main__': main() diff --git a/swh/indexer/tests/test_language.py b/swh/indexer/tests/test_language.py index 048f309..3990f77 100644 --- a/swh/indexer/tests/test_language.py +++ b/swh/indexer/tests/test_language.py @@ -1,113 +1,113 @@ # Copyright (C) 2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import unittest import logging from nose.tools import istest from swh.indexer import language from swh.indexer.language import ContentLanguageIndexer from swh.indexer.tests.test_utils import MockObjStorage class _MockIndexerStorage(): """Mock storage to simplify reading indexers' outputs. """ def content_language_add(self, languages, conflict_update=None): self.state = languages self.conflict_update = conflict_update def indexer_configuration_add(self, tools): return [{ 'id': 20, }] class TestLanguageIndexer(ContentLanguageIndexer): """Specific language whose configuration is enough to satisfy the indexing tests. """ def prepare(self): self.config = { - 'destination_queue': None, + 'destination_task': None, 'rescheduling_task': None, 'tools': { 'name': 'pygments', 'version': '2.0.1+dfsg-1.1+deb8u1', 'configuration': { 'type': 'library', 'debian-package': 'python3-pygments', 'max_content_size': 10240, }, } } self.idx_storage = _MockIndexerStorage() self.log = logging.getLogger('swh.indexer') self.objstorage = MockObjStorage() - self.task_destination = None + self.destination_task = None self.rescheduling_task = self.config['rescheduling_task'] self.tool_config = self.config['tools']['configuration'] self.max_content_size = self.tool_config['max_content_size'] self.tools = self.register_tools(self.config['tools']) self.tool = self.tools[0] class Language(unittest.TestCase): """ Tests pygments tool for language detection """ def setUp(self): self.maxDiff = None @istest def test_compute_language_none(self): # given self.content = "" self.declared_language = { 'lang': None } # when result = language.compute_language(self.content) # then self.assertEqual(self.declared_language, result) @istest def test_index_content_language_python(self): # given # testing python sha1s = ['02fb2c89e14f7fab46701478c83779c7beb7b069'] lang_indexer = TestLanguageIndexer() # when lang_indexer.run(sha1s, policy_update='ignore-dups') results = lang_indexer.idx_storage.state expected_results = [{ 'id': '02fb2c89e14f7fab46701478c83779c7beb7b069', 'indexer_configuration_id': 20, 'lang': 'python' }] # then self.assertEqual(expected_results, results) @istest def test_index_content_language_c(self): # given # testing c sha1s = ['103bc087db1d26afc3a0283f38663d081e9b01e6'] lang_indexer = TestLanguageIndexer() # when lang_indexer.run(sha1s, policy_update='ignore-dups') results = lang_indexer.idx_storage.state expected_results = [{ 'id': '103bc087db1d26afc3a0283f38663d081e9b01e6', 'indexer_configuration_id': 20, 'lang': 'c' }] # then self.assertEqual('c', results[0]['lang']) self.assertEqual(expected_results, results) diff --git a/swh/indexer/tests/test_metadata.py b/swh/indexer/tests/test_metadata.py index 51913c7..d32e235 100644 --- a/swh/indexer/tests/test_metadata.py +++ b/swh/indexer/tests/test_metadata.py @@ -1,366 +1,366 @@ # Copyright (C) 2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import unittest import logging from nose.tools import istest from swh.indexer.metadata_dictionary import compute_metadata from swh.indexer.metadata_detector import detect_metadata from swh.indexer.metadata_detector import extract_minimal_metadata_dict from swh.indexer.metadata import ContentMetadataIndexer from swh.indexer.metadata import RevisionMetadataIndexer from swh.indexer.tests.test_utils import MockObjStorage, MockStorage from swh.indexer.tests.test_utils import MockIndexerStorage class TestContentMetadataIndexer(ContentMetadataIndexer): """Specific Metadata whose configuration is enough to satisfy the indexing tests. """ def prepare(self): self.config.update({ 'rescheduling_task': None, }) self.idx_storage = MockIndexerStorage() self.log = logging.getLogger('swh.indexer') self.objstorage = MockObjStorage() - self.task_destination = None + self.destination_task = None self.rescheduling_task = self.config['rescheduling_task'] self.tools = self.register_tools(self.config['tools']) self.tool = self.tools[0] self.results = [] class TestRevisionMetadataIndexer(RevisionMetadataIndexer): """Specific indexer whose configuration is enough to satisfy the indexing tests. """ ContentMetadataIndexer = TestContentMetadataIndexer def prepare(self): self.config = { 'rescheduling_task': None, 'storage': { 'cls': 'remote', 'args': { 'url': 'http://localhost:9999', } }, 'tools': { 'name': 'swh-metadata-detector', 'version': '0.0.1', 'configuration': { 'type': 'local', 'context': 'npm' } } } self.storage = MockStorage() self.idx_storage = MockIndexerStorage() self.log = logging.getLogger('swh.indexer') self.objstorage = MockObjStorage() - self.task_destination = None + self.destination_task = None self.rescheduling_task = self.config['rescheduling_task'] self.tools = self.register_tools(self.config['tools']) self.tool = self.tools[0] self.results = [] class Metadata(unittest.TestCase): """ Tests metadata_mock_tool tool for Metadata detection """ def setUp(self): """ shows the entire diff in the results """ self.maxDiff = None self.content_tool = { 'name': 'swh-metadata-translator', 'version': '0.0.1', 'configuration': { 'type': 'local', 'context': 'npm' } } @istest def test_compute_metadata_none(self): """ testing content empty content is empty should return None """ # given content = b"" context = "npm" # None if no metadata was found or an error occurred declared_metadata = None # when result = compute_metadata(context, content) # then self.assertEqual(declared_metadata, result) @istest def test_compute_metadata_npm(self): """ testing only computation of metadata with hard_mapping_npm """ # given content = b""" { "name": "test_metadata", "version": "0.0.1", "description": "Simple package.json test for indexer", "repository": { "type": "git", "url": "https://github.com/moranegg/metadata_test" } } """ declared_metadata = { 'name': 'test_metadata', 'version': '0.0.1', 'description': 'Simple package.json test for indexer', 'codeRepository': { 'type': 'git', 'url': 'https://github.com/moranegg/metadata_test' }, 'other': {} } # when result = compute_metadata("npm", content) # then self.assertEqual(declared_metadata, result) @istest def test_extract_minimal_metadata_dict(self): """ Test the creation of a coherent minimal metadata set """ # given metadata_list = [{ 'name': 'test_1', 'version': '0.0.1', 'description': 'Simple package.json test for indexer', 'codeRepository': { 'type': 'git', 'url': 'https://github.com/moranegg/metadata_test' }, 'other': {} }, { 'name': 'test_0_1', 'version': '0.0.1', 'description': 'Simple package.json test for indexer', 'codeRepository': { 'type': 'git', 'url': 'https://github.com/moranegg/metadata_test' }, 'other': {} }, { 'name': 'test_metadata', 'version': '0.0.1', 'author': 'moranegg', 'other': {} }] # when results = extract_minimal_metadata_dict(metadata_list) # then expected_results = { "developmentStatus": None, "version": ['0.0.1'], "operatingSystem": None, "description": ['Simple package.json test for indexer'], "keywords": None, "issueTracker": None, "name": ['test_1', 'test_0_1', 'test_metadata'], "author": ['moranegg'], "relatedLink": None, "url": None, "license": None, "maintainer": None, "email": None, "softwareRequirements": None, "identifier": None, "codeRepository": [{ 'type': 'git', 'url': 'https://github.com/moranegg/metadata_test' }] } self.assertEqual(expected_results, results) @istest def test_index_content_metadata_npm(self): """ testing NPM with package.json - one sha1 uses a file that can't be translated to metadata and should return None in the translated metadata """ # given sha1s = ['26a9f72a7c87cc9205725cfd879f514ff4f3d8d5', 'd4c647f0fc257591cc9ba1722484229780d1c607', '02fb2c89e14f7fab46701478c83779c7beb7b069'] # this metadata indexer computes only metadata for package.json # in npm context with a hard mapping metadata_indexer = TestContentMetadataIndexer( tool=self.content_tool, config={}) # when metadata_indexer.run(sha1s, policy_update='ignore-dups') results = metadata_indexer.idx_storage.state expected_results = [{ 'indexer_configuration_id': 30, 'translated_metadata': { 'other': {}, 'codeRepository': { 'type': 'git', 'url': 'https://github.com/moranegg/metadata_test' }, 'description': 'Simple package.json test for indexer', 'name': 'test_metadata', 'version': '0.0.1' }, 'id': '26a9f72a7c87cc9205725cfd879f514ff4f3d8d5' }, { 'indexer_configuration_id': 30, 'translated_metadata': { 'softwareRequirements': { 'JSONStream': '~1.3.1', 'abbrev': '~1.1.0', 'ansi-regex': '~2.1.1', 'ansicolors': '~0.3.2', 'ansistyles': '~0.1.3' }, 'issueTracker': { 'url': 'https://github.com/npm/npm/issues' }, 'author': 'Isaac Z. Schlueter (http://blog.izs.me)', 'codeRepository': { 'type': 'git', 'url': 'https://github.com/npm/npm' }, 'description': 'a package manager for JavaScript', 'softwareSuggestions': { 'tacks': '~1.2.6', 'tap': '~10.3.2' }, 'license': 'Artistic-2.0', 'version': '5.0.3', 'other': { 'preferGlobal': True, 'config': { 'publishtest': False } }, 'name': 'npm', 'keywords': [ 'install', 'modules', 'package manager', 'package.json' ], 'url': 'https://docs.npmjs.com/' }, 'id': 'd4c647f0fc257591cc9ba1722484229780d1c607' }, { 'indexer_configuration_id': 30, 'translated_metadata': None, 'id': '02fb2c89e14f7fab46701478c83779c7beb7b069' }] # The assertion below returns False sometimes because of nested lists self.assertEqual(expected_results, results) @istest def test_detect_metadata_package_json(self): # given df = [{ 'sha1_git': b'abc', 'name': b'index.js', 'target': b'abc', 'length': 897, 'status': 'visible', 'type': 'file', 'perms': 33188, 'dir_id': b'dir_a', 'sha1': b'bcd' }, { 'sha1_git': b'aab', 'name': b'package.json', 'target': b'aab', 'length': 712, 'status': 'visible', 'type': 'file', 'perms': 33188, 'dir_id': b'dir_a', 'sha1': b'cde' }] # when results = detect_metadata(df) expected_results = { 'npm': [ b'cde' ] } # then self.assertEqual(expected_results, results) @istest def test_revision_metadata_indexer(self): metadata_indexer = TestRevisionMetadataIndexer() sha1_gits = [ b'8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f', ] metadata_indexer.run(sha1_gits, 'update-dups') results = metadata_indexer.idx_storage.state expected_results = [{ 'id': b'8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f', 'translated_metadata': { 'identifier': None, 'maintainer': None, 'url': [ 'https://github.com/librariesio/yarn-parser#readme' ], 'codeRepository': [{ 'type': 'git', 'url': 'git+https://github.com/librariesio/yarn-parser.git' }], 'author': ['Andrew Nesbitt'], 'license': ['AGPL-3.0'], 'version': ['1.0.0'], 'description': [ 'Tiny web service for parsing yarn.lock files' ], 'relatedLink': None, 'developmentStatus': None, 'operatingSystem': None, 'issueTracker': [{ 'url': 'https://github.com/librariesio/yarn-parser/issues' }], 'softwareRequirements': [{ 'express': '^4.14.0', 'yarn': '^0.21.0', 'body-parser': '^1.15.2' }], 'name': ['yarn-parser'], 'keywords': [['yarn', 'parse', 'lock', 'dependencies']], 'email': None }, 'indexer_configuration_id': 7 }] # then self.assertEqual(expected_results, results) diff --git a/swh/indexer/tests/test_mimetype.py b/swh/indexer/tests/test_mimetype.py index 63f6044..899780e 100644 --- a/swh/indexer/tests/test_mimetype.py +++ b/swh/indexer/tests/test_mimetype.py @@ -1,158 +1,158 @@ # Copyright (C) 2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import unittest import logging from nose.tools import istest from swh.indexer.mimetype import ContentMimetypeIndexer from swh.indexer.tests.test_utils import MockObjStorage class _MockIndexerStorage(): """Mock storage to simplify reading indexers' outputs. """ def content_mimetype_add(self, mimetypes, conflict_update=None): self.state = mimetypes self.conflict_update = conflict_update def indexer_configuration_add(self, tools): return [{ 'id': 10, }] class TestMimetypeIndexer(ContentMimetypeIndexer): """Specific mimetype whose configuration is enough to satisfy the indexing tests. """ def prepare(self): self.config = { - 'destination_queue': None, + 'destination_task': None, 'rescheduling_task': None, 'tools': { 'name': 'file', 'version': '1:5.30-1+deb9u1', 'configuration': { "type": "library", "debian-package": "python3-magic" }, }, } self.idx_storage = _MockIndexerStorage() self.log = logging.getLogger('swh.indexer') self.objstorage = MockObjStorage() - self.task_destination = None + self.destination_task = None self.rescheduling_task = self.config['rescheduling_task'] - self.destination_queue = self.config['destination_queue'] + self.destination_task = self.config['destination_task'] self.tools = self.register_tools(self.config['tools']) self.tool = self.tools[0] class TestMimetypeIndexerUnknownToolStorage(TestMimetypeIndexer): """Specific mimetype whose configuration is not enough to satisfy the indexing tests. """ def prepare(self): super().prepare() self.tools = None class TestMimetypeIndexerWithErrors(unittest.TestCase): @istest def wrong_unknown_configuration_tool(self): """Indexer with unknown configuration tool should fail the check""" with self.assertRaisesRegex(ValueError, 'Tools None is unknown'): TestMimetypeIndexerUnknownToolStorage() class TestMimetypeIndexerTest(unittest.TestCase): def setUp(self): self.indexer = TestMimetypeIndexer() @istest def test_index_no_update(self): # given sha1s = [ '01c9379dfc33803963d07c1ccc748d3fe4c96bb5', '688a5ef812c53907562fe379d4b3851e69c7cb15', ] # when self.indexer.run(sha1s, policy_update='ignore-dups') # then expected_results = [{ 'id': '01c9379dfc33803963d07c1ccc748d3fe4c96bb5', 'indexer_configuration_id': 10, 'mimetype': b'text/plain', 'encoding': b'us-ascii', }, { 'id': '688a5ef812c53907562fe379d4b3851e69c7cb15', 'indexer_configuration_id': 10, 'mimetype': b'text/plain', 'encoding': b'us-ascii', }] self.assertFalse(self.indexer.idx_storage.conflict_update) self.assertEquals(expected_results, self.indexer.idx_storage.state) @istest def test_index_update(self): # given sha1s = [ '01c9379dfc33803963d07c1ccc748d3fe4c96bb5', '688a5ef812c53907562fe379d4b3851e69c7cb15', 'da39a3ee5e6b4b0d3255bfef95601890afd80709', # empty content ] # when self.indexer.run(sha1s, policy_update='update-dups') # then expected_results = [{ 'id': '01c9379dfc33803963d07c1ccc748d3fe4c96bb5', 'indexer_configuration_id': 10, 'mimetype': b'text/plain', 'encoding': b'us-ascii', }, { 'id': '688a5ef812c53907562fe379d4b3851e69c7cb15', 'indexer_configuration_id': 10, 'mimetype': b'text/plain', 'encoding': b'us-ascii', }, { 'id': 'da39a3ee5e6b4b0d3255bfef95601890afd80709', 'indexer_configuration_id': 10, 'mimetype': b'application/x-empty', 'encoding': b'binary', }] self.assertTrue(self.indexer.idx_storage.conflict_update) self.assertEquals(expected_results, self.indexer.idx_storage.state) @istest def test_index_one_unknown_sha1(self): # given sha1s = ['688a5ef812c53907562fe379d4b3851e69c7cb15', '799a5ef812c53907562fe379d4b3851e69c7cb15', # unknown '800a5ef812c53907562fe379d4b3851e69c7cb15'] # unknown # when self.indexer.run(sha1s, policy_update='update-dups') # then expected_results = [{ 'id': '688a5ef812c53907562fe379d4b3851e69c7cb15', 'indexer_configuration_id': 10, 'mimetype': b'text/plain', 'encoding': b'us-ascii', }] self.assertTrue(self.indexer.idx_storage.conflict_update) self.assertEquals(expected_results, self.indexer.idx_storage.state)