diff --git a/README b/README index 75db696..b8c009c 100644 --- a/README +++ b/README @@ -1,71 +1,79 @@ swh-indexer =========== Tools to compute multiple indexes on SWH's raw contents: - mimetype - ctags - language - fossology-license - +- metadata # Context SWH has currently stored around 3B contents. The table `content` holds their checksums. Those contents are physically stored in an object storage (using disks) and replicated in another. Those object storages are not destined for reading yet. We are in the process to copy those contents over to azure's blob storages. As such, we will use that opportunity to trigger the computations on these contents once those have been copied over. # Workers There exists 2 kinds: - orchestrators (orchestrator, orchestrator-text) - indexer (mimetype, language, ctags, fossology-license) ## Orchestrator Orchestrators: - receive batch of sha1s - split those batches - broadcast those to indexers There are 2 sorts: - orchestrator (swh_indexer_orchestrator_content_all): Receives and broadcast sha1 ids (of contents) to indexers (currently only the mimetype indexer) - orchestrator-text (swh_indexer_orchestrator_content_text): Receives batch of sha1 ids (of textual contents) and broadcast those to indexers (currently language, ctags, and fossology-license indexers). ## Indexers Indexers: -- receive batch of sha1 -- retrieve the associated content from the blob storage -- compute for that content some index +- receive batch of ids +- retrieve the associated data depending on object type +- compute for that object some index - store the result to swh's storage - (and possibly do some broadcast itself) -Current indexers: +Current content indexers: - mimetype (queue swh_indexer_content_mimetype): compute the mimetype, filter out the textual contents and broadcast the list to the orchestrator-text - language (queue swh_indexer_content_language): detect the programming language - ctags (queue swh_indexer_content_ctags): try and compute tags information - fossology-license (queue swh_indexer_fossology_license): try and compute the license + +- metadata : translate file into translated_metadata dict + +Current revision indexers: + +- metadata: detects files containing metadata and retrieves translated_metadata + in content_metadata table in storage or run content indexer to translate + files. diff --git a/swh/indexer/tests/test_utils.py b/swh/indexer/tests/test_utils.py index d7a6f1d..aea186a 100644 --- a/swh/indexer/tests/test_utils.py +++ b/swh/indexer/tests/test_utils.py @@ -1,112 +1,113 @@ # Copyright (C) 2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from swh.objstorage.exc import ObjNotFoundError class MockStorageWrongConfiguration(): def indexer_configuration_get(self, tool): return None class MockObjStorage(): """Mock objstorage with predefined contents. """ def __init__(self): self.data = { '01c9379dfc33803963d07c1ccc748d3fe4c96bb50': b'this is some text', '688a5ef812c53907562fe379d4b3851e69c7cb15': b'another text', '8986af901dd2043044ce8f0d8fc039153641cf17': b'yet another text', '02fb2c89e14f7fab46701478c83779c7beb7b069': b""" import unittest import logging from nose.tools import istest from swh.indexer.mimetype import ContentMimetypeIndexer from swh.indexer.tests.test_utils import MockObjStorage class MockStorage(): def content_mimetype_add(self, mimetypes): self.state = mimetypes self.conflict_update = conflict_update def indexer_configuration_get(self, tool): return { 'id': 10, } """, '103bc087db1d26afc3a0283f38663d081e9b01e6': b""" #ifndef __AVL__ #define __AVL__ typedef struct _avl_tree avl_tree; typedef struct _data_t { int content; } data_t; """, '93666f74f1cf635c8c8ac118879da6ec5623c410': b""" (should 'pygments (recognize 'lisp 'easily)) """, '26a9f72a7c87cc9205725cfd879f514ff4f3d8d5': b""" { "name": "test_metadata", "version": "0.0.1", "description": "Simple package.json test for indexer", "repository": { "type": "git", "url": "https://github.com/moranegg/metadata_test" } } """, 'd4c647f0fc257591cc9ba1722484229780d1c607': b""" { "version": "5.0.3", "name": "npm", "description": "a package manager for JavaScript", "keywords": [ "install", "modules", "package manager", "package.json" ], "preferGlobal": true, "config": { "publishtest": false }, "homepage": "https://docs.npmjs.com/", "author": "Isaac Z. Schlueter (http://blog.izs.me)", "repository": { "type": "git", "url": "https://github.com/npm/npm" }, "bugs": { "url": "https://github.com/npm/npm/issues" }, "dependencies": { "JSONStream": "~1.3.1", "abbrev": "~1.1.0", "ansi-regex": "~2.1.1", "ansicolors": "~0.3.2", "ansistyles": "~0.1.3" }, "devDependencies": { "tacks": "~1.2.6", "tap": "~10.3.2" }, "license": "Artistic-2.0" } + """, + 'a7ab314d8a11d2c93e3dcf528ca294e7b431c449': b""" """ - } def get(self, sha1): raw_content = self.data.get(sha1) if not raw_content: raise ObjNotFoundError() return raw_content