diff --git a/swh/indexer/indexer.py b/swh/indexer/indexer.py --- a/swh/indexer/indexer.py +++ b/swh/indexer/indexer.py @@ -17,6 +17,53 @@ from swh.scheduler.utils import get_task +class DiskIndexer: + """Mixin intended to be used with other *Indexer classes. + + Indexer* inheriting from this class are a category of indexers + which needs the disk for their computations. + + Expects: + self.working_directory variable defined at runtime. + + """ + def __init__(self): + super().__init__() + + def write_to_temp(self, filename, data): + """Write the sha1's content in a temporary file. + + Args: + sha1 (str): the sha1 name + filename (str): one of sha1's many filenames + data (bytes): the sha1's content to write in temporary + file + + Returns: + The path to the temporary file created. That file is + filled in with the raw content's data. + + """ + os.makedirs(self.working_directory, exist_ok=True) + temp_dir = tempfile.mkdtemp(dir=self.working_directory) + content_path = os.path.join(temp_dir, filename) + + with open(content_path, 'wb') as f: + f.write(data) + + return content_path + + def cleanup(self, content_path): + """Remove content_path from working directory. + + Args: + content_path (str): the file to remove + + """ + temp_dir = os.path.dirname(content_path) + shutil.rmtree(temp_dir) + + class BaseIndexer(SWHConfig, metaclass=abc.ABCMeta): """Base class for indexers to inherit from. @@ -257,50 +304,3 @@ if self.rescheduling_task: self.log.warn('Rescheduling batch') self.rescheduling_task.delay(sha1s, policy_update) - - -class DiskIndexer: - """Mixin intended to be used with other *Indexer classes. - - Indexer* inheriting from this class are a category of indexers - which needs the disk for their computations. - - Expects: - self.working_directory variable defined at runtime. - - """ - def __init__(self): - super().__init__() - - def write_to_temp(self, filename, data): - """Write the sha1's content in a temporary file. - - Args: - sha1 (str): the sha1 name - filename (str): one of sha1's many filenames - data (bytes): the sha1's content to write in temporary - file - - Returns: - The path to the temporary file created. That file is - filled in with the raw content's data. - - """ - os.makedirs(self.working_directory, exist_ok=True) - temp_dir = tempfile.mkdtemp(dir=self.working_directory) - content_path = os.path.join(temp_dir, filename) - - with open(content_path, 'wb') as f: - f.write(data) - - return content_path - - def cleanup(self, content_path): - """Remove content_path from working directory. - - Args: - content_path (str): the file to remove - - """ - temp_dir = os.path.dirname(content_path) - shutil.rmtree(temp_dir) diff --git a/swh/indexer/metadata.py b/swh/indexer/metadata.py new file mode 100644 --- /dev/null +++ b/swh/indexer/metadata.py @@ -0,0 +1,84 @@ +# Copyright (C) 2017 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from .indexer import BaseIndexer +from swh.indexer.metadata_dictionary import compute_metadata + + +class ContentMetadataIndexer(BaseIndexer): + """Indexer in charge of: + - filtering out content already indexed + - reading content from objstorage with the content's id sha1 + - computing translated_metadata by given context + - using the MetadataDict and a tool for each context + - store result instorage + """ + CONFIG_BASE_FILENAME = 'indexer/metadata' + + ADDITIONAL_CONFIG = { + 'tools': ('dict', { + 'name': 'hard_mapping_npm', + 'version': '0.0.1', + 'configuration': { + 'type': 'test', + 'debian-package': '' + }, + }), + } + + def prepare(self): + super().prepare() + + def filter_contents(self, sha1s): + """Filter out known sha1s and return only missing ones. + + """ + yield from self.storage.content_metadata_missing(( + { + 'id': sha1, + 'indexer_configuration_id': self.tools['id'], + } for sha1 in sha1s + )) + + def index_content(self, sha1, raw_content): + """Index sha1s' content and store result. + + Args: + sha1 (bytes): content's identifier + raw_content (bytes): raw content in bytes + + Returns: + A dict, representing a content_metadata, with keys: + TODO + + """ + result = { + 'id': sha1, + 'indexer_configuration_id': self.tools['id'], + 'translated_metadata': None + } + try: + context = self.tools['name'] + result['translated_metadata'] = compute_metadata( + context, raw_content) + except NameError: + self.log.exception( + "Problem during tool retrieval of metadata translation") + return result + + def persist_index_computations(self, results, policy_update): + """Persist the results in storage. + + Args: + results ([dict]): list of content_metadata, dict with the + following keys: + - id (bytes): content's identifier (sha1) + - translated_metadata (jsonb): detected metadata + policy_update ([str]): either 'update-dups' or 'ignore-dups' to + respectively update duplicates or ignore them + + """ + self.storage.content_metadata_add( + results, conflict_update=(policy_update == 'update-dups')) diff --git a/swh/indexer/metadata_dictionary.py b/swh/indexer/metadata_dictionary.py new file mode 100644 --- /dev/null +++ b/swh/indexer/metadata_dictionary.py @@ -0,0 +1,152 @@ + +# Copyright (C) 2017 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information +import json + +npm_mapping = { + 'repository': 'codeRepository', + 'os': 'operatingSystem', + 'cpu': 'processorRequirements', + 'engines': 'processorRequirements', + 'dependencies': 'softwareRequirements', + 'bundleDependencies': 'softwareRequirements', + 'peerDependencies': 'softwareRequirements', + 'author': 'author', + 'contributor': 'contributor', + 'keywords': 'keywords', + 'license': 'license', + 'version': 'version', + 'description': 'description', + 'name': 'name', + 'devDependencies': 'softwareSuggestions', + 'optionalDependencies': 'softwareSuggestions', + 'bugs': 'issueTracker', + 'homepage': 'url' +} + +# TODO complete pom/doap hard mapping / find pom tool (iterate on all contexts) +pom_mapping = { + 'license': 'license', + 'version': 'version', + 'description': 'description', + 'name': 'name', + 'prerequisites': 'softwareRequirements' +} +doap_mapping = { + +} + + +def convert(raw_content): + if isinstance(raw_content, bytes): + return convert(raw_content.decode()) + if isinstance(raw_content, str): + try: + content = json.loads(raw_content) + if content: + return content + else: + return None + except json.decoder.JSONDecodeError: + return None + + if isinstance(raw_content, dict): + return raw_content + + +def compute_metadata(context, raw_content): + """ + first landing method: a dispatcher that sends content + to the right function to carry out the real parsing of syntax + and translation of terms + Args: + - context (text) : defines to which function/tool + the content is sent + - content (text): the string form of the raw_content + + Returns: + - translated_metadata (dict): jsonb form needed for the indexer + to store in storage + + """ + content = convert(raw_content) + if content is None: + return None + translated_metadata = mapping_tool_fn[context](content) + return translated_metadata + + +def parse_xml(content): + """ + Parses content from xml to a python dict + Args: + - content (text): the string form of the raw_content ( in xml) + + Returns: + - parsed_xml (dict): a python dict of the content after parsing + """ + # check if xml + # use xml parser to dict + return content + + +def translate(content_dict, mapping): + """ + Tranlsates content by parsing content to a json object + and translating with the npm mapping (for now hard_coded mapping) + Args: + - context_text (text) : should be json + + Returns: + - translated_metadata (dict): jsonb form needed for the indexer + """ + translated_metadata = {} + # TODO: keep mapping not in code (maybe fetch crosswalk from storage?) + # if fetched from storage should be done once for batch of sha1s + default = 'other' + translated_metadata['other'] = {} + for k, v in content_dict.items(): + try: + term = mapping.get(k, default) + if term not in translated_metadata: + translated_metadata[term] = v + continue + if isinstance(translated_metadata[term], str): + in_value = translated_metadata[term] + translated_metadata[term] = [in_value, v] + continue + if isinstance(translated_metadata[term], list): + translated_metadata[term].append(v) + continue + if isinstance(translated_metadata[term], dict): + translated_metadata[term][k] = v + continue + + except KeyError: + continue + return translated_metadata + + +mapping_tool_fn = { + "hard_mapping_npm": lambda content: translate(content, npm_mapping), + "pom_xml": lambda content: translate(parse_xml(content), pom_mapping), + "doap_xml": lambda content: translate(parse_xml(content), doap_mapping) +} + + +def main(): + raw_content = """{"name": "test_name", "unknown_term": "ut"}""" + raw_content1 = b"""{"name": "test_name", + "unknown_term": "ut", + "prerequisites" :"packageXYZ"}""" + result = compute_metadata("hard_mapping_npm", raw_content) + result1 = compute_metadata("pom_xml", raw_content1) + + print(result) + print(result1) + + +if __name__ == "__main__": + main() diff --git a/swh/indexer/tests/test_language.py b/swh/indexer/tests/test_language.py --- a/swh/indexer/tests/test_language.py +++ b/swh/indexer/tests/test_language.py @@ -1,4 +1,4 @@ -# Copyright (C) 2015-2017 The Software Heritage developers +# Copyright (C) 2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information diff --git a/swh/indexer/tests/test_metadata.py b/swh/indexer/tests/test_metadata.py new file mode 100644 --- /dev/null +++ b/swh/indexer/tests/test_metadata.py @@ -0,0 +1,195 @@ +# Copyright (C) 2017 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import unittest +import logging +from nose.tools import istest + +from swh.indexer.metadata_dictionary import compute_metadata +from swh.indexer.metadata import ContentMetadataIndexer +from swh.indexer.tests.test_utils import MockObjStorage + + +class MockStorage(): + """Mock storage to simplify reading indexers' outputs. + """ + def content_metadata_add(self, metadata, conflict_update=None): + self.state = metadata + self.conflict_update = conflict_update + + def indexer_configuration_get(self, tool): + return { + 'id': 30, + 'name': 'hard_mapping_npm', + 'version': '0.1' + } + + +class TestMetadataIndexer(ContentMetadataIndexer): + """Specific Metadata whose configuration is enough to satisfy the + indexing tests. + """ + def prepare(self): + self.config = { + 'rescheduling_task': None, + 'tools': { + 'name': 'hard_mapping_npm', + 'version': '0.1', + 'configuration': { + 'type': 'local', + 'debian-package': '' + } + } + } + self.storage = MockStorage() + self.log = logging.getLogger('swh.indexer') + self.objstorage = MockObjStorage() + self.task_destination = None + self.rescheduling_task = self.config['rescheduling_task'] + self.tools = self.retrieve_tools_information() + + +class Metadata(unittest.TestCase): + """ + Tests metadata_mock_tool tool for Metadata detection + """ + def setUp(self): + self.maxDiff = None + + @istest + def test_compute_metadata_none(self): + """ + testing content empty content is empty + should return None + """ + # given + content = b"" + tool = "hard_mapping_npm" + + # should it be empty {} or None if no metadata was found ? + declared_metadata = None + # when + result = compute_metadata(tool, content) + # then + self.assertEqual(declared_metadata, result) + + @istest + def test_compute_metadata_npm(self): + """ + testing only computation of metadata with hard_mapping_npm + """ + # given + content = b""" + { + "name": "test_metadata", + "version": "0.0.1", + "description": "Simple package.json test for indexer", + "repository": { + "type": "git", + "url": "https://github.com/moranegg/metadata_test" + } + } + """ + declared_metadata = { + 'name': 'test_metadata', + 'version': '0.0.1', + 'description': 'Simple package.json test for indexer', + 'codeRepository': { + 'type': 'git', + 'url': 'https://github.com/moranegg/metadata_test' + }, + 'other': {} + } + + # when + result = compute_metadata("hard_mapping_npm", content) + # then + self.assertEqual(declared_metadata, result) + + @istest + def test_index_content_metadata_npm(self): + """ + testing NPM with package.json + - one sha1 uses a file that can't be translated to metadata and + should return None in the translated metadata + """ + # given + sha1s = ['26a9f72a7c87cc9205725cfd879f514ff4f3d8d5', + 'd4c647f0fc257591cc9ba1722484229780d1c607', + '02fb2c89e14f7fab46701478c83779c7beb7b069'] + # this metadata indexer computes only metadata for package.json + # in npm context with a hard mapping + metadata_indexer = TestMetadataIndexer() + + # when + metadata_indexer.run(sha1s, policy_update='ignore-dups') + results = metadata_indexer.storage.state + + expected_results = [{ + 'indexer_configuration_id': 30, + 'translated_metadata': { + 'other': {}, + 'codeRepository': { + 'type': 'git', + 'url': 'https://github.com/moranegg/metadata_test' + }, + 'description': 'Simple package.json test for indexer', + 'name': 'test_metadata', + 'version': '0.0.1' + }, + 'id': '26a9f72a7c87cc9205725cfd879f514ff4f3d8d5' + }, { + 'indexer_configuration_id': 30, + 'translated_metadata': { + 'softwareRequirements': [ + 'abbrev', + 'ansi-regex', + { + 'JSONStream': '~1.3.1', + 'abbrev': '~1.1.0', + 'ansi-regex': '~2.1.1', + 'ansicolors': '~0.3.2', + 'ansistyles': '~0.1.3' + } + ], + 'issueTracker': { + 'url': 'https://github.com/npm/npm/issues' + }, + 'author': + 'Isaac Z. Schlueter (http://blog.izs.me)', + 'codeRepository': { + 'type': 'git', + 'url': 'https://github.com/npm/npm' + }, + 'description': 'a package manager for JavaScript', + 'softwareSuggestions': { + 'tacks': '~1.2.6', + 'tap': '~10.3.2' + }, + 'license': 'Artistic-2.0', + 'version': '5.0.3', + 'other': { + 'preferGlobal': True, + 'config': { + 'publishtest': False + } + }, + 'name': 'npm', + 'keywords': [ + 'install', + 'modules', + 'package manager', + 'package.json' + ], + 'url': 'https://docs.npmjs.com/' + }, + 'id': 'd4c647f0fc257591cc9ba1722484229780d1c607' + }, { + 'indexer_configuration_id': 30, + 'translated_metadata': None, + 'id': '02fb2c89e14f7fab46701478c83779c7beb7b069' + }] + # then + self.assertEqual(expected_results, results) diff --git a/swh/indexer/tests/test_utils.py b/swh/indexer/tests/test_utils.py --- a/swh/indexer/tests/test_utils.py +++ b/swh/indexer/tests/test_utils.py @@ -51,6 +51,60 @@ '93666f74f1cf635c8c8ac118879da6ec5623c410': b""" (should 'pygments (recognize 'lisp 'easily)) + """, + '26a9f72a7c87cc9205725cfd879f514ff4f3d8d5': b""" + { + "name": "test_metadata", + "version": "0.0.1", + "description": "Simple package.json test for indexer", + "repository": { + "type": "git", + "url": "https://github.com/moranegg/metadata_test" + } + } + """, + 'd4c647f0fc257591cc9ba1722484229780d1c607': b""" + { + "version": "5.0.3", + "name": "npm", + "description": "a package manager for JavaScript", + "keywords": [ + "install", + "modules", + "package manager", + "package.json" + ], + "preferGlobal": true, + "config": { + "publishtest": false + }, + "homepage": "https://docs.npmjs.com/", + "author": "Isaac Z. Schlueter (http://blog.izs.me)", + "repository": { + "type": "git", + "url": "https://github.com/npm/npm" + }, + "bugs": { + "url": "https://github.com/npm/npm/issues" + }, + "dependencies": { + "JSONStream": "~1.3.1", + "abbrev": "~1.1.0", + "ansi-regex": "~2.1.1", + "ansicolors": "~0.3.2", + "ansistyles": "~0.1.3" + }, + "bundleDependencies": [ + "abbrev", + "ansi-regex" + ], + "devDependencies": { + "tacks": "~1.2.6", + "tap": "~10.3.2" + }, + "license": "Artistic-2.0" + } + """ }