diff --git a/swh/indexer/indexer.py b/swh/indexer/indexer.py --- a/swh/indexer/indexer.py +++ b/swh/indexer/indexer.py @@ -17,6 +17,53 @@ from swh.scheduler.utils import get_task +class DiskIndexer: + """Mixin intended to be used with other *Indexer classes. + + Indexer* inheriting from this class are a category of indexers + which needs the disk for their computations. + + Expects: + self.working_directory variable defined at runtime. + + """ + def __init__(self): + super().__init__() + + def write_to_temp(self, filename, data): + """Write the sha1's content in a temporary file. + + Args: + sha1 (str): the sha1 name + filename (str): one of sha1's many filenames + data (bytes): the sha1's content to write in temporary + file + + Returns: + The path to the temporary file created. That file is + filled in with the raw content's data. + + """ + os.makedirs(self.working_directory, exist_ok=True) + temp_dir = tempfile.mkdtemp(dir=self.working_directory) + content_path = os.path.join(temp_dir, filename) + + with open(content_path, 'wb') as f: + f.write(data) + + return content_path + + def cleanup(self, content_path): + """Remove content_path from working directory. + + Args: + content_path (str): the file to remove + + """ + temp_dir = os.path.dirname(content_path) + shutil.rmtree(temp_dir) + + class BaseIndexer(SWHConfig, metaclass=abc.ABCMeta): """Base class for indexers to inherit from. @@ -257,50 +304,3 @@ if self.rescheduling_task: self.log.warn('Rescheduling batch') self.rescheduling_task.delay(sha1s, policy_update) - - -class DiskIndexer: - """Mixin intended to be used with other *Indexer classes. - - Indexer* inheriting from this class are a category of indexers - which needs the disk for their computations. - - Expects: - self.working_directory variable defined at runtime. - - """ - def __init__(self): - super().__init__() - - def write_to_temp(self, filename, data): - """Write the sha1's content in a temporary file. - - Args: - sha1 (str): the sha1 name - filename (str): one of sha1's many filenames - data (bytes): the sha1's content to write in temporary - file - - Returns: - The path to the temporary file created. That file is - filled in with the raw content's data. - - """ - os.makedirs(self.working_directory, exist_ok=True) - temp_dir = tempfile.mkdtemp(dir=self.working_directory) - content_path = os.path.join(temp_dir, filename) - - with open(content_path, 'wb') as f: - f.write(data) - - return content_path - - def cleanup(self, content_path): - """Remove content_path from working directory. - - Args: - content_path (str): the file to remove - - """ - temp_dir = os.path.dirname(content_path) - shutil.rmtree(temp_dir) diff --git a/swh/indexer/metadata.py b/swh/indexer/metadata.py new file mode 100644 --- /dev/null +++ b/swh/indexer/metadata.py @@ -0,0 +1,113 @@ +# Copyright (C) 2016-2017 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information +from .indexer import BaseIndexer +from swh.indexer.metadata_dictionary import MetadataDict + + +def compute_metadata(raw_content, context): + """ + uses xyz tool and xyz mapping to translate syntax + and translate semantic of content + Args: + context (text): with the tool name define from which + context/vocabulary the files needs to be translated + mapping (dict): extracted from MetadataDict + (context_term : codemeta_term) + Returns: + result (dict): translated_metadata (name, version, etc..) + """ + if raw_content is None: + return None + + try: + content_text = raw_content.decode() + tool_for_complete_translation = MetadataDict() + return tool_for_complete_translation.parse(context, content_text) + except Exception as e: + print(e) + return None + + +class ContentMetadataIndexer(BaseIndexer): + """Indexer in charge of: + - filtering out content already indexed + - reading content from objstorage with the content's id sha1 + - computing translated_metadata by given context + - using the MetadataDict and a tool for each context + - store result instorage + """ + CONFIG_BASE_FILENAME = 'indexer/metadata' + + ADDITIONAL_CONFIG = { + 'tools': ('dict', { + 'name': 'hard_mapping_npm', + 'version': '0.0.1', + 'configuration': { + 'type': 'test', + 'debian-package': '', + 'max_content_size': 10240, + }, + }), + } + + def prepare(self): + super().prepare() + c = self.config + self.max_content_size = c['tools']['configuration']['max_content_size'] + + def filter_contents(self, sha1s): + """Filter out known sha1s and return only missing ones. + + """ + yield from self.storage.content_metadata_missing(( + { + 'id': sha1, + 'indexer_configuration_id': self.tools['id'], + } for sha1 in sha1s + )) + + def index_content(self, sha1, raw_content): + """Index sha1s' content and store result. + + Args: + sha1 (bytes): content's identifier + raw_content (bytes): raw content in bytes + + Returns: + A dict, representing a content_metadata, with keys: + TODO + + """ + result = { + 'id': sha1, + 'indexer_configuration_id': self.tools['id'], + 'translated_metadata': None + } + # TODO a tool for each context + self.tool = self.ADDITIONAL_CONFIG['tools'][1]['name'] + + try: + result['translated_metadata'] = compute_metadata( + raw_content, self.tool) + + except Exception as e: + print(e) + + return result + + def persist_index_computations(self, results, policy_update): + """Persist the results in storage. + + Args: + results ([dict]): list of content_metadata, dict with the + following keys: + - id (bytes): content's identifier (sha1) + - translated_metadata (jsonb): detected metadata + policy_update ([str]): either 'update-dups' or 'ignore-dups' to + respectively update duplicates or ignore them + + """ + self.storage.content_metadata_add( + results, conflict_update=(policy_update == 'update-dups')) diff --git a/swh/indexer/metadata_dictionary.py b/swh/indexer/metadata_dictionary.py new file mode 100644 --- /dev/null +++ b/swh/indexer/metadata_dictionary.py @@ -0,0 +1,115 @@ +# Copyright (C) 2015-2017 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import json + +npm_mapping = { + 'repository': 'codeRepository', + 'os': 'operatingSystem', + 'cpu': 'processorRequirements', + 'engines': 'processorRequirements', + 'dependencies': 'softwareRequirements', + 'bundledDependencies': 'softwareRequirements', + 'peerDependencies': 'softwareRequirements', + 'author': 'author', + 'contributor': 'contributor', + 'keywords': 'keywords', + 'license': 'license', + 'version': 'version', + 'description': 'description', + 'name': 'name', + 'devDependencies': 'softwareSuggestions', + 'optionalDependencies': 'softwareSuggestions', + 'bugs': 'issueTracker' +} + + +class MetadataDict(): + + def __init__(self): + pass + + def parse(self, context, content): + """ + first landing method: a dispatcher that sends content + to the right function to carry out the real parsing of syntax + and translation of terms + Args: + - context (text) : defines to which function/tool + the content is sent + - content (text): the string form of the raw_content + + Returns: + - translated_metadata (dict): jsonb form needed for the indexer + to store in storage + + """ + # checks if decoding is needed? + + # sends content to parser and/or translator + if context == "hard_mapping_npm": + return self.translate_npm(content) + + elif context == "pom_xml": + # TODO + return self.translate_pom(self.parse_xml(content)) + + else: + return None + + def pase_xml(self, content): + """ + Parses content from xml to a python dict + Args: + - content (text): the string form of the raw_content ( in xml) + + Returns: + - parsed_xml (dict): a python dict of the content after parsing + """ + pass + + def translate_npm(self, content): + """ + Tranlsates content by parsing content to a json object + and translating with the npm mapping (for now hard_coded mapping) + Args: + - context_text (text) : should be json + + Returns: + - translated_metadata (dict): jsonb form needed for the indexer + """ + translated_metadata = {} + # TODO: keep mapping not in code (maybe fetch crosswalk from storage?) + # if fetched from storage should be done once for batch of sha1s + mapping = npm_mapping + content_dict = json.loads(content) + default = 'other' + translated_metadata['other'] = {} + for k, v in content_dict.items(): + try: + term = mapping.get(k, default) + # print(k,v) + if term != default: + translated_metadata[term] = v + else: + # if we want to keep the entries that do not correspond + # with identified terms => all under other + translated_metadata[term][k] = v + except KeyError: + continue + return translated_metadata + + def translate_pom(self, content): + pass + + +def main(): + mtd = MetadataDict() + result = mtd.translate_npm('{"name": "test_name", "unknown_term": "ut"}') + print(result) + + +if __name__ == "__main__": + main() diff --git a/swh/indexer/tests/test_metadata.py b/swh/indexer/tests/test_metadata.py new file mode 100644 --- /dev/null +++ b/swh/indexer/tests/test_metadata.py @@ -0,0 +1,205 @@ +# Copyright (C) 2015-2017 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import unittest +import logging +from nose.tools import istest + +from swh.indexer import metadata +from swh.indexer.metadata import ContentMetadataIndexer +from swh.indexer.tests.test_utils import MockObjStorage + + +def ordered(obj): + if isinstance(obj, dict): + return sorted((k, ordered(v)) for k, v in obj.items()) + if isinstance(obj, list): + return sorted(ordered(x) for x in obj) + else: + return obj + + +class MockStorage(): + """Mock storage to simplify reading indexers' outputs. + """ + def content_metadata_add(self, metadata, conflict_update=None): + self.state = metadata + self.conflict_update = conflict_update + + def indexer_configuration_get(self, tool): + return { + 'id': 30, + } + + +class TestMetadataIndexer(ContentMetadataIndexer): + """Specific Metadata whose configuration is enough to satisfy the + indexing tests. + """ + def prepare(self): + self.config = { + 'rescheduling_task': None, + 'tools': { + 'name': 'npm_mock_tool', + 'version': '0.1', + 'configuration': { + 'type': 'local', + 'debian-package': '', + 'max_content_size': 10240, + } + } + } + self.storage = MockStorage() + self.log = logging.getLogger('swh.indexer') + self.objstorage = MockObjStorage() + self.task_destination = None + self.rescheduling_task = self.config['rescheduling_task'] + self.tool_config = self.config['tools']['configuration'] + self.max_content_size = self.tool_config['max_content_size'] + self.tools = self.retrieve_tools_information() + + +class Metadata(unittest.TestCase): + """ + Tests metadata_mock_tool tool for Metadata detection + """ + def setUp(self): + self.maxDiff = None + + @istest + def test_compute_metadata_none(self): + """ + testing content empty content is empty + should return None + """ + # given + content = None + tool = "hard_mapping_npm" + + # should it be empty {} or None if no metadata was found ? + declared_metadata = None + # when + result = metadata.compute_metadata(content, tool) + # then + self.assertEqual(declared_metadata, result) + + @istest + def test_compute_metadata_npm(self): + """ + testing only computation of metadata with hard_mapping_npm + """ + # given + content = b""" + { + "name": "test_metadata", + "version": "0.0.1", + "description": "Simple package.json test for indexer", + + "repository": { + "type": "git", + "url": "https://github.com/moranegg/metadata_test" + } + } + """ + declared_metadata = { + 'name': 'test_metadata', + 'version': '0.0.1', + 'description': 'Simple package.json test for indexer', + 'codeRepository': { + 'type': 'git', + 'url': 'https://github.com/moranegg/metadata_test' + }, + 'other': {} + } + + # when + result = metadata.compute_metadata( + content, "hard_mapping_npm") + # then + self.assertEqual(declared_metadata, result) + + @istest + def test_index_content_metadata_npm(self): + """ + testing NPM with package.json + """ + # given + sha1s = ['26a9f72a7c87cc9205725cfd879f514ff4f3d8d5', + 'd4c647f0fc257591cc9ba1722484229780d1c607'] + # this metadata indexer computes only metadata for package.json + # in npm context with a hard mapping + metadata_indexer = TestMetadataIndexer() + + # when + metadata_indexer.run(sha1s, policy_update='ignore-dups') + results = metadata_indexer.storage.state + + expected_results = [ + { + 'id': '26a9f72a7c87cc9205725cfd879f514ff4f3d8d5', + 'translated_metadata': { + 'name': 'test_metadata', + 'version': '0.0.1', + 'codeRepository': { + 'url': 'https://github.com/moranegg/metadata_test', + 'type': 'git' + }, + 'description': 'Simple package.json test for indexer', + 'other': { + + } + }, + 'indexer_configuration_id': 30 + }, + { + 'id': 'd4c647f0fc257591cc9ba1722484229780d1c607', + 'translated_metadata': { + 'name': 'npm', + 'version': '5.0.3', + 'keywords': [ + 'install', + 'modules', + 'package manager', + 'package.json' + ], + 'softwareSuggestions': { + 'tacks': '~1.2.6', + 'tap': '~10.3.2' + }, + 'description': 'a package manager for JavaScript', + 'author': 'Isaac Z. Schlueter (http://blog.izs.me)', + 'issueTracker': { + 'url': 'https://github.com/npm/npm/issues' + }, + 'license': 'Artistic-2.0', + 'softwareRequirements': { + 'abbrev': '~1.1.0', + 'ansistyles': '~0.1.3', + 'ansicolors': '~0.3.2', + 'JSONStream': '~1.3.1', + 'ansi-regex': '~2.1.1' + }, + 'codeRepository': { + 'url': 'https://github.com/npm/npm', + 'type': 'git' + }, + 'other': { + 'bundleDependencies': [ + 'abbrev', + 'ansi-regex' + ], + 'preferGlobal': True, + 'homepage': 'https://docs.npmjs.com/', + 'config': { + 'publishtest': False + } + } + }, + 'indexer_configuration_id': 30 + } + ] + # then + # print(ordered(results)) + self.assertEqual(ordered(expected_results), ordered(results)) diff --git a/swh/indexer/tests/test_utils.py b/swh/indexer/tests/test_utils.py --- a/swh/indexer/tests/test_utils.py +++ b/swh/indexer/tests/test_utils.py @@ -51,6 +51,60 @@ '93666f74f1cf635c8c8ac118879da6ec5623c410': b""" (should 'pygments (recognize 'lisp 'easily)) + """, + '26a9f72a7c87cc9205725cfd879f514ff4f3d8d5': b""" + { + "name": "test_metadata", + "version": "0.0.1", + "description": "Simple package.json test for indexer", + "repository": { + "type": "git", + "url": "https://github.com/moranegg/metadata_test" + } + } + """, + 'd4c647f0fc257591cc9ba1722484229780d1c607': b""" + { + "version": "5.0.3", + "name": "npm", + "description": "a package manager for JavaScript", + "keywords": [ + "install", + "modules", + "package manager", + "package.json" + ], + "preferGlobal": true, + "config": { + "publishtest": false + }, + "homepage": "https://docs.npmjs.com/", + "author": "Isaac Z. Schlueter (http://blog.izs.me)", + "repository": { + "type": "git", + "url": "https://github.com/npm/npm" + }, + "bugs": { + "url": "https://github.com/npm/npm/issues" + }, + "dependencies": { + "JSONStream": "~1.3.1", + "abbrev": "~1.1.0", + "ansi-regex": "~2.1.1", + "ansicolors": "~0.3.2", + "ansistyles": "~0.1.3" + }, + "bundleDependencies": [ + "abbrev", + "ansi-regex" + ], + "devDependencies": { + "tacks": "~1.2.6", + "tap": "~10.3.2" + }, + "license": "Artistic-2.0" + } + """ }