diff --git a/swh/indexer/metadata.py b/swh/indexer/metadata.py index c3ff490..8b07d84 100644 --- a/swh/indexer/metadata.py +++ b/swh/indexer/metadata.py @@ -1,84 +1,84 @@ # Copyright (C) 2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from .indexer import BaseIndexer from swh.indexer.metadata_dictionary import compute_metadata class ContentMetadataIndexer(BaseIndexer): """Indexer in charge of: - filtering out content already indexed - reading content from objstorage with the content's id sha1 - computing translated_metadata by given context - using the MetadataDict and a tool for each context - store result in storage """ CONFIG_BASE_FILENAME = 'indexer/metadata' ADDITIONAL_CONFIG = { 'tools': ('dict', { - 'name': 'hard_mapping_npm', + 'name': 'swh-metadata-translator', 'version': '0.0.1', 'configuration': { - 'type': 'test', - 'debian-package': '' + 'type': 'local', + 'context': 'npm' }, }), } def prepare(self): super().prepare() def filter_contents(self, sha1s): """Filter out known sha1s and return only missing ones. """ yield from self.storage.content_metadata_missing(( { 'id': sha1, 'indexer_configuration_id': self.tools['id'], } for sha1 in sha1s )) def index_content(self, sha1, raw_content): """Index sha1s' content and store result. Args: sha1 (bytes): content's identifier raw_content (bytes): raw content in bytes Returns: result (dict): representing a content_metadata if translation wasn't successful the translated_metadata keys will be kept as None """ result = { 'id': sha1, 'indexer_configuration_id': self.tools['id'], 'translated_metadata': None } try: - context = self.tools['name'] + context = self.tools['configuration']['context'] result['translated_metadata'] = compute_metadata( context, raw_content) except: self.log.exception( "Problem during tool retrieval of metadata translation") return result def persist_index_computations(self, results, policy_update): """Persist the results in storage. Args: results ([dict]): list of content_metadata, dict with the following keys: - id (bytes): content's identifier (sha1) - translated_metadata (jsonb): detected metadata policy_update ([str]): either 'update-dups' or 'ignore-dups' to respectively update duplicates or ignore them """ self.storage.content_metadata_add( results, conflict_update=(policy_update == 'update-dups')) diff --git a/swh/indexer/metadata_dictionary.py b/swh/indexer/metadata_dictionary.py index 3356cc0..d68b65c 100644 --- a/swh/indexer/metadata_dictionary.py +++ b/swh/indexer/metadata_dictionary.py @@ -1,204 +1,204 @@ # Copyright (C) 2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import json def convert(raw_content): """ convert raw_content recursively: - from bytes to string - from string to dict Args: - raw_content (bytes / string / dict) Returns: - Dict of content (if string was json, otherwise returns string) """ if isinstance(raw_content, bytes): return convert(raw_content.decode()) if isinstance(raw_content, str): try: content = json.loads(raw_content) if content: return content else: return raw_content except json.decoder.JSONDecodeError: return raw_content if isinstance(raw_content, dict): return raw_content class BaseMapping(): """Base class for mappings to inherit from To implement a new mapping: - inherit this class - add a local property self.mapping - override translate function """ def translate(self, content_dict): """ Tranlsates content by parsing content to a json object and translating with the npm mapping (for now hard_coded mapping) Args: - context_text (text) : should be json Returns: - translated_metadata (dict): jsonb form needed for the indexer """ translated_metadata = {} default = 'other' translated_metadata['other'] = {} try: for k, v in content_dict.items(): try: term = self.mapping.get(k, default) if term not in translated_metadata: translated_metadata[term] = v continue if isinstance(translated_metadata[term], str): in_value = translated_metadata[term] translated_metadata[term] = [in_value, v] continue if isinstance(translated_metadata[term], list): translated_metadata[term].append(v) continue if isinstance(translated_metadata[term], dict): translated_metadata[term][k] = v continue except KeyError: self.log.exception( "Problem during item mapping") continue except: return None return translated_metadata class NpmMapping(BaseMapping): """ dedicated class for NPM (package.json) mapping and translation """ mapping = { 'repository': 'codeRepository', 'os': 'operatingSystem', 'cpu': 'processorRequirements', 'engines': 'processorRequirements', 'dependencies': 'softwareRequirements', 'bundleDependencies': 'softwareRequirements', 'peerDependencies': 'softwareRequirements', 'author': 'author', 'contributor': 'contributor', 'keywords': 'keywords', 'license': 'license', 'version': 'version', 'description': 'description', 'name': 'name', 'devDependencies': 'softwareSuggestions', 'optionalDependencies': 'softwareSuggestions', 'bugs': 'issueTracker', 'homepage': 'url' } def translate(self, raw_content): content_dict = convert(raw_content) return super().translate(content_dict) class MavenMapping(BaseMapping): """ dedicated class for Maven (pom.xml) mapping and translation """ mapping = { 'license': 'license', 'version': 'version', 'description': 'description', 'name': 'name', 'prerequisites': 'softwareRequirements', 'repositories': 'codeRepository', 'groupId': 'identifier', 'ciManagement': 'contIntegration', 'issuesManagement': 'issueTracker', } def translate(self, raw_content): content = convert(raw_content) # parse content from xml to dict return super().translate(content) class DoapMapping(BaseMapping): mapping = { } def translate(self, raw_content): content = convert(raw_content) # parse content from xml to dict return super().translate(content) def parse_xml(content): """ Parses content from xml to a python dict Args: - content (text): the string form of the raw_content ( in xml) Returns: - parsed_xml (dict): a python dict of the content after parsing """ # check if xml # use xml parser to dict return content mapping_tool_fn = { - "hard_mapping_npm": NpmMapping(), - "pom_xml": MavenMapping(), + "npm": NpmMapping(), + "maven": MavenMapping(), "doap_xml": DoapMapping() } def compute_metadata(context, raw_content): """ first landing method: a dispatcher that sends content to the right function to carry out the real parsing of syntax and translation of terms Args: - context (text) : defines to which function/tool the content is sent - content (text): the string form of the raw_content Returns: - translated_metadata (dict): jsonb form needed for the indexer to store in storage """ if raw_content is None or raw_content is b"": return None # TODO: keep mapping not in code (maybe fetch crosswalk from storage?) # if fetched from storage should be done once for batch of sha1s dictionary = mapping_tool_fn[context] translated_metadata = dictionary.translate(raw_content) # print(translated_metadata) return translated_metadata def main(): raw_content = """{"name": "test_name", "unknown_term": "ut"}""" raw_content1 = b"""{"name": "test_name", "unknown_term": "ut", "prerequisites" :"packageXYZ"}""" - result = compute_metadata("hard_mapping_npm", raw_content) - result1 = compute_metadata("pom_xml", raw_content1) + result = compute_metadata("npm", raw_content) + result1 = compute_metadata("maven", raw_content1) print(result) print(result1) if __name__ == "__main__": main() diff --git a/swh/indexer/tests/test_metadata.py b/swh/indexer/tests/test_metadata.py index fed24e8..2c90bbd 100644 --- a/swh/indexer/tests/test_metadata.py +++ b/swh/indexer/tests/test_metadata.py @@ -1,195 +1,199 @@ # Copyright (C) 2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import unittest import logging from nose.tools import istest from swh.indexer.metadata_dictionary import compute_metadata from swh.indexer.metadata import ContentMetadataIndexer from swh.indexer.tests.test_utils import MockObjStorage class MockStorage(): """Mock storage to simplify reading indexers' outputs. """ def content_metadata_add(self, metadata, conflict_update=None): self.state = metadata self.conflict_update = conflict_update def indexer_configuration_get(self, tool): return { 'id': 30, 'name': 'hard_mapping_npm', - 'version': '0.1' + 'version': '0.1', + 'configuration': { + 'type': 'local', + 'context': 'npm' + }, } class TestMetadataIndexer(ContentMetadataIndexer): """Specific Metadata whose configuration is enough to satisfy the indexing tests. """ def prepare(self): self.config = { 'rescheduling_task': None, 'tools': { - 'name': 'hard_mapping_npm', - 'version': '0.1', + 'name': 'swh-metadata-translator', + 'version': '0.0.1', 'configuration': { 'type': 'local', - 'debian-package': '' + 'context': 'npm' } } } self.storage = MockStorage() self.log = logging.getLogger('swh.indexer') self.objstorage = MockObjStorage() self.task_destination = None self.rescheduling_task = self.config['rescheduling_task'] self.tools = self.retrieve_tools_information() class Metadata(unittest.TestCase): """ Tests metadata_mock_tool tool for Metadata detection """ def setUp(self): """ shows the entire diff in the results """ self.maxDiff = None @istest def test_compute_metadata_none(self): """ testing content empty content is empty should return None """ # given content = b"" - tool = "hard_mapping_npm" + context = "npm" # None if no metadata was found or an error occurred declared_metadata = None # when - result = compute_metadata(tool, content) + result = compute_metadata(context, content) # then self.assertEqual(declared_metadata, result) @istest def test_compute_metadata_npm(self): """ testing only computation of metadata with hard_mapping_npm """ # given content = b""" { "name": "test_metadata", "version": "0.0.1", "description": "Simple package.json test for indexer", "repository": { "type": "git", "url": "https://github.com/moranegg/metadata_test" } } """ declared_metadata = { 'name': 'test_metadata', 'version': '0.0.1', 'description': 'Simple package.json test for indexer', 'codeRepository': { 'type': 'git', 'url': 'https://github.com/moranegg/metadata_test' }, 'other': {} } # when - result = compute_metadata("hard_mapping_npm", content) + result = compute_metadata("npm", content) # then self.assertEqual(declared_metadata, result) @istest def test_index_content_metadata_npm(self): """ testing NPM with package.json - one sha1 uses a file that can't be translated to metadata and should return None in the translated metadata """ # given sha1s = ['26a9f72a7c87cc9205725cfd879f514ff4f3d8d5', 'd4c647f0fc257591cc9ba1722484229780d1c607', '02fb2c89e14f7fab46701478c83779c7beb7b069'] # this metadata indexer computes only metadata for package.json # in npm context with a hard mapping metadata_indexer = TestMetadataIndexer() # when metadata_indexer.run(sha1s, policy_update='ignore-dups') results = metadata_indexer.storage.state expected_results = [{ 'indexer_configuration_id': 30, 'translated_metadata': { 'other': {}, 'codeRepository': { 'type': 'git', 'url': 'https://github.com/moranegg/metadata_test' }, 'description': 'Simple package.json test for indexer', 'name': 'test_metadata', 'version': '0.0.1' }, 'id': '26a9f72a7c87cc9205725cfd879f514ff4f3d8d5' }, { 'indexer_configuration_id': 30, 'translated_metadata': { 'softwareRequirements': { 'JSONStream': '~1.3.1', 'abbrev': '~1.1.0', 'ansi-regex': '~2.1.1', 'ansicolors': '~0.3.2', 'ansistyles': '~0.1.3' }, 'issueTracker': { 'url': 'https://github.com/npm/npm/issues' }, 'author': 'Isaac Z. Schlueter (http://blog.izs.me)', 'codeRepository': { 'type': 'git', 'url': 'https://github.com/npm/npm' }, 'description': 'a package manager for JavaScript', 'softwareSuggestions': { 'tacks': '~1.2.6', 'tap': '~10.3.2' }, 'license': 'Artistic-2.0', 'version': '5.0.3', 'other': { 'preferGlobal': True, 'config': { 'publishtest': False } }, 'name': 'npm', 'keywords': [ 'install', 'modules', 'package manager', 'package.json' ], 'url': 'https://docs.npmjs.com/' }, 'id': 'd4c647f0fc257591cc9ba1722484229780d1c607' }, { 'indexer_configuration_id': 30, 'translated_metadata': None, 'id': '02fb2c89e14f7fab46701478c83779c7beb7b069' }] # The assertion bellow returns False sometimes because of nested lists self.assertEqual(expected_results, results)