diff --git a/swh/indexer/metadata.py b/swh/indexer/metadata.py --- a/swh/indexer/metadata.py +++ b/swh/indexer/metadata.py @@ -6,7 +6,7 @@ import logging from swh.indexer.indexer import ContentIndexer, RevisionIndexer, OriginIndexer -from swh.indexer.metadata_dictionary import compute_metadata +from swh.indexer.metadata_dictionary import MAPPINGS from swh.indexer.metadata_detector import detect_metadata from swh.indexer.metadata_detector import extract_minimal_metadata_dict from swh.indexer.storage import INDEXER_CFG_KEY @@ -64,8 +64,9 @@ 'translated_metadata': None } try: - context = self.tool['tool_configuration']['context'] - result['translated_metadata'] = compute_metadata(context, data) + mapping_name = self.tool['tool_configuration']['context'] + result['translated_metadata'] = MAPPINGS[mapping_name] \ + .translate(data) # a twisted way to keep result with indexer object for get_results self.results.append(result) except Exception: @@ -121,7 +122,7 @@ 'version': '0.0.2', 'configuration': { 'type': 'local', - 'context': ['npm', 'codemeta'] + 'context': ['NpmMapping', 'CodemetaMapping'] }, }), } diff --git a/swh/indexer/metadata_detector.py b/swh/indexer/metadata_detector.py --- a/swh/indexer/metadata_detector.py +++ b/swh/indexer/metadata_detector.py @@ -4,10 +4,7 @@ # See top-level LICENSE file for more information -mapping_filenames = { - b"package.json": "npm", - b"codemeta.json": "codemeta" -} +from swh.indexer.metadata_dictionary import MAPPINGS def detect_metadata(files): @@ -21,15 +18,10 @@ - dictionary {mapping_filenames[name]:f['sha1']} """ results = {} - for f in files: - name = f['name'].lower().strip() - # TODO: possibility to detect extensions - if name in mapping_filenames: - tool = mapping_filenames[name] - if tool in results: - results[tool].append(f['sha1']) - else: - results[tool] = [f['sha1']] + for (mapping_name, mapping) in MAPPINGS.items(): + matches = mapping.detect_metadata_files(files) + if matches: + results[mapping_name] = matches return results diff --git a/swh/indexer/metadata_dictionary.py b/swh/indexer/metadata_dictionary.py --- a/swh/indexer/metadata_dictionary.py +++ b/swh/indexer/metadata_dictionary.py @@ -3,9 +3,11 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +import abc import csv import json import os.path +import logging import swh.indexer @@ -45,55 +47,66 @@ CROSSWALK_TABLE = read_crosstable(fd) -def convert(raw_content): - """ - convert raw_content recursively: +MAPPINGS = {} - - from bytes to string - - from string to dict - Args: - raw_content (bytes / string / dict) +def register_mapping(cls): + MAPPINGS[cls.__name__] = cls() + return cls - Returns: - dict: content (if string was json, otherwise returns string) - """ - if isinstance(raw_content, bytes): - return convert(raw_content.decode()) - if isinstance(raw_content, str): - try: - content = json.loads(raw_content) - if content: - return content - else: - return raw_content - except json.decoder.JSONDecodeError: - return raw_content - if isinstance(raw_content, dict): - return raw_content - - -class BaseMapping(): +class BaseMapping(metaclass=abc.ABCMeta): """Base class for mappings to inherit from To implement a new mapping: - inherit this class - - add a local property self.mapping - override translate function """ + def __init__(self): + self.log = logging.getLogger('%s.%s' % ( + self.__class__.__module__, + self.__class__.__name__)) + + @abc.abstractmethod + def detect_metadata_files(self, files): + """ + Detects files potentially containing metadata + Args: + - file_entries (list): list of files - def translate(self, content_dict): + Returns: + - empty list if nothing was found + - list of sha1 otherwise """ - Tranlsates content by parsing content to a json object - and translating with the npm mapping (for now hard_coded mapping) + pass + + @abc.abstractmethod + def translate(self, file_content): + pass + + +class DictMapping(BaseMapping): + """Base class for mappings that take as input a file that is mostly + a key-value store (eg. a shallow JSON dict).""" + + @property + @abc.abstractmethod + def mapping(self): + """A translation dict to map dict keys into a canonical name.""" + pass + + def translate_dict(self, content_dict): + """ + Translates content by parsing content from a dict object + and translating with the appropriate mapping Args: - context_text (text): should be json + content_dict (dict) Returns: - dict: translated metadata in jsonb form needed for the indexer + dict: translated metadata in json-friendly form needed for + the indexer """ translated_metadata = {} @@ -121,88 +134,68 @@ "Problem during item mapping") continue except Exception: + raise return None return translated_metadata -class NpmMapping(BaseMapping): - """ - dedicated class for NPM (package.json) mapping and translation - """ - mapping = CROSSWALK_TABLE['NodeJS'] - - def translate(self, raw_content): - content_dict = convert(raw_content) - return super().translate(content_dict) +class JsonMapping(DictMapping): + """Base class for all mappings that use a JSON file as input.""" + @property + @abc.abstractmethod + def filename(self): + """The .json file to extract metadata from.""" + pass -class MavenMapping(BaseMapping): - """ - dedicated class for Maven (pom.xml) mapping and translation - """ - mapping = CROSSWALK_TABLE['Java (Maven)'] + def detect_metadata_files(self, file_entries): + for entry in file_entries: + if entry['name'] == self.filename: + return [entry['sha1']] + return [] def translate(self, raw_content): - content = convert(raw_content) - # parse content from xml to dict - return super().translate(content) - + """ + Translates content by parsing content from a bytestring containing + json data and translating with the appropriate mapping -class DoapMapping(BaseMapping): - mapping = { + Args: + raw_content: bytes - } + Returns: + dict: translated metadata in json-friendly form needed for + the indexer - def translate(self, raw_content): - content = convert(raw_content) - # parse content from xml to dict - return super().translate(content) + """ + try: + raw_content = raw_content.decode() + except UnicodeDecodeError: + self.log.warning('Error unidecoding %r', raw_content) + return + try: + content_dict = json.loads(raw_content) + except json.JSONDecodeError: + self.log.warning('Error unjsoning %r' % raw_content) + return + return self.translate_dict(content_dict) -def parse_xml(content): +@register_mapping +class NpmMapping(JsonMapping): """ - Parses content from xml to a python dict - Args: - - content (text): the string form of the raw_content ( in xml) - - Returns: - - parsed_xml (dict): a python dict of the content after parsing + dedicated class for NPM (package.json) mapping and translation """ - # check if xml - # use xml parser to dict - return content - - -mapping_tool_fn = { - "npm": NpmMapping(), - "maven": MavenMapping(), - "doap_xml": DoapMapping() -} + mapping = CROSSWALK_TABLE['NodeJS'] + filename = b'package.json' -def compute_metadata(context, raw_content): +@register_mapping +class CodemetaMapping(JsonMapping): """ - first landing method: a dispatcher that sends content - to the right function to carry out the real parsing of syntax - and translation of terms - - Args: - context (text): defines to which function/tool the content is sent - content (text): the string form of the raw_content - - Returns: - dict: translated metadata jsonb dictionary needed for the indexer to - store in storage - + dedicated class for CodeMeta (codemeta.json) mapping and translation """ - if raw_content is None or raw_content is b"": - return None - - # TODO: keep mapping not in code (maybe fetch crosswalk from storage?) - # if fetched from storage should be done once for batch of sha1s - dictionary = mapping_tool_fn[context] - translated_metadata = dictionary.translate(raw_content) - return translated_metadata + mapping = CROSSWALK_TABLE['codemeta-V1'] + filename = b'codemeta.json' def main(): @@ -210,8 +203,8 @@ raw_content1 = b"""{"name": "test_name", "unknown_term": "ut", "prerequisites" :"packageXYZ"}""" - result = compute_metadata("npm", raw_content) - result1 = compute_metadata("maven", raw_content1) + result = MAPPINGS["NpmMapping"].translate(raw_content) + result1 = MAPPINGS["MavenMapping"].translate(raw_content1) print(result) print(result1) diff --git a/swh/indexer/sql/50-swh-data.sql b/swh/indexer/sql/50-swh-data.sql --- a/swh/indexer/sql/50-swh-data.sql +++ b/swh/indexer/sql/50-swh-data.sql @@ -14,10 +14,10 @@ values ('pygments', '2.0.1+dfsg-1.1+deb8u1', '{"type": "library", "debian-package": "python3-pygments", "max_content_size": 10240}'); insert into indexer_configuration(tool_name, tool_version, tool_configuration) -values ('swh-metadata-translator', '0.0.1', '{"type": "local", "context": "npm"}'); +values ('swh-metadata-translator', '0.0.1', '{"type": "local", "context": "NpmMapping"}'); insert into indexer_configuration(tool_name, tool_version, tool_configuration) -values ('swh-metadata-detector', '0.0.1', '{"type": "local", "context": ["npm", "codemeta"]}'); +values ('swh-metadata-detector', '0.0.1', '{"type": "local", "context": ["NpmMapping", "CodemetaMapping"]}'); insert into indexer_configuration(tool_name, tool_version, tool_configuration) values ('swh-deposit', '0.0.1', '{"sword_version": "2"}'); diff --git a/swh/indexer/tests/storage/test_storage.py b/swh/indexer/tests/storage/test_storage.py --- a/swh/indexer/tests/storage/test_storage.py +++ b/swh/indexer/tests/storage/test_storage.py @@ -1601,7 +1601,7 @@ tool = { 'tool_name': 'swh-metadata-translator', 'tool_version': '0.0.1', - 'tool_configuration': {"type": "local", "context": "npm"}, + 'tool_configuration': {"type": "local", "context": "NpmMapping"}, } actual_tool = self.storage.indexer_configuration_get(tool) diff --git a/swh/indexer/tests/test_metadata.py b/swh/indexer/tests/test_metadata.py --- a/swh/indexer/tests/test_metadata.py +++ b/swh/indexer/tests/test_metadata.py @@ -6,7 +6,7 @@ import unittest import logging -from swh.indexer.metadata_dictionary import compute_metadata, CROSSWALK_TABLE +from swh.indexer.metadata_dictionary import CROSSWALK_TABLE, MAPPINGS from swh.indexer.metadata_detector import detect_metadata from swh.indexer.metadata_detector import extract_minimal_metadata_dict from swh.indexer.metadata import ContentMetadataIndexer @@ -54,7 +54,7 @@ 'version': '0.0.2', 'configuration': { 'type': 'local', - 'context': 'npm' + 'context': 'NpmMapping' } } } @@ -83,7 +83,7 @@ 'version': '0.0.2', 'configuration': { 'type': 'local', - 'context': 'npm' + 'context': 'NpmMapping' } } MockIndexerStorage.added_data = [] @@ -120,12 +120,11 @@ """ # given content = b"" - context = "npm" # None if no metadata was found or an error occurred declared_metadata = None # when - result = compute_metadata(context, content) + result = MAPPINGS["NpmMapping"].translate(content) # then self.assertEqual(declared_metadata, result) @@ -157,7 +156,7 @@ } # when - result = compute_metadata("npm", content) + result = MAPPINGS["NpmMapping"].translate(content) # then self.assertEqual(declared_metadata, result) @@ -329,7 +328,7 @@ results = detect_metadata(df) expected_results = { - 'npm': [ + 'NpmMapping': [ b'cde' ] } diff --git a/swh/indexer/tests/test_utils.py b/swh/indexer/tests/test_utils.py --- a/swh/indexer/tests/test_utils.py +++ b/swh/indexer/tests/test_utils.py @@ -248,7 +248,7 @@ 'tool_version': '0.0.1', 'tool_configuration': { 'type': 'local', - 'context': 'npm' + 'context': 'NpmMapping' }, }] elif tool['tool_name'] == 'swh-metadata-detector': @@ -258,7 +258,7 @@ 'tool_version': '0.0.1', 'tool_configuration': { 'type': 'local', - 'context': 'npm' + 'context': 'NpmMapping' }, }] elif tool['tool_name'] == 'origin-metadata': @@ -291,7 +291,7 @@ 'tool': { 'configuration': { 'type': 'local', - 'context': 'npm' + 'context': 'NpmMapping' }, 'version': '0.0.1', 'id': 6,