diff --git a/swh/indexer/indexer.py b/swh/indexer/indexer.py index 638a952..b18f954 100644 --- a/swh/indexer/indexer.py +++ b/swh/indexer/indexer.py @@ -1,386 +1,385 @@ # Copyright (C) 2016-2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import abc import os import logging import shutil import tempfile from swh.core.config import SWHConfig from swh.objstorage import get_objstorage from swh.objstorage.exc import ObjNotFoundError from swh.model import hashutil from swh.storage import get_storage from swh.scheduler.utils import get_task class DiskIndexer: """Mixin intended to be used with other *Indexer classes. Indexer* inheriting from this class are a category of indexers which needs the disk for their computations. Expects: self.working_directory variable defined at runtime. """ def __init__(self): super().__init__() def write_to_temp(self, filename, data): """Write the sha1's content in a temporary file. Args: sha1 (str): the sha1 name filename (str): one of sha1's many filenames data (bytes): the sha1's content to write in temporary file Returns: The path to the temporary file created. That file is filled in with the raw content's data. """ os.makedirs(self.working_directory, exist_ok=True) temp_dir = tempfile.mkdtemp(dir=self.working_directory) content_path = os.path.join(temp_dir, filename) with open(content_path, 'wb') as f: f.write(data) return content_path def cleanup(self, content_path): """Remove content_path from working directory. Args: content_path (str): the file to remove """ temp_dir = os.path.dirname(content_path) shutil.rmtree(temp_dir) class BaseIndexer(SWHConfig, metaclass=abc.ABCMeta): """Base class for indexers to inherit from. The main entry point is the `run` functions which is in charge to trigger the computations on the ids batch received. Indexers can: - filter out ids whose data has already been indexed. - retrieve ids data from storage or objstorage - index this data depending on the object and store the result in storage. To implement a new object type indexer, inherit from the BaseIndexer and implement the process of indexation : - def run(self, object_ids, policy_update): object_ids are different depending on object. For example: sha1 for content, sha1_git for revision, directorie, release, and id for origin To implement a new concrete indexer, inherit from the object level classes: ContentIndexer, RevisionIndexer (later on OriginIndexer will also be available) Then you need to implement the following functions: - def filter(self, ids): filter out data already indexed (in storage). This function is used by the orchestrator and not directly by the indexer (cf. swh.indexer.orchestrator.BaseOrchestratorIndexer). - def index_object(self, id, data): compute index on id with data (retrieved from the storage or the objstorage by the id key) and return the resulting index computation. - def persist_index_computations(self, results, policy_update): persist the results of multiple index computations in the storage. The new indexer implementation can also override the following functions: - def prepare(self): Configuration preparation for the indexer. When overriding, this must call the super().prepare() function. - def check(self): Configuration check for the indexer. When overriding, this must call the super().check() function. - def retrieve_tools_information(self): This should return a dict of the tool(s) to use when indexing or filtering. """ CONFIG = 'indexer/base' DEFAULT_CONFIG = { 'storage': ('dict', { 'host': 'uffizi', 'cls': 'remote', 'args': {'root': '/tmp/softwareheritage/objects', 'slicing': '0:2/2:4/4:6'} }), # queue to reschedule if problem (none for no rescheduling, # the default) 'rescheduling_task': ('str', None), 'objstorage': ('dict', { 'cls': 'multiplexer', 'args': { 'objstorages': [{ 'cls': 'filtered', 'args': { 'storage_conf': { 'cls': 'azure-storage', 'args': { 'account_name': '0euwestswh', 'api_secret_key': 'secret', 'container_name': 'contents' } }, 'filters_conf': [ {'type': 'readonly'}, {'type': 'prefix', 'prefix': '0'} ] } }, { 'cls': 'filtered', 'args': { 'storage_conf': { 'cls': 'azure-storage', 'args': { 'account_name': '1euwestswh', 'api_secret_key': 'secret', 'container_name': 'contents' } }, 'filters_conf': [ {'type': 'readonly'}, {'type': 'prefix', 'prefix': '1'} ] } }] }, }), } ADDITIONAL_CONFIG = {} def __init__(self): """Prepare and check that the indexer is ready to run. """ super().__init__() self.prepare() self.check() def prepare(self): """Prepare the indexer's needed runtime configuration. Without this step, the indexer cannot possibly run. """ self.config = self.parse_config_file( additional_configs=[self.ADDITIONAL_CONFIG]) objstorage = self.config['objstorage'] self.objstorage = get_objstorage(objstorage['cls'], objstorage['args']) storage = self.config['storage'] self.storage = get_storage(storage['cls'], storage['args']) rescheduling_task = self.config['rescheduling_task'] if rescheduling_task: self.rescheduling_task = get_task(rescheduling_task) else: self.rescheduling_task = None l = logging.getLogger('requests.packages.urllib3.connectionpool') l.setLevel(logging.WARN) self.log = logging.getLogger('swh.indexer') self.tools = self.retrieve_tools_information() def check(self): """Check the indexer's configuration is ok before proceeding. If ok, does nothing. If not raise error. """ if not self.tools: raise ValueError('Tools %s is unknown, cannot continue' % self.config['tools']) def retrieve_tools_information(self): """Permit to define how to retrieve tool information based on configuration. Add a sensible default which can be overridden if not sufficient. (For now, all indexers use only one tool) """ tool = { 'tool_%s' % key: value for key, value in self.config['tools'].items() } return self.storage.indexer_configuration_get(tool) @abc.abstractmethod def filter(self, ids): """Filter missing ids for that particular indexer. Args: ids ([bytes]): list of ids Yields: iterator of missing ids """ pass @abc.abstractmethod def index(self, id, data): """Index computation for the sha1 and associated raw content. Args: id (bytes): sha1 identifier content (bytes): id's data from storage or objstorage depending on object type Returns: a dict that makes sense for the persist_index_computations function. """ pass @abc.abstractmethod def persist_index_computations(self, results, policy_update): """Persist the computation resulting from the index. Args: results ([result]): List of results. One result is the result of the index function. policy_update ([str]): either 'update-dups' or 'ignore-dups' to respectively update duplicates or ignore them Returns: None """ pass def next_step(self, results): """Do something else with computations results (e.g. send to another queue, ...). (This is not an abstractmethod since it is optional). Args: results ([result]): List of results (dict) as returned by index function. Returns: None """ pass @abc.abstractmethod def run(self, ids, policy_update): """Given a list of ids: - retrieves the data from the storage - executes the indexing computations - stores the results (according to policy_update) Args: ids ([bytes]): id's identifier list policy_update ([str]): either 'update-dups' or 'ignore-dups' to respectively update duplicates or ignore them """ pass class ContentIndexer(BaseIndexer): """ An object type indexer, inherit from the BaseIndexer and implement the process of indexation for Contents with the run method Note: the ContentIndexer is not an instantiable object to use it in another context one should refer to the instructions in the BaseIndexer """ def run(self, sha1s, policy_update): """Given a list of sha1s: - retrieve the content from the storage - execute the indexing computations - store the results (according to policy_update) Args: sha1s ([bytes]): sha1's identifier list policy_update ([str]): either 'update-dups' or 'ignore-dups' to respectively update duplicates or ignore them """ results = [] try: for sha1 in sha1s: try: raw_content = self.objstorage.get(sha1) except ObjNotFoundError: self.log.warn('Content %s not found in objstorage' % hashutil.hash_to_hex(sha1)) continue res = self.index(sha1, raw_content) if res: # If no results, skip it results.append(res) self.persist_index_computations(results, policy_update) self.next_step(results) except Exception: self.log.exception( 'Problem when reading contents metadata.') if self.rescheduling_task: self.log.warn('Rescheduling batch') self.rescheduling_task.delay(sha1s, policy_update) class RevisionIndexer(BaseIndexer): """ An object type indexer, inherit from the BaseIndexer and implement the process of indexation for Revisions with the run method Note: the RevisionIndexer is not an instantiable object to use it in another context one should refer to the instructions in the BaseIndexer """ def run(self, sha1_gits, policy_update): """ Given a list of sha1_gits: - retrieve revsions from storage - execute the indexing computations - store the results (according to policy_update) Args: sha1_gits ([bytes]): sha1_git's identifier list policy_update ([str]): either 'update-dups' or 'ignore-dups' to respectively update duplicates or ignore them """ results = [] try: for sha1_git in sha1_gits: try: revs = self.storage.revision_get([sha1_git]) except ValueError: self.log.warn('Revision %s not found in storage' % hashutil.hash_to_hex(sha1_git)) continue for rev in revs: if rev: # If no revision, skip it res = self.index(rev) - print(res) if res: # If no results, skip it results.append(res) self.persist_index_computations(results, policy_update) except Exception: self.log.exception( 'Problem when processing revision') diff --git a/swh/indexer/metadata.py b/swh/indexer/metadata.py index 7211e1d..2491c0d 100644 --- a/swh/indexer/metadata.py +++ b/swh/indexer/metadata.py @@ -1,259 +1,262 @@ # Copyright (C) 2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from swh.indexer.indexer import ContentIndexer, RevisionIndexer from swh.indexer.metadata_dictionary import compute_metadata from swh.indexer.metadata_detector import detect_metadata from swh.indexer.metadata_detector import extract_minimal_metadata_dict from swh.model import hashutil -from swh.objstorage.exc import ObjNotFoundError class ContentMetadataIndexer(ContentIndexer): """Indexer at content level in charge of: - filtering out content already indexed in content_metadata - reading content from objstorage with the content's id sha1 - computing translated_metadata by given context - using the metadata_dictionary as the 'swh-metadata-translator' tool - store result in content_metadata table """ CONFIG_BASE_FILENAME = 'indexer/metadata' - ADDITIONAL_CONFIG = { - 'tools': ('dict', { - 'name': 'swh-metadata-translator', - 'version': '0.0.1', - 'configuration': { - 'type': 'local', - 'context': 'npm' - }, - }), - } - - def __init__(self): - self.config = self.parse_config_file( - config_filename="~/.config/swh/storage.yml", - additional_configs=[self.ADDITIONAL_CONFIG]) + def __init__(self, tool): + self.tool = tool super().__init__() def prepare(self): super().prepare() self.results = [] + def retrieve_tools_information(self): + self.config['tools'] = self.tool + return super().retrieve_tools_information() + def filter(self, sha1s): """Filter out known sha1s and return only missing ones. """ yield from self.storage.content_metadata_missing(( { 'id': sha1, 'indexer_configuration_id': self.tools['id'], } for sha1 in sha1s )) def index(self, sha1, raw_content): """Index sha1s' content and store result. Args: sha1 (bytes): content's identifier raw_content (bytes): raw content in bytes Returns: result (dict): representing a content_metadata if translation wasn't successful the translated_metadata keys will be kept as None """ result = { 'id': sha1, 'indexer_configuration_id': self.tools['id'], 'translated_metadata': None } try: - context = self.tools['configuration']['context'] + context = self.tools['tool_configuration']['context'] result['translated_metadata'] = compute_metadata( context, raw_content) # a twisted way to keep result with indexer object for get_results self.results.append(result) except: self.log.exception( "Problem during tool retrieval of metadata translation") return result def persist_index_computations(self, results, policy_update): """Persist the results in storage. Args: results ([dict]): list of content_metadata, dict with the following keys: - id (bytes): content's identifier (sha1) - translated_metadata (jsonb): detected metadata policy_update ([str]): either 'update-dups' or 'ignore-dups' to respectively update duplicates or ignore them """ self.storage.content_metadata_add( results, conflict_update=(policy_update == 'update-dups')) def get_results(self): """ can be called only if run method was called before Returns: results (list): list of content_metadata entries calculated by current indxer """ return self.results class RevisionMetadataIndexer(RevisionIndexer): """Indexer at Revision level in charge of: - filtering revisions already indexed in revision_metadata table with defined computation tool - retrieve all entry_files in root directory - use metadata_detector for file_names containig metadata - compute metadata translation if necessary and possible (depends on tool) - send sha1s to content indexing if possible - store the results for revision """ + CONFIG_BASE_FILENAME = 'indexer/metadata' + ADDITIONAL_CONFIG = { - 'destination_queue': ('str', None), 'tools': ('dict', { 'name': 'swh-metadata-detector', - 'version': '0.1', + 'version': '0.0.1', 'configuration': { 'type': 'local', - 'contexts': ['npm'] + 'context': ['npm', 'codemeta'] }, }), } def prepare(self): super().prepare() - self.tools = self.retrieve_tools_information() - print(self.tools) def filter(self, sha1_gits): """Filter out known sha1s and return only missing ones. """ yield from self.storage.revision_metadata_missing(( { 'id': sha1_git, 'indexer_configuration_id': self.tools['id'], } for sha1_git in sha1_gits )) def index(self, rev): """Index rev by processing it and organizing result. use metadata_detector to iterate on filenames - if one filename detected -> sends file to content indexer - if multiple file detected -> translation needed at revision level Args: rev (bytes): revision artifact from storage Returns: A dict, representing a revision_metadata, with keys: - id (bytes): rev's identifier (sha1_git) - indexer_configuration_id (bytes): tool used - translated_metadata (bytes): dict of retrieved metadata """ try: result = { 'id': rev['id'], 'indexer_configuration_id': self.tools['id'], 'translated_metadata': None } root_dir = rev['directory'] - dir_ls = self.storage.directory_ls(root_dir, recursive=True) + dir_ls = self.storage.directory_ls(root_dir, recursive=False) files = (entry for entry in dir_ls if entry['type'] == 'file') detected_files = detect_metadata(files) result['translated_metadata'] = self.translate_revision_metadata( detected_files) except Exception as e: self.log.exception( - 'Problem when indexing rev: ', e) + 'Problem when indexing rev') print(e) - return result def persist_index_computations(self, results, policy_update): """Persist the results in storage. Args: results ([dict]): list of content_mimetype, dict with the following keys: - id (bytes): content's identifier (sha1) - mimetype (bytes): mimetype in bytes - encoding (bytes): encoding in bytes policy_update ([str]): either 'update-dups' or 'ignore-dups' to respectively update duplicates or ignore them """ - self.storage.revision_metadata_add( - results, conflict_update=(policy_update == 'update-dups')) + # TODO: add functions in storage to keep data in revision_metadata + # self.storage.reivision_metadata_add( + # results, conflict_update=(policy_update == 'update-dups')) + pass def translate_revision_metadata(self, detected_files): """ Determine plan of action to translate metadata when containing one or multiple detected files: Args: - detected_files : dict with context name and list of sha1s (e.g : {'npm' : [sha1_1, sha1_2], 'authors': sha1_3}) Returns: - translated_metadata: dict with the CodeMeta vocabulary """ - print(detected_files) translated_metadata = [] - + tool = { + 'name': 'swh-metadata-translator', + 'version': '0.0.1', + 'configuration': { + 'type': 'local', + 'context': None + }, + } # TODO: iterate on each context, on each file # -> get raw_contents # -> translate each content for context in detected_files.keys(): + tool['configuration']['context'] = context for sha1 in detected_files[context]: - try: - raw_content = self.objstorage.get(sha1) - except ObjNotFoundError: - self.log.warn('Content %s not found in objstorage' % - hashutil.hash_to_hex(sha1)) - # sends to raw_content 'swh-metadata-translator' - local_metadata = compute_metadata(context, raw_content) - # aggregating metadata - translated_metadata.append(local_metadata) - # for now this method doesn't call the ContentMetadataIndexer - # due to configuration issue that should be resolved with a better - # configuration management plan, should look like this - ##################################################################### - # send sha1s to ContentMetadataIndexer - # c_metadata_indexer = ContentMetadataIndexer() - # c_metadata_indexer.run(sha1s, policy_update='ignore-dups') - # translated_metadata = c_metadata_indexer.get_results() - ##################################################################### - # open questions: - # do we keep at revision level the translated_metadata of one file? - # we have a key in the swh-metadata-translator named 'other' - # to keep undefined categories, should we delete this ? - extract_minimal_metadata_dict(translated_metadata) - return translated_metadata + local_metadata = {} + # fetch content_metadata from storage + metadata_generator = self.storage.content_metadata_get([sha1]) + metadata_generated = False + for c in metadata_generator: + # print(c) + metadata_generated = True + # extracting translated_metadata + local_metadata = c['translated_metadata'] + if not metadata_generated: + # schedule indexation of content + try: + c_metadata_indexer = ContentMetadataIndexer(tool) + c_metadata_indexer.run([sha1], + policy_update='ignore-dups') + local_metadata = c_metadata_indexer.get_results() + except Exception as e: + self.log.warn("""indexing Content %s with + ContentMetadataIndexer raises + exeception""" % + hashutil.hash_to_hex(sha1)) + print(e) + # local metadata is aggregated + if local_metadata: + translated_metadata.append(local_metadata) + # transform translated_metadata into min set with swh-metadata-detector + min_metadata = extract_minimal_metadata_dict(translated_metadata) + return min_metadata def main(): rev_metadata_indexer = RevisionMetadataIndexer() sha1_git1 = hashutil.hash_to_bytes( '8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f') sha1_git2 = hashutil.hash_to_bytes( - '8dbb6aeb036e7fd80664eb8bfd1507881af1ba94') - sha1_gits = [sha1_git1, sha1_git2] + '026040ea79dec1b49b4e3e7beda9132b6b26b51b') + sha1_git3 = hashutil.hash_to_bytes( + '9699072e21eded4be8d45e3b8d543952533fa190') + sha1_gits = [sha1_git1, sha1_git2, sha1_git3] rev_metadata_indexer.run(sha1_gits, 'update-dups') if __name__ == '__main__': main() diff --git a/swh/indexer/metadata_detector.py b/swh/indexer/metadata_detector.py new file mode 100644 index 0000000..ceb4a53 --- /dev/null +++ b/swh/indexer/metadata_detector.py @@ -0,0 +1,72 @@ +# Copyright (C) 2017 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + + +mapping_filenames = { + b"package.json": "npm", + b"codemeta.json": "codemeta" +} + + +def detect_metadata(files): + """ + Detects files potentially containing metadata + Args: + - file_entries (list): list of files + + Returns: + - empty list if nothing was found + - dictionary {mapping_filenames[name]:f['sha1']} + """ + results = {} + for f in files: + name = f['name'].lower().strip() + # TODO: possibility to detect extensions + if name in mapping_filenames: + tool = mapping_filenames[name] + if tool in results: + results[tool].append(f['sha1']) + else: + results[tool] = [f['sha1']] + return results + + +def extract_minimal_metadata_dict(metadata_list): + """ + Every item in the metadata_list is a dict of translated_metadata in the + CodeMeta vocabulary + we wish to extract a minimal set of terms and keep all values corresponding + to this term + Args: + - metadata_list (list): list of dicts of translated_metadata + + Returns: + - minimal_dict (dict): one dict with selected values of metadata + """ + minimal_dict = { + "developmentStatus": [], + "version": [], + "operatingSystem": [], + "description": [], + "keywords": [], + "issueTracker": [], + "name": [], + "author": [], + "relatedLink": [], + "url": [], + "type": [], + "license": [], + "maintainer": [], + "email": [], + "softwareRequirements": [], + "identifier": [] + } + for term in minimal_dict.keys(): + for metadata_dict in metadata_list: + if term in metadata_dict: + minimal_dict[term].append(metadata_dict[term]) + if not minimal_dict[term]: + minimal_dict[term] = None + return minimal_dict diff --git a/swh/indexer/tests/test_metadata.py b/swh/indexer/tests/test_metadata.py index 630fd68..a8d3bbe 100644 --- a/swh/indexer/tests/test_metadata.py +++ b/swh/indexer/tests/test_metadata.py @@ -1,278 +1,244 @@ # Copyright (C) 2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import unittest import logging from nose.tools import istest from swh.indexer.metadata_dictionary import compute_metadata from swh.indexer.metadata import ContentMetadataIndexer +from swh.indexer.metadata import RevisionMetadataIndexer from swh.indexer.tests.test_utils import MockObjStorage class MockStorage(): """Mock storage to simplify reading indexers' outputs. """ + def content_metadata_get(self, sha1s): + yield + def content_metadata_add(self, metadata, conflict_update=None): self.state = metadata self.conflict_update = conflict_update def revision_metadata_add(self, metadata, conflict_update=None): self.state = metadata self.conflict_update = conflict_update def indexer_configuration_get(self, tool): if tool['tool_name'] == 'swh-metadata-translator': return { 'id': 30, - 'name': 'swh-metadata-translator', - 'version': '0.1', - 'configuration': { + 'tool_name': 'swh-metadata-translator', + 'tool_version': '0.0.1', + 'tool_configuration': { 'type': 'local', 'context': 'npm' }, } elif tool['tool_name'] == 'swh-metadata-detector': return { 'id': 7, - 'name': 'swh-metadata-detector', - 'version': '0.1', - 'configuration': { + 'tool_name': 'swh-metadata-detector', + 'tool_version': '0.0.1', + 'tool_configuration': { 'type': 'local', 'context': 'npm' }, } - def revision_missing(self, revisions, cur=None): - pass - - def revision_get(self, revisions): - """Get all revisions from storage - Args: an iterable of revision ids - Returns: an iterable of revisions as dictionaries - (or None if the revision doesn't exist) - """ - pass - - def directory_get(self, - directories, - cur=None): - """Get information on directories. - - Args: - - directories: an iterable of directory ids - - Returns: - List of directories as dict with keys and associated values. - - """ - pass - - def directory_ls(self, directory, recursive=False, cur=None): - """Get entries for one directory. - - Args: - - directory: the directory to list entries from. - - recursive: if flag on, this list recursively from this directory. - Returns: - List of entries for such directory. - - """ - pass - - -class TestMetadataIndexer(ContentMetadataIndexer): +class TestContentMetadataIndexer(ContentMetadataIndexer): """Specific Metadata whose configuration is enough to satisfy the indexing tests. """ def prepare(self): self.config = { 'rescheduling_task': None, - 'tools': { - 'name': 'swh-metadata-translator', - 'version': '0.0.1', - 'configuration': { - 'type': 'local', - 'context': 'npm' - } - } } self.storage = MockStorage() self.log = logging.getLogger('swh.indexer') self.objstorage = MockObjStorage() self.task_destination = None self.rescheduling_task = self.config['rescheduling_task'] self.tools = self.retrieve_tools_information() self.results = [] -# class TestRevisionMetadataIndexer(RevsionMetadataIndexer): -# """Specific indexer whose configuration is enough to satisfy the -# indexing tests. -# """ -# def prepare(self): -# self.config = { -# 'rescheduling_task': None, -# 'tools': { -# 'name': 'swh-metadata-detector', -# 'version': '0.0.1', -# 'configuration': { -# 'type': 'local', -# 'context': 'npm' -# } -# } -# } -# self.storage = MockStorage() -# self.log = logging.getLogger('swh.indexer') -# self.objstorage = MockObjStorage() -# self.task_destination = None -# self.rescheduling_task = self.config['rescheduling_task'] -# self.tools = self.retrieve_tools_information() -# self.results = [] +class TestRevisionMetadataIndexer(RevisionMetadataIndexer): + """Specific indexer whose configuration is enough to satisfy the + indexing tests. + """ + def prepare(self): + self.config = { + 'rescheduling_task': None, + } + self.storage = MockStorage() + self.log = logging.getLogger('swh.indexer') + self.objstorage = MockObjStorage() + self.task_destination = None + self.rescheduling_task = self.config['rescheduling_task'] + self.tools = self.retrieve_tools_information() + self.results = [] class Metadata(unittest.TestCase): """ Tests metadata_mock_tool tool for Metadata detection """ def setUp(self): """ shows the entire diff in the results """ self.maxDiff = None + self.content_tool = { + 'name': 'swh-metadata-translator', + 'version': '0.0.1', + 'configuration': { + 'type': 'local', + 'context': 'npm' + } + } + self.revision_tool = { + 'name': 'swh-metadata-detector', + 'version': '0.0.1', + 'configuration': { + 'type': 'local', + 'context': 'npm' + } + } @istest def test_compute_metadata_none(self): """ testing content empty content is empty should return None """ # given content = b"" context = "npm" # None if no metadata was found or an error occurred declared_metadata = None # when result = compute_metadata(context, content) # then self.assertEqual(declared_metadata, result) @istest def test_compute_metadata_npm(self): """ testing only computation of metadata with hard_mapping_npm """ # given content = b""" { "name": "test_metadata", "version": "0.0.1", "description": "Simple package.json test for indexer", "repository": { "type": "git", "url": "https://github.com/moranegg/metadata_test" } } """ declared_metadata = { 'name': 'test_metadata', 'version': '0.0.1', 'description': 'Simple package.json test for indexer', 'codeRepository': { 'type': 'git', 'url': 'https://github.com/moranegg/metadata_test' }, 'other': {} } # when result = compute_metadata("npm", content) # then self.assertEqual(declared_metadata, result) @istest def test_index_content_metadata_npm(self): """ testing NPM with package.json - one sha1 uses a file that can't be translated to metadata and should return None in the translated metadata """ # given sha1s = ['26a9f72a7c87cc9205725cfd879f514ff4f3d8d5', 'd4c647f0fc257591cc9ba1722484229780d1c607', '02fb2c89e14f7fab46701478c83779c7beb7b069'] # this metadata indexer computes only metadata for package.json # in npm context with a hard mapping - metadata_indexer = TestMetadataIndexer() + metadata_indexer = TestContentMetadataIndexer(self.content_tool) # when metadata_indexer.run(sha1s, policy_update='ignore-dups') results = metadata_indexer.storage.state expected_results = [{ 'indexer_configuration_id': 30, 'translated_metadata': { 'other': {}, 'codeRepository': { 'type': 'git', 'url': 'https://github.com/moranegg/metadata_test' }, 'description': 'Simple package.json test for indexer', 'name': 'test_metadata', 'version': '0.0.1' }, 'id': '26a9f72a7c87cc9205725cfd879f514ff4f3d8d5' }, { 'indexer_configuration_id': 30, 'translated_metadata': { 'softwareRequirements': { 'JSONStream': '~1.3.1', 'abbrev': '~1.1.0', 'ansi-regex': '~2.1.1', 'ansicolors': '~0.3.2', 'ansistyles': '~0.1.3' }, 'issueTracker': { 'url': 'https://github.com/npm/npm/issues' }, 'author': 'Isaac Z. Schlueter (http://blog.izs.me)', 'codeRepository': { 'type': 'git', 'url': 'https://github.com/npm/npm' }, 'description': 'a package manager for JavaScript', 'softwareSuggestions': { 'tacks': '~1.2.6', 'tap': '~10.3.2' }, 'license': 'Artistic-2.0', 'version': '5.0.3', 'other': { 'preferGlobal': True, 'config': { 'publishtest': False } }, 'name': 'npm', 'keywords': [ 'install', 'modules', 'package manager', 'package.json' ], 'url': 'https://docs.npmjs.com/' }, 'id': 'd4c647f0fc257591cc9ba1722484229780d1c607' }, { 'indexer_configuration_id': 30, 'translated_metadata': None, 'id': '02fb2c89e14f7fab46701478c83779c7beb7b069' }] # The assertion bellow returns False sometimes because of nested lists self.assertEqual(expected_results, results)