diff --git a/swh/indexer/indexer.py b/swh/indexer/indexer.py index 33999d3..bc37773 100644 --- a/swh/indexer/indexer.py +++ b/swh/indexer/indexer.py @@ -1,383 +1,383 @@ # Copyright (C) 2016-2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import abc import os import logging import shutil import tempfile from swh.core.config import SWHConfig from swh.objstorage import get_objstorage from swh.objstorage.exc import ObjNotFoundError from swh.model import hashutil from swh.storage import get_storage from swh.scheduler.utils import get_task class DiskIndexer: """Mixin intended to be used with other *Indexer classes. Indexer* inheriting from this class are a category of indexers which needs the disk for their computations. Expects: self.working_directory variable defined at runtime. """ def __init__(self): super().__init__() def write_to_temp(self, filename, data): """Write the sha1's content in a temporary file. Args: sha1 (str): the sha1 name filename (str): one of sha1's many filenames data (bytes): the sha1's content to write in temporary file Returns: The path to the temporary file created. That file is filled in with the raw content's data. """ os.makedirs(self.working_directory, exist_ok=True) temp_dir = tempfile.mkdtemp(dir=self.working_directory) content_path = os.path.join(temp_dir, filename) with open(content_path, 'wb') as f: f.write(data) return content_path def cleanup(self, content_path): """Remove content_path from working directory. Args: content_path (str): the file to remove """ temp_dir = os.path.dirname(content_path) shutil.rmtree(temp_dir) class BaseIndexer(SWHConfig, metaclass=abc.ABCMeta): """Base class for indexers to inherit from. The main entry point is the `run` functions which is in charge to trigger the computations on the ids batch received. Indexers can: - filter out ids whose data has already been indexed. - retrieve ids data from storage or objstorage - index this data depending on the object and store the result in storage. To implement a new object type indexer, inherit from the BaseIndexer and implement the process of indexation : - def run(self, object_ids, policy_update): object_ids are different depending on object. For example: sha1 for content, sha1_git for revision, directory, release, and id for origin To implement a new concrete indexer, inherit from the object level classes: ContentIndexer, RevisionIndexer (later on OriginIndexer will also be available) Then you need to implement the following functions: - def filter(self, ids): filter out data already indexed (in storage). This function is used by the orchestrator and not directly by the indexer (cf. swh.indexer.orchestrator.BaseOrchestratorIndexer). - def index_object(self, id, data): compute index on id with data (retrieved from the storage or the objstorage by the id key) and return the resulting index computation. - def persist_index_computations(self, results, policy_update): persist the results of multiple index computations in the storage. The new indexer implementation can also override the following functions: - def prepare(self): Configuration preparation for the indexer. When overriding, this must call the super().prepare() function. - def check(self): Configuration check for the indexer. When overriding, this must call the super().check() function. - def retrieve_tools_information(self): This should return a dict of the tool(s) to use when indexing or filtering. """ CONFIG = 'indexer/base' DEFAULT_CONFIG = { 'storage': ('dict', { 'host': 'uffizi', 'cls': 'remote', 'args': {'root': '/tmp/softwareheritage/objects', 'slicing': '0:2/2:4/4:6'} }), # queue to reschedule if problem (none for no rescheduling, # the default) 'rescheduling_task': ('str', None), 'objstorage': ('dict', { 'cls': 'multiplexer', 'args': { 'objstorages': [{ 'cls': 'filtered', 'args': { 'storage_conf': { 'cls': 'azure-storage', 'args': { 'account_name': '0euwestswh', 'api_secret_key': 'secret', 'container_name': 'contents' } }, 'filters_conf': [ {'type': 'readonly'}, {'type': 'prefix', 'prefix': '0'} ] } }, { 'cls': 'filtered', 'args': { 'storage_conf': { 'cls': 'azure-storage', 'args': { 'account_name': '1euwestswh', 'api_secret_key': 'secret', 'container_name': 'contents' } }, 'filters_conf': [ {'type': 'readonly'}, {'type': 'prefix', 'prefix': '1'} ] } }] }, }), } ADDITIONAL_CONFIG = {} def __init__(self): """Prepare and check that the indexer is ready to run. """ super().__init__() self.prepare() self.check() def prepare(self): """Prepare the indexer's needed runtime configuration. Without this step, the indexer cannot possibly run. """ self.config = self.parse_config_file( additional_configs=[self.ADDITIONAL_CONFIG]) objstorage = self.config['objstorage'] self.objstorage = get_objstorage(objstorage['cls'], objstorage['args']) storage = self.config['storage'] self.storage = get_storage(storage['cls'], storage['args']) rescheduling_task = self.config['rescheduling_task'] if rescheduling_task: self.rescheduling_task = get_task(rescheduling_task) else: self.rescheduling_task = None l = logging.getLogger('requests.packages.urllib3.connectionpool') l.setLevel(logging.WARN) self.log = logging.getLogger('swh.indexer') self.tools = self.retrieve_tools_information() def check(self): """Check the indexer's configuration is ok before proceeding. If ok, does nothing. If not raise error. """ if not self.tools: raise ValueError('Tools %s is unknown, cannot continue' % self.config['tools']) def retrieve_tools_information(self): """Permit to define how to retrieve tool information based on configuration. Add a sensible default which can be overridden if not sufficient. (For now, all indexers use only one tool) """ tool = { 'tool_%s' % key: value for key, value in self.config['tools'].items() } return self.storage.indexer_configuration_get(tool) @abc.abstractmethod def filter(self, ids): """Filter missing ids for that particular indexer. Args: ids ([bytes]): list of ids Yields: iterator of missing ids """ pass @abc.abstractmethod def index(self, id, data): """Index computation for the id and associated raw data. Args: id (bytes): identifier data (bytes): id's data from storage or objstorage depending on object type Returns: a dict that makes sense for the persist_index_computations function. """ pass @abc.abstractmethod def persist_index_computations(self, results, policy_update): """Persist the computation resulting from the index. Args: results ([result]): List of results. One result is the result of the index function. policy_update ([str]): either 'update-dups' or 'ignore-dups' to respectively update duplicates or ignore them Returns: None """ pass def next_step(self, results): """Do something else with computations results (e.g. send to another queue, ...). (This is not an abstractmethod since it is optional). Args: results ([result]): List of results (dict) as returned by index function. Returns: None """ pass @abc.abstractmethod def run(self, ids, policy_update): """Given a list of ids: - retrieves the data from the storage - executes the indexing computations - stores the results (according to policy_update) Args: ids ([bytes]): id's identifier list policy_update ([str]): either 'update-dups' or 'ignore-dups' to respectively update duplicates or ignore them """ pass class ContentIndexer(BaseIndexer): """ An object type indexer, inherits from the BaseIndexer and implements the process of indexation for Contents using the run method Note: the ContentIndexer is not an instantiable object to use it in another context one should inherit from this class and override the methods mentioned in the BaseIndexer class """ def run(self, sha1s, policy_update): """Given a list of sha1s: - retrieve the content from the storage - execute the indexing computations - store the results (according to policy_update) Args: sha1s ([bytes]): sha1's identifier list policy_update ([str]): either 'update-dups' or 'ignore-dups' to respectively update duplicates or ignore them """ results = [] try: for sha1 in sha1s: try: raw_content = self.objstorage.get(sha1) except ObjNotFoundError: self.log.warn('Content %s not found in objstorage' % hashutil.hash_to_hex(sha1)) continue res = self.index(sha1, raw_content) if res: # If no results, skip it results.append(res) self.persist_index_computations(results, policy_update) self.next_step(results) except Exception: self.log.exception( 'Problem when reading contents metadata.') if self.rescheduling_task: self.log.warn('Rescheduling batch') self.rescheduling_task.delay(sha1s, policy_update) class RevisionIndexer(BaseIndexer): """ An object type indexer, inherits from the BaseIndexer and implements the process of indexation for Revisions using the run method Note: the RevisionIndexer is not an instantiable object to use it in another context one should inherit from this class and override the methods mentioned in the BaseIndexer class """ def run(self, sha1_gits, policy_update): """ Given a list of sha1_gits: - retrieve revsions from storage - execute the indexing computations - store the results (according to policy_update) Args: sha1_gits ([bytes]): sha1_git's identifier list policy_update ([str]): either 'update-dups' or 'ignore-dups' to respectively update duplicates or ignore them """ results = [] revs = self.storage.revision_get(sha1_gits) for rev in revs: if not rev: - self.log.warn('Revision %s not found in storage' % - hashutil.hash_to_hex(sha1_gits)) + self.log.warn('Revisions %s not found in storage' % + list(map(hashutil.hash_to_hex, sha1_gits))) continue try: res = self.index(rev) if res: # If no results, skip it results.append(res) except Exception: self.log.exception( 'Problem when processing revision') self.persist_index_computations(results, policy_update) diff --git a/swh/indexer/metadata.py b/swh/indexer/metadata.py index bf59e57..28a445c 100644 --- a/swh/indexer/metadata.py +++ b/swh/indexer/metadata.py @@ -1,275 +1,280 @@ # Copyright (C) 2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import click from swh.indexer.indexer import ContentIndexer, RevisionIndexer from swh.indexer.metadata_dictionary import compute_metadata from swh.indexer.metadata_detector import detect_metadata from swh.indexer.metadata_detector import extract_minimal_metadata_dict from swh.model import hashutil class ContentMetadataIndexer(ContentIndexer): """Indexer at content level in charge of: - filtering out content already indexed in content_metadata - reading content from objstorage with the content's id sha1 - computing translated_metadata by given context - using the metadata_dictionary as the 'swh-metadata-translator' tool - store result in content_metadata table """ CONFIG_BASE_FILENAME = 'indexer/metadata' def __init__(self, tool, config): self.tool = tool # twisted way to use the exact same config of RevisionMetadataIndexer # object that uses internally ContentMetadataIndexer self.new_config = config super().__init__() def prepare(self): super().prepare() self.results = [] if self.new_config['storage']: self.storage = self.new_config['storage'] if self.new_config['objstorage']: self.objstorage = self.new_config['objstorage'] def retrieve_tools_information(self): self.config['tools'] = self.tool return super().retrieve_tools_information() def filter(self, sha1s): """Filter out known sha1s and return only missing ones. """ yield from self.storage.content_metadata_missing(( { 'id': sha1, 'indexer_configuration_id': self.tools['id'], } for sha1 in sha1s )) def index(self, sha1, raw_content): """Index sha1s' content and store result. Args: sha1 (bytes): content's identifier raw_content (bytes): raw content in bytes Returns: result (dict): representing a content_metadata if translation wasn't successful the translated_metadata keys will be kept as None """ result = { 'id': sha1, 'indexer_configuration_id': self.tools['id'], 'translated_metadata': None } try: context = self.tools['tool_configuration']['context'] result['translated_metadata'] = compute_metadata( context, raw_content) # a twisted way to keep result with indexer object for get_results self.results.append(result) except: self.log.exception( "Problem during tool retrieval of metadata translation") return result def persist_index_computations(self, results, policy_update): """Persist the results in storage. Args: results ([dict]): list of content_metadata, dict with the following keys: - id (bytes): content's identifier (sha1) - translated_metadata (jsonb): detected metadata policy_update ([str]): either 'update-dups' or 'ignore-dups' to respectively update duplicates or ignore them """ self.storage.content_metadata_add( results, conflict_update=(policy_update == 'update-dups')) def get_results(self): """ can be called only if run method was called before Returns: results (list): list of content_metadata entries calculated by current indxer """ return self.results class RevisionMetadataIndexer(RevisionIndexer): """Indexer at Revision level in charge of: - filtering revisions already indexed in revision_metadata table with defined computation tool - retrieve all entry_files in root directory - use metadata_detector for file_names containig metadata - compute metadata translation if necessary and possible (depends on tool) - send sha1s to content indexing if possible - store the results for revision """ CONFIG_BASE_FILENAME = 'indexer/metadata' ADDITIONAL_CONFIG = { 'tools': ('dict', { 'name': 'swh-metadata-detector', 'version': '0.0.1', 'configuration': { 'type': 'local', 'context': ['npm', 'codemeta'] }, }), } def prepare(self): super().prepare() def filter(self, sha1_gits): """Filter out known sha1s and return only missing ones. """ yield from self.storage.revision_metadata_missing(( { 'id': sha1_git, 'indexer_configuration_id': self.tools['id'], } for sha1_git in sha1_gits )) def index(self, rev): """Index rev by processing it and organizing result. use metadata_detector to iterate on filenames - if one filename detected -> sends file to content indexer - if multiple file detected -> translation needed at revision level Args: rev (bytes): revision artifact from storage Returns: A dict, representing a revision_metadata, with keys: - id (bytes): rev's identifier (sha1_git) - indexer_configuration_id (bytes): tool used - translated_metadata (bytes): dict of retrieved metadata """ try: result = { 'id': rev['id'], 'indexer_configuration_id': self.tools['id'], 'translated_metadata': None } root_dir = rev['directory'] dir_ls = self.storage.directory_ls(root_dir, recursive=False) files = (entry for entry in dir_ls if entry['type'] == 'file') detected_files = detect_metadata(files) result['translated_metadata'] = self.translate_revision_metadata( detected_files) except Exception as e: self.log.exception( 'Problem when indexing rev') return result def persist_index_computations(self, results, policy_update): """Persist the results in storage. Args: results ([dict]): list of content_mimetype, dict with the following keys: - id (bytes): content's identifier (sha1) - mimetype (bytes): mimetype in bytes - encoding (bytes): encoding in bytes policy_update ([str]): either 'update-dups' or 'ignore-dups' to respectively update duplicates or ignore them """ # TODO: add functions in storage to keep data in revision_metadata self.storage.revision_metadata_add( results, conflict_update=(policy_update == 'update-dups')) def translate_revision_metadata(self, detected_files): """ Determine plan of action to translate metadata when containing one or multiple detected files: Args: - detected_files : dict with context name and list of sha1s (e.g : {'npm' : [sha1_1, sha1_2], 'authors': sha1_3}) Returns: - translated_metadata: dict with the CodeMeta vocabulary """ translated_metadata = [] tool = { 'name': 'swh-metadata-translator', 'version': '0.0.1', 'configuration': { 'type': 'local', 'context': None }, } # TODO: iterate on each context, on each file # -> get raw_contents # -> translate each content config = { 'storage': self.storage, 'objstorage': self.objstorage } for context in detected_files.keys(): tool['configuration']['context'] = context c_metadata_indexer = ContentMetadataIndexer(tool, config) - # sha1s that aren't in content_metadata table - sha1s_filtered = list(c_metadata_indexer.filter( - detected_files[context])) + # sha1s that are in content_metadata table + sha1s_in_storage = [] + metadata_generator = self.storage.content_metadata_get( + detected_files[context]) + for c in metadata_generator: + # extracting translated_metadata + sha1 = c['id'] + sha1s_in_storage.append(sha1) + local_metadata = c['translated_metadata'] + # local metadata is aggregated + if local_metadata: + translated_metadata.append(local_metadata) + + sha1s_filtered = [item for item in detected_files[context] + if item not in sha1s_in_storage] + if sha1s_filtered: - print(sha1s_filtered) # schedule indexation of content try: c_metadata_indexer.run(sha1s_filtered, policy_update='ignore-dups') # on the fly possibility: - local_metadata = c_metadata_indexer.get_results() + results = c_metadata_indexer.get_results() + + for result in results: + local_metadata = result['translated_metadata'] + translated_metadata.append(local_metadata) + except Exception as e: self.log.warn("""Exception while indexing content""", e) - sha1s_in_storage = [item for item in detected_files[context] - if item not in sha1s_filtered] - # fetch from storage results that were skipped with filter - for sha1 in sha1s_in_storage: - local_metadata = {} - # fetch content_metadata from storage - metadata_generator = self.storage.content_metadata_get([sha1]) - for c in metadata_generator: - # extracting translated_metadata - local_metadata = c['translated_metadata'] - # local metadata is aggregated - if local_metadata: - translated_metadata.append(local_metadata) + # transform translated_metadata into min set with swh-metadata-detector min_metadata = extract_minimal_metadata_dict(translated_metadata) return min_metadata @click.command() -@click.option('--revs_ids', +@click.option('--revs', '-i', default=['8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f', '026040ea79dec1b49b4e3e7beda9132b6b26b51b', '9699072e21eded4be8d45e3b8d543952533fa190'], - help='Default sha1_git to lookup') -def main(revs_ids): - _git_sha1s = list(map(hashutil.hash_to_bytes, revs_ids)) + help='Default sha1_git to lookup', multiple=True) +def main(revs): + _git_sha1s = list(map(hashutil.hash_to_bytes, revs)) rev_metadata_indexer = RevisionMetadataIndexer() rev_metadata_indexer.run(_git_sha1s, 'update-dups') if __name__ == '__main__': import logging logging.basicConfig(level=logging.INFO) main() diff --git a/swh/indexer/metadata_detector.py b/swh/indexer/metadata_detector.py index ceb4a53..78599e0 100644 --- a/swh/indexer/metadata_detector.py +++ b/swh/indexer/metadata_detector.py @@ -1,72 +1,73 @@ # Copyright (C) 2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information mapping_filenames = { b"package.json": "npm", b"codemeta.json": "codemeta" } def detect_metadata(files): """ Detects files potentially containing metadata Args: - file_entries (list): list of files Returns: - empty list if nothing was found - dictionary {mapping_filenames[name]:f['sha1']} """ results = {} for f in files: name = f['name'].lower().strip() # TODO: possibility to detect extensions if name in mapping_filenames: tool = mapping_filenames[name] if tool in results: results[tool].append(f['sha1']) else: results[tool] = [f['sha1']] return results def extract_minimal_metadata_dict(metadata_list): """ Every item in the metadata_list is a dict of translated_metadata in the CodeMeta vocabulary we wish to extract a minimal set of terms and keep all values corresponding to this term Args: - metadata_list (list): list of dicts of translated_metadata Returns: - minimal_dict (dict): one dict with selected values of metadata """ minimal_dict = { "developmentStatus": [], "version": [], "operatingSystem": [], "description": [], "keywords": [], "issueTracker": [], "name": [], "author": [], "relatedLink": [], "url": [], "type": [], "license": [], "maintainer": [], "email": [], "softwareRequirements": [], - "identifier": [] + "identifier": [], + "codeRepository": [] } for term in minimal_dict.keys(): for metadata_dict in metadata_list: if term in metadata_dict: minimal_dict[term].append(metadata_dict[term]) if not minimal_dict[term]: minimal_dict[term] = None return minimal_dict diff --git a/swh/indexer/tests/test_metadata.py b/swh/indexer/tests/test_metadata.py index 44bdfeb..de17b61 100644 --- a/swh/indexer/tests/test_metadata.py +++ b/swh/indexer/tests/test_metadata.py @@ -1,291 +1,295 @@ # Copyright (C) 2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import unittest import logging from nose.tools import istest from swh.indexer.metadata_dictionary import compute_metadata from swh.indexer.metadata_detector import detect_metadata from swh.indexer.metadata import ContentMetadataIndexer from swh.indexer.metadata import RevisionMetadataIndexer from swh.indexer.tests.test_utils import MockObjStorage from swh.indexer.tests.test_utils import MockStorage class TestContentMetadataIndexer(ContentMetadataIndexer): """Specific Metadata whose configuration is enough to satisfy the indexing tests. """ def prepare(self): self.config = { 'rescheduling_task': None, } self.storage = MockStorage() self.log = logging.getLogger('swh.indexer') self.objstorage = MockObjStorage() self.task_destination = None self.rescheduling_task = self.config['rescheduling_task'] self.tools = self.retrieve_tools_information() self.results = [] class TestRevisionMetadataIndexer(RevisionMetadataIndexer): """Specific indexer whose configuration is enough to satisfy the indexing tests. """ def prepare(self): self.config = { 'rescheduling_task': None, 'tools': { 'name': 'swh-metadata-detector', 'version': '0.0.1', 'configuration': { 'type': 'local', 'context': 'npm' } } } self.storage = MockStorage() self.log = logging.getLogger('swh.indexer') self.objstorage = MockObjStorage() self.task_destination = None self.rescheduling_task = self.config['rescheduling_task'] self.tools = self.retrieve_tools_information() self.results = [] class Metadata(unittest.TestCase): """ Tests metadata_mock_tool tool for Metadata detection """ def setUp(self): """ shows the entire diff in the results """ self.maxDiff = None self.content_tool = { 'name': 'swh-metadata-translator', 'version': '0.0.1', 'configuration': { 'type': 'local', 'context': 'npm' } } @istest def test_compute_metadata_none(self): """ testing content empty content is empty should return None """ # given content = b"" context = "npm" # None if no metadata was found or an error occurred declared_metadata = None # when result = compute_metadata(context, content) # then self.assertEqual(declared_metadata, result) @istest def test_compute_metadata_npm(self): """ testing only computation of metadata with hard_mapping_npm """ # given content = b""" { "name": "test_metadata", "version": "0.0.1", "description": "Simple package.json test for indexer", "repository": { "type": "git", "url": "https://github.com/moranegg/metadata_test" } } """ declared_metadata = { 'name': 'test_metadata', 'version': '0.0.1', 'description': 'Simple package.json test for indexer', 'codeRepository': { 'type': 'git', 'url': 'https://github.com/moranegg/metadata_test' }, 'other': {} } # when result = compute_metadata("npm", content) # then self.assertEqual(declared_metadata, result) @istest def test_index_content_metadata_npm(self): """ testing NPM with package.json - one sha1 uses a file that can't be translated to metadata and should return None in the translated metadata """ # given sha1s = ['26a9f72a7c87cc9205725cfd879f514ff4f3d8d5', 'd4c647f0fc257591cc9ba1722484229780d1c607', '02fb2c89e14f7fab46701478c83779c7beb7b069'] # this metadata indexer computes only metadata for package.json # in npm context with a hard mapping metadata_indexer = TestContentMetadataIndexer(self.content_tool, None) # when metadata_indexer.run(sha1s, policy_update='ignore-dups') results = metadata_indexer.storage.state expected_results = [{ 'indexer_configuration_id': 30, 'translated_metadata': { 'other': {}, 'codeRepository': { 'type': 'git', 'url': 'https://github.com/moranegg/metadata_test' }, 'description': 'Simple package.json test for indexer', 'name': 'test_metadata', 'version': '0.0.1' }, 'id': '26a9f72a7c87cc9205725cfd879f514ff4f3d8d5' }, { 'indexer_configuration_id': 30, 'translated_metadata': { 'softwareRequirements': { 'JSONStream': '~1.3.1', 'abbrev': '~1.1.0', 'ansi-regex': '~2.1.1', 'ansicolors': '~0.3.2', 'ansistyles': '~0.1.3' }, 'issueTracker': { 'url': 'https://github.com/npm/npm/issues' }, 'author': 'Isaac Z. Schlueter (http://blog.izs.me)', 'codeRepository': { 'type': 'git', 'url': 'https://github.com/npm/npm' }, 'description': 'a package manager for JavaScript', 'softwareSuggestions': { 'tacks': '~1.2.6', 'tap': '~10.3.2' }, 'license': 'Artistic-2.0', 'version': '5.0.3', 'other': { 'preferGlobal': True, 'config': { 'publishtest': False } }, 'name': 'npm', 'keywords': [ 'install', 'modules', 'package manager', 'package.json' ], 'url': 'https://docs.npmjs.com/' }, 'id': 'd4c647f0fc257591cc9ba1722484229780d1c607' }, { 'indexer_configuration_id': 30, 'translated_metadata': None, 'id': '02fb2c89e14f7fab46701478c83779c7beb7b069' }] # The assertion bellow returns False sometimes because of nested lists self.assertEqual(expected_results, results) @istest def test_detect_metadata_package_json(self): # given df = [{ 'sha1_git': b'abc', 'name': b'index.js', 'target': b'abc', 'length': 897, 'status': 'visible', 'type': 'file', 'perms': 33188, 'dir_id': b'dir_a', 'sha1': b'bcd' }, { 'sha1_git': b'aab', 'name': b'package.json', 'target': b'aab', 'length': 712, 'status': 'visible', 'type': 'file', 'perms': 33188, 'dir_id': b'dir_a', 'sha1': b'cde' }] # when results = detect_metadata(df) expected_results = { 'npm': [ b'cde' ] } # then self.assertEqual(expected_results, results) @istest def test_revision_metadata_indexer(self): metadata_indexer = TestRevisionMetadataIndexer() sha1_gits = [ b'8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f', ] metadata_indexer.run(sha1_gits, 'update-dups') results = metadata_indexer.storage.state expected_results = [{ 'id': b'8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f', 'translated_metadata': { 'identifier': None, 'maintainer': None, 'url': [ 'https://github.com/librariesio/yarn-parser#readme' ], + 'codeRepository': [{ + 'type': 'git', + 'url': 'git+https://github.com/librariesio/yarn-parser.git' + }], 'author': ['Andrew Nesbitt'], 'license': ['AGPL-3.0'], 'version': ['1.0.0'], 'description': [ 'Tiny web service for parsing yarn.lock files' ], 'relatedLink': None, 'developmentStatus': None, 'operatingSystem': None, 'issueTracker': [{ 'url': 'https://github.com/librariesio/yarn-parser/issues' }], 'softwareRequirements': [{ 'express': '^4.14.0', 'yarn': '^0.21.0', 'body-parser': '^1.15.2' }], 'name': ['yarn-parser'], 'keywords': [['yarn', 'parse', 'lock', 'dependencies']], 'type': None, 'email': None }, 'indexer_configuration_id': 7 }] # then self.assertEqual(expected_results, results)