diff --git a/README b/README index b8c009c..5c16501 100644 --- a/README +++ b/README @@ -1,79 +1,82 @@ swh-indexer =========== Tools to compute multiple indexes on SWH's raw contents: -- mimetype -- ctags -- language -- fossology-license -- metadata +- content: + - mimetype + - ctags + - language + - fossology-license + - metadata +- revision: + - metadata # Context SWH has currently stored around 3B contents. The table `content` holds their checksums. Those contents are physically stored in an object storage (using disks) and replicated in another. Those object storages are not destined for reading yet. We are in the process to copy those contents over to azure's blob storages. As such, we will use that opportunity to trigger the computations on these contents once those have been copied over. # Workers There exists 2 kinds: - orchestrators (orchestrator, orchestrator-text) - indexer (mimetype, language, ctags, fossology-license) ## Orchestrator Orchestrators: - receive batch of sha1s - split those batches - broadcast those to indexers There are 2 sorts: - orchestrator (swh_indexer_orchestrator_content_all): Receives and broadcast sha1 ids (of contents) to indexers (currently only the mimetype indexer) - orchestrator-text (swh_indexer_orchestrator_content_text): Receives batch of sha1 ids (of textual contents) and broadcast those to indexers (currently language, ctags, and fossology-license indexers). ## Indexers Indexers: - receive batch of ids - retrieve the associated data depending on object type - compute for that object some index - store the result to swh's storage - (and possibly do some broadcast itself) Current content indexers: - mimetype (queue swh_indexer_content_mimetype): compute the mimetype, filter out the textual contents and broadcast the list to the orchestrator-text - language (queue swh_indexer_content_language): detect the programming language - ctags (queue swh_indexer_content_ctags): try and compute tags information - fossology-license (queue swh_indexer_fossology_license): try and compute the license - metadata : translate file into translated_metadata dict Current revision indexers: - metadata: detects files containing metadata and retrieves translated_metadata in content_metadata table in storage or run content indexer to translate files. diff --git a/swh/indexer/indexer.py b/swh/indexer/indexer.py index b18f954..78e58e0 100644 --- a/swh/indexer/indexer.py +++ b/swh/indexer/indexer.py @@ -1,385 +1,384 @@ # Copyright (C) 2016-2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import abc import os import logging import shutil import tempfile from swh.core.config import SWHConfig from swh.objstorage import get_objstorage from swh.objstorage.exc import ObjNotFoundError from swh.model import hashutil from swh.storage import get_storage from swh.scheduler.utils import get_task class DiskIndexer: """Mixin intended to be used with other *Indexer classes. Indexer* inheriting from this class are a category of indexers which needs the disk for their computations. Expects: self.working_directory variable defined at runtime. """ def __init__(self): super().__init__() def write_to_temp(self, filename, data): """Write the sha1's content in a temporary file. Args: sha1 (str): the sha1 name filename (str): one of sha1's many filenames data (bytes): the sha1's content to write in temporary file Returns: The path to the temporary file created. That file is filled in with the raw content's data. """ os.makedirs(self.working_directory, exist_ok=True) temp_dir = tempfile.mkdtemp(dir=self.working_directory) content_path = os.path.join(temp_dir, filename) with open(content_path, 'wb') as f: f.write(data) return content_path def cleanup(self, content_path): """Remove content_path from working directory. Args: content_path (str): the file to remove """ temp_dir = os.path.dirname(content_path) shutil.rmtree(temp_dir) class BaseIndexer(SWHConfig, metaclass=abc.ABCMeta): """Base class for indexers to inherit from. The main entry point is the `run` functions which is in charge to trigger the computations on the ids batch received. Indexers can: - filter out ids whose data has already been indexed. - retrieve ids data from storage or objstorage - index this data depending on the object and store the result in storage. To implement a new object type indexer, inherit from the BaseIndexer and implement the process of indexation : - def run(self, object_ids, policy_update): object_ids are different depending on object. For example: sha1 for content, sha1_git for - revision, directorie, release, and id for origin + revision, directory, release, and id for origin To implement a new concrete indexer, inherit from the object level classes: ContentIndexer, RevisionIndexer (later on OriginIndexer will also be available) Then you need to implement the following functions: - def filter(self, ids): filter out data already indexed (in storage). This function is used by the orchestrator and not directly by the indexer (cf. swh.indexer.orchestrator.BaseOrchestratorIndexer). - def index_object(self, id, data): compute index on id with data (retrieved from the storage or the objstorage by the id key) and return the resulting index computation. - def persist_index_computations(self, results, policy_update): persist the results of multiple index computations in the storage. The new indexer implementation can also override the following functions: - def prepare(self): Configuration preparation for the indexer. When overriding, this must call the super().prepare() function. - def check(self): Configuration check for the indexer. When overriding, this must call the super().check() function. - def retrieve_tools_information(self): This should return a dict of the tool(s) to use when indexing or filtering. """ CONFIG = 'indexer/base' DEFAULT_CONFIG = { 'storage': ('dict', { 'host': 'uffizi', 'cls': 'remote', 'args': {'root': '/tmp/softwareheritage/objects', 'slicing': '0:2/2:4/4:6'} }), # queue to reschedule if problem (none for no rescheduling, # the default) 'rescheduling_task': ('str', None), 'objstorage': ('dict', { 'cls': 'multiplexer', 'args': { 'objstorages': [{ 'cls': 'filtered', 'args': { 'storage_conf': { 'cls': 'azure-storage', 'args': { 'account_name': '0euwestswh', 'api_secret_key': 'secret', 'container_name': 'contents' } }, 'filters_conf': [ {'type': 'readonly'}, {'type': 'prefix', 'prefix': '0'} ] } }, { 'cls': 'filtered', 'args': { 'storage_conf': { 'cls': 'azure-storage', 'args': { 'account_name': '1euwestswh', 'api_secret_key': 'secret', 'container_name': 'contents' } }, 'filters_conf': [ {'type': 'readonly'}, {'type': 'prefix', 'prefix': '1'} ] } }] }, }), } ADDITIONAL_CONFIG = {} def __init__(self): """Prepare and check that the indexer is ready to run. """ super().__init__() self.prepare() self.check() def prepare(self): """Prepare the indexer's needed runtime configuration. Without this step, the indexer cannot possibly run. """ self.config = self.parse_config_file( additional_configs=[self.ADDITIONAL_CONFIG]) objstorage = self.config['objstorage'] self.objstorage = get_objstorage(objstorage['cls'], objstorage['args']) storage = self.config['storage'] self.storage = get_storage(storage['cls'], storage['args']) rescheduling_task = self.config['rescheduling_task'] if rescheduling_task: self.rescheduling_task = get_task(rescheduling_task) else: self.rescheduling_task = None l = logging.getLogger('requests.packages.urllib3.connectionpool') l.setLevel(logging.WARN) self.log = logging.getLogger('swh.indexer') self.tools = self.retrieve_tools_information() def check(self): """Check the indexer's configuration is ok before proceeding. If ok, does nothing. If not raise error. """ if not self.tools: raise ValueError('Tools %s is unknown, cannot continue' % self.config['tools']) def retrieve_tools_information(self): """Permit to define how to retrieve tool information based on configuration. Add a sensible default which can be overridden if not sufficient. (For now, all indexers use only one tool) """ tool = { 'tool_%s' % key: value for key, value in self.config['tools'].items() } return self.storage.indexer_configuration_get(tool) @abc.abstractmethod def filter(self, ids): """Filter missing ids for that particular indexer. Args: ids ([bytes]): list of ids Yields: iterator of missing ids """ pass @abc.abstractmethod def index(self, id, data): """Index computation for the sha1 and associated raw content. Args: id (bytes): sha1 identifier content (bytes): id's data from storage or objstorage depending on object type Returns: a dict that makes sense for the persist_index_computations function. """ pass @abc.abstractmethod def persist_index_computations(self, results, policy_update): """Persist the computation resulting from the index. Args: results ([result]): List of results. One result is the result of the index function. policy_update ([str]): either 'update-dups' or 'ignore-dups' to respectively update duplicates or ignore them Returns: None """ pass def next_step(self, results): """Do something else with computations results (e.g. send to another queue, ...). (This is not an abstractmethod since it is optional). Args: results ([result]): List of results (dict) as returned by index function. Returns: None """ pass @abc.abstractmethod def run(self, ids, policy_update): """Given a list of ids: - retrieves the data from the storage - executes the indexing computations - stores the results (according to policy_update) Args: ids ([bytes]): id's identifier list policy_update ([str]): either 'update-dups' or 'ignore-dups' to respectively update duplicates or ignore them """ pass class ContentIndexer(BaseIndexer): """ - An object type indexer, inherit from the BaseIndexer and - implement the process of indexation for Contents with the run method + An object type indexer, inherits from the BaseIndexer and + implements the process of indexation for Contents using the run method Note: the ContentIndexer is not an instantiable object - to use it in another context one should refer to the instructions in the - BaseIndexer + to use it in another context one should inherit from this class and + override the methods mentioned in the BaseIndexer class """ def run(self, sha1s, policy_update): """Given a list of sha1s: - retrieve the content from the storage - execute the indexing computations - store the results (according to policy_update) Args: sha1s ([bytes]): sha1's identifier list policy_update ([str]): either 'update-dups' or 'ignore-dups' to respectively update duplicates or ignore them """ results = [] try: for sha1 in sha1s: try: raw_content = self.objstorage.get(sha1) except ObjNotFoundError: self.log.warn('Content %s not found in objstorage' % hashutil.hash_to_hex(sha1)) continue res = self.index(sha1, raw_content) if res: # If no results, skip it results.append(res) self.persist_index_computations(results, policy_update) self.next_step(results) except Exception: self.log.exception( 'Problem when reading contents metadata.') if self.rescheduling_task: self.log.warn('Rescheduling batch') self.rescheduling_task.delay(sha1s, policy_update) class RevisionIndexer(BaseIndexer): """ - An object type indexer, inherit from the BaseIndexer and - implement the process of indexation for Revisions with the run method + An object type indexer, inherits from the BaseIndexer and + implements the process of indexation for Revisions using the run method Note: the RevisionIndexer is not an instantiable object - to use it in another context one should refer to the instructions in the - BaseIndexer + to use it in another context one should inherit from this class and + override the methods mentioned in the BaseIndexer class """ def run(self, sha1_gits, policy_update): """ Given a list of sha1_gits: - retrieve revsions from storage - execute the indexing computations - store the results (according to policy_update) Args: sha1_gits ([bytes]): sha1_git's identifier list policy_update ([str]): either 'update-dups' or 'ignore-dups' to respectively update duplicates or ignore them """ results = [] - try: - for sha1_git in sha1_gits: - try: - revs = self.storage.revision_get([sha1_git]) - except ValueError: - self.log.warn('Revision %s not found in storage' % - hashutil.hash_to_hex(sha1_git)) - continue - for rev in revs: - if rev: # If no revision, skip it - res = self.index(rev) - if res: # If no results, skip it - results.append(res) - self.persist_index_computations(results, policy_update) - except Exception: - self.log.exception( - 'Problem when processing revision') + revs = self.storage.revision_get(sha1_gits) + + for rev in revs: + if not rev: + self.log.warn('Revision %s not found in storage' % + hashutil.hash_to_hex(sha1_gits)) + continue + try: + res = self.index(rev) + if res: # If no results, skip it + results.append(res) + except Exception: + self.log.exception( + 'Problem when processing revision') + self.persist_index_computations(results, policy_update) + print(results) diff --git a/swh/indexer/metadata.py b/swh/indexer/metadata.py index 2491c0d..d5038cd 100644 --- a/swh/indexer/metadata.py +++ b/swh/indexer/metadata.py @@ -1,262 +1,260 @@ # Copyright (C) 2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from swh.indexer.indexer import ContentIndexer, RevisionIndexer from swh.indexer.metadata_dictionary import compute_metadata from swh.indexer.metadata_detector import detect_metadata from swh.indexer.metadata_detector import extract_minimal_metadata_dict from swh.model import hashutil class ContentMetadataIndexer(ContentIndexer): """Indexer at content level in charge of: - filtering out content already indexed in content_metadata - reading content from objstorage with the content's id sha1 - computing translated_metadata by given context - using the metadata_dictionary as the 'swh-metadata-translator' tool - store result in content_metadata table """ CONFIG_BASE_FILENAME = 'indexer/metadata' def __init__(self, tool): self.tool = tool super().__init__() def prepare(self): super().prepare() self.results = [] def retrieve_tools_information(self): self.config['tools'] = self.tool return super().retrieve_tools_information() def filter(self, sha1s): """Filter out known sha1s and return only missing ones. """ yield from self.storage.content_metadata_missing(( { 'id': sha1, 'indexer_configuration_id': self.tools['id'], } for sha1 in sha1s )) def index(self, sha1, raw_content): """Index sha1s' content and store result. Args: sha1 (bytes): content's identifier raw_content (bytes): raw content in bytes Returns: result (dict): representing a content_metadata if translation wasn't successful the translated_metadata keys will be kept as None """ result = { 'id': sha1, 'indexer_configuration_id': self.tools['id'], 'translated_metadata': None } try: context = self.tools['tool_configuration']['context'] result['translated_metadata'] = compute_metadata( context, raw_content) # a twisted way to keep result with indexer object for get_results self.results.append(result) except: self.log.exception( "Problem during tool retrieval of metadata translation") return result def persist_index_computations(self, results, policy_update): """Persist the results in storage. Args: results ([dict]): list of content_metadata, dict with the following keys: - id (bytes): content's identifier (sha1) - translated_metadata (jsonb): detected metadata policy_update ([str]): either 'update-dups' or 'ignore-dups' to respectively update duplicates or ignore them """ self.storage.content_metadata_add( results, conflict_update=(policy_update == 'update-dups')) def get_results(self): """ can be called only if run method was called before Returns: results (list): list of content_metadata entries calculated by current indxer """ return self.results class RevisionMetadataIndexer(RevisionIndexer): """Indexer at Revision level in charge of: - filtering revisions already indexed in revision_metadata table with defined computation tool - retrieve all entry_files in root directory - use metadata_detector for file_names containig metadata - compute metadata translation if necessary and possible (depends on tool) - send sha1s to content indexing if possible - store the results for revision """ CONFIG_BASE_FILENAME = 'indexer/metadata' ADDITIONAL_CONFIG = { 'tools': ('dict', { 'name': 'swh-metadata-detector', 'version': '0.0.1', 'configuration': { 'type': 'local', 'context': ['npm', 'codemeta'] }, }), } def prepare(self): super().prepare() def filter(self, sha1_gits): """Filter out known sha1s and return only missing ones. """ yield from self.storage.revision_metadata_missing(( { 'id': sha1_git, 'indexer_configuration_id': self.tools['id'], } for sha1_git in sha1_gits )) def index(self, rev): """Index rev by processing it and organizing result. use metadata_detector to iterate on filenames - if one filename detected -> sends file to content indexer - if multiple file detected -> translation needed at revision level Args: rev (bytes): revision artifact from storage Returns: A dict, representing a revision_metadata, with keys: - id (bytes): rev's identifier (sha1_git) - indexer_configuration_id (bytes): tool used - translated_metadata (bytes): dict of retrieved metadata """ try: result = { 'id': rev['id'], 'indexer_configuration_id': self.tools['id'], 'translated_metadata': None } root_dir = rev['directory'] dir_ls = self.storage.directory_ls(root_dir, recursive=False) files = (entry for entry in dir_ls if entry['type'] == 'file') detected_files = detect_metadata(files) result['translated_metadata'] = self.translate_revision_metadata( detected_files) except Exception as e: self.log.exception( 'Problem when indexing rev') print(e) return result def persist_index_computations(self, results, policy_update): """Persist the results in storage. Args: results ([dict]): list of content_mimetype, dict with the following keys: - id (bytes): content's identifier (sha1) - mimetype (bytes): mimetype in bytes - encoding (bytes): encoding in bytes policy_update ([str]): either 'update-dups' or 'ignore-dups' to respectively update duplicates or ignore them """ # TODO: add functions in storage to keep data in revision_metadata - # self.storage.reivision_metadata_add( - # results, conflict_update=(policy_update == 'update-dups')) - pass + self.storage.revision_metadata_add( + results, conflict_update=(policy_update == 'update-dups')) def translate_revision_metadata(self, detected_files): """ Determine plan of action to translate metadata when containing one or multiple detected files: Args: - detected_files : dict with context name and list of sha1s (e.g : {'npm' : [sha1_1, sha1_2], 'authors': sha1_3}) Returns: - translated_metadata: dict with the CodeMeta vocabulary """ translated_metadata = [] tool = { 'name': 'swh-metadata-translator', 'version': '0.0.1', 'configuration': { 'type': 'local', 'context': None }, } # TODO: iterate on each context, on each file # -> get raw_contents # -> translate each content for context in detected_files.keys(): tool['configuration']['context'] = context for sha1 in detected_files[context]: local_metadata = {} # fetch content_metadata from storage metadata_generator = self.storage.content_metadata_get([sha1]) metadata_generated = False for c in metadata_generator: - # print(c) metadata_generated = True # extracting translated_metadata local_metadata = c['translated_metadata'] if not metadata_generated: # schedule indexation of content try: c_metadata_indexer = ContentMetadataIndexer(tool) c_metadata_indexer.run([sha1], policy_update='ignore-dups') local_metadata = c_metadata_indexer.get_results() except Exception as e: self.log.warn("""indexing Content %s with ContentMetadataIndexer raises exeception""" % hashutil.hash_to_hex(sha1)) print(e) # local metadata is aggregated if local_metadata: translated_metadata.append(local_metadata) # transform translated_metadata into min set with swh-metadata-detector min_metadata = extract_minimal_metadata_dict(translated_metadata) return min_metadata def main(): rev_metadata_indexer = RevisionMetadataIndexer() sha1_git1 = hashutil.hash_to_bytes( '8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f') sha1_git2 = hashutil.hash_to_bytes( '026040ea79dec1b49b4e3e7beda9132b6b26b51b') sha1_git3 = hashutil.hash_to_bytes( '9699072e21eded4be8d45e3b8d543952533fa190') sha1_gits = [sha1_git1, sha1_git2, sha1_git3] rev_metadata_indexer.run(sha1_gits, 'update-dups') if __name__ == '__main__': main() diff --git a/swh/indexer/tests/test_metadata.py b/swh/indexer/tests/test_metadata.py index a8d3bbe..4594bb1 100644 --- a/swh/indexer/tests/test_metadata.py +++ b/swh/indexer/tests/test_metadata.py @@ -1,244 +1,291 @@ # Copyright (C) 2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import unittest import logging from nose.tools import istest from swh.indexer.metadata_dictionary import compute_metadata +from swh.indexer.metadata_detector import detect_metadata from swh.indexer.metadata import ContentMetadataIndexer from swh.indexer.metadata import RevisionMetadataIndexer from swh.indexer.tests.test_utils import MockObjStorage - - -class MockStorage(): - """Mock storage to simplify reading indexers' outputs. - """ - def content_metadata_get(self, sha1s): - yield - - def content_metadata_add(self, metadata, conflict_update=None): - self.state = metadata - self.conflict_update = conflict_update - - def revision_metadata_add(self, metadata, conflict_update=None): - self.state = metadata - self.conflict_update = conflict_update - - def indexer_configuration_get(self, tool): - if tool['tool_name'] == 'swh-metadata-translator': - return { - 'id': 30, - 'tool_name': 'swh-metadata-translator', - 'tool_version': '0.0.1', - 'tool_configuration': { - 'type': 'local', - 'context': 'npm' - }, - } - elif tool['tool_name'] == 'swh-metadata-detector': - return { - 'id': 7, - 'tool_name': 'swh-metadata-detector', - 'tool_version': '0.0.1', - 'tool_configuration': { - 'type': 'local', - 'context': 'npm' - }, - } +from swh.indexer.tests.test_utils import MockStorage class TestContentMetadataIndexer(ContentMetadataIndexer): """Specific Metadata whose configuration is enough to satisfy the indexing tests. """ def prepare(self): self.config = { 'rescheduling_task': None, } self.storage = MockStorage() self.log = logging.getLogger('swh.indexer') self.objstorage = MockObjStorage() self.task_destination = None self.rescheduling_task = self.config['rescheduling_task'] self.tools = self.retrieve_tools_information() self.results = [] class TestRevisionMetadataIndexer(RevisionMetadataIndexer): """Specific indexer whose configuration is enough to satisfy the indexing tests. """ def prepare(self): self.config = { 'rescheduling_task': None, + 'tools': { + 'name': 'swh-metadata-detector', + 'version': '0.0.1', + 'configuration': { + 'type': 'local', + 'context': 'npm' + } + } } self.storage = MockStorage() self.log = logging.getLogger('swh.indexer') self.objstorage = MockObjStorage() self.task_destination = None self.rescheduling_task = self.config['rescheduling_task'] self.tools = self.retrieve_tools_information() self.results = [] class Metadata(unittest.TestCase): """ Tests metadata_mock_tool tool for Metadata detection """ def setUp(self): """ shows the entire diff in the results """ self.maxDiff = None self.content_tool = { 'name': 'swh-metadata-translator', 'version': '0.0.1', 'configuration': { 'type': 'local', 'context': 'npm' } } - self.revision_tool = { - 'name': 'swh-metadata-detector', - 'version': '0.0.1', - 'configuration': { - 'type': 'local', - 'context': 'npm' - } - } @istest def test_compute_metadata_none(self): """ testing content empty content is empty should return None """ # given content = b"" context = "npm" # None if no metadata was found or an error occurred declared_metadata = None # when result = compute_metadata(context, content) # then self.assertEqual(declared_metadata, result) @istest def test_compute_metadata_npm(self): """ testing only computation of metadata with hard_mapping_npm """ # given content = b""" { "name": "test_metadata", "version": "0.0.1", "description": "Simple package.json test for indexer", "repository": { "type": "git", "url": "https://github.com/moranegg/metadata_test" } } """ declared_metadata = { 'name': 'test_metadata', 'version': '0.0.1', 'description': 'Simple package.json test for indexer', 'codeRepository': { 'type': 'git', 'url': 'https://github.com/moranegg/metadata_test' }, 'other': {} } # when result = compute_metadata("npm", content) # then self.assertEqual(declared_metadata, result) @istest def test_index_content_metadata_npm(self): """ testing NPM with package.json - one sha1 uses a file that can't be translated to metadata and should return None in the translated metadata """ # given sha1s = ['26a9f72a7c87cc9205725cfd879f514ff4f3d8d5', 'd4c647f0fc257591cc9ba1722484229780d1c607', '02fb2c89e14f7fab46701478c83779c7beb7b069'] # this metadata indexer computes only metadata for package.json # in npm context with a hard mapping metadata_indexer = TestContentMetadataIndexer(self.content_tool) # when metadata_indexer.run(sha1s, policy_update='ignore-dups') results = metadata_indexer.storage.state expected_results = [{ 'indexer_configuration_id': 30, 'translated_metadata': { 'other': {}, 'codeRepository': { 'type': 'git', 'url': 'https://github.com/moranegg/metadata_test' }, 'description': 'Simple package.json test for indexer', 'name': 'test_metadata', 'version': '0.0.1' }, 'id': '26a9f72a7c87cc9205725cfd879f514ff4f3d8d5' }, { 'indexer_configuration_id': 30, 'translated_metadata': { 'softwareRequirements': { 'JSONStream': '~1.3.1', 'abbrev': '~1.1.0', 'ansi-regex': '~2.1.1', 'ansicolors': '~0.3.2', 'ansistyles': '~0.1.3' }, 'issueTracker': { 'url': 'https://github.com/npm/npm/issues' }, 'author': 'Isaac Z. Schlueter (http://blog.izs.me)', 'codeRepository': { 'type': 'git', 'url': 'https://github.com/npm/npm' }, 'description': 'a package manager for JavaScript', 'softwareSuggestions': { 'tacks': '~1.2.6', 'tap': '~10.3.2' }, 'license': 'Artistic-2.0', 'version': '5.0.3', 'other': { 'preferGlobal': True, 'config': { 'publishtest': False } }, 'name': 'npm', 'keywords': [ 'install', 'modules', 'package manager', 'package.json' ], 'url': 'https://docs.npmjs.com/' }, 'id': 'd4c647f0fc257591cc9ba1722484229780d1c607' }, { 'indexer_configuration_id': 30, 'translated_metadata': None, 'id': '02fb2c89e14f7fab46701478c83779c7beb7b069' }] # The assertion bellow returns False sometimes because of nested lists self.assertEqual(expected_results, results) + + @istest + def test_detect_metadata_package_json(self): + # given + df = [{ + 'sha1_git': b'abc', + 'name': b'index.js', + 'target': b'abc', + 'length': 897, + 'status': 'visible', + 'type': 'file', + 'perms': 33188, + 'dir_id': b'dir_a', + 'sha1': b'bcd' + }, + { + 'sha1_git': b'aab', + 'name': b'package.json', + 'target': b'aab', + 'length': 712, + 'status': 'visible', + 'type': 'file', + 'perms': 33188, + 'dir_id': b'dir_a', + 'sha1': b'cde' + }] + # when + results = detect_metadata(df) + + expected_results = { + 'npm': [ + b'cde' + ] + } + # then + self.assertEqual(expected_results, results) + + @istest + def test_revision_metadata_indexer(self): + metadata_indexer = TestRevisionMetadataIndexer() + + sha1_gits = [ + b'8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f', + ] + metadata_indexer.run(sha1_gits, 'update-dups') + + results = metadata_indexer.storage.state + + expected_results = [{ + 'id': b'8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f', + 'translated_metadata': { + 'identifier': None, + 'maintainer': None, + 'url': [ + 'https://github.com/librariesio/yarn-parser#readme' + ], + 'author': ['Andrew Nesbitt'], + 'license': ['AGPL-3.0'], + 'version': ['1.0.0'], + 'description': [ + 'Tiny web service for parsing yarn.lock files' + ], + 'relatedLink': None, + 'developmentStatus': None, + 'operatingSystem': None, + 'issueTracker': [{ + 'url': 'https://github.com/librariesio/yarn-parser/issues' + }], + 'softwareRequirements': [{ + 'express': '^4.14.0', + 'yarn': '^0.21.0', + 'body-parser': '^1.15.2' + }], + 'name': ['yarn-parser'], + 'keywords': [['yarn', 'parse', 'lock', 'dependencies']], + 'type': None, + 'email': None + }, + 'indexer_configuration_id': 7 + }] + # then + self.assertEqual(expected_results, results) diff --git a/swh/indexer/tests/test_utils.py b/swh/indexer/tests/test_utils.py index aea186a..b6b340f 100644 --- a/swh/indexer/tests/test_utils.py +++ b/swh/indexer/tests/test_utils.py @@ -1,113 +1,246 @@ # Copyright (C) 2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from swh.objstorage.exc import ObjNotFoundError class MockStorageWrongConfiguration(): def indexer_configuration_get(self, tool): return None class MockObjStorage(): """Mock objstorage with predefined contents. """ def __init__(self): self.data = { '01c9379dfc33803963d07c1ccc748d3fe4c96bb50': b'this is some text', '688a5ef812c53907562fe379d4b3851e69c7cb15': b'another text', '8986af901dd2043044ce8f0d8fc039153641cf17': b'yet another text', '02fb2c89e14f7fab46701478c83779c7beb7b069': b""" import unittest import logging from nose.tools import istest from swh.indexer.mimetype import ContentMimetypeIndexer from swh.indexer.tests.test_utils import MockObjStorage class MockStorage(): def content_mimetype_add(self, mimetypes): self.state = mimetypes self.conflict_update = conflict_update def indexer_configuration_get(self, tool): return { 'id': 10, } """, '103bc087db1d26afc3a0283f38663d081e9b01e6': b""" #ifndef __AVL__ #define __AVL__ typedef struct _avl_tree avl_tree; typedef struct _data_t { int content; } data_t; """, '93666f74f1cf635c8c8ac118879da6ec5623c410': b""" (should 'pygments (recognize 'lisp 'easily)) """, '26a9f72a7c87cc9205725cfd879f514ff4f3d8d5': b""" { "name": "test_metadata", "version": "0.0.1", "description": "Simple package.json test for indexer", "repository": { "type": "git", "url": "https://github.com/moranegg/metadata_test" } } """, 'd4c647f0fc257591cc9ba1722484229780d1c607': b""" { "version": "5.0.3", "name": "npm", "description": "a package manager for JavaScript", "keywords": [ "install", "modules", "package manager", "package.json" ], "preferGlobal": true, "config": { "publishtest": false }, "homepage": "https://docs.npmjs.com/", "author": "Isaac Z. Schlueter (http://blog.izs.me)", "repository": { "type": "git", "url": "https://github.com/npm/npm" }, "bugs": { "url": "https://github.com/npm/npm/issues" }, "dependencies": { "JSONStream": "~1.3.1", "abbrev": "~1.1.0", "ansi-regex": "~2.1.1", "ansicolors": "~0.3.2", "ansistyles": "~0.1.3" }, "devDependencies": { "tacks": "~1.2.6", "tap": "~10.3.2" }, "license": "Artistic-2.0" } """, 'a7ab314d8a11d2c93e3dcf528ca294e7b431c449': b""" """ } def get(self, sha1): raw_content = self.data.get(sha1) if not raw_content: raise ObjNotFoundError() return raw_content + + +class MockStorage(): + """Mock storage to simplify reading indexers' outputs. + """ + def content_metadata_add(self, metadata, conflict_update=None): + self.state = metadata + self.conflict_update = conflict_update + + def revision_metadata_add(self, metadata, conflict_update=None): + self.state = metadata + self.conflict_update = conflict_update + + def indexer_configuration_get(self, tool): + if tool['tool_name'] == 'swh-metadata-translator': + return { + 'id': 30, + 'tool_name': 'swh-metadata-translator', + 'tool_version': '0.0.1', + 'tool_configuration': { + 'type': 'local', + 'context': 'npm' + }, + } + elif tool['tool_name'] == 'swh-metadata-detector': + return { + 'id': 7, + 'tool_name': 'swh-metadata-detector', + 'tool_version': '0.0.1', + 'tool_configuration': { + 'type': 'local', + 'context': 'npm' + }, + } + + def revision_get(self, revisions): + return [{ + 'id': b'8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f', + 'committer': { + 'id': 26, + 'name': b'Andrew Nesbitt', + 'fullname': b'Andrew Nesbitt ', + 'email': b'andrewnez@gmail.com' + }, + 'synthetic': False, + 'date': { + 'negative_utc': False, + 'timestamp': { + 'seconds': 1487596456, + 'microseconds': 0 + }, + 'offset': 0 + }, + 'directory': b'10' + }] + + def directory_ls(self, directory, recursive=False, cur=None): + # with directory: b'\x9d', + return [{ + 'sha1_git': b'abc', + 'name': b'index.js', + 'target': b'abc', + 'length': 897, + 'status': 'visible', + 'type': 'file', + 'perms': 33188, + 'dir_id': b'10', + 'sha1': b'bcd' + }, + { + 'sha1_git': b'aab', + 'name': b'package.json', + 'target': b'aab', + 'length': 712, + 'status': 'visible', + 'type': 'file', + 'perms': 33188, + 'dir_id': b'10', + 'sha1': b'cde' + }, + { + 'dir_id': b'10', + 'target': b'11', + 'type': 'dir', + 'length': None, + 'name': b'.github', + 'sha1': None, + 'perms': 16384, + 'sha1_git': None, + 'status': None, + 'sha256': None + }] + + def content_metadata_get(self, sha1s): + return [{ + 'tool': { + 'configuration': { + 'type': 'local', + 'context': 'npm' + }, + 'version': '0.0.1', + 'id': 6, + 'name': 'swh-metadata-translator' + }, + 'id': b'cde', + 'translated_metadata': { + 'issueTracker': { + 'url': 'https://github.com/librariesio/yarn-parser/issues' + }, + 'version': '1.0.0', + 'name': 'yarn-parser', + 'author': 'Andrew Nesbitt', + 'url': 'https://github.com/librariesio/yarn-parser#readme', + 'processorRequirements': {'node': '7.5'}, + 'other': { + 'scripts': { + 'start': 'node index.js' + }, + 'main': 'index.js' + }, + 'license': 'AGPL-3.0', + 'keywords': ['yarn', 'parse', 'lock', 'dependencies'], + 'codeRepository': { + 'type': 'git', + 'url': 'git+https://github.com/librariesio/yarn-parser.git' + }, + 'description': 'Tiny web service for parsing yarn.lock files', + 'softwareRequirements': { + 'yarn': '^0.21.0', + 'express': '^4.14.0', + 'body-parser': '^1.15.2'} + } + }]