diff --git a/README b/README --- a/README +++ b/README @@ -2,11 +2,14 @@ =========== Tools to compute multiple indexes on SWH's raw contents: -- mimetype -- ctags -- language -- fossology-license - +- content: + - mimetype + - ctags + - language + - fossology-license + - metadata +- revision: + - metadata # Context @@ -50,13 +53,13 @@ ## Indexers Indexers: -- receive batch of sha1 -- retrieve the associated content from the blob storage -- compute for that content some index +- receive batch of ids +- retrieve the associated data depending on object type +- compute for that object some index - store the result to swh's storage - (and possibly do some broadcast itself) -Current indexers: +Current content indexers: - mimetype (queue swh_indexer_content_mimetype): compute the mimetype, filter out the textual contents and broadcast the list to the @@ -69,3 +72,11 @@ - fossology-license (queue swh_indexer_fossology_license): try and compute the license + +- metadata : translate file into translated_metadata dict + +Current revision indexers: + +- metadata: detects files containing metadata and retrieves translated_metadata + in content_metadata table in storage or run content indexer to translate + files. diff --git a/swh/indexer/ctags.py b/swh/indexer/ctags.py --- a/swh/indexer/ctags.py +++ b/swh/indexer/ctags.py @@ -10,7 +10,7 @@ from swh.model import hashutil from .language import compute_language -from .indexer import BaseIndexer, DiskIndexer +from .indexer import ContentIndexer, DiskIndexer # Options used to compute tags @@ -54,7 +54,7 @@ } -class CtagsIndexer(BaseIndexer, DiskIndexer): +class CtagsIndexer(ContentIndexer, DiskIndexer): CONFIG_BASE_FILENAME = 'indexer/ctags' ADDITIONAL_CONFIG = { @@ -80,7 +80,7 @@ self.working_directory = self.config['workdir'] self.language_map = self.config['languages'] - def filter_contents(self, sha1s): + def filter(self, sha1s): """Filter out known sha1s and return only missing ones. """ @@ -91,7 +91,7 @@ } for sha1 in sha1s )) - def index_content(self, sha1, raw_content): + def index(self, sha1, raw_content): """Index sha1s' content and store result. Args: diff --git a/swh/indexer/fossology_license.py b/swh/indexer/fossology_license.py --- a/swh/indexer/fossology_license.py +++ b/swh/indexer/fossology_license.py @@ -8,7 +8,7 @@ from swh.model import hashutil -from .indexer import BaseIndexer, DiskIndexer +from .indexer import ContentIndexer, DiskIndexer def compute_license(path, log=None): @@ -46,7 +46,7 @@ } -class ContentFossologyLicenseIndexer(BaseIndexer, DiskIndexer): +class ContentFossologyLicenseIndexer(ContentIndexer, DiskIndexer): """Indexer in charge of: - filtering out content already indexed - reading content from objstorage per the content's id (sha1) @@ -71,7 +71,7 @@ super().prepare() self.working_directory = self.config['workdir'] - def filter_contents(self, sha1s): + def filter(self, sha1s): """Filter out known sha1s and return only missing ones. """ @@ -82,7 +82,7 @@ } for sha1 in sha1s )) - def index_content(self, sha1, raw_content): + def index(self, sha1, raw_content): """Index sha1s' content and store result. Args: diff --git a/swh/indexer/indexer.py b/swh/indexer/indexer.py --- a/swh/indexer/indexer.py +++ b/swh/indexer/indexer.py @@ -69,24 +69,34 @@ """Base class for indexers to inherit from. The main entry point is the `run` functions which is in charge to - trigger the computations on the sha1s batch received. + trigger the computations on the ids batch received. Indexers can: - - filter out sha1 whose data has already been indexed. - - retrieve sha1's content from objstorage, index this content then - store the result in storage. + - filter out ids whose data has already been indexed. + - retrieve ids data from storage or objstorage + - index this data depending on the object and store the result in storage. - To implement a new index, inherit from this class and implement - the following functions: + To implement a new object type indexer, inherit from the BaseIndexer and + implement the process of indexation : - - def filter_contents(self, sha1s): filter out data already + - def run(self, object_ids, policy_update): object_ids are different + depending on object. For example: sha1 for content, sha1_git for + revision, directory, release, and id for origin + + To implement a new concrete indexer, inherit from the object level classes: + ContentIndexer, RevisionIndexer + (later on OriginIndexer will also be available) + + Then you need to implement the following functions: + + - def filter(self, ids): filter out data already indexed (in storage). This function is used by the orchestrator and not directly by the indexer (cf. swh.indexer.orchestrator.BaseOrchestratorIndexer). - - def index_content(self, sha1, raw_content): compute index on - sha1 with data raw_content (retrieved in the objstorage by the - sha1 key) and return the resulting index computation. + - def index_object(self, id, data): compute index on + id with data (retrieved from the storage or the objstorage by the + id key) and return the resulting index computation. - def persist_index_computations(self, results, policy_update): persist the results of multiple index computations in the @@ -212,25 +222,26 @@ return self.storage.indexer_configuration_get(tool) @abc.abstractmethod - def filter_contents(self, sha1s): - """Filter missing sha1 for that particular indexer. + def filter(self, ids): + """Filter missing ids for that particular indexer. Args: - sha1s ([bytes]): list of contents' sha1 + ids ([bytes]): list of ids Yields: - iterator of missing sha1 + iterator of missing ids """ pass @abc.abstractmethod - def index_content(self, sha1, content): - """Index computation for the sha1 and associated raw content. + def index(self, id, data): + """Index computation for the id and associated raw data. Args: - sha1 (bytes): sha1 identifier - content (bytes): sha1's raw content + id (bytes): identifier + data (bytes): id's data from storage or objstorage depending on + object type Returns: a dict that makes sense for the persist_index_computations @@ -245,7 +256,7 @@ Args: results ([result]): List of results. One result is the - result of the index_content function. + result of the index function. policy_update ([str]): either 'update-dups' or 'ignore-dups' to respectively update duplicates or ignore them @@ -263,7 +274,7 @@ Args: results ([result]): List of results (dict) as returned - by index_content function. + by index function. Returns: None @@ -271,6 +282,32 @@ """ pass + @abc.abstractmethod + def run(self, ids, policy_update): + """Given a list of ids: + - retrieves the data from the storage + - executes the indexing computations + - stores the results (according to policy_update) + + Args: + ids ([bytes]): id's identifier list + policy_update ([str]): either 'update-dups' or 'ignore-dups' to + respectively update duplicates or ignore them + + """ + pass + + +class ContentIndexer(BaseIndexer): + """ + An object type indexer, inherits from the BaseIndexer and + implements the process of indexation for Contents using the run method + + Note: the ContentIndexer is not an instantiable object + to use it in another context one should inherit from this class and + override the methods mentioned in the BaseIndexer class + """ + def run(self, sha1s, policy_update): """Given a list of sha1s: - retrieve the content from the storage @@ -292,7 +329,7 @@ self.log.warn('Content %s not found in objstorage' % hashutil.hash_to_hex(sha1)) continue - res = self.index_content(sha1, raw_content) + res = self.index(sha1, raw_content) if res: # If no results, skip it results.append(res) @@ -304,3 +341,43 @@ if self.rescheduling_task: self.log.warn('Rescheduling batch') self.rescheduling_task.delay(sha1s, policy_update) + + +class RevisionIndexer(BaseIndexer): + """ + An object type indexer, inherits from the BaseIndexer and + implements the process of indexation for Revisions using the run method + + Note: the RevisionIndexer is not an instantiable object + to use it in another context one should inherit from this class and + override the methods mentioned in the BaseIndexer class + """ + + def run(self, sha1_gits, policy_update): + """ + Given a list of sha1_gits: + - retrieve revsions from storage + - execute the indexing computations + - store the results (according to policy_update) + Args: + sha1_gits ([bytes]): sha1_git's identifier list + policy_update ([str]): either 'update-dups' or 'ignore-dups' to + respectively update duplicates or ignore them + + """ + results = [] + revs = self.storage.revision_get(sha1_gits) + + for rev in revs: + if not rev: + self.log.warn('Revision %s not found in storage' % + hashutil.hash_to_hex(sha1_gits)) + continue + try: + res = self.index(rev) + if res: # If no results, skip it + results.append(res) + except Exception: + self.log.exception( + 'Problem when processing revision') + self.persist_index_computations(results, policy_update) diff --git a/swh/indexer/language.py b/swh/indexer/language.py --- a/swh/indexer/language.py +++ b/swh/indexer/language.py @@ -10,7 +10,7 @@ from pygments.util import ClassNotFound from chardet.universaldetector import UniversalDetector -from .indexer import BaseIndexer +from .indexer import ContentIndexer def _cleanup_classname(classname): @@ -107,7 +107,7 @@ } -class ContentLanguageIndexer(BaseIndexer): +class ContentLanguageIndexer(ContentIndexer): """Indexer in charge of: - filtering out content already indexed - reading content from objstorage per the content's id (sha1) @@ -134,7 +134,7 @@ c = self.config self.max_content_size = c['tools']['configuration']['max_content_size'] - def filter_contents(self, sha1s): + def filter(self, sha1s): """Filter out known sha1s and return only missing ones. """ @@ -145,7 +145,7 @@ } for sha1 in sha1s )) - def index_content(self, sha1, raw_content): + def index(self, sha1, raw_content): """Index sha1s' content and store result. Args: diff --git a/swh/indexer/metadata.py b/swh/indexer/metadata.py --- a/swh/indexer/metadata.py +++ b/swh/indexer/metadata.py @@ -2,36 +2,46 @@ # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +import click -from .indexer import BaseIndexer +from swh.indexer.indexer import ContentIndexer, RevisionIndexer from swh.indexer.metadata_dictionary import compute_metadata +from swh.indexer.metadata_detector import detect_metadata +from swh.indexer.metadata_detector import extract_minimal_metadata_dict +from swh.model import hashutil -class ContentMetadataIndexer(BaseIndexer): - """Indexer in charge of: - - filtering out content already indexed + +class ContentMetadataIndexer(ContentIndexer): + """Indexer at content level in charge of: + - filtering out content already indexed in content_metadata - reading content from objstorage with the content's id sha1 - computing translated_metadata by given context - - using the MetadataDict and a tool for each context - - store result in storage + - using the metadata_dictionary as the 'swh-metadata-translator' tool + - store result in content_metadata table """ CONFIG_BASE_FILENAME = 'indexer/metadata' - ADDITIONAL_CONFIG = { - 'tools': ('dict', { - 'name': 'swh-metadata-translator', - 'version': '0.0.1', - 'configuration': { - 'type': 'local', - 'context': 'npm' - }, - }), - } + def __init__(self, tool, config): + self.tool = tool + # twisted way to use the exact same config of RevisionMetadataIndexer + # object that uses internally ContentMetadataIndexer + self.new_config = config + super().__init__() def prepare(self): super().prepare() + self.results = [] + if self.new_config['storage']: + self.storage = self.new_config['storage'] + if self.new_config['objstorage']: + self.objstorage = self.new_config['objstorage'] + + def retrieve_tools_information(self): + self.config['tools'] = self.tool + return super().retrieve_tools_information() - def filter_contents(self, sha1s): + def filter(self, sha1s): """Filter out known sha1s and return only missing ones. """ yield from self.storage.content_metadata_missing(( @@ -41,7 +51,7 @@ } for sha1 in sha1s )) - def index_content(self, sha1, raw_content): + def index(self, sha1, raw_content): """Index sha1s' content and store result. Args: @@ -60,9 +70,11 @@ 'translated_metadata': None } try: - context = self.tools['configuration']['context'] + context = self.tools['tool_configuration']['context'] result['translated_metadata'] = compute_metadata( context, raw_content) + # a twisted way to keep result with indexer object for get_results + self.results.append(result) except: self.log.exception( "Problem during tool retrieval of metadata translation") @@ -82,3 +94,188 @@ """ self.storage.content_metadata_add( results, conflict_update=(policy_update == 'update-dups')) + + def get_results(self): + """ + can be called only if run method was called before + + Returns: + results (list): list of content_metadata entries calculated + by current indxer + """ + return self.results + + +class RevisionMetadataIndexer(RevisionIndexer): + """Indexer at Revision level in charge of: + - filtering revisions already indexed in revision_metadata table with + defined computation tool + - retrieve all entry_files in root directory + - use metadata_detector for file_names containig metadata + - compute metadata translation if necessary and possible (depends on tool) + - send sha1s to content indexing if possible + - store the results for revision + + """ + CONFIG_BASE_FILENAME = 'indexer/metadata' + + ADDITIONAL_CONFIG = { + 'tools': ('dict', { + 'name': 'swh-metadata-detector', + 'version': '0.0.1', + 'configuration': { + 'type': 'local', + 'context': ['npm', 'codemeta'] + }, + }), + } + + def prepare(self): + super().prepare() + + def filter(self, sha1_gits): + """Filter out known sha1s and return only missing ones. + + """ + yield from self.storage.revision_metadata_missing(( + { + 'id': sha1_git, + 'indexer_configuration_id': self.tools['id'], + } for sha1_git in sha1_gits + )) + + def index(self, rev): + """Index rev by processing it and organizing result. + use metadata_detector to iterate on filenames + - if one filename detected -> sends file to content indexer + - if multiple file detected -> translation needed at revision level + + Args: + rev (bytes): revision artifact from storage + + Returns: + A dict, representing a revision_metadata, with keys: + - id (bytes): rev's identifier (sha1_git) + - indexer_configuration_id (bytes): tool used + - translated_metadata (bytes): dict of retrieved metadata + + """ + try: + result = { + 'id': rev['id'], + 'indexer_configuration_id': self.tools['id'], + 'translated_metadata': None + } + + root_dir = rev['directory'] + dir_ls = self.storage.directory_ls(root_dir, recursive=False) + files = (entry for entry in dir_ls if entry['type'] == 'file') + detected_files = detect_metadata(files) + result['translated_metadata'] = self.translate_revision_metadata( + detected_files) + except Exception as e: + self.log.exception( + 'Problem when indexing rev') + return result + + def persist_index_computations(self, results, policy_update): + """Persist the results in storage. + + Args: + results ([dict]): list of content_mimetype, dict with the + following keys: + - id (bytes): content's identifier (sha1) + - mimetype (bytes): mimetype in bytes + - encoding (bytes): encoding in bytes + policy_update ([str]): either 'update-dups' or 'ignore-dups' to + respectively update duplicates or ignore them + + """ + # TODO: add functions in storage to keep data in revision_metadata + self.storage.revision_metadata_add( + results, conflict_update=(policy_update == 'update-dups')) + + def translate_revision_metadata(self, detected_files): + """ + Determine plan of action to translate metadata when containing + one or multiple detected files: + Args: + - detected_files : dict with context name and list of sha1s + (e.g : {'npm' : [sha1_1, sha1_2], + 'authors': sha1_3}) + + Returns: + - translated_metadata: dict with the CodeMeta vocabulary + """ + translated_metadata = [] + tool = { + 'name': 'swh-metadata-translator', + 'version': '0.0.1', + 'configuration': { + 'type': 'local', + 'context': None + }, + } + # TODO: iterate on each context, on each file + # -> get raw_contents + # -> translate each content + config = { + 'storage': self.storage, + 'objstorage': self.objstorage + } + for context in detected_files.keys(): + tool['configuration']['context'] = context + c_metadata_indexer = ContentMetadataIndexer(tool, config) + # sha1s that are in content_metadata table + sha1s_in_storage = [] + metadata_generator = self.storage.content_metadata_get( + detected_files[context]) + for c in metadata_generator: + # extracting translated_metadata + sha1 = c['id'] + sha1s_in_storage.append(sha1) + local_metadata = c['translated_metadata'] + # local metadata is aggregated + if local_metadata: + translated_metadata.append(local_metadata) + + sha1s_filtered = [item for item in detected_files[context] + if item not in sha1s_in_storage] + + if sha1s_filtered: + print(sha1s_filtered) + # schedule indexation of content + try: + c_metadata_indexer.run(sha1s_filtered, + policy_update='ignore-dups') + # on the fly possibility: + results = c_metadata_indexer.get_results() + + for result in results: + local_metadata = result['translated_metadata'] + translated_metadata.append(local_metadata) + + except Exception as e: + self.log.warn("""Exception while indexing content""", e) + + # transform translated_metadata into min set with swh-metadata-detector + min_metadata = extract_minimal_metadata_dict(translated_metadata) + return min_metadata + + +@click.command() +@click.option('--revs_ids', + default=['8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f', + '026040ea79dec1b49b4e3e7beda9132b6b26b51b', + '9699072e21eded4be8d45e3b8d543952533fa190'], + help='Default sha1_git to lookup') +def main(revs_ids): + _git_sha1s = list(map(hashutil.hash_to_bytes, revs_ids)) + rev_metadata_indexer = RevisionMetadataIndexer() + rev_metadata_indexer.run(_git_sha1s, 'update-dups') + + +if __name__ == '__main__': + import logging + logging.basicConfig(level=logging.INFO) + main() diff --git a/swh/indexer/metadata_detector.py b/swh/indexer/metadata_detector.py new file mode 100644 --- /dev/null +++ b/swh/indexer/metadata_detector.py @@ -0,0 +1,73 @@ +# Copyright (C) 2017 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + + +mapping_filenames = { + b"package.json": "npm", + b"codemeta.json": "codemeta" +} + + +def detect_metadata(files): + """ + Detects files potentially containing metadata + Args: + - file_entries (list): list of files + + Returns: + - empty list if nothing was found + - dictionary {mapping_filenames[name]:f['sha1']} + """ + results = {} + for f in files: + name = f['name'].lower().strip() + # TODO: possibility to detect extensions + if name in mapping_filenames: + tool = mapping_filenames[name] + if tool in results: + results[tool].append(f['sha1']) + else: + results[tool] = [f['sha1']] + return results + + +def extract_minimal_metadata_dict(metadata_list): + """ + Every item in the metadata_list is a dict of translated_metadata in the + CodeMeta vocabulary + we wish to extract a minimal set of terms and keep all values corresponding + to this term + Args: + - metadata_list (list): list of dicts of translated_metadata + + Returns: + - minimal_dict (dict): one dict with selected values of metadata + """ + minimal_dict = { + "developmentStatus": [], + "version": [], + "operatingSystem": [], + "description": [], + "keywords": [], + "issueTracker": [], + "name": [], + "author": [], + "relatedLink": [], + "url": [], + "type": [], + "license": [], + "maintainer": [], + "email": [], + "softwareRequirements": [], + "identifier": [], + "codeRepository": [] + } + for term in minimal_dict.keys(): + for metadata_dict in metadata_list: + if term in metadata_dict: + minimal_dict[term].append(metadata_dict[term]) + if not minimal_dict[term]: + minimal_dict[term] = None + return minimal_dict diff --git a/swh/indexer/mimetype.py b/swh/indexer/mimetype.py --- a/swh/indexer/mimetype.py +++ b/swh/indexer/mimetype.py @@ -8,7 +8,7 @@ from subprocess import Popen, PIPE from swh.scheduler import utils -from .indexer import BaseIndexer +from .indexer import ContentIndexer def compute_mimetype_encoding(raw_content): @@ -35,7 +35,7 @@ } -class ContentMimetypeIndexer(BaseIndexer): +class ContentMimetypeIndexer(ContentIndexer): """Indexer in charge of: - filtering out content already indexed - reading content from objstorage per the content's id (sha1) @@ -67,7 +67,7 @@ self.task_destination = None self.tools = self.retrieve_tools_information() - def filter_contents(self, sha1s): + def filter(self, sha1s): """Filter out known sha1s and return only missing ones. """ @@ -78,7 +78,7 @@ } for sha1 in sha1s )) - def index_content(self, sha1, raw_content): + def index(self, sha1, raw_content): """Index sha1s' content and store result. Args: diff --git a/swh/indexer/orchestrator.py b/swh/indexer/orchestrator.py --- a/swh/indexer/orchestrator.py +++ b/swh/indexer/orchestrator.py @@ -93,22 +93,22 @@ self.indexers = indexers self.tasks = tasks - def run(self, sha1s): + def run(self, ids): for name, (idx_class, filtering, batch_size) in self.indexers.items(): if filtering: policy_update = 'ignore-dups' indexer_class = get_class(idx_class) - sha1s_filtered = list(indexer_class().filter_contents(sha1s)) - if not sha1s_filtered: + ids_filtered = list(indexer_class().filter(ids)) + if not ids_filtered: continue else: policy_update = 'update-dups' - sha1s_filtered = sha1s + ids_filtered = ids celery_tasks = [] - for sha1s_to_send in grouper(sha1s_filtered, batch_size): + for ids_to_send in grouper(ids_filtered, batch_size): celery_task = self.tasks[name].s( - sha1s=list(sha1s_to_send), + ids=list(ids_to_send), policy_update=policy_update) celery_tasks.append(celery_task) diff --git a/swh/indexer/tests/test_metadata.py b/swh/indexer/tests/test_metadata.py --- a/swh/indexer/tests/test_metadata.py +++ b/swh/indexer/tests/test_metadata.py @@ -8,38 +8,39 @@ from nose.tools import istest from swh.indexer.metadata_dictionary import compute_metadata +from swh.indexer.metadata_detector import detect_metadata from swh.indexer.metadata import ContentMetadataIndexer +from swh.indexer.metadata import RevisionMetadataIndexer from swh.indexer.tests.test_utils import MockObjStorage +from swh.indexer.tests.test_utils import MockStorage -class MockStorage(): - """Mock storage to simplify reading indexers' outputs. +class TestContentMetadataIndexer(ContentMetadataIndexer): + """Specific Metadata whose configuration is enough to satisfy the + indexing tests. """ - def content_metadata_add(self, metadata, conflict_update=None): - self.state = metadata - self.conflict_update = conflict_update - - def indexer_configuration_get(self, tool): - return { - 'id': 30, - 'name': 'hard_mapping_npm', - 'version': '0.1', - 'configuration': { - 'type': 'local', - 'context': 'npm' - }, + def prepare(self): + self.config = { + 'rescheduling_task': None, } + self.storage = MockStorage() + self.log = logging.getLogger('swh.indexer') + self.objstorage = MockObjStorage() + self.task_destination = None + self.rescheduling_task = self.config['rescheduling_task'] + self.tools = self.retrieve_tools_information() + self.results = [] -class TestMetadataIndexer(ContentMetadataIndexer): - """Specific Metadata whose configuration is enough to satisfy the +class TestRevisionMetadataIndexer(RevisionMetadataIndexer): + """Specific indexer whose configuration is enough to satisfy the indexing tests. """ def prepare(self): self.config = { 'rescheduling_task': None, - 'tools': { - 'name': 'swh-metadata-translator', + 'tools': { + 'name': 'swh-metadata-detector', 'version': '0.0.1', 'configuration': { 'type': 'local', @@ -53,6 +54,7 @@ self.task_destination = None self.rescheduling_task = self.config['rescheduling_task'] self.tools = self.retrieve_tools_information() + self.results = [] class Metadata(unittest.TestCase): @@ -64,6 +66,14 @@ shows the entire diff in the results """ self.maxDiff = None + self.content_tool = { + 'name': 'swh-metadata-translator', + 'version': '0.0.1', + 'configuration': { + 'type': 'local', + 'context': 'npm' + } + } @istest def test_compute_metadata_none(self): @@ -128,7 +138,7 @@ '02fb2c89e14f7fab46701478c83779c7beb7b069'] # this metadata indexer computes only metadata for package.json # in npm context with a hard mapping - metadata_indexer = TestMetadataIndexer() + metadata_indexer = TestContentMetadataIndexer(self.content_tool, None) # when metadata_indexer.run(sha1s, policy_update='ignore-dups') @@ -197,3 +207,89 @@ # The assertion bellow returns False sometimes because of nested lists self.assertEqual(expected_results, results) + + @istest + def test_detect_metadata_package_json(self): + # given + df = [{ + 'sha1_git': b'abc', + 'name': b'index.js', + 'target': b'abc', + 'length': 897, + 'status': 'visible', + 'type': 'file', + 'perms': 33188, + 'dir_id': b'dir_a', + 'sha1': b'bcd' + }, + { + 'sha1_git': b'aab', + 'name': b'package.json', + 'target': b'aab', + 'length': 712, + 'status': 'visible', + 'type': 'file', + 'perms': 33188, + 'dir_id': b'dir_a', + 'sha1': b'cde' + }] + # when + results = detect_metadata(df) + + expected_results = { + 'npm': [ + b'cde' + ] + } + # then + self.assertEqual(expected_results, results) + + @istest + def test_revision_metadata_indexer(self): + metadata_indexer = TestRevisionMetadataIndexer() + + sha1_gits = [ + b'8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f', + ] + metadata_indexer.run(sha1_gits, 'update-dups') + + results = metadata_indexer.storage.state + + expected_results = [{ + 'id': b'8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f', + 'translated_metadata': { + 'identifier': None, + 'maintainer': None, + 'url': [ + 'https://github.com/librariesio/yarn-parser#readme' + ], + 'codeRepository': [{ + 'type': 'git', + 'url': 'git+https://github.com/librariesio/yarn-parser.git' + }], + 'author': ['Andrew Nesbitt'], + 'license': ['AGPL-3.0'], + 'version': ['1.0.0'], + 'description': [ + 'Tiny web service for parsing yarn.lock files' + ], + 'relatedLink': None, + 'developmentStatus': None, + 'operatingSystem': None, + 'issueTracker': [{ + 'url': 'https://github.com/librariesio/yarn-parser/issues' + }], + 'softwareRequirements': [{ + 'express': '^4.14.0', + 'yarn': '^0.21.0', + 'body-parser': '^1.15.2' + }], + 'name': ['yarn-parser'], + 'keywords': [['yarn', 'parse', 'lock', 'dependencies']], + 'type': None, + 'email': None + }, + 'indexer_configuration_id': 7 + }] + # then + self.assertEqual(expected_results, results) diff --git a/swh/indexer/tests/test_utils.py b/swh/indexer/tests/test_utils.py --- a/swh/indexer/tests/test_utils.py +++ b/swh/indexer/tests/test_utils.py @@ -101,8 +101,9 @@ "license": "Artistic-2.0" } + """, + 'a7ab314d8a11d2c93e3dcf528ca294e7b431c449': b""" """ - } def get(self, sha1): @@ -110,3 +111,139 @@ if not raw_content: raise ObjNotFoundError() return raw_content + + +class MockStorage(): + """Mock storage to simplify reading indexers' outputs. + """ + def content_metadata_missing(self, sha1s): + yield from [] + + def content_metadata_add(self, metadata, conflict_update=None): + self.state = metadata + self.conflict_update = conflict_update + + def revision_metadata_add(self, metadata, conflict_update=None): + self.state = metadata + self.conflict_update = conflict_update + + def indexer_configuration_get(self, tool): + if tool['tool_name'] == 'swh-metadata-translator': + return { + 'id': 30, + 'tool_name': 'swh-metadata-translator', + 'tool_version': '0.0.1', + 'tool_configuration': { + 'type': 'local', + 'context': 'npm' + }, + } + elif tool['tool_name'] == 'swh-metadata-detector': + return { + 'id': 7, + 'tool_name': 'swh-metadata-detector', + 'tool_version': '0.0.1', + 'tool_configuration': { + 'type': 'local', + 'context': 'npm' + }, + } + + def revision_get(self, revisions): + return [{ + 'id': b'8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f', + 'committer': { + 'id': 26, + 'name': b'Andrew Nesbitt', + 'fullname': b'Andrew Nesbitt ', + 'email': b'andrewnez@gmail.com' + }, + 'synthetic': False, + 'date': { + 'negative_utc': False, + 'timestamp': { + 'seconds': 1487596456, + 'microseconds': 0 + }, + 'offset': 0 + }, + 'directory': b'10' + }] + + def directory_ls(self, directory, recursive=False, cur=None): + # with directory: b'\x9d', + return [{ + 'sha1_git': b'abc', + 'name': b'index.js', + 'target': b'abc', + 'length': 897, + 'status': 'visible', + 'type': 'file', + 'perms': 33188, + 'dir_id': b'10', + 'sha1': b'bcd' + }, + { + 'sha1_git': b'aab', + 'name': b'package.json', + 'target': b'aab', + 'length': 712, + 'status': 'visible', + 'type': 'file', + 'perms': 33188, + 'dir_id': b'10', + 'sha1': b'cde' + }, + { + 'dir_id': b'10', + 'target': b'11', + 'type': 'dir', + 'length': None, + 'name': b'.github', + 'sha1': None, + 'perms': 16384, + 'sha1_git': None, + 'status': None, + 'sha256': None + }] + + def content_metadata_get(self, sha1s): + return [{ + 'tool': { + 'configuration': { + 'type': 'local', + 'context': 'npm' + }, + 'version': '0.0.1', + 'id': 6, + 'name': 'swh-metadata-translator' + }, + 'id': b'cde', + 'translated_metadata': { + 'issueTracker': { + 'url': 'https://github.com/librariesio/yarn-parser/issues' + }, + 'version': '1.0.0', + 'name': 'yarn-parser', + 'author': 'Andrew Nesbitt', + 'url': 'https://github.com/librariesio/yarn-parser#readme', + 'processorRequirements': {'node': '7.5'}, + 'other': { + 'scripts': { + 'start': 'node index.js' + }, + 'main': 'index.js' + }, + 'license': 'AGPL-3.0', + 'keywords': ['yarn', 'parse', 'lock', 'dependencies'], + 'codeRepository': { + 'type': 'git', + 'url': 'git+https://github.com/librariesio/yarn-parser.git' + }, + 'description': 'Tiny web service for parsing yarn.lock files', + 'softwareRequirements': { + 'yarn': '^0.21.0', + 'express': '^4.14.0', + 'body-parser': '^1.15.2'} + } + }]