diff --git a/PKG-INFO b/PKG-INFO index da1d5fb..838ac17 100644 --- a/PKG-INFO +++ b/PKG-INFO @@ -1,69 +1,69 @@ Metadata-Version: 2.1 Name: swh.indexer -Version: 0.0.58 +Version: 0.0.59 Summary: Software Heritage Content Indexer Home-page: https://forge.softwareheritage.org/diffusion/78/ Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest Project-URL: Funding, https://www.softwareheritage.org/donate Project-URL: Source, https://forge.softwareheritage.org/source/swh-indexer Description: swh-indexer ============ Tools to compute multiple indexes on SWH's raw contents: - content: - mimetype - ctags - language - fossology-license - metadata - revision: - metadata An indexer is in charge of: - looking up objects - extracting information from those objects - store those information in the swh-indexer db There are multiple indexers working on different object types: - content indexer: works with content sha1 hashes - revision indexer: works with revision sha1 hashes - origin indexer: works with origin identifiers Indexation procedure: - receive batch of ids - retrieve the associated data depending on object type - compute for that object some index - store the result to swh's storage Current content indexers: - mimetype (queue swh_indexer_content_mimetype): detect the encoding and mimetype - language (queue swh_indexer_content_language): detect the programming language - ctags (queue swh_indexer_content_ctags): compute tags information - fossology-license (queue swh_indexer_fossology_license): compute the license - metadata: translate file into translated_metadata dict Current revision indexers: - metadata: detects files containing metadata and retrieves translated_metadata in content_metadata table in storage or run content indexer to translate files. Platform: UNKNOWN Classifier: Programming Language :: Python :: 3 Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3) Classifier: Operating System :: OS Independent Classifier: Development Status :: 5 - Production/Stable Description-Content-Type: text/markdown Provides-Extra: testing diff --git a/swh.indexer.egg-info/PKG-INFO b/swh.indexer.egg-info/PKG-INFO index da1d5fb..838ac17 100644 --- a/swh.indexer.egg-info/PKG-INFO +++ b/swh.indexer.egg-info/PKG-INFO @@ -1,69 +1,69 @@ Metadata-Version: 2.1 Name: swh.indexer -Version: 0.0.58 +Version: 0.0.59 Summary: Software Heritage Content Indexer Home-page: https://forge.softwareheritage.org/diffusion/78/ Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest Project-URL: Funding, https://www.softwareheritage.org/donate Project-URL: Source, https://forge.softwareheritage.org/source/swh-indexer Description: swh-indexer ============ Tools to compute multiple indexes on SWH's raw contents: - content: - mimetype - ctags - language - fossology-license - metadata - revision: - metadata An indexer is in charge of: - looking up objects - extracting information from those objects - store those information in the swh-indexer db There are multiple indexers working on different object types: - content indexer: works with content sha1 hashes - revision indexer: works with revision sha1 hashes - origin indexer: works with origin identifiers Indexation procedure: - receive batch of ids - retrieve the associated data depending on object type - compute for that object some index - store the result to swh's storage Current content indexers: - mimetype (queue swh_indexer_content_mimetype): detect the encoding and mimetype - language (queue swh_indexer_content_language): detect the programming language - ctags (queue swh_indexer_content_ctags): compute tags information - fossology-license (queue swh_indexer_fossology_license): compute the license - metadata: translate file into translated_metadata dict Current revision indexers: - metadata: detects files containing metadata and retrieves translated_metadata in content_metadata table in storage or run content indexer to translate files. Platform: UNKNOWN Classifier: Programming Language :: Python :: 3 Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3) Classifier: Operating System :: OS Independent Classifier: Development Status :: 5 - Production/Stable Description-Content-Type: text/markdown Provides-Extra: testing diff --git a/swh/indexer/ctags.py b/swh/indexer/ctags.py index 0679692..492e7c0 100644 --- a/swh/indexer/ctags.py +++ b/swh/indexer/ctags.py @@ -1,156 +1,155 @@ # Copyright (C) 2015-2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -import click import subprocess import json from swh.model import hashutil from .language import compute_language from .indexer import ContentIndexer, DiskIndexer # Options used to compute tags __FLAGS = [ '--fields=+lnz', # +l: language # +n: line number of tag definition # +z: include the symbol's kind (function, variable, ...) '--sort=no', # sort output on tag name '--links=no', # do not follow symlinks '--output-format=json', # outputs in json ] def run_ctags(path, lang=None, ctags_command='ctags'): """Run ctags on file path with optional language. Args: path: path to the file lang: language for that path (optional) Returns: ctags' output """ optional = [] if lang: optional = ['--language-force=%s' % lang] cmd = [ctags_command] + __FLAGS + optional + [path] output = subprocess.check_output(cmd, universal_newlines=True) for symbol in output.split('\n'): if not symbol: continue js_symbol = json.loads(symbol) yield { 'name': js_symbol['name'], 'kind': js_symbol['kind'], 'line': js_symbol['line'], 'lang': js_symbol['language'], } class CtagsIndexer(ContentIndexer, DiskIndexer): CONFIG_BASE_FILENAME = 'indexer/ctags' ADDITIONAL_CONFIG = { 'workdir': ('str', '/tmp/swh/indexer.ctags'), 'tools': ('dict', { 'name': 'universal-ctags', 'version': '~git7859817b', 'configuration': { 'command_line': '''ctags --fields=+lnz --sort=no --links=no ''' '''--output-format=json ''' }, }), 'languages': ('dict', { 'ada': 'Ada', 'adl': None, 'agda': None, # ... }) } def prepare(self): super().prepare() self.working_directory = self.config['workdir'] self.language_map = self.config['languages'] self.tool = self.tools[0] def filter(self, ids): """Filter out known sha1s and return only missing ones. """ yield from self.idx_storage.content_ctags_missing(( { 'id': sha1, 'indexer_configuration_id': self.tool['id'], } for sha1 in ids )) def compute_ctags(self, path, lang): """Compute ctags on file at path with language lang. """ return run_ctags(path, lang=lang) def index(self, id, data): """Index sha1s' content and store result. Args: id (bytes): content's identifier data (bytes): raw content in bytes Returns: A dict, representing a content_mimetype, with keys: - id (bytes): content's identifier (sha1) - ctags ([dict]): ctags list of symbols """ lang = compute_language(data, log=self.log)['lang'] if not lang: return None ctags_lang = self.language_map.get(lang) if not ctags_lang: return None ctags = { 'id': id, } filename = hashutil.hash_to_hex(id) content_path = self.write_to_temp( filename=filename, data=data) result = run_ctags(content_path, lang=ctags_lang) ctags.update({ 'ctags': list(result), 'indexer_configuration_id': self.tool['id'], }) self.cleanup(content_path) return ctags def persist_index_computations(self, results, policy_update): """Persist the results in storage. Args: results ([dict]): list of content_mimetype, dict with the following keys: - id (bytes): content's identifier (sha1) - ctags ([dict]): ctags list of symbols policy_update ([str]): either 'update-dups' or 'ignore-dups' to respectively update duplicates or ignore them """ self.idx_storage.content_ctags_add( results, conflict_update=(policy_update == 'update-dups')) diff --git a/swh/indexer/fossology_license.py b/swh/indexer/fossology_license.py index 37522b9..2e984e4 100644 --- a/swh/indexer/fossology_license.py +++ b/swh/indexer/fossology_license.py @@ -1,172 +1,186 @@ # Copyright (C) 2016-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -import click import subprocess from swh.model import hashutil from .indexer import ContentIndexer, ContentRangeIndexer, DiskIndexer +def compute_license(path, log=None): + """Determine license from file at path. + + Args: + path: filepath to determine the license + + Returns: + A dict with the following keys: + - licenses ([str]): associated detected licenses to path + - path (bytes): content filepath + + """ + try: + properties = subprocess.check_output(['nomossa', path], + universal_newlines=True) + if properties: + res = properties.rstrip().split(' contains license(s) ') + licenses = res[1].split(',') + else: + licenses = [] + + return { + 'licenses': licenses, + 'path': path, + } + except subprocess.CalledProcessError: + if log: + from os import path as __path + log.exception('Problem during license detection for sha1 %s' % + __path.basename(path)) + return { + 'licenses': [], + 'path': path, + } + + class MixinFossologyLicenseIndexer: """Mixin fossology license indexer. See :class:`ContentFossologyLicenseIndexer` and :class:`FossologyLicenseRangeIndexer` """ ADDITIONAL_CONFIG = { 'workdir': ('str', '/tmp/swh/indexer.fossology.license'), 'tools': ('dict', { 'name': 'nomos', 'version': '3.1.0rc2-31-ga2cbb8c', 'configuration': { 'command_line': 'nomossa ', }, }), 'write_batch_size': ('int', 1000), } CONFIG_BASE_FILENAME = 'indexer/fossology_license' def prepare(self): super().prepare() self.working_directory = self.config['workdir'] self.tool = self.tools[0] def compute_license(self, path, log=None): """Determine license from file at path. Args: path: filepath to determine the license Returns: A dict with the following keys: - licenses ([str]): associated detected licenses to path - path (bytes): content filepath - - tool (str): tool used to compute the output """ - try: - properties = subprocess.check_output(['nomossa', path], - universal_newlines=True) - if properties: - res = properties.rstrip().split(' contains license(s) ') - licenses = res[1].split(',') - - return { - 'licenses': licenses, - 'path': path, - } - except subprocess.CalledProcessError: - if log: - from os import path as __path - log.exception('Problem during license detection for sha1 %s' % - __path.basename(path)) - return { - 'licenses': [], - 'path': path, - } + return compute_license(path, log=log) def index(self, id, data): """Index sha1s' content and store result. Args: id (bytes): content's identifier - raw_content (bytes): raw content in bytes + raw_content (bytes): associated raw content to content id Returns: A dict, representing a content_license, with keys: - id (bytes): content's identifier (sha1) - license (bytes): license in bytes - path (bytes): path + - indexer_configuration_id (int): tool used to compute the output """ - if isinstance(id, str): - id = hashutil.hash_to_hex(id) content_path = self.write_to_temp( - filename=id, + filename=hashutil.hash_to_hex(id), # use the id as pathname data=data) try: properties = self.compute_license(path=content_path, log=self.log) properties.update({ 'id': id, 'indexer_configuration_id': self.tool['id'], }) finally: self.cleanup(content_path) return properties def persist_index_computations(self, results, policy_update): """Persist the results in storage. Args: results ([dict]): list of content_license, dict with the following keys: - id (bytes): content's identifier (sha1) - license (bytes): license in bytes - path (bytes): path policy_update ([str]): either 'update-dups' or 'ignore-dups' to respectively update duplicates or ignore them """ self.idx_storage.content_fossology_license_add( results, conflict_update=(policy_update == 'update-dups')) class ContentFossologyLicenseIndexer( MixinFossologyLicenseIndexer, DiskIndexer, ContentIndexer): """Indexer in charge of: - filtering out content already indexed - reading content from objstorage per the content's id (sha1) - computing {license, encoding} from that content - store result in storage """ def filter(self, ids): """Filter out known sha1s and return only missing ones. """ yield from self.idx_storage.content_fossology_license_missing(( { 'id': sha1, 'indexer_configuration_id': self.tool['id'], } for sha1 in ids )) class FossologyLicenseRangeIndexer( MixinFossologyLicenseIndexer, DiskIndexer, ContentRangeIndexer): """FossologyLicense Range Indexer working on range of content identifiers. It: - filters out the non textual content - (optionally) filters out content already indexed (cf :callable:`range`) - reads content from objstorage per the content's id (sha1) - computes {mimetype, encoding} from that content - stores result in storage """ def indexed_contents_in_range(self, start, end): """Retrieve indexed content id within range [start, end]. Args **start** (bytes): Starting bound from range identifier **end** (bytes): End range identifier Yields: Content identifier (bytes) present in the range [start, end] """ while start: result = self.idx_storage.content_fossology_license_get_range( start, end, self.tool['id']) contents = result['ids'] for _id in contents: yield _id start = result['next'] diff --git a/swh/indexer/indexer.py b/swh/indexer/indexer.py index 1136d8d..5d8dd8d 100644 --- a/swh/indexer/indexer.py +++ b/swh/indexer/indexer.py @@ -1,587 +1,586 @@ # Copyright (C) 2016-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import abc import os import logging import shutil import tempfile import datetime from copy import deepcopy from swh.scheduler import get_scheduler from swh.storage import get_storage from swh.core.config import SWHConfig from swh.objstorage import get_objstorage from swh.objstorage.exc import ObjNotFoundError from swh.indexer.storage import get_indexer_storage, INDEXER_CFG_KEY from swh.model import hashutil from swh.core import utils class DiskIndexer: """Mixin intended to be used with other SomethingIndexer classes. Indexers inheriting from this class are a category of indexers which needs the disk for their computations. Note: This expects `self.working_directory` variable defined at runtime. """ def write_to_temp(self, filename, data): """Write the sha1's content in a temporary file. Args: - sha1 (str): the sha1 name filename (str): one of sha1's many filenames data (bytes): the sha1's content to write in temporary file Returns: The path to the temporary file created. That file is filled in with the raw content's data. """ os.makedirs(self.working_directory, exist_ok=True) temp_dir = tempfile.mkdtemp(dir=self.working_directory) content_path = os.path.join(temp_dir, filename) with open(content_path, 'wb') as f: f.write(data) return content_path def cleanup(self, content_path): """Remove content_path from working directory. Args: content_path (str): the file to remove """ temp_dir = os.path.dirname(content_path) shutil.rmtree(temp_dir) class BaseIndexer(SWHConfig, metaclass=abc.ABCMeta): """Base class for indexers to inherit from. The main entry point is the :func:`run` function which is in charge of triggering the computations on the batch dict/ids received. Indexers can: - filter out ids whose data has already been indexed. - retrieve ids data from storage or objstorage - index this data depending on the object and store the result in storage. To implement a new object type indexer, inherit from the BaseIndexer and implement indexing: :func:`run`: object_ids are different depending on object. For example: sha1 for content, sha1_git for revision, directory, release, and id for origin To implement a new concrete indexer, inherit from the object level classes: :class:`ContentIndexer`, :class:`RevisionIndexer`, :class:`OriginIndexer`. Then you need to implement the following functions: :func:`filter`: filter out data already indexed (in storage). :func:`index_object`: compute index on id with data (retrieved from the storage or the objstorage by the id key) and return the resulting index computation. :func:`persist_index_computations`: persist the results of multiple index computations in the storage. The new indexer implementation can also override the following functions: :func:`prepare`: Configuration preparation for the indexer. When overriding, this must call the `super().prepare()` instruction. :func:`check`: Configuration check for the indexer. When overriding, this must call the `super().check()` instruction. :func:`register_tools`: This should return a dict of the tool(s) to use when indexing or filtering. """ CONFIG = 'indexer/base' DEFAULT_CONFIG = { INDEXER_CFG_KEY: ('dict', { 'cls': 'remote', 'args': { 'url': 'http://localhost:5007/' } }), 'storage': ('dict', { 'cls': 'remote', 'args': { 'url': 'http://localhost:5002/', } }), 'objstorage': ('dict', { 'cls': 'remote', 'args': { 'url': 'http://localhost:5003/', } }) } ADDITIONAL_CONFIG = {} def __init__(self): """Prepare and check that the indexer is ready to run. """ super().__init__() self.prepare() self.check() def prepare(self): """Prepare the indexer's needed runtime configuration. Without this step, the indexer cannot possibly run. """ self.config = self.parse_config_file( additional_configs=[self.ADDITIONAL_CONFIG]) if self.config['storage']: self.storage = get_storage(**self.config['storage']) objstorage = self.config['objstorage'] self.objstorage = get_objstorage(objstorage['cls'], objstorage['args']) idx_storage = self.config[INDEXER_CFG_KEY] self.idx_storage = get_indexer_storage(**idx_storage) _log = logging.getLogger('requests.packages.urllib3.connectionpool') _log.setLevel(logging.WARN) self.log = logging.getLogger('swh.indexer') self.tools = list(self.register_tools(self.config['tools'])) def check(self): """Check the indexer's configuration is ok before proceeding. If ok, does nothing. If not raise error. """ if not self.tools: raise ValueError('Tools %s is unknown, cannot continue' % self.tools) def _prepare_tool(self, tool): """Prepare the tool dict to be compliant with the storage api. """ return {'tool_%s' % key: value for key, value in tool.items()} def register_tools(self, tools): """Permit to register tools to the storage. Add a sensible default which can be overridden if not sufficient. (For now, all indexers use only one tool) Expects the self.config['tools'] property to be set with one or more tools. Args: tools (dict/[dict]): Either a dict or a list of dict. Returns: List of dict with additional id key. Raises: ValueError if not a list nor a dict. """ tools = self.config['tools'] if isinstance(tools, list): tools = map(self._prepare_tool, tools) elif isinstance(tools, dict): tools = [self._prepare_tool(tools)] else: raise ValueError('Configuration tool(s) must be a dict or list!') return self.idx_storage.indexer_configuration_add(tools) @abc.abstractmethod def index(self, id, data): """Index computation for the id and associated raw data. Args: id (bytes): identifier data (bytes): id's data from storage or objstorage depending on object type Returns: a dict that makes sense for the persist_index_computations function. """ pass @abc.abstractmethod def persist_index_computations(self, results, policy_update): """Persist the computation resulting from the index. Args: results ([result]): List of results. One result is the result of the index function. policy_update ([str]): either 'update-dups' or 'ignore-dups' to respectively update duplicates or ignore them Returns: None """ pass def next_step(self, results, task): """Do something else with computations results (e.g. send to another queue, ...). (This is not an abstractmethod since it is optional). Args: results ([result]): List of results (dict) as returned by index function. task (dict): a dict in the form expected by `scheduler.backend.SchedulerBackend.create_tasks` without `next_run`, plus a `result_name` key. Returns: None """ if task: if getattr(self, 'scheduler', None): scheduler = self.scheduler else: scheduler = get_scheduler(**self.config['scheduler']) task = deepcopy(task) result_name = task.pop('result_name') task['next_run'] = datetime.datetime.now() task['arguments']['kwargs'][result_name] = self.results scheduler.create_tasks([task]) @abc.abstractmethod def run(self, ids, policy_update, next_step=None, **kwargs): """Given a list of ids: - retrieves the data from the storage - executes the indexing computations - stores the results (according to policy_update) Args: ids ([bytes]): id's identifier list policy_update (str): either 'update-dups' or 'ignore-dups' to respectively update duplicates or ignore them next_step (dict): a dict in the form expected by `scheduler.backend.SchedulerBackend.create_tasks` without `next_run`, plus a `result_name` key. **kwargs: passed to the `index` method """ pass class ContentIndexer(BaseIndexer): """A content indexer working on a list of ids directly. To work on indexer range, use the :class:`ContentRangeIndexer` instead. Note: :class:`ContentIndexer` is not an instantiable object. To use it, one should inherit from this class and override the methods mentioned in the :class:`BaseIndexer` class. """ @abc.abstractmethod def filter(self, ids): """Filter missing ids for that particular indexer. Args: ids ([bytes]): list of ids Yields: iterator of missing ids """ pass def run(self, ids, policy_update, next_step=None, **kwargs): """Given a list of ids: - retrieve the content from the storage - execute the indexing computations - store the results (according to policy_update) Args: ids ([bytes]): sha1's identifier list policy_update (str): either 'update-dups' or 'ignore-dups' to respectively update duplicates or ignore them next_step (dict): a dict in the form expected by `scheduler.backend.SchedulerBackend.create_tasks` without `next_run`, plus a `result_name` key. **kwargs: passed to the `index` method """ results = [] try: for sha1 in ids: try: raw_content = self.objstorage.get(sha1) except ObjNotFoundError: self.log.warning('Content %s not found in objstorage' % hashutil.hash_to_hex(sha1)) continue res = self.index(sha1, raw_content, **kwargs) if res: # If no results, skip it results.append(res) self.persist_index_computations(results, policy_update) self.results = results return self.next_step(results, task=next_step) except Exception: self.log.exception( 'Problem when reading contents metadata.') class ContentRangeIndexer(BaseIndexer): """A content range indexer. This expects as input a range of ids to index. To work on a list of ids, use the :class:`ContentIndexer` instead. Note: :class:`ContentRangeIndexer` is not an instantiable object. To use it, one should inherit from this class and override the methods mentioned in the :class:`BaseIndexer` class. """ @abc.abstractmethod def indexed_contents_in_range(self, start, end): """Retrieve indexed contents within range [start, end]. Args **start** (bytes): Starting bound from range identifier **end** (bytes): End range identifier Yields: Content identifier (bytes) present in the range [start, end] """ pass def _list_contents_to_index(self, start, end, indexed): """Compute from storage the new contents to index in the range [start, end]. The already indexed contents are skipped. Args: **start** (bytes): Starting bound from range identifier **end** (bytes): End range identifier **indexed** (Set[bytes]): Set of content already indexed. Yields: Identifier (bytes) of contents to index. """ while start: result = self.storage.content_get_range(start, end) contents = result['contents'] for c in contents: _id = c['sha1'] if _id in indexed: continue yield _id start = result['next'] def _index_contents(self, start, end, indexed, **kwargs): """Index the contents from within range [start, end] Args: **start** (bytes): Starting bound from range identifier **end** (bytes): End range identifier **indexed** (Set[bytes]): Set of content already indexed. Yields: Data indexed (dict) to persist using the indexer storage """ for sha1 in self._list_contents_to_index(start, end, indexed): try: raw_content = self.objstorage.get(sha1) except ObjNotFoundError: self.log.warning('Content %s not found in objstorage' % hashutil.hash_to_hex(sha1)) continue res = self.index(sha1, raw_content, **kwargs) if res: yield res def run(self, start, end, skip_existing=True, **kwargs): """Given a range of content ids, compute the indexing computations on the contents within. Either the indexer is incremental (filter out existing computed data) or not (compute everything from scratch). Args: **start** (Union[bytes, str]): Starting range identifier **end** (Union[bytes, str]): Ending range identifier **skip_existing** (bool): Skip existing indexed data (default) or not **kwargs: passed to the `index` method Returns: a boolean. True if data was indexed, False otherwise. """ with_indexed_data = False try: if isinstance(start, str): start = hashutil.hash_to_bytes(start) if isinstance(end, str): end = hashutil.hash_to_bytes(end) if skip_existing: indexed = set(self.indexed_contents_in_range(start, end)) else: indexed = set() index_computations = self._index_contents(start, end, indexed) for results in utils.grouper(index_computations, n=self.config['write_batch_size']): self.persist_index_computations( results, policy_update='update-dups') with_indexed_data = True return with_indexed_data except Exception: self.log.exception( 'Problem when computing metadata.') class OriginIndexer(BaseIndexer): """An object type indexer, inherits from the :class:`BaseIndexer` and implements Origin indexing using the run method Note: the :class:`OriginIndexer` is not an instantiable object. To use it in another context one should inherit from this class and override the methods mentioned in the :class:`BaseIndexer` class. """ def run(self, ids, policy_update, parse_ids=False, next_step=None, **kwargs): """Given a list of origin ids: - retrieve origins from storage - execute the indexing computations - store the results (according to policy_update) Args: ids ([Union[int, Tuple[str, bytes]]]): list of origin ids or (type, url) tuples. policy_update (str): either 'update-dups' or 'ignore-dups' to respectively update duplicates or ignore them parse_ids (bool: If `True`, will try to convert `ids` from a human input to the valid type. next_step (dict): a dict in the form expected by `scheduler.backend.SchedulerBackend.create_tasks` without `next_run`, plus a `result_name` key. **kwargs: passed to the `index` method """ if parse_ids: ids = [ o.split('+', 1) if ':' in o else int(o) # type+url or id for o in ids] results = [] for id_ in ids: if isinstance(id_, (tuple, list)): if len(id_) != 2: raise TypeError('Expected a (type, url) tuple.') (type_, url) = id_ params = {'type': type_, 'url': url} elif isinstance(id_, int): params = {'id': id_} else: raise TypeError('Invalid value in "ids": %r' % id_) origin = self.storage.origin_get(params) if not origin: self.log.warning('Origins %s not found in storage' % list(ids)) continue try: res = self.index(origin, **kwargs) if origin: # If no results, skip it results.append(res) except Exception: self.log.exception( 'Problem when processing origin %s' % id_) self.persist_index_computations(results, policy_update) self.results = results return self.next_step(results, task=next_step) class RevisionIndexer(BaseIndexer): """An object type indexer, inherits from the :class:`BaseIndexer` and implements Revision indexing using the run method Note: the :class:`RevisionIndexer` is not an instantiable object. To use it in another context one should inherit from this class and override the methods mentioned in the :class:`BaseIndexer` class. """ def run(self, ids, policy_update, next_step=None): """Given a list of sha1_gits: - retrieve revisions from storage - execute the indexing computations - store the results (according to policy_update) Args: ids ([bytes or str]): sha1_git's identifier list policy_update (str): either 'update-dups' or 'ignore-dups' to respectively update duplicates or ignore them """ results = [] ids = [id_.encode() if isinstance(id_, str) else id_ for id_ in ids] revs = self.storage.revision_get(ids) for rev in revs: if not rev: self.log.warning('Revisions %s not found in storage' % list(map(hashutil.hash_to_hex, ids))) continue try: res = self.index(rev) if res: # If no results, skip it results.append(res) except Exception: self.log.exception( 'Problem when processing revision') self.persist_index_computations(results, policy_update) self.results = results return self.next_step(results, task=next_step) diff --git a/swh/indexer/mimetype.py b/swh/indexer/mimetype.py index 7dd43af..a17a09c 100644 --- a/swh/indexer/mimetype.py +++ b/swh/indexer/mimetype.py @@ -1,156 +1,155 @@ # Copyright (C) 2016-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -import click import magic from swh.model import hashutil from .indexer import ContentIndexer, ContentRangeIndexer def compute_mimetype_encoding(raw_content): """Determine mimetype and encoding from the raw content. Args: raw_content (bytes): content's raw data Returns: A dict with mimetype and encoding key and corresponding values (as bytes). """ r = magic.detect_from_content(raw_content) return { 'mimetype': r.mime_type.encode('utf-8'), 'encoding': r.encoding.encode('utf-8'), } class MixinMimetypeIndexer: """Mixin mimetype indexer. See :class:`ContentMimetypeIndexer` and :class:`MimetypeRangeIndexer` """ ADDITIONAL_CONFIG = { 'tools': ('dict', { 'name': 'file', 'version': '1:5.30-1+deb9u1', 'configuration': { "type": "library", "debian-package": "python3-magic" }, }), 'write_batch_size': ('int', 1000), } CONFIG_BASE_FILENAME = 'indexer/mimetype' def prepare(self): super().prepare() self.tool = self.tools[0] def index(self, id, data): """Index sha1s' content and store result. Args: id (bytes): content's identifier data (bytes): raw content in bytes Returns: A dict, representing a content_mimetype, with keys: - id (bytes): content's identifier (sha1) - mimetype (bytes): mimetype in bytes - encoding (bytes): encoding in bytes """ try: properties = compute_mimetype_encoding(data) properties.update({ 'id': id, 'indexer_configuration_id': self.tool['id'], }) except TypeError: self.log.error('Detecting mimetype error for id %s' % ( hashutil.hash_to_hex(id), )) return None return properties def persist_index_computations(self, results, policy_update): """Persist the results in storage. Args: results ([dict]): list of content_mimetype, dict with the following keys: - id (bytes): content's identifier (sha1) - mimetype (bytes): mimetype in bytes - encoding (bytes): encoding in bytes policy_update ([str]): either 'update-dups' or 'ignore-dups' to respectively update duplicates or ignore them """ self.idx_storage.content_mimetype_add( results, conflict_update=(policy_update == 'update-dups')) class ContentMimetypeIndexer(MixinMimetypeIndexer, ContentIndexer): """Mimetype Indexer working on list of content identifiers. It: - (optionally) filters out content already indexed (cf. :callable:`filter`) - reads content from objstorage per the content's id (sha1) - computes {mimetype, encoding} from that content - stores result in storage FIXME: - 1. Rename redundant ContentMimetypeIndexer to MimetypeIndexer - 2. Do we keep it afterwards? ~> i think this can be used with the journal """ def filter(self, ids): """Filter out known sha1s and return only missing ones. """ yield from self.idx_storage.content_mimetype_missing(( { 'id': sha1, 'indexer_configuration_id': self.tool['id'], } for sha1 in ids )) class MimetypeRangeIndexer(MixinMimetypeIndexer, ContentRangeIndexer): """Mimetype Range Indexer working on range of content identifiers. It: - (optionally) filters out content already indexed (cf :callable:`range`) - reads content from objstorage per the content's id (sha1) - computes {mimetype, encoding} from that content - stores result in storage """ def indexed_contents_in_range(self, start, end): """Retrieve indexed content id within range [start, end]. Args **start** (bytes): Starting bound from range identifier **end** (bytes): End range identifier Yields: Content identifier (bytes) present in the range [start, end] """ while start: result = self.idx_storage.content_mimetype_get_range( start, end, self.tool['id']) contents = result['ids'] for _id in contents: yield _id start = result['next'] diff --git a/swh/indexer/tests/test_fossology_license.py b/swh/indexer/tests/test_fossology_license.py index 10ad15f..fdcf515 100644 --- a/swh/indexer/tests/test_fossology_license.py +++ b/swh/indexer/tests/test_fossology_license.py @@ -1,172 +1,200 @@ # Copyright (C) 2017-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import unittest import logging +from unittest.mock import patch + from swh.indexer.fossology_license import ( - ContentFossologyLicenseIndexer, FossologyLicenseRangeIndexer + ContentFossologyLicenseIndexer, FossologyLicenseRangeIndexer, + compute_license ) from swh.indexer.tests.test_utils import ( MockObjStorage, BasicMockStorage, BasicMockIndexerStorage, SHA1_TO_LICENSES, CommonContentIndexerTest, CommonContentIndexerRangeTest, CommonIndexerWithErrorsTest, CommonIndexerNoTool, NoDiskIndexer ) +class BasicTest(unittest.TestCase): + @patch('swh.indexer.fossology_license.subprocess') + def test_compute_license(self, mock_subprocess): + """Computing licenses from a raw content should return results + + """ + for path, intermediary_result, output in [ + (b'some/path', None, + []), + (b'some/path/2', [], + []), + (b'other/path', ' contains license(s) GPL,AGPL', + ['GPL', 'AGPL'])]: + mock_subprocess.check_output.return_value = intermediary_result + + actual_result = compute_license(path, log=None) + + self.assertEqual(actual_result, { + 'licenses': output, + 'path': path, + }) + + class InjectLicenseIndexer: """Override license computations. """ def compute_license(self, path, log=None): """path is the content identifier """ + if isinstance(id, bytes): + path = path.decode('utf-8') return { 'licenses': SHA1_TO_LICENSES.get(path) } class FossologyLicenseTestIndexer( NoDiskIndexer, InjectLicenseIndexer, ContentFossologyLicenseIndexer): """Specific fossology license whose configuration is enough to satisfy the indexing checks. """ def prepare(self): self.config = { 'tools': { 'name': 'nomos', 'version': '3.1.0rc2-31-ga2cbb8c', 'configuration': { 'command_line': 'nomossa ', }, }, } self.idx_storage = BasicMockIndexerStorage() self.log = logging.getLogger('swh.indexer') self.objstorage = MockObjStorage() self.tools = self.register_tools(self.config['tools']) self.tool = self.tools[0] class TestFossologyLicenseIndexer(CommonContentIndexerTest, unittest.TestCase): """Language indexer test scenarios: - Known sha1s in the input list have their data indexed - Unknown sha1 in the input list are not indexed """ def setUp(self): self.indexer = FossologyLicenseTestIndexer() self.id0 = '01c9379dfc33803963d07c1ccc748d3fe4c96bb5' self.id1 = '688a5ef812c53907562fe379d4b3851e69c7cb15' self.id2 = 'da39a3ee5e6b4b0d3255bfef95601890afd80709' # empty content tool_id = self.indexer.tool['id'] # then self.expected_results = { self.id0: { 'id': self.id0, 'indexer_configuration_id': tool_id, 'licenses': SHA1_TO_LICENSES[self.id0], }, self.id1: { 'id': self.id1, 'indexer_configuration_id': tool_id, 'licenses': SHA1_TO_LICENSES[self.id1], }, self.id2: { 'id': self.id2, 'indexer_configuration_id': tool_id, 'licenses': SHA1_TO_LICENSES[self.id2], } } class FossologyLicenseRangeIndexerTest( NoDiskIndexer, InjectLicenseIndexer, FossologyLicenseRangeIndexer): """Testing the range indexer on fossology license. """ def prepare(self): self.config = { 'tools': { 'name': 'nomos', 'version': '3.1.0rc2-31-ga2cbb8c', 'configuration': { 'command_line': 'nomossa ', }, }, 'write_batch_size': 100, } self.idx_storage = BasicMockIndexerStorage() self.log = logging.getLogger('swh.indexer') # this hardcodes some contents, will use this to setup the storage self.objstorage = MockObjStorage() # sync objstorage and storage contents = [{'sha1': c_id} for c_id in self.objstorage] self.storage = BasicMockStorage(contents) self.tools = self.register_tools(self.config['tools']) self.tool = self.tools[0] class TestFossologyLicenseRangeIndexer( CommonContentIndexerRangeTest, unittest.TestCase): """Range Fossology License Indexer tests. - new data within range are indexed - no data outside a range are indexed - with filtering existing indexed data prior to compute new index - without filtering existing indexed data prior to compute new index """ def setUp(self): self.indexer = FossologyLicenseRangeIndexerTest() # will play along with the objstorage's mocked contents for now self.contents = sorted(self.indexer.objstorage) # FIXME: leverage swh.objstorage.in_memory_storage's # InMemoryObjStorage, swh.storage.tests's gen_contents, and # hypothesis to generate data to actually run indexer on those self.id0 = '01c9379dfc33803963d07c1ccc748d3fe4c96bb5' self.id1 = '02fb2c89e14f7fab46701478c83779c7beb7b069' self.id2 = '103bc087db1d26afc3a0283f38663d081e9b01e6' tool_id = self.indexer.tool['id'] self.expected_results = { self.id0: { 'id': self.id0, 'indexer_configuration_id': tool_id, 'licenses': SHA1_TO_LICENSES[self.id0] }, self.id1: { 'id': self.id1, 'indexer_configuration_id': tool_id, 'licenses': SHA1_TO_LICENSES[self.id1] }, self.id2: { 'id': self.id2, 'indexer_configuration_id': tool_id, 'licenses': SHA1_TO_LICENSES[self.id2] } } class FossologyLicenseIndexerUnknownToolTestStorage( CommonIndexerNoTool, FossologyLicenseTestIndexer): """Fossology license indexer with wrong configuration""" class FossologyLicenseRangeIndexerUnknownToolTestStorage( CommonIndexerNoTool, FossologyLicenseRangeIndexerTest): """Fossology license range indexer with wrong configuration""" class TestFossologyLicenseIndexersErrors( CommonIndexerWithErrorsTest, unittest.TestCase): """Test the indexer raise the right errors when wrongly initialized""" Indexer = FossologyLicenseIndexerUnknownToolTestStorage RangeIndexer = FossologyLicenseRangeIndexerUnknownToolTestStorage diff --git a/swh/indexer/tests/test_mimetype.py b/swh/indexer/tests/test_mimetype.py index 70d2e1d..c3aee8e 100644 --- a/swh/indexer/tests/test_mimetype.py +++ b/swh/indexer/tests/test_mimetype.py @@ -1,163 +1,188 @@ # Copyright (C) 2017-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import unittest import logging +from unittest.mock import patch + from swh.indexer.mimetype import ( - ContentMimetypeIndexer, MimetypeRangeIndexer + ContentMimetypeIndexer, MimetypeRangeIndexer, compute_mimetype_encoding ) from swh.indexer.tests.test_utils import ( MockObjStorage, BasicMockStorage, BasicMockIndexerStorage, CommonContentIndexerTest, CommonContentIndexerRangeTest, CommonIndexerWithErrorsTest, CommonIndexerNoTool ) +class FakeMagicResult: + def __init__(self, mimetype, encoding): + self.mime_type = mimetype + self.encoding = encoding + + +class BasicTest(unittest.TestCase): + @patch('swh.indexer.mimetype.magic') + def test_compute_mimetype_encoding(self, mock_magic): + """Compute mimetype encoding should return results""" + for _input, _mimetype, _encoding in [ + (b'some-content', 'text/plain', 'utf-8'), + (b'raw-content', 'application/json', 'ascii')]: + mock_magic.detect_from_content.return_value = FakeMagicResult( + _mimetype, _encoding) + + actual_result = compute_mimetype_encoding(_input) + self.assertEqual(actual_result, { + 'mimetype': _mimetype.encode('utf-8'), + 'encoding': _encoding.encode('utf-8'), + }) + + class MimetypeTestIndexer(ContentMimetypeIndexer): """Specific mimetype indexer instance whose configuration is enough to satisfy the indexing tests. """ def prepare(self): self.config = { 'tools': { 'name': 'file', 'version': '1:5.30-1+deb9u1', 'configuration': { "type": "library", "debian-package": "python3-magic" }, }, } self.idx_storage = BasicMockIndexerStorage() self.log = logging.getLogger('swh.indexer') self.objstorage = MockObjStorage() self.tools = self.register_tools(self.config['tools']) self.tool = self.tools[0] class TestMimetypeIndexer(CommonContentIndexerTest, unittest.TestCase): """Mimetype indexer test scenarios: - Known sha1s in the input list have their data indexed - Unknown sha1 in the input list are not indexed """ def setUp(self): self.indexer = MimetypeTestIndexer() self.id0 = '01c9379dfc33803963d07c1ccc748d3fe4c96bb5' self.id1 = '688a5ef812c53907562fe379d4b3851e69c7cb15' self.id2 = 'da39a3ee5e6b4b0d3255bfef95601890afd80709' tool_id = self.indexer.tool['id'] self.expected_results = { self.id0: { 'id': self.id0, 'indexer_configuration_id': tool_id, 'mimetype': b'text/plain', 'encoding': b'us-ascii', }, self.id1: { 'id': self.id1, 'indexer_configuration_id': tool_id, 'mimetype': b'text/plain', 'encoding': b'us-ascii', }, self.id2: { 'id': self.id2, 'indexer_configuration_id': tool_id, 'mimetype': b'application/x-empty', 'encoding': b'binary', } } class MimetypeRangeIndexerTest(MimetypeRangeIndexer): """Specific mimetype whose configuration is enough to satisfy the indexing tests. """ def prepare(self): self.config = { 'tools': { 'name': 'file', 'version': '1:5.30-1+deb9u1', 'configuration': { "type": "library", "debian-package": "python3-magic" }, }, 'write_batch_size': 100, } self.idx_storage = BasicMockIndexerStorage() self.log = logging.getLogger('swh.indexer') # this hardcodes some contents, will use this to setup the storage self.objstorage = MockObjStorage() # sync objstorage and storage contents = [{'sha1': c_id} for c_id in self.objstorage] self.storage = BasicMockStorage(contents) self.tools = self.register_tools(self.config['tools']) self.tool = self.tools[0] class TestMimetypeRangeIndexer( CommonContentIndexerRangeTest, unittest.TestCase): """Range Mimetype Indexer tests. - new data within range are indexed - no data outside a range are indexed - with filtering existing indexed data prior to compute new index - without filtering existing indexed data prior to compute new index """ def setUp(self): self.indexer = MimetypeRangeIndexerTest() # will play along with the objstorage's mocked contents for now self.contents = sorted(self.indexer.objstorage) # FIXME: leverage swh.objstorage.in_memory_storage's # InMemoryObjStorage, swh.storage.tests's gen_contents, and # hypothesis to generate data to actually run indexer on those self.id0 = '01c9379dfc33803963d07c1ccc748d3fe4c96bb5' self.id1 = '02fb2c89e14f7fab46701478c83779c7beb7b069' self.id2 = '103bc087db1d26afc3a0283f38663d081e9b01e6' tool_id = self.indexer.tool['id'] self.expected_results = { self.id0: { 'encoding': b'us-ascii', 'id': self.id0, 'indexer_configuration_id': tool_id, 'mimetype': b'text/plain'}, self.id1: { 'encoding': b'us-ascii', 'id': self.id1, 'indexer_configuration_id': tool_id, 'mimetype': b'text/x-python'}, self.id2: { 'encoding': b'us-ascii', 'id': self.id2, 'indexer_configuration_id': tool_id, 'mimetype': b'text/plain'} } class MimetypeIndexerUnknownToolTestStorage( CommonIndexerNoTool, MimetypeTestIndexer): """Fossology license indexer with wrong configuration""" class MimetypeRangeIndexerUnknownToolTestStorage( CommonIndexerNoTool, MimetypeRangeIndexerTest): """Fossology license range indexer with wrong configuration""" class TestMimetypeIndexersErrors( CommonIndexerWithErrorsTest, unittest.TestCase): """Test the indexer raise the right errors when wrongly initialized""" Indexer = MimetypeIndexerUnknownToolTestStorage RangeIndexer = MimetypeRangeIndexerUnknownToolTestStorage diff --git a/version.txt b/version.txt index 1e222fc..d1c0402 100644 --- a/version.txt +++ b/version.txt @@ -1 +1 @@ -v0.0.58-0-ga8546bc \ No newline at end of file +v0.0.59-0-g45c8f94 \ No newline at end of file