diff --git a/swh/indexer/indexer.py b/swh/indexer/indexer.py index e8fa96d..72d8b78 100644 --- a/swh/indexer/indexer.py +++ b/swh/indexer/indexer.py @@ -1,584 +1,585 @@ # Copyright (C) 2016-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import abc import os import logging import shutil import tempfile import datetime from copy import deepcopy from swh.scheduler import get_scheduler from swh.storage import get_storage from swh.core.config import SWHConfig from swh.objstorage import get_objstorage from swh.objstorage.exc import ObjNotFoundError from swh.indexer.storage import get_indexer_storage, INDEXER_CFG_KEY from swh.model import hashutil from swh.core import utils class DiskIndexer: """Mixin intended to be used with other SomethingIndexer classes. Indexers inheriting from this class are a category of indexers which needs the disk for their computations. Note: This expects `self.working_directory` variable defined at runtime. """ def write_to_temp(self, filename, data): """Write the sha1's content in a temporary file. Args: filename (str): one of sha1's many filenames data (bytes): the sha1's content to write in temporary file Returns: The path to the temporary file created. That file is filled in with the raw content's data. """ os.makedirs(self.working_directory, exist_ok=True) temp_dir = tempfile.mkdtemp(dir=self.working_directory) content_path = os.path.join(temp_dir, filename) with open(content_path, 'wb') as f: f.write(data) return content_path def cleanup(self, content_path): """Remove content_path from working directory. Args: content_path (str): the file to remove """ temp_dir = os.path.dirname(content_path) shutil.rmtree(temp_dir) class BaseIndexer(SWHConfig, metaclass=abc.ABCMeta): """Base class for indexers to inherit from. The main entry point is the :func:`run` function which is in charge of triggering the computations on the batch dict/ids received. Indexers can: - filter out ids whose data has already been indexed. - retrieve ids data from storage or objstorage - index this data depending on the object and store the result in storage. To implement a new object type indexer, inherit from the BaseIndexer and implement indexing: :func:`run`: object_ids are different depending on object. For example: sha1 for content, sha1_git for revision, directory, release, and id for origin To implement a new concrete indexer, inherit from the object level classes: :class:`ContentIndexer`, :class:`RevisionIndexer`, :class:`OriginIndexer`. Then you need to implement the following functions: :func:`filter`: filter out data already indexed (in storage). :func:`index_object`: compute index on id with data (retrieved from the storage or the objstorage by the id key) and return the resulting index computation. :func:`persist_index_computations`: persist the results of multiple index computations in the storage. The new indexer implementation can also override the following functions: :func:`prepare`: Configuration preparation for the indexer. When overriding, this must call the `super().prepare()` instruction. :func:`check`: Configuration check for the indexer. When overriding, this must call the `super().check()` instruction. :func:`register_tools`: This should return a dict of the tool(s) to use when indexing or filtering. """ CONFIG = 'indexer/base' DEFAULT_CONFIG = { INDEXER_CFG_KEY: ('dict', { 'cls': 'remote', 'args': { 'url': 'http://localhost:5007/' } }), 'storage': ('dict', { 'cls': 'remote', 'args': { 'url': 'http://localhost:5002/', } }), 'objstorage': ('dict', { 'cls': 'remote', 'args': { 'url': 'http://localhost:5003/', } }) } ADDITIONAL_CONFIG = {} def __init__(self): """Prepare and check that the indexer is ready to run. """ super().__init__() self.prepare() self.check() def prepare(self): """Prepare the indexer's needed runtime configuration. Without this step, the indexer cannot possibly run. """ self.config = self.parse_config_file( additional_configs=[self.ADDITIONAL_CONFIG]) if self.config['storage']: self.storage = get_storage(**self.config['storage']) objstorage = self.config['objstorage'] self.objstorage = get_objstorage(objstorage['cls'], objstorage['args']) idx_storage = self.config[INDEXER_CFG_KEY] self.idx_storage = get_indexer_storage(**idx_storage) _log = logging.getLogger('requests.packages.urllib3.connectionpool') _log.setLevel(logging.WARN) self.log = logging.getLogger('swh.indexer') self.tools = list(self.register_tools(self.config['tools'])) - def check(self): + def check(self, *, check_tools=True): """Check the indexer's configuration is ok before proceeding. If ok, does nothing. If not raise error. """ - if not self.tools: + if check_tools and not self.tools: raise ValueError('Tools %s is unknown, cannot continue' % self.tools) def _prepare_tool(self, tool): """Prepare the tool dict to be compliant with the storage api. """ return {'tool_%s' % key: value for key, value in tool.items()} def register_tools(self, tools): """Permit to register tools to the storage. Add a sensible default which can be overridden if not sufficient. (For now, all indexers use only one tool) Expects the self.config['tools'] property to be set with one or more tools. Args: tools (dict/[dict]): Either a dict or a list of dict. Returns: List of dict with additional id key. Raises: ValueError if not a list nor a dict. """ tools = self.config['tools'] if isinstance(tools, list): tools = map(self._prepare_tool, tools) elif isinstance(tools, dict): tools = [self._prepare_tool(tools)] else: raise ValueError('Configuration tool(s) must be a dict or list!') - return self.idx_storage.indexer_configuration_add(tools) + if tools: + return self.idx_storage.indexer_configuration_add(tools) @abc.abstractmethod def index(self, id, data): """Index computation for the id and associated raw data. Args: id (bytes): identifier data (bytes): id's data from storage or objstorage depending on object type Returns: a dict that makes sense for the persist_index_computations function. """ pass @abc.abstractmethod def persist_index_computations(self, results, policy_update): """Persist the computation resulting from the index. Args: results ([result]): List of results. One result is the result of the index function. policy_update ([str]): either 'update-dups' or 'ignore-dups' to respectively update duplicates or ignore them Returns: None """ pass def next_step(self, results, task): """Do something else with computations results (e.g. send to another queue, ...). (This is not an abstractmethod since it is optional). Args: results ([result]): List of results (dict) as returned by index function. task (dict): a dict in the form expected by `scheduler.backend.SchedulerBackend.create_tasks` without `next_run`, plus a `result_name` key. Returns: None """ if task: if getattr(self, 'scheduler', None): scheduler = self.scheduler else: scheduler = get_scheduler(**self.config['scheduler']) task = deepcopy(task) result_name = task.pop('result_name') task['next_run'] = datetime.datetime.now() task['arguments']['kwargs'][result_name] = self.results scheduler.create_tasks([task]) @abc.abstractmethod def run(self, ids, policy_update, next_step=None, **kwargs): """Given a list of ids: - retrieves the data from the storage - executes the indexing computations - stores the results (according to policy_update) Args: ids ([bytes]): id's identifier list policy_update (str): either 'update-dups' or 'ignore-dups' to respectively update duplicates or ignore them next_step (dict): a dict in the form expected by `scheduler.backend.SchedulerBackend.create_tasks` without `next_run`, plus a `result_name` key. **kwargs: passed to the `index` method """ pass class ContentIndexer(BaseIndexer): """A content indexer working on a list of ids directly. To work on indexer range, use the :class:`ContentRangeIndexer` instead. Note: :class:`ContentIndexer` is not an instantiable object. To use it, one should inherit from this class and override the methods mentioned in the :class:`BaseIndexer` class. """ @abc.abstractmethod def filter(self, ids): """Filter missing ids for that particular indexer. Args: ids ([bytes]): list of ids Yields: iterator of missing ids """ pass def run(self, ids, policy_update, next_step=None, **kwargs): """Given a list of ids: - retrieve the content from the storage - execute the indexing computations - store the results (according to policy_update) Args: ids ([bytes]): sha1's identifier list policy_update (str): either 'update-dups' or 'ignore-dups' to respectively update duplicates or ignore them next_step (dict): a dict in the form expected by `scheduler.backend.SchedulerBackend.create_tasks` without `next_run`, plus a `result_name` key. **kwargs: passed to the `index` method """ results = [] try: for sha1 in ids: try: raw_content = self.objstorage.get(sha1) except ObjNotFoundError: self.log.warning('Content %s not found in objstorage' % hashutil.hash_to_hex(sha1)) continue res = self.index(sha1, raw_content, **kwargs) if res: # If no results, skip it results.append(res) self.persist_index_computations(results, policy_update) self.results = results return self.next_step(results, task=next_step) except Exception: self.log.exception( 'Problem when reading contents metadata.') class ContentRangeIndexer(BaseIndexer): """A content range indexer. This expects as input a range of ids to index. To work on a list of ids, use the :class:`ContentIndexer` instead. Note: :class:`ContentRangeIndexer` is not an instantiable object. To use it, one should inherit from this class and override the methods mentioned in the :class:`BaseIndexer` class. """ @abc.abstractmethod def indexed_contents_in_range(self, start, end): """Retrieve indexed contents within range [start, end]. Args **start** (bytes): Starting bound from range identifier **end** (bytes): End range identifier Yields: Content identifier (bytes) present in the range [start, end] """ pass def _list_contents_to_index(self, start, end, indexed): """Compute from storage the new contents to index in the range [start, end]. The already indexed contents are skipped. Args: **start** (bytes): Starting bound from range identifier **end** (bytes): End range identifier **indexed** (Set[bytes]): Set of content already indexed. Yields: Identifier (bytes) of contents to index. """ while start: result = self.storage.content_get_range(start, end) contents = result['contents'] for c in contents: _id = c['sha1'] if _id in indexed: continue yield _id start = result['next'] def _index_contents(self, start, end, indexed, **kwargs): """Index the contents from within range [start, end] Args: **start** (bytes): Starting bound from range identifier **end** (bytes): End range identifier **indexed** (Set[bytes]): Set of content already indexed. Yields: Data indexed (dict) to persist using the indexer storage """ for sha1 in self._list_contents_to_index(start, end, indexed): try: raw_content = self.objstorage.get(sha1) except ObjNotFoundError: self.log.warning('Content %s not found in objstorage' % hashutil.hash_to_hex(sha1)) continue res = self.index(sha1, raw_content, **kwargs) if res: yield res def run(self, start, end, skip_existing=True, **kwargs): """Given a range of content ids, compute the indexing computations on the contents within. Either the indexer is incremental (filter out existing computed data) or not (compute everything from scratch). Args: **start** (Union[bytes, str]): Starting range identifier **end** (Union[bytes, str]): Ending range identifier **skip_existing** (bool): Skip existing indexed data (default) or not **kwargs: passed to the `index` method Returns: a boolean. True if data was indexed, False otherwise. """ with_indexed_data = False try: if isinstance(start, str): start = hashutil.hash_to_bytes(start) if isinstance(end, str): end = hashutil.hash_to_bytes(end) if skip_existing: indexed = set(self.indexed_contents_in_range(start, end)) else: indexed = set() index_computations = self._index_contents(start, end, indexed) for results in utils.grouper(index_computations, n=self.config['write_batch_size']): self.persist_index_computations( results, policy_update='update-dups') with_indexed_data = True return with_indexed_data except Exception: self.log.exception( 'Problem when computing metadata.') class OriginIndexer(BaseIndexer): """An object type indexer, inherits from the :class:`BaseIndexer` and implements Origin indexing using the run method Note: the :class:`OriginIndexer` is not an instantiable object. To use it in another context one should inherit from this class and override the methods mentioned in the :class:`BaseIndexer` class. """ def run(self, ids, policy_update='update-dups', parse_ids=True, next_step=None, **kwargs): """Given a list of origin ids: - retrieve origins from storage - execute the indexing computations - store the results (according to policy_update) Args: ids ([Union[int, Tuple[str, bytes]]]): list of origin ids or (type, url) tuples. policy_update (str): either 'update-dups' or 'ignore-dups' to respectively update duplicates (default) or ignore them next_step (dict): a dict in the form expected by `scheduler.backend.SchedulerBackend.create_tasks` without `next_run`, plus a `result_name` key. parse_ids (bool): Do we need to parse id or not (default) **kwargs: passed to the `index` method """ if parse_ids: ids = [o.split('+', 1) if ':' in o else int(o) # type+url or id for o in ids] results = [] for id_ in ids: if isinstance(id_, (tuple, list)): if len(id_) != 2: raise TypeError('Expected a (type, url) tuple.') (type_, url) = id_ params = {'type': type_, 'url': url} elif isinstance(id_, int): params = {'id': id_} else: raise TypeError('Invalid value in "ids": %r' % id_) origin = self.storage.origin_get(params) if not origin: self.log.warning('Origins %s not found in storage' % list(ids)) continue try: res = self.index(origin, **kwargs) if origin: # If no results, skip it results.append(res) except Exception: self.log.exception( 'Problem when processing origin %s' % id_) self.persist_index_computations(results, policy_update) self.results = results return self.next_step(results, task=next_step) class RevisionIndexer(BaseIndexer): """An object type indexer, inherits from the :class:`BaseIndexer` and implements Revision indexing using the run method Note: the :class:`RevisionIndexer` is not an instantiable object. To use it in another context one should inherit from this class and override the methods mentioned in the :class:`BaseIndexer` class. """ def run(self, ids, policy_update, next_step=None): """Given a list of sha1_gits: - retrieve revisions from storage - execute the indexing computations - store the results (according to policy_update) Args: ids ([bytes or str]): sha1_git's identifier list policy_update (str): either 'update-dups' or 'ignore-dups' to respectively update duplicates or ignore them """ results = [] ids = [id_.encode() if isinstance(id_, str) else id_ for id_ in ids] revs = self.storage.revision_get(ids) for rev in revs: if not rev: self.log.warning('Revisions %s not found in storage' % list(map(hashutil.hash_to_hex, ids))) continue try: res = self.index(rev) if res: # If no results, skip it results.append(res) except Exception: self.log.exception( 'Problem when processing revision') self.persist_index_computations(results, policy_update) self.results = results return self.next_step(results, task=next_step) diff --git a/swh/indexer/metadata.py b/swh/indexer/metadata.py index 799849e..4406c11 100644 --- a/swh/indexer/metadata.py +++ b/swh/indexer/metadata.py @@ -1,335 +1,343 @@ # Copyright (C) 2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import click import logging from swh.indexer.indexer import ContentIndexer, RevisionIndexer, OriginIndexer from swh.indexer.metadata_dictionary import MAPPINGS from swh.indexer.metadata_detector import detect_metadata from swh.indexer.metadata_detector import extract_minimal_metadata_dict from swh.indexer.storage import INDEXER_CFG_KEY from swh.model import hashutil class ContentMetadataIndexer(ContentIndexer): """Content-level indexer This indexer is in charge of: - filtering out content already indexed in content_metadata - reading content from objstorage with the content's id sha1 - computing translated_metadata by given context - using the metadata_dictionary as the 'swh-metadata-translator' tool - store result in content_metadata table """ CONFIG_BASE_FILENAME = 'indexer/content_metadata' def __init__(self, tool, config): # twisted way to use the exact same config of RevisionMetadataIndexer # object that uses internally ContentMetadataIndexer self.config = config self.config['tools'] = tool super().__init__() def filter(self, ids): """Filter out known sha1s and return only missing ones. """ yield from self.idx_storage.content_metadata_missing(( { 'id': sha1, 'indexer_configuration_id': self.tool['id'], } for sha1 in ids )) def index(self, id, data): """Index sha1s' content and store result. Args: id (bytes): content's identifier data (bytes): raw content in bytes Returns: dict: dictionary representing a content_metadata. If the translation wasn't successful the translated_metadata keys will be returned as None """ result = { 'id': id, 'indexer_configuration_id': self.tool['id'], 'translated_metadata': None } try: mapping_name = self.tool['tool_configuration']['context'] result['translated_metadata'] = MAPPINGS[mapping_name] \ .translate(data) # a twisted way to keep result with indexer object for get_results self.results.append(result) except Exception: self.log.exception( "Problem during tool retrieval of metadata translation") return result def persist_index_computations(self, results, policy_update): """Persist the results in storage. Args: results ([dict]): list of content_metadata, dict with the following keys: - id (bytes): content's identifier (sha1) - translated_metadata (jsonb): detected metadata policy_update ([str]): either 'update-dups' or 'ignore-dups' to respectively update duplicates or ignore them """ self.idx_storage.content_metadata_add( results, conflict_update=(policy_update == 'update-dups')) def get_results(self): """can be called only if run method was called before Returns: list: list of content_metadata entries calculated by current indexer """ return self.results class RevisionMetadataIndexer(RevisionIndexer): """Revision-level indexer This indexer is in charge of: - filtering revisions already indexed in revision_metadata table with defined computation tool - retrieve all entry_files in root directory - use metadata_detector for file_names containing metadata - compute metadata translation if necessary and possible (depends on tool) - send sha1s to content indexing if possible - store the results for revision """ CONFIG_BASE_FILENAME = 'indexer/revision_metadata' ADDITIONAL_CONFIG = { 'tools': ('dict', { 'name': 'swh-metadata-detector', 'version': '0.0.2', 'configuration': { 'type': 'local', 'context': ['NpmMapping', 'CodemetaMapping'] }, }), } ContentMetadataIndexer = ContentMetadataIndexer def prepare(self): super().prepare() self.tool = self.tools[0] def filter(self, sha1_gits): """Filter out known sha1s and return only missing ones. """ yield from self.idx_storage.revision_metadata_missing(( { 'id': sha1_git, 'indexer_configuration_id': self.tool['id'], } for sha1_git in sha1_gits )) def index(self, rev): """Index rev by processing it and organizing result. use metadata_detector to iterate on filenames - if one filename detected -> sends file to content indexer - if multiple file detected -> translation needed at revision level Args: rev (bytes): revision artifact from storage Returns: dict: dictionary representing a revision_metadata, with keys: - id (str): rev's identifier (sha1_git) - indexer_configuration_id (bytes): tool used - translated_metadata: dict of retrieved metadata """ - try: - result = { - 'id': rev['id'].decode(), - 'indexer_configuration_id': self.tool['id'], - 'translated_metadata': None - } + result = { + 'id': rev['id'].decode(), + 'indexer_configuration_id': self.tool['id'], + 'translated_metadata': None + } + try: root_dir = rev['directory'] dir_ls = self.storage.directory_ls(root_dir, recursive=False) files = [entry for entry in dir_ls if entry['type'] == 'file'] detected_files = detect_metadata(files) result['translated_metadata'] = self.translate_revision_metadata( detected_files) except Exception as e: self.log.exception( 'Problem when indexing rev: %r', e) return result def persist_index_computations(self, results, policy_update): """Persist the results in storage. Args: results ([dict]): list of content_mimetype, dict with the following keys: - id (bytes): content's identifier (sha1) - mimetype (bytes): mimetype in bytes - encoding (bytes): encoding in bytes policy_update ([str]): either 'update-dups' or 'ignore-dups' to respectively update duplicates or ignore them """ # TODO: add functions in storage to keep data in revision_metadata self.idx_storage.revision_metadata_add( results, conflict_update=(policy_update == 'update-dups')) def translate_revision_metadata(self, detected_files): """ Determine plan of action to translate metadata when containing one or multiple detected files: Args: detected_files (dict): dictionary mapping context names (e.g., "npm", "authors") to list of sha1 Returns: dict: dict with translated metadata according to the CodeMeta vocabulary """ translated_metadata = [] tool = { 'name': 'swh-metadata-translator', 'version': '0.0.2', 'configuration': { 'type': 'local', 'context': None }, } # TODO: iterate on each context, on each file # -> get raw_contents # -> translate each content config = { INDEXER_CFG_KEY: self.idx_storage, 'objstorage': self.objstorage } for context in detected_files.keys(): tool['configuration']['context'] = context c_metadata_indexer = self.ContentMetadataIndexer(tool, config) # sha1s that are in content_metadata table sha1s_in_storage = [] metadata_generator = self.idx_storage.content_metadata_get( detected_files[context]) for c in metadata_generator: # extracting translated_metadata sha1 = c['id'] sha1s_in_storage.append(sha1) local_metadata = c['translated_metadata'] # local metadata is aggregated if local_metadata: translated_metadata.append(local_metadata) sha1s_filtered = [item for item in detected_files[context] if item not in sha1s_in_storage] if sha1s_filtered: # schedule indexation of content try: c_metadata_indexer.run(sha1s_filtered, policy_update='ignore-dups') # on the fly possibility: results = c_metadata_indexer.get_results() for result in results: local_metadata = result['translated_metadata'] translated_metadata.append(local_metadata) except Exception as e: self.log.warning("""Exception while indexing content""", e) # transform translated_metadata into min set with swh-metadata-detector min_metadata = extract_minimal_metadata_dict(translated_metadata) return min_metadata class OriginMetadataIndexer(OriginIndexer): + ADDITIONAL_CONFIG = { + 'tools': ('list', []) + } + + def check(self, **kwargs): + kwargs['check_tools'] = False + super().check(**kwargs) + def filter(self, ids): return ids def run(self, revisions_metadata, policy_update, *, origin_head): """Expected to be called with the result of RevisionMetadataIndexer as first argument; ie. not a list of ids as other indexers would. Args: * `revisions_metadata` (List[dict]): contains metadata from revisions, along with the respective revision ids. It is passed by RevisionMetadataIndexer via a Celery chain triggered by OriginIndexer.next_step. * `policy_update`: `'ignore-dups'` or `'update-dups'` * `origin_head` (dict): {str(origin_id): rev_id.encode()} keys `origin_id` and `revision_id`, which is the result of OriginHeadIndexer. """ origin_head_map = {int(origin_id): rev_id for (origin_id, rev_id) in origin_head.items()} # Fix up the argument order. revisions_metadata has to be the # first argument because of celery.chain; the next line calls # run() with the usual order, ie. origin ids first. return super().run(ids=list(origin_head_map), policy_update=policy_update, parse_ids=False, revisions_metadata=revisions_metadata, origin_head_map=origin_head_map) def index(self, origin, *, revisions_metadata, origin_head_map): # Get the last revision of the origin. revision_id = origin_head_map[origin['id']] # Get the metadata of that revision, and return it for revision_metadata in revisions_metadata: if revision_metadata['id'] == revision_id: return { 'origin_id': origin['id'], 'metadata': revision_metadata['translated_metadata'], 'from_revision': revision_id, 'indexer_configuration_id': revision_metadata['indexer_configuration_id'], } raise KeyError('%r not in %r' % (revision_id, [r['id'] for r in revisions_metadata])) def persist_index_computations(self, results, policy_update): self.idx_storage.origin_intrinsic_metadata_add( results, conflict_update=(policy_update == 'update-dups')) @click.command() @click.option('--revs', '-i', help='Default sha1_git to lookup', multiple=True) def main(revs): _git_sha1s = list(map(hashutil.hash_to_bytes, revs)) rev_metadata_indexer = RevisionMetadataIndexer() rev_metadata_indexer.run(_git_sha1s, 'update-dups') if __name__ == '__main__': logging.basicConfig(level=logging.INFO) main() diff --git a/swh/indexer/storage/__init__.py b/swh/indexer/storage/__init__.py index 0a34143..e6393a6 100644 --- a/swh/indexer/storage/__init__.py +++ b/swh/indexer/storage/__init__.py @@ -1,743 +1,745 @@ # Copyright (C) 2015-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import json import psycopg2 from collections import defaultdict from swh.core.api import remote_api_endpoint from swh.storage.common import db_transaction_generator, db_transaction from swh.storage.exc import StorageDBError from .db import Db from . import converters INDEXER_CFG_KEY = 'indexer_storage' def get_indexer_storage(cls, args): """Get an indexer storage object of class `storage_class` with arguments `storage_args`. Args: cls (str): storage's class, either 'local' or 'remote' args (dict): dictionary of arguments passed to the storage class constructor Returns: an instance of swh.indexer's storage (either local or remote) Raises: ValueError if passed an unknown storage class. """ if cls == 'remote': from .api.client import RemoteStorage as IndexerStorage elif cls == 'local': from . import IndexerStorage else: raise ValueError('Unknown indexer storage class `%s`' % cls) return IndexerStorage(**args) class IndexerStorage: """SWH Indexer Storage """ def __init__(self, db, min_pool_conns=1, max_pool_conns=10): """ Args: db_conn: either a libpq connection string, or a psycopg2 connection """ try: if isinstance(db, psycopg2.extensions.connection): self._pool = None self._db = Db(db) else: self._pool = psycopg2.pool.ThreadedConnectionPool( min_pool_conns, max_pool_conns, db ) self._db = None except psycopg2.OperationalError as e: raise StorageDBError(e) def get_db(self): if self._db: return self._db return Db.from_pool(self._pool) @remote_api_endpoint('check_config') def check_config(self, *, check_write): """Check that the storage is configured and ready to go.""" # Check permissions on one of the tables with self.get_db().transaction() as cur: if check_write: check = 'INSERT' else: check = 'SELECT' cur.execute( "select has_table_privilege(current_user, 'content_mimetype', %s)", # noqa (check,) ) return cur.fetchone()[0] return True @remote_api_endpoint('content_mimetype/missing') @db_transaction_generator() def content_mimetype_missing(self, mimetypes, db=None, cur=None): """Generate mimetypes missing from storage. Args: mimetypes (iterable): iterable of dict with keys: - **id** (bytes): sha1 identifier - **indexer_configuration_id** (int): tool used to compute the results Yields: tuple (id, indexer_configuration_id): missing id """ for obj in db.content_mimetype_missing_from_list(mimetypes, cur): yield obj[0] def _content_get_range(self, content_type, start, end, indexer_configuration_id, limit=1000, with_textual_data=False, db=None, cur=None): """Retrieve ids of type content_type within range [start, end] bound by limit. Args: **content_type** (str): content's type (mimetype, language, etc...) **start** (bytes): Starting identifier range (expected smaller than end) **end** (bytes): Ending identifier range (expected larger than start) **indexer_configuration_id** (int): The tool used to index data **limit** (int): Limit result (default to 1000) **with_textual_data** (bool): Deal with only textual content (True) or all content (all contents by defaults, False) Raises: ValueError for; - limit to None - wrong content_type provided Returns: a dict with keys: - **ids** [bytes]: iterable of content ids within the range. - **next** (Optional[bytes]): The next range of sha1 starts at this sha1 if any """ if limit is None: raise ValueError('Development error: limit should not be None') if content_type not in db.content_indexer_names: err = 'Development error: Wrong type. Should be one of [%s]' % ( ','.join(db.content_indexer_names)) raise ValueError(err) ids = [] next_id = None for counter, obj in enumerate(db.content_get_range( content_type, start, end, indexer_configuration_id, limit=limit+1, with_textual_data=with_textual_data, cur=cur)): _id = obj[0] if counter >= limit: next_id = _id break ids.append(_id) return { 'ids': ids, 'next': next_id } @remote_api_endpoint('content_mimetype/range') @db_transaction() def content_mimetype_get_range(self, start, end, indexer_configuration_id, limit=1000, db=None, cur=None): """Retrieve mimetypes within range [start, end] bound by limit. Args: **start** (bytes): Starting identifier range (expected smaller than end) **end** (bytes): Ending identifier range (expected larger than start) **indexer_configuration_id** (int): The tool used to index data **limit** (int): Limit result (default to 1000) Raises: ValueError for limit to None Returns: a dict with keys: - **ids** [bytes]: iterable of content ids within the range. - **next** (Optional[bytes]): The next range of sha1 starts at this sha1 if any """ return self._content_get_range('mimetype', start, end, indexer_configuration_id, limit=limit, db=db, cur=cur) @remote_api_endpoint('content_mimetype/add') @db_transaction() def content_mimetype_add(self, mimetypes, conflict_update=False, db=None, cur=None): """Add mimetypes not present in storage. Args: mimetypes (iterable): dictionaries with keys: - **id** (bytes): sha1 identifier - **mimetype** (bytes): raw content's mimetype - **encoding** (bytes): raw content's encoding - **indexer_configuration_id** (int): tool's id used to compute the results - **conflict_update** (bool): Flag to determine if we want to overwrite (``True``) or skip duplicates (``False``, the default) """ db.mktemp_content_mimetype(cur) db.copy_to(mimetypes, 'tmp_content_mimetype', ['id', 'mimetype', 'encoding', 'indexer_configuration_id'], cur) db.content_mimetype_add_from_temp(conflict_update, cur) @remote_api_endpoint('content_mimetype') @db_transaction_generator() def content_mimetype_get(self, ids, db=None, cur=None): """Retrieve full content mimetype per ids. Args: ids (iterable): sha1 identifier Yields: mimetypes (iterable): dictionaries with keys: - **id** (bytes): sha1 identifier - **mimetype** (bytes): raw content's mimetype - **encoding** (bytes): raw content's encoding - **tool** (dict): Tool used to compute the language """ for c in db.content_mimetype_get_from_list(ids, cur): yield converters.db_to_mimetype( dict(zip(db.content_mimetype_cols, c))) @remote_api_endpoint('content_language/missing') @db_transaction_generator() def content_language_missing(self, languages, db=None, cur=None): """List languages missing from storage. Args: languages (iterable): dictionaries with keys: - **id** (bytes): sha1 identifier - **indexer_configuration_id** (int): tool used to compute the results Yields: an iterable of missing id for the tuple (id, indexer_configuration_id) """ for obj in db.content_language_missing_from_list(languages, cur): yield obj[0] @remote_api_endpoint('content_language') @db_transaction_generator() def content_language_get(self, ids, db=None, cur=None): """Retrieve full content language per ids. Args: ids (iterable): sha1 identifier Yields: languages (iterable): dictionaries with keys: - **id** (bytes): sha1 identifier - **lang** (bytes): raw content's language - **tool** (dict): Tool used to compute the language """ for c in db.content_language_get_from_list(ids, cur): yield converters.db_to_language( dict(zip(db.content_language_cols, c))) @remote_api_endpoint('content_language/add') @db_transaction() def content_language_add(self, languages, conflict_update=False, db=None, cur=None): """Add languages not present in storage. Args: languages (iterable): dictionaries with keys: - **id** (bytes): sha1 - **lang** (bytes): language detected conflict_update (bool): Flag to determine if we want to overwrite (true) or skip duplicates (false, the default) """ db.mktemp_content_language(cur) # empty language is mapped to 'unknown' db.copy_to( ({ 'id': l['id'], 'lang': 'unknown' if not l['lang'] else l['lang'], 'indexer_configuration_id': l['indexer_configuration_id'], } for l in languages), 'tmp_content_language', ['id', 'lang', 'indexer_configuration_id'], cur) db.content_language_add_from_temp(conflict_update, cur) @remote_api_endpoint('content/ctags/missing') @db_transaction_generator() def content_ctags_missing(self, ctags, db=None, cur=None): """List ctags missing from storage. Args: ctags (iterable): dicts with keys: - **id** (bytes): sha1 identifier - **indexer_configuration_id** (int): tool used to compute the results Yields: an iterable of missing id for the tuple (id, indexer_configuration_id) """ for obj in db.content_ctags_missing_from_list(ctags, cur): yield obj[0] @remote_api_endpoint('content/ctags') @db_transaction_generator() def content_ctags_get(self, ids, db=None, cur=None): """Retrieve ctags per id. Args: ids (iterable): sha1 checksums Yields: Dictionaries with keys: - **id** (bytes): content's identifier - **name** (str): symbol's name - **kind** (str): symbol's kind - **language** (str): language for that content - **tool** (dict): tool used to compute the ctags' info """ for c in db.content_ctags_get_from_list(ids, cur): yield converters.db_to_ctags(dict(zip(db.content_ctags_cols, c))) @remote_api_endpoint('content/ctags/add') @db_transaction() def content_ctags_add(self, ctags, conflict_update=False, db=None, cur=None): """Add ctags not present in storage Args: ctags (iterable): dictionaries with keys: - **id** (bytes): sha1 - **ctags** ([list): List of dictionary with keys: name, kind, line, language """ def _convert_ctags(__ctags): """Convert ctags dict to list of ctags. """ for ctags in __ctags: yield from converters.ctags_to_db(ctags) db.mktemp_content_ctags(cur) db.copy_to(list(_convert_ctags(ctags)), tblname='tmp_content_ctags', columns=['id', 'name', 'kind', 'line', 'lang', 'indexer_configuration_id'], cur=cur) db.content_ctags_add_from_temp(conflict_update, cur) @remote_api_endpoint('content/ctags/search') @db_transaction_generator() def content_ctags_search(self, expression, limit=10, last_sha1=None, db=None, cur=None): """Search through content's raw ctags symbols. Args: expression (str): Expression to search for limit (int): Number of rows to return (default to 10). last_sha1 (str): Offset from which retrieving data (default to ''). Yields: rows of ctags including id, name, lang, kind, line, etc... """ for obj in db.content_ctags_search(expression, last_sha1, limit, cur=cur): yield converters.db_to_ctags(dict(zip(db.content_ctags_cols, obj))) @remote_api_endpoint('content/fossology_license') @db_transaction_generator() def content_fossology_license_get(self, ids, db=None, cur=None): """Retrieve licenses per id. Args: ids (iterable): sha1 checksums Yields: list: dictionaries with the following keys: - **id** (bytes) - **licenses** ([str]): associated licenses for that content - **tool** (dict): Tool used to compute the license """ d = defaultdict(list) for c in db.content_fossology_license_get_from_list(ids, cur): license = dict(zip(db.content_fossology_license_cols, c)) id_ = license['id'] d[id_].append(converters.db_to_fossology_license(license)) for id_, facts in d.items(): yield {id_: facts} @remote_api_endpoint('content/fossology_license/add') @db_transaction() def content_fossology_license_add(self, licenses, conflict_update=False, db=None, cur=None): """Add licenses not present in storage. Args: licenses (iterable): dictionaries with keys: - **id**: sha1 - **license** ([bytes]): List of licenses associated to sha1 - **tool** (str): nomossa conflict_update: Flag to determine if we want to overwrite (true) or skip duplicates (false, the default) Returns: list: content_license entries which failed due to unknown licenses """ # Then, we add the correct ones db.mktemp_content_fossology_license(cur) db.copy_to( ({ 'id': sha1['id'], 'indexer_configuration_id': sha1['indexer_configuration_id'], 'license': license, } for sha1 in licenses for license in sha1['licenses']), tblname='tmp_content_fossology_license', columns=['id', 'license', 'indexer_configuration_id'], cur=cur) db.content_fossology_license_add_from_temp(conflict_update, cur) @remote_api_endpoint('content/fossology_license/range') @db_transaction() def content_fossology_license_get_range( self, start, end, indexer_configuration_id, limit=1000, db=None, cur=None): """Retrieve licenses within range [start, end] bound by limit. Args: **start** (bytes): Starting identifier range (expected smaller than end) **end** (bytes): Ending identifier range (expected larger than start) **indexer_configuration_id** (int): The tool used to index data **limit** (int): Limit result (default to 1000) Raises: ValueError for limit to None Returns: a dict with keys: - **ids** [bytes]: iterable of content ids within the range. - **next** (Optional[bytes]): The next range of sha1 starts at this sha1 if any """ return self._content_get_range('fossology_license', start, end, indexer_configuration_id, limit=limit, with_textual_data=True, db=db, cur=cur) @remote_api_endpoint('content_metadata/missing') @db_transaction_generator() def content_metadata_missing(self, metadata, db=None, cur=None): """List metadata missing from storage. Args: metadata (iterable): dictionaries with keys: - **id** (bytes): sha1 identifier - **indexer_configuration_id** (int): tool used to compute the results Yields: an iterable of missing id for the tuple (id, indexer_configuration_id) """ for obj in db.content_metadata_missing_from_list(metadata, cur): yield obj[0] @remote_api_endpoint('content_metadata') @db_transaction_generator() def content_metadata_get(self, ids, db=None, cur=None): """Retrieve metadata per id. Args: ids (iterable): sha1 checksums Yields: list: dictionaries with the following keys: id (bytes) translated_metadata (str): associated metadata tool (dict): tool used to compute metadata """ for c in db.content_metadata_get_from_list(ids, cur): yield converters.db_to_metadata( dict(zip(db.content_metadata_cols, c))) @remote_api_endpoint('content_metadata/add') @db_transaction() def content_metadata_add(self, metadata, conflict_update=False, db=None, cur=None): """Add metadata not present in storage. Args: metadata (iterable): dictionaries with keys: - **id**: sha1 - **translated_metadata**: arbitrary dict conflict_update: Flag to determine if we want to overwrite (true) or skip duplicates (false, the default) """ db.mktemp_content_metadata(cur) db.copy_to(metadata, 'tmp_content_metadata', ['id', 'translated_metadata', 'indexer_configuration_id'], cur) db.content_metadata_add_from_temp(conflict_update, cur) @remote_api_endpoint('revision_metadata/missing') @db_transaction_generator() def revision_metadata_missing(self, metadata, db=None, cur=None): """List metadata missing from storage. Args: metadata (iterable): dictionaries with keys: - **id** (bytes): sha1_git revision identifier - **indexer_configuration_id** (int): tool used to compute the results Returns: iterable: missing ids """ for obj in db.revision_metadata_missing_from_list(metadata, cur): yield obj[0] @remote_api_endpoint('revision_metadata') @db_transaction_generator() def revision_metadata_get(self, ids, db=None, cur=None): """Retrieve revision metadata per id. Args: ids (iterable): sha1 checksums Yields: list: dictionaries with the following keys: - **id** (bytes) - **translated_metadata** (str): associated metadata - **tool** (dict): tool used to compute metadata """ for c in db.revision_metadata_get_from_list(ids, cur): yield converters.db_to_metadata( dict(zip(db.revision_metadata_cols, c))) @remote_api_endpoint('revision_metadata/add') @db_transaction() def revision_metadata_add(self, metadata, conflict_update=False, db=None, cur=None): """Add metadata not present in storage. Args: metadata (iterable): dictionaries with keys: - **id**: sha1_git of revision - **translated_metadata**: arbitrary dict + - **indexer_configuration_id**: tool used to compute metadata conflict_update: Flag to determine if we want to overwrite (true) or skip duplicates (false, the default) """ db.mktemp_revision_metadata(cur) db.copy_to(metadata, 'tmp_revision_metadata', ['id', 'translated_metadata', 'indexer_configuration_id'], cur) db.revision_metadata_add_from_temp(conflict_update, cur) @remote_api_endpoint('origin_intrinsic_metadata') @db_transaction_generator() def origin_intrinsic_metadata_get(self, ids, db=None, cur=None): """Retrieve origin metadata per id. Args: ids (iterable): origin identifiers Yields: list: dictionaries with the following keys: - **id** (int) - **translated_metadata** (str): associated metadata - **tool** (dict): tool used to compute metadata """ for c in db.origin_intrinsic_metadata_get_from_list(ids, cur): yield converters.db_to_metadata( dict(zip(db.origin_intrinsic_metadata_cols, c))) @remote_api_endpoint('origin_intrinsic_metadata/add') @db_transaction() def origin_intrinsic_metadata_add(self, metadata, conflict_update=False, db=None, cur=None): """Add origin metadata not present in storage. Args: metadata (iterable): dictionaries with keys: - **origin_id**: origin identifier - **from_revision**: sha1 id of the revision used to generate these metadata. - **metadata**: arbitrary dict + - **indexer_configuration_id**: tool used to compute metadata conflict_update: Flag to determine if we want to overwrite (true) or skip duplicates (false, the default) """ db.mktemp_origin_intrinsic_metadata(cur) db.copy_to(metadata, 'tmp_origin_intrinsic_metadata', ['origin_id', 'metadata', 'indexer_configuration_id', 'from_revision'], cur) db.origin_intrinsic_metadata_add_from_temp(conflict_update, cur) @remote_api_endpoint('origin_intrinsic_metadata/search/fulltext') @db_transaction_generator() def origin_intrinsic_metadata_search_fulltext( self, conjunction, limit=100, db=None, cur=None): """Returns the list of origins whose metadata contain all the terms. Args: conjunction (List[str]): List of terms to be searched for. limit (int): The maximum number of results to return Yields: list: dictionaries with the following keys: - **id** (int) - **metadata** (str): associated metadata - **tool** (dict): tool used to compute metadata """ for c in db.origin_intrinsic_metadata_search_fulltext( conjunction, limit=limit, cur=cur): yield converters.db_to_metadata( dict(zip(db.origin_intrinsic_metadata_cols, c))) @remote_api_endpoint('indexer_configuration/add') @db_transaction_generator() def indexer_configuration_add(self, tools, db=None, cur=None): """Add new tools to the storage. Args: tools ([dict]): List of dictionary representing tool to insert in the db. Dictionary with the following keys: - **tool_name** (str): tool's name - **tool_version** (str): tool's version - **tool_configuration** (dict): tool's configuration (free form dict) Returns: List of dict inserted in the db (holding the id key as well). The order of the list is not guaranteed to match the order of the initial list. """ db.mktemp_indexer_configuration(cur) db.copy_to(tools, 'tmp_indexer_configuration', ['tool_name', 'tool_version', 'tool_configuration'], cur) tools = db.indexer_configuration_add_from_temp(cur) for line in tools: yield dict(zip(db.indexer_configuration_cols, line)) @remote_api_endpoint('indexer_configuration/data') @db_transaction() def indexer_configuration_get(self, tool, db=None, cur=None): """Retrieve tool information. Args: tool (dict): Dictionary representing a tool with the following keys: - **tool_name** (str): tool's name - **tool_version** (str): tool's version - **tool_configuration** (dict): tool's configuration (free form dict) Returns: The identifier of the tool if it exists, None otherwise. """ tool_conf = tool['tool_configuration'] if isinstance(tool_conf, dict): tool_conf = json.dumps(tool_conf) idx = db.indexer_configuration_get(tool['tool_name'], tool['tool_version'], tool_conf) if not idx: return None return dict(zip(db.indexer_configuration_cols, idx)) diff --git a/swh/indexer/tests/test_origin_metadata.py b/swh/indexer/tests/test_origin_metadata.py index 08a2346..510ae1a 100644 --- a/swh/indexer/tests/test_origin_metadata.py +++ b/swh/indexer/tests/test_origin_metadata.py @@ -1,127 +1,122 @@ # Copyright (C) 2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import time import logging import unittest from celery import task from swh.indexer.metadata import OriginMetadataIndexer from swh.indexer.tests.test_utils import MockObjStorage, MockStorage from swh.indexer.tests.test_utils import MockIndexerStorage from swh.indexer.tests.test_origin_head import OriginHeadTestIndexer from swh.indexer.tests.test_metadata import RevisionMetadataTestIndexer from swh.scheduler.tests.scheduler_testing import SchedulerTestFixture class OriginMetadataTestIndexer(OriginMetadataIndexer): def prepare(self): self.config = { 'storage': { 'cls': 'remote', 'args': { 'url': 'http://localhost:9999', } }, - 'tools': { - 'name': 'origin-metadata', - 'version': '0.0.1', - 'configuration': {} - } + 'tools': [], } self.storage = MockStorage() self.idx_storage = MockIndexerStorage() self.log = logging.getLogger('swh.indexer') self.objstorage = MockObjStorage() self.tools = self.register_tools(self.config['tools']) - self.tool = self.tools[0] self.results = [] @task def revision_metadata_test_task(*args, **kwargs): indexer = RevisionMetadataTestIndexer() indexer.run(*args, **kwargs) return indexer.results @task def origin_intrinsic_metadata_test_task(*args, **kwargs): indexer = OriginMetadataTestIndexer() indexer.run(*args, **kwargs) return indexer.results class OriginHeadTestIndexer(OriginHeadTestIndexer): def prepare(self): super().prepare() self.config['tasks'] = { 'revision_metadata': 'revision_metadata_test_task', 'origin_intrinsic_metadata': 'origin_intrinsic_metadata_test_task', } class TestOriginMetadata(SchedulerTestFixture, unittest.TestCase): def setUp(self): super().setUp() self.maxDiff = None MockIndexerStorage.added_data = [] self.add_scheduler_task_type( 'revision_metadata_test_task', 'swh.indexer.tests.test_origin_metadata.' 'revision_metadata_test_task') self.add_scheduler_task_type( 'origin_intrinsic_metadata_test_task', 'swh.indexer.tests.test_origin_metadata.' 'origin_intrinsic_metadata_test_task') RevisionMetadataTestIndexer.scheduler = self.scheduler def tearDown(self): del RevisionMetadataTestIndexer.scheduler super().tearDown() def test_pipeline(self): indexer = OriginHeadTestIndexer() indexer.scheduler = self.scheduler indexer.run(["git+https://github.com/librariesio/yarn-parser"]) self.run_ready_tasks() # Run the first task time.sleep(0.1) # Give it time to complete and schedule the 2nd one self.run_ready_tasks() # Run the second task metadata = { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'url': 'https://github.com/librariesio/yarn-parser#readme', 'schema:codeRepository': 'git+https://github.com/librariesio/yarn-parser.git', 'schema:author': 'Andrew Nesbitt', 'license': 'AGPL-3.0', 'version': '1.0.0', 'description': 'Tiny web service for parsing yarn.lock files', 'codemeta:issueTracker': 'https://github.com/librariesio/yarn-parser/issues', 'name': 'yarn-parser', 'keywords': ['yarn', 'parse', 'lock', 'dependencies'], } rev_metadata = { 'id': '8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f', 'translated_metadata': metadata, 'indexer_configuration_id': 7, } origin_metadata = { 'origin_id': 54974445, 'from_revision': '8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f', 'metadata': metadata, 'indexer_configuration_id': 7, } expected_results = [ ('origin_intrinsic_metadata', True, [origin_metadata]), ('revision_metadata', True, [rev_metadata])] results = list(indexer.idx_storage.added_data) self.assertCountEqual(expected_results, results) diff --git a/swh/indexer/tests/test_utils.py b/swh/indexer/tests/test_utils.py index 858ce23..102c28d 100644 --- a/swh/indexer/tests/test_utils.py +++ b/swh/indexer/tests/test_utils.py @@ -1,715 +1,720 @@ # Copyright (C) 2017-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from swh.objstorage.exc import ObjNotFoundError from swh.model import hashutil ORIGINS = [ { 'id': 52189575, 'lister': None, 'project': None, 'type': 'git', 'url': 'https://github.com/SoftwareHeritage/swh-storage'}, { 'id': 4423668, 'lister': None, 'project': None, 'type': 'ftp', 'url': 'rsync://ftp.gnu.org/gnu/3dldf'}, { 'id': 77775770, 'lister': None, 'project': None, 'type': 'deposit', 'url': 'https://forge.softwareheritage.org/source/jesuisgpl/'}, { 'id': 85072327, 'lister': None, 'project': None, 'type': 'pypi', 'url': 'https://pypi.org/project/limnoria/'}, { 'id': 49908349, 'lister': None, 'project': None, 'type': 'svn', 'url': 'http://0-512-md.googlecode.com/svn/'}, { 'id': 54974445, 'lister': None, 'project': None, 'type': 'git', 'url': 'https://github.com/librariesio/yarn-parser'}, ] SNAPSHOTS = { 52189575: { 'branches': { b'refs/heads/add-revision-origin-cache': { 'target': b'L[\xce\x1c\x88\x8eF\t\xf1"\x19\x1e\xfb\xc0' b's\xe7/\xe9l\x1e', 'target_type': 'revision'}, b'HEAD': { 'target': b'8K\x12\x00d\x03\xcc\xe4]bS\xe3\x8f{\xd7}' b'\xac\xefrm', 'target_type': 'revision'}, b'refs/tags/v0.0.103': { 'target': b'\xb6"Im{\xfdLb\xb0\x94N\xea\x96m\x13x\x88+' b'\x0f\xdd', 'target_type': 'release'}, }}, 4423668: { 'branches': { b'3DLDF-1.1.4.tar.gz': { 'target': b'dJ\xfb\x1c\x91\xf4\x82B%]6\xa2\x90|\xd3\xfc' b'"G\x99\x11', 'target_type': 'revision'}, b'3DLDF-2.0.2.tar.gz': { 'target': b'\xb6\x0e\xe7\x9e9\xac\xaa\x19\x9e=' b'\xd1\xc5\x00\\\xc6\xfc\xe0\xa6\xb4V', 'target_type': 'revision'}, b'3DLDF-2.0.3-examples.tar.gz': { 'target': b'!H\x19\xc0\xee\x82-\x12F1\xbd\x97' b'\xfe\xadZ\x80\x80\xc1\x83\xff', 'target_type': 'revision'}, b'3DLDF-2.0.3.tar.gz': { 'target': b'\x8e\xa9\x8e/\xea}\x9feF\xf4\x9f\xfd\xee' b'\xcc\x1a\xb4`\x8c\x8by', 'target_type': 'revision'}, b'3DLDF-2.0.tar.gz': { 'target': b'F6*\xff(?\x19a\xef\xb6\xc2\x1fv$S\xe3G' b'\xd3\xd1m', b'target_type': 'revision'} }}, 77775770: { 'branches': { b'master': { 'target': b'\xe7n\xa4\x9c\x9f\xfb\xb7\xf76\x11\x08{' b'\xa6\xe9\x99\xb1\x9e]q\xeb', 'target_type': 'revision'} }, 'id': b"h\xc0\xd2a\x04\xd4~'\x8d\xd6\xbe\x07\xeda\xfa\xfbV" b"\x1d\r "}, 85072327: { 'branches': { b'HEAD': { 'target': b'releases/2018.09.09', 'target_type': 'alias'}, b'releases/2018.09.01': { 'target': b'<\xee1(\xe8\x8d_\xc1\xc9\xa6rT\xf1\x1d' b'\xbb\xdfF\xfdw\xcf', 'target_type': 'revision'}, b'releases/2018.09.09': { 'target': b'\x83\xb9\xb6\xc7\x05\xb1%\xd0\xfem\xd8k' b'A\x10\x9d\xc5\xfa2\xf8t', 'target_type': 'revision'}}, 'id': b'{\xda\x8e\x84\x7fX\xff\x92\x80^\x93V\x18\xa3\xfay' b'\x12\x9e\xd6\xb3'}, 49908349: { 'branches': { b'master': { 'target': b'\xe4?r\xe1,\x88\xab\xec\xe7\x9a\x87\xb8' b'\xc9\xad#.\x1bw=\x18', 'target_type': 'revision'}}, 'id': b'\xa1\xa2\x8c\n\xb3\x87\xa8\xf9\xe0a\x8c\xb7' b'\x05\xea\xb8\x1f\xc4H\xf4s'}, 54974445: { 'branches': { b'HEAD': { 'target': b'8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f', 'target_type': 'revision'}}} } SHA1_TO_LICENSES = { '01c9379dfc33803963d07c1ccc748d3fe4c96bb5': ['GPL'], '02fb2c89e14f7fab46701478c83779c7beb7b069': ['Apache2.0'], '103bc087db1d26afc3a0283f38663d081e9b01e6': ['MIT'], '688a5ef812c53907562fe379d4b3851e69c7cb15': ['AGPL'], 'da39a3ee5e6b4b0d3255bfef95601890afd80709': [], } SHA1_TO_CTAGS = { '01c9379dfc33803963d07c1ccc748d3fe4c96bb5': [{ 'name': 'foo', 'kind': 'str', 'line': 10, 'lang': 'bar', }], 'd4c647f0fc257591cc9ba1722484229780d1c607': [{ 'name': 'let', 'kind': 'int', 'line': 100, 'lang': 'haskell', }], '688a5ef812c53907562fe379d4b3851e69c7cb15': [{ 'name': 'symbol', 'kind': 'float', 'line': 99, 'lang': 'python', }], } class MockObjStorage: """Mock an swh-objstorage objstorage with predefined contents. """ data = {} def __init__(self): self.data = { '01c9379dfc33803963d07c1ccc748d3fe4c96bb5': b'this is some text', '688a5ef812c53907562fe379d4b3851e69c7cb15': b'another text', '8986af901dd2043044ce8f0d8fc039153641cf17': b'yet another text', '02fb2c89e14f7fab46701478c83779c7beb7b069': b""" import unittest import logging from swh.indexer.mimetype import ContentMimetypeIndexer from swh.indexer.tests.test_utils import MockObjStorage class MockStorage(): def content_mimetype_add(self, mimetypes): self.state = mimetypes self.conflict_update = conflict_update def indexer_configuration_add(self, tools): return [{ 'id': 10, }] """, '103bc087db1d26afc3a0283f38663d081e9b01e6': b""" #ifndef __AVL__ #define __AVL__ typedef struct _avl_tree avl_tree; typedef struct _data_t { int content; } data_t; """, '93666f74f1cf635c8c8ac118879da6ec5623c410': b""" (should 'pygments (recognize 'lisp 'easily)) """, '26a9f72a7c87cc9205725cfd879f514ff4f3d8d5': b""" { "name": "test_metadata", "version": "0.0.1", "description": "Simple package.json test for indexer", "repository": { "type": "git", "url": "https://github.com/moranegg/metadata_test" } } """, 'd4c647f0fc257591cc9ba1722484229780d1c607': b""" { "version": "5.0.3", "name": "npm", "description": "a package manager for JavaScript", "keywords": [ "install", "modules", "package manager", "package.json" ], "preferGlobal": true, "config": { "publishtest": false }, "homepage": "https://docs.npmjs.com/", "author": "Isaac Z. Schlueter (http://blog.izs.me)", "repository": { "type": "git", "url": "https://github.com/npm/npm" }, "bugs": { "url": "https://github.com/npm/npm/issues" }, "dependencies": { "JSONStream": "~1.3.1", "abbrev": "~1.1.0", "ansi-regex": "~2.1.1", "ansicolors": "~0.3.2", "ansistyles": "~0.1.3" }, "devDependencies": { "tacks": "~1.2.6", "tap": "~10.3.2" }, "license": "Artistic-2.0" } """, 'a7ab314d8a11d2c93e3dcf528ca294e7b431c449': b""" """, 'da39a3ee5e6b4b0d3255bfef95601890afd80709': b'', } def __iter__(self): yield from self.data.keys() def __contains__(self, sha1): return self.data.get(sha1) is not None def get(self, sha1): raw_content = self.data.get(sha1) if raw_content is None: raise ObjNotFoundError(sha1) return raw_content class MockIndexerStorage(): """Mock an swh-indexer storage. """ added_data = [] def indexer_configuration_add(self, tools): - tool = tools[0] + results = [] + for tool in tools: + results.append(self._indexer_configuration_add_one(tool)) + return results + + def _indexer_configuration_add_one(self, tool): if tool['tool_name'] == 'swh-metadata-translator': - return [{ + return { 'id': 30, 'tool_name': 'swh-metadata-translator', 'tool_version': '0.0.1', 'tool_configuration': { 'type': 'local', 'context': 'NpmMapping' }, - }] + } elif tool['tool_name'] == 'swh-metadata-detector': - return [{ + return { 'id': 7, 'tool_name': 'swh-metadata-detector', 'tool_version': '0.0.1', 'tool_configuration': { 'type': 'local', 'context': 'NpmMapping' }, - }] + } elif tool['tool_name'] == 'origin-metadata': - return [{ + return { 'id': 8, 'tool_name': 'origin-metadata', 'tool_version': '0.0.1', 'tool_configuration': {}, - }] + } else: assert False, 'Unknown tool {tool_name}'.format(**tool) def content_metadata_missing(self, sha1s): yield from [] def content_metadata_add(self, metadata, conflict_update=None): self.added_data.append( ('content_metadata', conflict_update, metadata)) def revision_metadata_add(self, metadata, conflict_update=None): self.added_data.append( ('revision_metadata', conflict_update, metadata)) def origin_intrinsic_metadata_add(self, metadata, conflict_update=None): self.added_data.append( ('origin_intrinsic_metadata', conflict_update, metadata)) def content_metadata_get(self, sha1s): return [{ 'tool': { 'configuration': { 'type': 'local', 'context': 'NpmMapping' }, 'version': '0.0.1', 'id': 6, 'name': 'swh-metadata-translator' }, 'id': b'cde', 'translated_metadata': { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'type': 'SoftwareSourceCode', 'codemeta:issueTracker': 'https://github.com/librariesio/yarn-parser/issues', 'version': '1.0.0', 'name': 'yarn-parser', 'schema:author': 'Andrew Nesbitt', 'url': 'https://github.com/librariesio/yarn-parser#readme', 'processorRequirements': {'node': '7.5'}, 'license': 'AGPL-3.0', 'keywords': ['yarn', 'parse', 'lock', 'dependencies'], 'schema:codeRepository': 'git+https://github.com/librariesio/yarn-parser.git', 'description': 'Tiny web service for parsing yarn.lock files', } }] class MockStorage(): """Mock a real swh-storage storage to simplify reading indexers' outputs. """ def origin_get(self, id_): for origin in ORIGINS: for (k, v) in id_.items(): if origin[k] != v: break else: # This block is run iff we didn't break, ie. if all supplied # parts of the id are set to the expected value. return origin assert False, id_ def snapshot_get_latest(self, origin_id): if origin_id in SNAPSHOTS: return SNAPSHOTS[origin_id] else: assert False, origin_id def revision_get(self, revisions): return [{ 'id': b'8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f', 'committer': { 'id': 26, 'name': b'Andrew Nesbitt', 'fullname': b'Andrew Nesbitt ', 'email': b'andrewnez@gmail.com' }, 'synthetic': False, 'date': { 'negative_utc': False, 'timestamp': { 'seconds': 1487596456, 'microseconds': 0 }, 'offset': 0 }, 'directory': b'10' }] def directory_ls(self, directory, recursive=False, cur=None): # with directory: b'\x9d', return [{ 'sha1_git': b'abc', 'name': b'index.js', 'target': b'abc', 'length': 897, 'status': 'visible', 'type': 'file', 'perms': 33188, 'dir_id': b'10', 'sha1': b'bcd' }, { 'sha1_git': b'aab', 'name': b'package.json', 'target': b'aab', 'length': 712, 'status': 'visible', 'type': 'file', 'perms': 33188, 'dir_id': b'10', 'sha1': b'cde' }, { 'dir_id': b'10', 'target': b'11', 'type': 'dir', 'length': None, 'name': b'.github', 'sha1': None, 'perms': 16384, 'sha1_git': None, 'status': None, 'sha256': None }] class BasicMockStorage(): """In memory implementation to fake the content_get_range api. FIXME: To remove when the actual in-memory lands. """ contents = [] def __init__(self, contents): self.contents = contents def content_get_range(self, start, end, limit=1000): # to make input test data consilient with actual runtime the # other way of doing properly things would be to rewrite all # tests (that's another task entirely so not right now) if isinstance(start, bytes): start = hashutil.hash_to_hex(start) if isinstance(end, bytes): end = hashutil.hash_to_hex(end) results = [] _next_id = None counter = 0 for c in self.contents: _id = c['sha1'] if start <= _id and _id <= end: results.append(c) if counter >= limit: break counter += 1 return { 'contents': results, 'next': _next_id } class BasicMockIndexerStorage(): """Mock Indexer storage to simplify reading indexers' outputs. """ state = [] def _internal_add(self, data, conflict_update=None): """All content indexer have the same structure. So reuse `data` as the same data. It's either mimetype, language, fossology_license, etc... """ self.state = data self.conflict_update = conflict_update def content_mimetype_add(self, data, conflict_update=None): self._internal_add(data, conflict_update=conflict_update) def content_fossology_license_add(self, data, conflict_update=None): self._internal_add(data, conflict_update=conflict_update) def content_language_add(self, data, conflict_update=None): self._internal_add(data, conflict_update=conflict_update) def content_ctags_add(self, data, conflict_update=None): self._internal_add(data, conflict_update=conflict_update) def _internal_get_range(self, start, end, indexer_configuration_id, limit=1000): """Same logic as _internal_add, we retrieve indexed data given an identifier. So the code here does not change even though the underlying data does. """ # to make input test data consilient with actual runtime the # other way of doing properly things would be to rewrite all # tests (that's another task entirely so not right now) if isinstance(start, bytes): start = hashutil.hash_to_hex(start) if isinstance(end, bytes): end = hashutil.hash_to_hex(end) results = [] _next = None counter = 0 for m in self.state: _id = m['id'] _tool_id = m['indexer_configuration_id'] if (start <= _id and _id <= end and _tool_id == indexer_configuration_id): results.append(_id) if counter >= limit: break counter += 1 return { 'ids': results, 'next': _next } def content_mimetype_get_range( self, start, end, indexer_configuration_id, limit=1000): return self._internal_get_range( start, end, indexer_configuration_id, limit=limit) def content_fossology_license_get_range( self, start, end, indexer_configuration_id, limit=1000): return self._internal_get_range( start, end, indexer_configuration_id, limit=limit) def indexer_configuration_add(self, tools): return [{ 'id': 10, }] class CommonIndexerNoTool: """Mixin to wronly initialize content indexer""" def prepare(self): super().prepare() self.tools = None class CommonIndexerWithErrorsTest: """Test indexer configuration checks. """ Indexer = None RangeIndexer = None def test_wrong_unknown_configuration_tool(self): """Indexer with unknown configuration tool fails check""" with self.assertRaisesRegex(ValueError, 'Tools None is unknown'): print('indexer: %s' % self.Indexer) self.Indexer() def test_wrong_unknown_configuration_tool_range(self): """Range Indexer with unknown configuration tool fails check""" if self.RangeIndexer is not None: with self.assertRaisesRegex(ValueError, 'Tools None is unknown'): self.RangeIndexer() class CommonContentIndexerTest: def assert_results_ok(self, actual_results, expected_results=None): if expected_results is None: expected_results = self.expected_results for indexed_data in actual_results: _id = indexed_data['id'] self.assertEqual(indexed_data, expected_results[_id]) _tool_id = indexed_data['indexer_configuration_id'] self.assertEqual(_tool_id, self.indexer.tool['id']) def test_index(self): """Known sha1 have their data indexed """ sha1s = [self.id0, self.id1, self.id2] # when self.indexer.run(sha1s, policy_update='update-dups') actual_results = self.indexer.idx_storage.state self.assertTrue(self.indexer.idx_storage.conflict_update) self.assert_results_ok(actual_results) # 2nd pass self.indexer.run(sha1s, policy_update='ignore-dups') self.assertFalse(self.indexer.idx_storage.conflict_update) self.assert_results_ok(actual_results) def test_index_one_unknown_sha1(self): """Unknown sha1 are not indexed""" sha1s = [self.id1, '799a5ef812c53907562fe379d4b3851e69c7cb15', # unknown '800a5ef812c53907562fe379d4b3851e69c7cb15'] # unknown # when self.indexer.run(sha1s, policy_update='update-dups') actual_results = self.indexer.idx_storage.state # then expected_results = { k: v for k, v in self.expected_results.items() if k in sha1s } self.assert_results_ok(actual_results, expected_results) class CommonContentIndexerRangeTest: """Allows to factorize tests on range indexer. """ def assert_results_ok(self, start, end, actual_results, expected_results=None): if expected_results is None: expected_results = self.expected_results for indexed_data in actual_results: _id = indexed_data['id'] self.assertEqual(indexed_data, expected_results[_id]) self.assertTrue(start <= _id and _id <= end) _tool_id = indexed_data['indexer_configuration_id'] self.assertEqual(_tool_id, self.indexer.tool['id']) def test__index_contents(self): """Indexing contents without existing data results in indexed data """ start, end = [self.contents[0], self.contents[2]] # output hex ids # given actual_results = list(self.indexer._index_contents( start, end, indexed={})) self.assert_results_ok(start, end, actual_results) def test__index_contents_with_indexed_data(self): """Indexing contents with existing data results in less indexed data """ start, end = [self.contents[0], self.contents[2]] # output hex ids data_indexed = [self.id0, self.id2] # given actual_results = self.indexer._index_contents( start, end, indexed=set(data_indexed)) # craft the expected results expected_results = self.expected_results.copy() for already_indexed_key in data_indexed: expected_results.pop(already_indexed_key) self.assert_results_ok( start, end, actual_results, expected_results) def test_generate_content_get(self): """Optimal indexing should result in indexed data """ start, end = [self.contents[0], self.contents[2]] # output hex ids # given actual_results = self.indexer.run(start, end) # then self.assertTrue(actual_results) def test_generate_content_get_input_as_bytes(self): """Optimal indexing should result in indexed data Input are in bytes here. """ _start, _end = [self.contents[0], self.contents[2]] # output hex ids start, end = map(hashutil.hash_to_bytes, (_start, _end)) # given actual_results = self.indexer.run( # checks the bytes input this time start, end, skip_existing=False) # no data so same result # then self.assertTrue(actual_results) def test_generate_content_get_no_result(self): """No result indexed returns False""" start, end = ['0000000000000000000000000000000000000000', '0000000000000000000000000000000000000001'] # given actual_results = self.indexer.run( start, end, incremental=False) # then self.assertFalse(actual_results) class NoDiskIndexer: """Mixin to override the DiskIndexer behavior avoiding side-effects in tests. """ def write_to_temp(self, filename, data): # noop return filename def cleanup(self, content_path): # noop return None