diff --git a/swh/indexer/ctags.py b/swh/indexer/ctags.py index 492e7c0..1ad6022 100644 --- a/swh/indexer/ctags.py +++ b/swh/indexer/ctags.py @@ -1,155 +1,156 @@ # Copyright (C) 2015-2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import subprocess import json from swh.model import hashutil from .language import compute_language from .indexer import ContentIndexer, DiskIndexer # Options used to compute tags __FLAGS = [ '--fields=+lnz', # +l: language # +n: line number of tag definition # +z: include the symbol's kind (function, variable, ...) '--sort=no', # sort output on tag name '--links=no', # do not follow symlinks '--output-format=json', # outputs in json ] def run_ctags(path, lang=None, ctags_command='ctags'): """Run ctags on file path with optional language. Args: path: path to the file lang: language for that path (optional) - Returns: - ctags' output + Yields: + dict: ctags' output """ optional = [] if lang: optional = ['--language-force=%s' % lang] cmd = [ctags_command] + __FLAGS + optional + [path] output = subprocess.check_output(cmd, universal_newlines=True) for symbol in output.split('\n'): if not symbol: continue js_symbol = json.loads(symbol) yield { 'name': js_symbol['name'], 'kind': js_symbol['kind'], 'line': js_symbol['line'], 'lang': js_symbol['language'], } class CtagsIndexer(ContentIndexer, DiskIndexer): CONFIG_BASE_FILENAME = 'indexer/ctags' ADDITIONAL_CONFIG = { 'workdir': ('str', '/tmp/swh/indexer.ctags'), 'tools': ('dict', { 'name': 'universal-ctags', 'version': '~git7859817b', 'configuration': { 'command_line': '''ctags --fields=+lnz --sort=no --links=no ''' '''--output-format=json ''' }, }), 'languages': ('dict', { 'ada': 'Ada', 'adl': None, 'agda': None, # ... }) } def prepare(self): super().prepare() self.working_directory = self.config['workdir'] self.language_map = self.config['languages'] self.tool = self.tools[0] def filter(self, ids): """Filter out known sha1s and return only missing ones. """ yield from self.idx_storage.content_ctags_missing(( { 'id': sha1, 'indexer_configuration_id': self.tool['id'], } for sha1 in ids )) def compute_ctags(self, path, lang): """Compute ctags on file at path with language lang. """ return run_ctags(path, lang=lang) def index(self, id, data): """Index sha1s' content and store result. Args: id (bytes): content's identifier data (bytes): raw content in bytes Returns: - A dict, representing a content_mimetype, with keys: - - id (bytes): content's identifier (sha1) - - ctags ([dict]): ctags list of symbols + dict: a dict representing a content_mimetype with keys: + + - **id** (bytes): content's identifier (sha1) + - **ctags** ([dict]): ctags list of symbols """ lang = compute_language(data, log=self.log)['lang'] if not lang: return None ctags_lang = self.language_map.get(lang) if not ctags_lang: return None ctags = { 'id': id, } filename = hashutil.hash_to_hex(id) content_path = self.write_to_temp( filename=filename, data=data) result = run_ctags(content_path, lang=ctags_lang) ctags.update({ 'ctags': list(result), 'indexer_configuration_id': self.tool['id'], }) self.cleanup(content_path) return ctags def persist_index_computations(self, results, policy_update): """Persist the results in storage. Args: results ([dict]): list of content_mimetype, dict with the - following keys: + following keys: - id (bytes): content's identifier (sha1) - ctags ([dict]): ctags list of symbols policy_update ([str]): either 'update-dups' or 'ignore-dups' to - respectively update duplicates or ignore them + respectively update duplicates or ignore them """ self.idx_storage.content_ctags_add( results, conflict_update=(policy_update == 'update-dups')) diff --git a/swh/indexer/fossology_license.py b/swh/indexer/fossology_license.py index 58e341f..60b5523 100644 --- a/swh/indexer/fossology_license.py +++ b/swh/indexer/fossology_license.py @@ -1,185 +1,191 @@ # Copyright (C) 2016-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import subprocess from swh.model import hashutil from .indexer import ContentIndexer, ContentRangeIndexer, DiskIndexer def compute_license(path, log=None): """Determine license from file at path. Args: path: filepath to determine the license Returns: - A dict with the following keys: + dict: A dict with the following keys: + - licenses ([str]): associated detected licenses to path - path (bytes): content filepath """ try: properties = subprocess.check_output(['nomossa', path], universal_newlines=True) if properties: res = properties.rstrip().split(' contains license(s) ') licenses = res[1].split(',') else: licenses = [] return { 'licenses': licenses, 'path': path, } except subprocess.CalledProcessError: if log: from os import path as __path log.exception('Problem during license detection for sha1 %s' % __path.basename(path)) return { 'licenses': [], 'path': path, } class MixinFossologyLicenseIndexer: """Mixin fossology license indexer. See :class:`ContentFossologyLicenseIndexer` and :class:`FossologyLicenseRangeIndexer` """ ADDITIONAL_CONFIG = { 'workdir': ('str', '/tmp/swh/indexer.fossology.license'), 'tools': ('dict', { 'name': 'nomos', 'version': '3.1.0rc2-31-ga2cbb8c', 'configuration': { 'command_line': 'nomossa ', }, }), 'write_batch_size': ('int', 1000), } CONFIG_BASE_FILENAME = 'indexer/fossology_license' def prepare(self): super().prepare() self.working_directory = self.config['workdir'] self.tool = self.tools[0] def compute_license(self, path, log=None): """Determine license from file at path. Args: path: filepath to determine the license Returns: - A dict with the following keys: + dict: A dict with the following keys: + - licenses ([str]): associated detected licenses to path - path (bytes): content filepath """ return compute_license(path, log=log) def index(self, id, data): """Index sha1s' content and store result. Args: id (bytes): content's identifier raw_content (bytes): associated raw content to content id Returns: - A dict, representing a content_license, with keys: - - id (bytes): content's identifier (sha1) - - license (bytes): license in bytes - - path (bytes): path - - indexer_configuration_id (int): tool used to compute the output + dict: A dict, representing a content_license, with keys: + + - id (bytes): content's identifier (sha1) + - license (bytes): license in bytes + - path (bytes): path + - indexer_configuration_id (int): tool used to compute the output """ content_path = self.write_to_temp( filename=hashutil.hash_to_hex(id), # use the id as pathname data=data) try: properties = self.compute_license(path=content_path, log=self.log) properties.update({ 'id': id, 'indexer_configuration_id': self.tool['id'], }) finally: self.cleanup(content_path) return properties def persist_index_computations(self, results, policy_update): """Persist the results in storage. Args: results ([dict]): list of content_license, dict with the - following keys: + following keys: + - id (bytes): content's identifier (sha1) - license (bytes): license in bytes - path (bytes): path + policy_update ([str]): either 'update-dups' or 'ignore-dups' to - respectively update duplicates or ignore them + respectively update duplicates or ignore them """ self.idx_storage.content_fossology_license_add( results, conflict_update=(policy_update == 'update-dups')) class ContentFossologyLicenseIndexer( MixinFossologyLicenseIndexer, DiskIndexer, ContentIndexer): """Indexer in charge of: + - filtering out content already indexed - reading content from objstorage per the content's id (sha1) - computing {license, encoding} from that content - store result in storage """ def filter(self, ids): """Filter out known sha1s and return only missing ones. """ yield from self.idx_storage.content_fossology_license_missing(( { 'id': sha1, 'indexer_configuration_id': self.tool['id'], } for sha1 in ids )) class FossologyLicenseRangeIndexer( MixinFossologyLicenseIndexer, DiskIndexer, ContentRangeIndexer): """FossologyLicense Range Indexer working on range of content identifiers. - It: - filters out the non textual content - (optionally) filters out content already indexed (cf - :func:`indexed_contents_in_range`) + :meth:`.indexed_contents_in_range`) - reads content from objstorage per the content's id (sha1) - computes {mimetype, encoding} from that content - stores result in storage """ def indexed_contents_in_range(self, start, end): """Retrieve indexed content id within range [start, end]. - Args - **start** (bytes): Starting bound from range identifier - **end** (bytes): End range identifier + Args: + start (bytes): Starting bound from range identifier + end (bytes): End range identifier Returns: - a dict with keys: + dict: a dict with keys: + - **ids** [bytes]: iterable of content ids within the range. - **next** (Optional[bytes]): The next range of sha1 starts at - this sha1 if any + this sha1 if any """ return self.idx_storage.content_fossology_license_get_range( start, end, self.tool['id']) diff --git a/swh/indexer/indexer.py b/swh/indexer/indexer.py index 65946b5..8f7df07 100644 --- a/swh/indexer/indexer.py +++ b/swh/indexer/indexer.py @@ -1,610 +1,607 @@ # Copyright (C) 2016-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import abc import os import logging import shutil import tempfile import datetime from copy import deepcopy from swh.scheduler import get_scheduler from swh.storage import get_storage from swh.core.config import SWHConfig from swh.objstorage import get_objstorage from swh.objstorage.exc import ObjNotFoundError from swh.indexer.storage import get_indexer_storage, INDEXER_CFG_KEY from swh.model import hashutil from swh.core import utils class DiskIndexer: """Mixin intended to be used with other SomethingIndexer classes. Indexers inheriting from this class are a category of indexers which needs the disk for their computations. Note: This expects `self.working_directory` variable defined at runtime. """ def write_to_temp(self, filename, data): """Write the sha1's content in a temporary file. Args: filename (str): one of sha1's many filenames data (bytes): the sha1's content to write in temporary - file + file Returns: The path to the temporary file created. That file is filled in with the raw content's data. """ os.makedirs(self.working_directory, exist_ok=True) temp_dir = tempfile.mkdtemp(dir=self.working_directory) content_path = os.path.join(temp_dir, filename) with open(content_path, 'wb') as f: f.write(data) return content_path def cleanup(self, content_path): """Remove content_path from working directory. Args: content_path (str): the file to remove """ temp_dir = os.path.dirname(content_path) shutil.rmtree(temp_dir) class BaseIndexer(SWHConfig, metaclass=abc.ABCMeta): """Base class for indexers to inherit from. The main entry point is the :func:`run` function which is in charge of triggering the computations on the batch dict/ids received. Indexers can: - filter out ids whose data has already been indexed. - retrieve ids data from storage or objstorage - index this data depending on the object and store the result in storage. To implement a new object type indexer, inherit from the BaseIndexer and implement indexing: - :func:`run`: + :meth:`.run`: object_ids are different depending on object. For example: sha1 for content, sha1_git for revision, directory, release, and id for origin To implement a new concrete indexer, inherit from the object level classes: :class:`ContentIndexer`, :class:`RevisionIndexer`, :class:`OriginIndexer`. Then you need to implement the following functions: - :func:`filter`: + :meth:`.filter`: filter out data already indexed (in storage). - :func:`index_object`: + :meth:`.index_object`: compute index on id with data (retrieved from the storage or the objstorage by the id key) and return the resulting index computation. - :func:`persist_index_computations`: + :meth:`.persist_index_computations`: persist the results of multiple index computations in the storage. The new indexer implementation can also override the following functions: - :func:`prepare`: + :meth:`.prepare`: Configuration preparation for the indexer. When overriding, this must call the `super().prepare()` instruction. - :func:`check`: + :meth:`.check`: Configuration check for the indexer. When overriding, this must call the `super().check()` instruction. - :func:`register_tools`: + :meth:`.register_tools`: This should return a dict of the tool(s) to use when indexing or filtering. """ CONFIG = 'indexer/base' DEFAULT_CONFIG = { INDEXER_CFG_KEY: ('dict', { 'cls': 'remote', 'args': { 'url': 'http://localhost:5007/' } }), 'storage': ('dict', { 'cls': 'remote', 'args': { 'url': 'http://localhost:5002/', } }), 'objstorage': ('dict', { 'cls': 'remote', 'args': { 'url': 'http://localhost:5003/', } }) } ADDITIONAL_CONFIG = {} def __init__(self): """Prepare and check that the indexer is ready to run. """ super().__init__() self.prepare() self.check() def prepare(self): """Prepare the indexer's needed runtime configuration. Without this step, the indexer cannot possibly run. """ # HACK to deal with edge case (e.g revision metadata indexer) if not hasattr(self, 'config'): self.config = self.parse_config_file( additional_configs=[self.ADDITIONAL_CONFIG]) config_storage = self.config.get('storage') if config_storage: self.storage = get_storage(**config_storage) objstorage = self.config['objstorage'] self.objstorage = get_objstorage(objstorage['cls'], objstorage['args']) idx_storage = self.config[INDEXER_CFG_KEY] self.idx_storage = get_indexer_storage(**idx_storage) _log = logging.getLogger('requests.packages.urllib3.connectionpool') _log.setLevel(logging.WARN) self.log = logging.getLogger('swh.indexer') self.tools = list(self.register_tools(self.config['tools'])) def check(self, *, check_tools=True): """Check the indexer's configuration is ok before proceeding. If ok, does nothing. If not raise error. """ if check_tools and not self.tools: raise ValueError('Tools %s is unknown, cannot continue' % self.tools) def _prepare_tool(self, tool): """Prepare the tool dict to be compliant with the storage api. """ return {'tool_%s' % key: value for key, value in tool.items()} def register_tools(self, tools): """Permit to register tools to the storage. Add a sensible default which can be overridden if not sufficient. (For now, all indexers use only one tool) Expects the self.config['tools'] property to be set with one or more tools. Args: tools (dict/[dict]): Either a dict or a list of dict. Returns: - List of dict with additional id key. + list: List of dicts with additional id key. Raises: - ValueError if not a list nor a dict. + ValueError: if not a list nor a dict. """ if isinstance(tools, list): tools = list(map(self._prepare_tool, tools)) elif isinstance(tools, dict): tools = [self._prepare_tool(tools)] else: raise ValueError('Configuration tool(s) must be a dict or list!') if tools: return self.idx_storage.indexer_configuration_add(tools) else: return [] @abc.abstractmethod def index(self, id, data): """Index computation for the id and associated raw data. Args: id (bytes): identifier data (bytes): id's data from storage or objstorage depending on - object type + object type Returns: - a dict that makes sense for the persist_index_computations - function. + dict: a dict that makes sense for the + :meth:`.persist_index_computations` method. """ pass @abc.abstractmethod def persist_index_computations(self, results, policy_update): """Persist the computation resulting from the index. Args: results ([result]): List of results. One result is the - result of the index function. + result of the index function. policy_update ([str]): either 'update-dups' or 'ignore-dups' to - respectively update duplicates or ignore - them + respectively update duplicates or ignore them Returns: None """ pass def next_step(self, results, task): """Do something else with computations results (e.g. send to another queue, ...). (This is not an abstractmethod since it is optional). Args: results ([result]): List of results (dict) as returned - by index function. + by index function. task (dict): a dict in the form expected by - `scheduler.backend.SchedulerBackend.create_tasks` - without `next_run`, plus an optional `result_name` key. + `scheduler.backend.SchedulerBackend.create_tasks` + without `next_run`, plus an optional `result_name` key. Returns: None """ if task: if getattr(self, 'scheduler', None): scheduler = self.scheduler else: scheduler = get_scheduler(**self.config['scheduler']) task = deepcopy(task) result_name = task.pop('result_name', None) task['next_run'] = datetime.datetime.now() if result_name: task['arguments']['kwargs'][result_name] = self.results scheduler.create_tasks([task]) @abc.abstractmethod def run(self, ids, policy_update, next_step=None, **kwargs): """Given a list of ids: - retrieves the data from the storage - executes the indexing computations - stores the results (according to policy_update) Args: ids ([bytes]): id's identifier list policy_update (str): either 'update-dups' or 'ignore-dups' to - respectively update duplicates or ignore them + respectively update duplicates or ignore them next_step (dict): a dict in the form expected by - `scheduler.backend.SchedulerBackend.create_tasks` - without `next_run`, plus a `result_name` key. + `scheduler.backend.SchedulerBackend.create_tasks` + without `next_run`, plus a `result_name` key. **kwargs: passed to the `index` method """ pass class ContentIndexer(BaseIndexer): """A content indexer working on a list of ids directly. To work on indexer range, use the :class:`ContentRangeIndexer` instead. Note: :class:`ContentIndexer` is not an instantiable object. To use it, one should inherit from this class and override the methods mentioned in the :class:`BaseIndexer` class. """ @abc.abstractmethod def filter(self, ids): """Filter missing ids for that particular indexer. Args: ids ([bytes]): list of ids Yields: iterator of missing ids """ pass def run(self, ids, policy_update, next_step=None, **kwargs): """Given a list of ids: - retrieve the content from the storage - execute the indexing computations - store the results (according to policy_update) Args: ids ([bytes]): sha1's identifier list policy_update (str): either 'update-dups' or 'ignore-dups' to respectively update duplicates or ignore them next_step (dict): a dict in the form expected by `scheduler.backend.SchedulerBackend.create_tasks` without `next_run`, plus an optional `result_name` key. **kwargs: passed to the `index` method """ results = [] try: for sha1 in ids: try: raw_content = self.objstorage.get(sha1) except ObjNotFoundError: self.log.warning('Content %s not found in objstorage' % hashutil.hash_to_hex(sha1)) continue res = self.index(sha1, raw_content, **kwargs) if res: # If no results, skip it results.append(res) self.persist_index_computations(results, policy_update) self.results = results return self.next_step(results, task=next_step) except Exception: self.log.exception( 'Problem when reading contents metadata.') class ContentRangeIndexer(BaseIndexer): """A content range indexer. This expects as input a range of ids to index. To work on a list of ids, use the :class:`ContentIndexer` instead. Note: :class:`ContentRangeIndexer` is not an instantiable object. To use it, one should inherit from this class and override the methods mentioned in the :class:`BaseIndexer` class. """ @abc.abstractmethod def indexed_contents_in_range(self, start, end): """Retrieve indexed contents within range [start, end]. - Args - **start** (bytes): Starting bound from range identifier - **end** (bytes): End range identifier + Args: + start (bytes): Starting bound from range identifier + end (bytes): End range identifier Yields: - Content identifier (bytes) present in the range [start, end] + bytes: Content identifier present in the range ``[start, end]`` """ pass def _list_contents_to_index(self, start, end, indexed): """Compute from storage the new contents to index in the range [start, end]. The already indexed contents are skipped. Args: - **start** (bytes): Starting bound from range identifier - **end** (bytes): End range identifier - **indexed** (Set[bytes]): Set of content already indexed. + start (bytes): Starting bound from range identifier + end (bytes): End range identifier + indexed (Set[bytes]): Set of content already indexed. Yields: - Identifier (bytes) of contents to index. + bytes: Identifier of contents to index. """ while start: result = self.storage.content_get_range(start, end) contents = result['contents'] for c in contents: _id = c['sha1'] if _id in indexed: continue yield _id start = result['next'] def _index_contents(self, start, end, indexed, **kwargs): """Index the contents from within range [start, end] Args: - **start** (bytes): Starting bound from range identifier - **end** (bytes): End range identifier - **indexed** (Set[bytes]): Set of content already indexed. + start (bytes): Starting bound from range identifier + end (bytes): End range identifier + indexed (Set[bytes]): Set of content already indexed. Yields: - Data indexed (dict) to persist using the indexer storage + dict: Data indexed to persist using the indexer storage """ for sha1 in self._list_contents_to_index(start, end, indexed): try: raw_content = self.objstorage.get(sha1) except ObjNotFoundError: self.log.warning('Content %s not found in objstorage' % hashutil.hash_to_hex(sha1)) continue res = self.index(sha1, raw_content, **kwargs) if res: yield res def _index_with_skipping_already_done(self, start, end): """Index not already indexed contents in range [start, end]. Args: - **start** (Union[bytes, str]): Starting range identifier - **end** (Union[bytes, str]): Ending range identifier + start** (Union[bytes, str]): Starting range identifier + end (Union[bytes, str]): Ending range identifier Yields: - Content identifier (bytes) present in the range [start, - end] which are not already indexed. + bytes: Content identifier present in the range + ``[start, end]`` which are not already indexed. """ while start: indexed_page = self.indexed_contents_in_range(start, end) contents = indexed_page['ids'] _end = contents[-1] if contents else end yield from self._index_contents( start, _end, contents) start = indexed_page['next'] def run(self, start, end, skip_existing=True, **kwargs): """Given a range of content ids, compute the indexing computations on the contents within. Either the indexer is incremental (filter out existing computed data) or not (compute everything from scratch). Args: - **start** (Union[bytes, str]): Starting range identifier - **end** (Union[bytes, str]): Ending range identifier - **skip_existing** (bool): Skip existing indexed data - (default) or not + start (Union[bytes, str]): Starting range identifier + end (Union[bytes, str]): Ending range identifier + skip_existing (bool): Skip existing indexed data + (default) or not **kwargs: passed to the `index` method Returns: - a boolean. True if data was indexed, False otherwise. + bool: True if data was indexed, False otherwise. """ with_indexed_data = False try: if isinstance(start, str): start = hashutil.hash_to_bytes(start) if isinstance(end, str): end = hashutil.hash_to_bytes(end) if skip_existing: gen = self._index_with_skipping_already_done(start, end) else: gen = self._index_contents(start, end, indexed=[]) for results in utils.grouper(gen, n=self.config['write_batch_size']): self.persist_index_computations( results, policy_update='update-dups') with_indexed_data = True except Exception: self.log.exception( 'Problem when computing metadata.') finally: return with_indexed_data class OriginIndexer(BaseIndexer): """An object type indexer, inherits from the :class:`BaseIndexer` and implements Origin indexing using the run method Note: the :class:`OriginIndexer` is not an instantiable object. To use it in another context one should inherit from this class and override the methods mentioned in the :class:`BaseIndexer` class. """ def run(self, ids, policy_update='update-dups', parse_ids=True, next_step=None, **kwargs): """Given a list of origin ids: - retrieve origins from storage - execute the indexing computations - store the results (according to policy_update) Args: ids ([Union[int, Tuple[str, bytes]]]): list of origin ids or - (type, url) tuples. + (type, url) tuples. policy_update (str): either 'update-dups' or 'ignore-dups' to - respectively update duplicates (default) - or ignore them + respectively update duplicates (default) or ignore them next_step (dict): a dict in the form expected by - `scheduler.backend.SchedulerBackend.create_tasks` - without `next_run`, plus an optional `result_name` key. + `scheduler.backend.SchedulerBackend.create_tasks` without + `next_run`, plus an optional `result_name` key. parse_ids (bool): Do we need to parse id or not (default) **kwargs: passed to the `index` method """ if parse_ids: ids = [o.split('+', 1) if ':' in o else int(o) # type+url or id for o in ids] results = [] for id_ in ids: if isinstance(id_, (tuple, list)): if len(id_) != 2: raise TypeError('Expected a (type, url) tuple.') (type_, url) = id_ params = {'type': type_, 'url': url} elif isinstance(id_, int): params = {'id': id_} else: raise TypeError('Invalid value in "ids": %r' % id_) origin = self.storage.origin_get(params) if not origin: self.log.warning('Origins %s not found in storage' % list(ids)) continue try: res = self.index(origin, **kwargs) if origin: # If no results, skip it results.append(res) except Exception: self.log.exception( 'Problem when processing origin %s' % id_) self.persist_index_computations(results, policy_update) self.results = results return self.next_step(results, task=next_step) class RevisionIndexer(BaseIndexer): """An object type indexer, inherits from the :class:`BaseIndexer` and implements Revision indexing using the run method Note: the :class:`RevisionIndexer` is not an instantiable object. To use it in another context one should inherit from this class and override the methods mentioned in the :class:`BaseIndexer` class. """ def run(self, ids, policy_update, next_step=None): """Given a list of sha1_gits: - retrieve revisions from storage - execute the indexing computations - store the results (according to policy_update) Args: ids ([bytes or str]): sha1_git's identifier list policy_update (str): either 'update-dups' or 'ignore-dups' to - respectively update duplicates or ignore - them + respectively update duplicates or ignore them """ results = [] ids = [hashutil.hash_to_bytes(id_) if isinstance(id_, str) else id_ for id_ in ids] revs = self.storage.revision_get(ids) for rev in revs: if not rev: self.log.warning('Revisions %s not found in storage' % list(map(hashutil.hash_to_hex, ids))) continue try: res = self.index(rev) if res: # If no results, skip it results.append(res) except Exception: self.log.exception( 'Problem when processing revision') self.persist_index_computations(results, policy_update) self.results = results return self.next_step(results, task=next_step) diff --git a/swh/indexer/language.py b/swh/indexer/language.py index 5ac61ec..7d0e3a4 100644 --- a/swh/indexer/language.py +++ b/swh/indexer/language.py @@ -1,209 +1,209 @@ # Copyright (C) 2016-2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import io from pygments.lexers import guess_lexer from pygments.util import ClassNotFound from chardet.universaldetector import UniversalDetector from .indexer import ContentIndexer def _cleanup_classname(classname): """Determine the language from the pygments' lexer names. """ return classname.lower().replace(' ', '-') def _read_raw(raw_content, size=2048): """Read raw content in chunk. """ bs = io.BytesIO(raw_content) while True: chunk = bs.read(size) if not chunk: break yield chunk def _detect_encoding(raw_content): """Given a raw content, try and detect its encoding. """ detector = UniversalDetector() for chunk in _read_raw(raw_content): detector.feed(chunk) if detector.done: break detector.close() return detector.result['encoding'] def compute_language_from_chunk(encoding, length, raw_content, max_size, log=None): """Determine the raw content's language. Args: encoding (str): Encoding to use to decode the content length (int): raw_content's length raw_content (bytes): raw content to work with max_size (int): max size to split the raw content at Returns: - Dict with keys: - - lang: None if nothing found or the possible language + dict: Dict with keys: + - **lang**: None if nothing found or the possible language """ try: if max_size <= length: raw_content = raw_content[0:max_size] content = raw_content.decode(encoding) lang = _cleanup_classname( guess_lexer(content).name) except ClassNotFound: lang = None except UnicodeDecodeError: raise except Exception: if log: log.exception('Problem during language detection, skipping') lang = None return { 'lang': lang } def compute_language(raw_content, encoding=None, log=None): """Determine the raw content's language. Args: raw_content (bytes): raw content to work with Returns: - Dict with keys: - - lang: None if nothing found or the possible language + dict: Dict with keys: + - **lang**: None if nothing found or the possible language """ try: encoding = _detect_encoding(raw_content) content = raw_content.decode(encoding) lang = _cleanup_classname( guess_lexer(content).name) except ClassNotFound: lang = None except Exception: if log: log.exception('Problem during language detection, skipping') lang = None return { 'lang': lang } class ContentLanguageIndexer(ContentIndexer): """Indexer in charge of: - filtering out content already indexed - reading content from objstorage per the content's id (sha1) - computing {mimetype, encoding} from that content - store result in storage """ CONFIG_BASE_FILENAME = 'indexer/language' ADDITIONAL_CONFIG = { 'tools': ('dict', { 'name': 'pygments', 'version': '2.0.1+dfsg-1.1+deb8u1', 'configuration': { 'type': 'library', 'debian-package': 'python3-pygments', 'max_content_size': 10240, }, }), } def prepare(self): super().prepare() c = self.config self.max_content_size = c['tools']['configuration']['max_content_size'] self.tool = self.tools[0] def filter(self, ids): """Filter out known sha1s and return only missing ones. """ yield from self.idx_storage.content_language_missing(( { 'id': sha1, 'indexer_configuration_id': self.tool['id'] } for sha1 in ids )) def index(self, id, data): """Index sha1s' content and store result. Args: id (bytes): content's identifier data (bytes): raw content in bytes Returns: - A dict, representing a content_mimetype, with keys: - - id (bytes): content's identifier (sha1) - - lang (bytes): detected language + dict: Dict that represents a content_mimetype, with keys: + - id (bytes): content's identifier (sha1) + - lang (bytes): detected language """ result = { 'id': id, 'indexer_configuration_id': self.tool['id'], 'lang': None, } encoding = _detect_encoding(data) if not encoding: return result _len = len(data) for i in range(0, 9): max_size = self.max_content_size + i try: result = compute_language_from_chunk( encoding, _len, data, max_size, log=self.log) except UnicodeDecodeError: self.log.warning( 'Decoding failed on wrong byte chunk at [0-%s]' ', trying again at next ending byte.' % max_size) continue # we found something, so we return it result.update({ 'id': id, 'indexer_configuration_id': self.tool['id'], }) break return result def persist_index_computations(self, results, policy_update): """Persist the results in storage. Args: results ([dict]): list of content_mimetype, dict with the - following keys: + following keys: - id (bytes): content's identifier (sha1) - lang (bytes): detected language policy_update ([str]): either 'update-dups' or 'ignore-dups' to - respectively update duplicates or ignore them + respectively update duplicates or ignore them """ self.idx_storage.content_language_add( results, conflict_update=(policy_update == 'update-dups')) diff --git a/swh/indexer/metadata.py b/swh/indexer/metadata.py index 84827fc..4549305 100644 --- a/swh/indexer/metadata.py +++ b/swh/indexer/metadata.py @@ -1,337 +1,336 @@ # Copyright (C) 2017-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import click import itertools import logging from swh.indexer.indexer import ContentIndexer, RevisionIndexer, OriginIndexer from swh.indexer.metadata_dictionary import MAPPINGS from swh.indexer.metadata_detector import detect_metadata from swh.indexer.metadata_detector import extract_minimal_metadata_dict from swh.indexer.storage import INDEXER_CFG_KEY from swh.model import hashutil class ContentMetadataIndexer(ContentIndexer): """Content-level indexer This indexer is in charge of: - filtering out content already indexed in content_metadata - reading content from objstorage with the content's id sha1 - computing translated_metadata by given context - using the metadata_dictionary as the 'swh-metadata-translator' tool - store result in content_metadata table """ # Note: This used when the content metadata indexer is used alone # (not the case for example in the case of the RevisionMetadataIndexer) CONFIG_BASE_FILENAME = 'indexer/content_metadata' def __init__(self, tool, config): # FIXME: Simplify this twisted way to use the exact same # config of RevisionMetadataIndexer object that uses # internally ContentMetadataIndexer self.config = config self.config['tools'] = tool self.results = [] super().__init__() self.tool = self.tools[0] # Tool is now registered (cf. prepare call) def filter(self, ids): """Filter out known sha1s and return only missing ones. """ yield from self.idx_storage.content_metadata_missing(( { 'id': sha1, 'indexer_configuration_id': self.tool['id'], } for sha1 in ids )) def index(self, id, data): """Index sha1s' content and store result. Args: id (bytes): content's identifier data (bytes): raw content in bytes Returns: dict: dictionary representing a content_metadata. If the translation wasn't successful the translated_metadata keys will be returned as None """ result = { 'id': id, 'indexer_configuration_id': self.tool['id'], 'translated_metadata': None } try: mapping_name = self.tool['tool_configuration']['context'] result['translated_metadata'] = MAPPINGS[mapping_name] \ .translate(data) except Exception: self.log.exception( "Problem during tool retrieval of metadata translation") return result def persist_index_computations(self, results, policy_update): """Persist the results in storage. Args: results ([dict]): list of content_metadata, dict with the - following keys: + following keys: - id (bytes): content's identifier (sha1) - translated_metadata (jsonb): detected metadata policy_update ([str]): either 'update-dups' or 'ignore-dups' to - respectively update duplicates or ignore them + respectively update duplicates or ignore them """ self.idx_storage.content_metadata_add( results, conflict_update=(policy_update == 'update-dups')) class RevisionMetadataIndexer(RevisionIndexer): """Revision-level indexer This indexer is in charge of: - filtering revisions already indexed in revision_metadata table with defined computation tool - retrieve all entry_files in root directory - use metadata_detector for file_names containing metadata - compute metadata translation if necessary and possible (depends on tool) - send sha1s to content indexing if possible - store the results for revision """ CONFIG_BASE_FILENAME = 'indexer/revision_metadata' ADDITIONAL_CONFIG = { 'tools': ('dict', { 'name': 'swh-metadata-detector', 'version': '0.0.2', 'configuration': { 'type': 'local', 'context': ['NpmMapping', 'CodemetaMapping'] }, }), } ContentMetadataIndexer = ContentMetadataIndexer def prepare(self): super().prepare() self.tool = self.tools[0] def filter(self, sha1_gits): """Filter out known sha1s and return only missing ones. """ yield from self.idx_storage.revision_metadata_missing(( { 'id': sha1_git, 'indexer_configuration_id': self.tool['id'], } for sha1_git in sha1_gits )) def index(self, rev): """Index rev by processing it and organizing result. use metadata_detector to iterate on filenames - if one filename detected -> sends file to content indexer - if multiple file detected -> translation needed at revision level Args: rev (bytes): revision artifact from storage Returns: dict: dictionary representing a revision_metadata, with keys: - - id (str): rev's identifier (sha1_git) - - indexer_configuration_id (bytes): tool used - - translated_metadata: dict of retrieved metadata + - id (str): rev's identifier (sha1_git) + - indexer_configuration_id (bytes): tool used + - translated_metadata: dict of retrieved metadata """ result = { 'id': rev['id'], 'indexer_configuration_id': self.tool['id'], 'translated_metadata': None } try: root_dir = rev['directory'] dir_ls = self.storage.directory_ls(root_dir, recursive=False) files = [entry for entry in dir_ls if entry['type'] == 'file'] detected_files = detect_metadata(files) result['translated_metadata'] = self.translate_revision_metadata( detected_files) except Exception as e: self.log.exception( 'Problem when indexing rev: %r', e) return result def persist_index_computations(self, results, policy_update): """Persist the results in storage. Args: results ([dict]): list of content_mimetype, dict with the - following keys: + following keys: - id (bytes): content's identifier (sha1) - mimetype (bytes): mimetype in bytes - encoding (bytes): encoding in bytes policy_update ([str]): either 'update-dups' or 'ignore-dups' to - respectively update duplicates or ignore them + respectively update duplicates or ignore them """ # TODO: add functions in storage to keep data in revision_metadata self.idx_storage.revision_metadata_add( results, conflict_update=(policy_update == 'update-dups')) def translate_revision_metadata(self, detected_files): """ Determine plan of action to translate metadata when containing one or multiple detected files: Args: detected_files (dict): dictionary mapping context names (e.g., "npm", "authors") to list of sha1 Returns: dict: dict with translated metadata according to the CodeMeta vocabulary """ translated_metadata = [] tool = { 'name': 'swh-metadata-translator', 'version': '0.0.2', 'configuration': { 'type': 'local', 'context': None }, } # TODO: iterate on each context, on each file # -> get raw_contents # -> translate each content config = { k: self.config[k] for k in [INDEXER_CFG_KEY, 'objstorage', 'storage'] } for context in detected_files.keys(): tool['configuration']['context'] = context c_metadata_indexer = self.ContentMetadataIndexer(tool, config) # sha1s that are in content_metadata table sha1s_in_storage = [] metadata_generator = self.idx_storage.content_metadata_get( detected_files[context]) for c in metadata_generator: # extracting translated_metadata sha1 = c['id'] sha1s_in_storage.append(sha1) local_metadata = c['translated_metadata'] # local metadata is aggregated if local_metadata: translated_metadata.append(local_metadata) sha1s_filtered = [item for item in detected_files[context] if item not in sha1s_in_storage] if sha1s_filtered: # content indexing try: c_metadata_indexer.run(sha1s_filtered, policy_update='ignore-dups') # on the fly possibility: for result in c_metadata_indexer.results: local_metadata = result['translated_metadata'] translated_metadata.append(local_metadata) except Exception: self.log.exception( "Exception while indexing metadata on contents") # transform translated_metadata into min set with swh-metadata-detector min_metadata = extract_minimal_metadata_dict(translated_metadata) return min_metadata class OriginMetadataIndexer(OriginIndexer): CONFIG_BASE_FILENAME = 'indexer/origin_intrinsic_metadata' ADDITIONAL_CONFIG = { 'tools': ('list', []) } def check(self, **kwargs): kwargs['check_tools'] = False super().check(**kwargs) def filter(self, ids): return ids def run(self, origin_head, policy_update): """Expected to be called with the result of RevisionMetadataIndexer as first argument; ie. not a list of ids as other indexers would. Args: - - * `origin_head` (dict): {str(origin_id): rev_id} + origin_head (dict): {str(origin_id): rev_id} keys `origin_id` and `revision_id`, which is the result of OriginHeadIndexer. - * `policy_update`: `'ignore-dups'` or `'update-dups'` + policy_update (str): `'ignore-dups'` or `'update-dups'` """ origin_head_map = {int(origin_id): hashutil.hash_to_bytes(rev_id) for (origin_id, rev_id) in origin_head.items()} # Fix up the argument order. revisions_metadata has to be the # first argument because of celery.chain; the next line calls # run() with the usual order, ie. origin ids first. return super().run(ids=list(origin_head_map), policy_update=policy_update, parse_ids=False, origin_head_map=origin_head_map) def index(self, origin, *, origin_head_map): # Get the last revision of the origin. revision_id = origin_head_map[origin['id']] revision_metadata = self.idx_storage \ .revision_metadata_get([revision_id]) results = [] for item in revision_metadata: assert item['id'] == revision_id # Get the metadata of that revision, and return it results.append({ 'origin_id': origin['id'], 'metadata': item['translated_metadata'], 'from_revision': revision_id, 'indexer_configuration_id': item['tool']['id'], }) return results def persist_index_computations(self, results, policy_update): self.idx_storage.origin_intrinsic_metadata_add( list(itertools.chain(*results)), conflict_update=(policy_update == 'update-dups')) @click.command() @click.option('--revs', '-i', help='Default sha1_git to lookup', multiple=True) def main(revs): _git_sha1s = list(map(hashutil.hash_to_bytes, revs)) rev_metadata_indexer = RevisionMetadataIndexer() rev_metadata_indexer.run(_git_sha1s, 'update-dups') if __name__ == '__main__': logging.basicConfig(level=logging.INFO) main() diff --git a/swh/indexer/metadata_detector.py b/swh/indexer/metadata_detector.py index 629974a..fb7fc3f 100644 --- a/swh/indexer/metadata_detector.py +++ b/swh/indexer/metadata_detector.py @@ -1,60 +1,62 @@ # Copyright (C) 2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from swh.indexer.codemeta import compact, expand from swh.indexer.codemeta import make_absolute_uri from swh.indexer.metadata_dictionary import MAPPINGS def detect_metadata(files): """ Detects files potentially containing metadata + Args: - - file_entries (list): list of files + file_entries (list): list of files Returns: - - empty list if nothing was found - - dictionary {mapping_filenames[name]:f['sha1']} + dict: {mapping_filenames[name]:f['sha1']} (may be empty) """ results = {} for (mapping_name, mapping) in MAPPINGS.items(): matches = mapping.detect_metadata_files(files) if matches: results[mapping_name] = matches return results _MINIMAL_PROPERTY_SET = { "developmentStatus", "version", "operatingSystem", "description", "keywords", "issueTracker", "name", "author", "relatedLink", "url", "license", "maintainer", "email", "identifier", "codeRepository"} MINIMAL_METADATA_SET = {make_absolute_uri(prop) for prop in _MINIMAL_PROPERTY_SET} def extract_minimal_metadata_dict(metadata_list): """ Every item in the metadata_list is a dict of translated_metadata in the - CodeMeta vocabulary - we wish to extract a minimal set of terms and keep all values corresponding - to this term without duplication + CodeMeta vocabulary. + + We wish to extract a minimal set of terms and keep all values corresponding + to this term without duplication. + Args: - - metadata_list (list): list of dicts of translated_metadata + metadata_list (list): list of dicts of translated_metadata Returns: - - minimal_dict (dict): one dict with selected values of metadata + dict: minimal_dict; dict with selected values of metadata """ minimal_dict = {} for document in metadata_list: for metadata_item in expand(document): for (term, value) in metadata_item.items(): if term in MINIMAL_METADATA_SET: if term not in minimal_dict: minimal_dict[term] = [value] elif value not in minimal_dict[term]: minimal_dict[term].append(value) return compact(minimal_dict) diff --git a/swh/indexer/metadata_dictionary.py b/swh/indexer/metadata_dictionary.py index b8e01b9..7e71140 100644 --- a/swh/indexer/metadata_dictionary.py +++ b/swh/indexer/metadata_dictionary.py @@ -1,284 +1,284 @@ # Copyright (C) 2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import os import re import abc import json import logging import xmltodict from swh.indexer.codemeta import CROSSWALK_TABLE, SCHEMA_URI from swh.indexer.codemeta import compact, expand MAPPINGS = {} def register_mapping(cls): MAPPINGS[cls.__name__] = cls() return cls class BaseMapping(metaclass=abc.ABCMeta): """Base class for mappings to inherit from To implement a new mapping: - inherit this class - override translate function """ def __init__(self): self.log = logging.getLogger('%s.%s' % ( self.__class__.__module__, self.__class__.__name__)) @abc.abstractmethod def detect_metadata_files(self, files): """ Detects files potentially containing metadata + Args: - - file_entries (list): list of files + file_entries (list): list of files Returns: - - empty list if nothing was found - - list of sha1 otherwise + list: list of sha1 (possibly empty) """ pass @abc.abstractmethod def translate(self, file_content): pass def normalize_translation(self, metadata): return compact(metadata) class SingleFileMapping(BaseMapping): """Base class for all mappings that use a single file as input.""" @property @abc.abstractmethod def filename(self): """The .json file to extract metadata from.""" pass def detect_metadata_files(self, file_entries): for entry in file_entries: if entry['name'] == self.filename: return [entry['sha1']] return [] class DictMapping(BaseMapping): """Base class for mappings that take as input a file that is mostly a key-value store (eg. a shallow JSON dict).""" @property @abc.abstractmethod def mapping(self): """A translation dict to map dict keys into a canonical name.""" pass def translate_dict(self, content_dict, *, normalize=True): """ Translates content by parsing content from a dict object and translating with the appropriate mapping Args: - content_dict (dict) + content_dict (dict): content dict to translate Returns: dict: translated metadata in json-friendly form needed for - the indexer + the indexer """ translated_metadata = {'@type': SCHEMA_URI + 'SoftwareSourceCode'} for k, v in content_dict.items(): # First, check if there is a specific translation # method for this key translation_method = getattr(self, 'translate_' + k, None) if translation_method: translation_method(translated_metadata, v) elif k in self.mapping: # if there is no method, but the key is known from the # crosswalk table # if there is a normalization method, use it on the value normalization_method = getattr(self, 'normalize_' + k, None) if normalization_method: v = normalization_method(v) # set the translation metadata with the normalized value translated_metadata[self.mapping[k]] = v if normalize: return self.normalize_translation(translated_metadata) else: return translated_metadata class JsonMapping(DictMapping, SingleFileMapping): """Base class for all mappings that use a JSON file as input.""" def translate(self, raw_content): """ Translates content by parsing content from a bytestring containing json data and translating with the appropriate mapping Args: - raw_content: bytes + raw_content (bytes): raw content to translate Returns: dict: translated metadata in json-friendly form needed for - the indexer + the indexer """ try: raw_content = raw_content.decode() except UnicodeDecodeError: self.log.warning('Error unidecoding %r', raw_content) return try: content_dict = json.loads(raw_content) except json.JSONDecodeError: self.log.warning('Error unjsoning %r' % raw_content) return return self.translate_dict(content_dict) @register_mapping class NpmMapping(JsonMapping): """ dedicated class for NPM (package.json) mapping and translation """ mapping = CROSSWALK_TABLE['NodeJS'] filename = b'package.json' _schema_shortcuts = { 'github': 'https://github.com/', 'gist': 'https://gist.github.com/', 'bitbucket': 'https://bitbucket.org/', 'gitlab': 'https://gitlab.com/', } def normalize_repository(self, d): """https://docs.npmjs.com/files/package.json#repository""" if isinstance(d, dict): return '{type}+{url}'.format(**d) elif isinstance(d, str): if '://' in d: return d elif ':' in d: (schema, rest) = d.split(':', 1) if schema in self._schema_shortcuts: return self._schema_shortcuts[schema] + rest else: return None else: return self._schema_shortcuts['github'] + d else: return None def normalize_bugs(self, d): return '{url}'.format(**d) _parse_author = re.compile(r'^ *' r'(?P.*?)' r'( +<(?P.*)>)?' r'( +\((?P.*)\))?' r' *$') def normalize_author(self, d): 'https://docs.npmjs.com/files/package.json' \ '#people-fields-author-contributors' author = {'@type': SCHEMA_URI+'Person'} if isinstance(d, dict): name = d.get('name', None) email = d.get('email', None) url = d.get('url', None) elif isinstance(d, str): match = self._parse_author.match(d) name = match.group('name') email = match.group('email') url = match.group('url') else: return None if name: author[SCHEMA_URI+'name'] = name if email: author[SCHEMA_URI+'email'] = email if url: author[SCHEMA_URI+'url'] = url return author @register_mapping class CodemetaMapping(SingleFileMapping): """ dedicated class for CodeMeta (codemeta.json) mapping and translation """ filename = b'codemeta.json' def translate(self, content): return self.normalize_translation(expand(json.loads(content.decode()))) @register_mapping class MavenMapping(DictMapping, SingleFileMapping): """ dedicated class for Maven (pom.xml) mapping and translation """ filename = b'pom.xml' mapping = CROSSWALK_TABLE['Java (Maven)'] def translate(self, content): d = xmltodict.parse(content)['project'] metadata = self.translate_dict(d, normalize=False) metadata[SCHEMA_URI+'codeRepository'] = self.parse_repositories(d) return self.normalize_translation(metadata) _default_repository = {'url': 'https://repo.maven.apache.org/maven2/'} def parse_repositories(self, d): """https://maven.apache.org/pom.html#Repositories""" if 'repositories' not in d: return [self.parse_repository(d, self._default_repository)] else: repositories = d['repositories'].get('repository', []) if not isinstance(repositories, list): repositories = [repositories] results = [] for repo in repositories: res = self.parse_repository(d, repo) if res: results.append(res) return results def parse_repository(self, d, repo): if repo.get('layout', 'default') != 'default': return # TODO ? url = repo['url'] if d['groupId']: url = os.path.join(url, *d['groupId'].split('.')) if d['artifactId']: url = os.path.join(url, d['artifactId']) return url def main(): raw_content = """{"name": "test_name", "unknown_term": "ut"}""" raw_content1 = b"""{"name": "test_name", "unknown_term": "ut", "prerequisites" :"packageXYZ"}""" result = MAPPINGS["NpmMapping"].translate(raw_content) result1 = MAPPINGS["MavenMapping"].translate(raw_content1) print(result) print(result1) if __name__ == "__main__": main() diff --git a/swh/indexer/mimetype.py b/swh/indexer/mimetype.py index 1877644..b058cb0 100644 --- a/swh/indexer/mimetype.py +++ b/swh/indexer/mimetype.py @@ -1,153 +1,155 @@ # Copyright (C) 2016-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import magic from swh.model import hashutil from .indexer import ContentIndexer, ContentRangeIndexer def compute_mimetype_encoding(raw_content): """Determine mimetype and encoding from the raw content. Args: raw_content (bytes): content's raw data Returns: - A dict with mimetype and encoding key and corresponding values + dict: mimetype and encoding key and corresponding values (as bytes). """ r = magic.detect_from_content(raw_content) return { 'mimetype': r.mime_type, 'encoding': r.encoding, } class MixinMimetypeIndexer: """Mixin mimetype indexer. See :class:`ContentMimetypeIndexer` and :class:`MimetypeRangeIndexer` """ ADDITIONAL_CONFIG = { 'tools': ('dict', { 'name': 'file', 'version': '1:5.30-1+deb9u1', 'configuration': { "type": "library", "debian-package": "python3-magic" }, }), 'write_batch_size': ('int', 1000), } CONFIG_BASE_FILENAME = 'indexer/mimetype' def prepare(self): super().prepare() self.tool = self.tools[0] def index(self, id, data): """Index sha1s' content and store result. Args: id (bytes): content's identifier data (bytes): raw content in bytes Returns: - A dict, representing a content_mimetype, with keys: + dict: content's mimetype; dict keys being - - id (bytes): content's identifier (sha1) - - mimetype (bytes): mimetype in bytes - - encoding (bytes): encoding in bytes + - **id** (bytes): content's identifier (sha1) + - **mimetype** (bytes): mimetype in bytes + - **encoding** (bytes): encoding in bytes """ try: properties = compute_mimetype_encoding(data) properties.update({ 'id': id, 'indexer_configuration_id': self.tool['id'], }) except TypeError: self.log.error('Detecting mimetype error for id %s' % ( hashutil.hash_to_hex(id), )) return None return properties def persist_index_computations(self, results, policy_update): """Persist the results in storage. Args: - results ([dict]): list of content_mimetype, dict with the - following keys: - - - id (bytes): content's identifier (sha1) - - mimetype (bytes): mimetype in bytes - - encoding (bytes): encoding in bytes + results ([dict]): list of content's mimetype dicts + (see :meth:`.index`) policy_update ([str]): either 'update-dups' or 'ignore-dups' to - respectively update duplicates or ignore them + respectively update duplicates or ignore them """ self.idx_storage.content_mimetype_add( results, conflict_update=(policy_update == 'update-dups')) class ContentMimetypeIndexer(MixinMimetypeIndexer, ContentIndexer): """Mimetype Indexer working on list of content identifiers. It: - - (optionally) filters out content already indexed (cf. :callable:`filter`) + + - (optionally) filters out content already indexed (cf. + :meth:`.filter`) - reads content from objstorage per the content's id (sha1) - computes {mimetype, encoding} from that content - stores result in storage FIXME: - - 1. Rename redundant ContentMimetypeIndexer to MimetypeIndexer - - 2. Do we keep it afterwards? ~> i think this can be used with the journal + + 1. Rename redundant ContentMimetypeIndexer to MimetypeIndexer + 2. Do we keep it afterwards? ~> i think this can be used with the journal """ def filter(self, ids): """Filter out known sha1s and return only missing ones. """ yield from self.idx_storage.content_mimetype_missing(( { 'id': sha1, 'indexer_configuration_id': self.tool['id'], } for sha1 in ids )) class MimetypeRangeIndexer(MixinMimetypeIndexer, ContentRangeIndexer): """Mimetype Range Indexer working on range of content identifiers. It: - - (optionally) filters out content already indexed (cf :callable:`range`) + + - (optionally) filters out content already indexed (cf + :meth:`.indexed_contents_in_range`) - reads content from objstorage per the content's id (sha1) - computes {mimetype, encoding} from that content - stores result in storage """ def indexed_contents_in_range(self, start, end): """Retrieve indexed content id within range [start, end]. - Args - **start** (bytes): Starting bound from range identifier - **end** (bytes): End range identifier + Args: + start (bytes): Starting bound from range identifier + end (bytes): End range identifier Returns: - a dict with keys: + dict: a dict with keys: + - **ids** [bytes]: iterable of content ids within the range. - **next** (Optional[bytes]): The next range of sha1 starts at - this sha1 if any + this sha1 if any """ return self.idx_storage.content_mimetype_get_range( start, end, self.tool['id']) diff --git a/swh/indexer/rehash.py b/swh/indexer/rehash.py index d2697e0..b01b326 100644 --- a/swh/indexer/rehash.py +++ b/swh/indexer/rehash.py @@ -1,172 +1,172 @@ # Copyright (C) 2017-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import logging import itertools from collections import defaultdict from swh.core import utils from swh.core.config import SWHConfig from swh.model import hashutil from swh.objstorage import get_objstorage from swh.objstorage.exc import ObjNotFoundError from swh.storage import get_storage class RecomputeChecksums(SWHConfig): """Class in charge of (re)computing content's hashes. Hashes to compute are defined across 2 configuration options: compute_checksums ([str]) list of hash algorithms that py:func:`swh.model.hashutil.MultiHash.from_data` function should be able to deal with. For variable-length checksums, a desired checksum length should also be provided. Their format is : e.g: blake2:512 recompute_checksums (bool) a boolean to notify that we also want to recompute potential existing hashes specified in compute_checksums. Default to False. """ DEFAULT_CONFIG = { # The storage to read from or update metadata to 'storage': ('dict', { 'cls': 'remote', 'args': { 'url': 'http://localhost:5002/' }, }), # The objstorage to read contents' data from 'objstorage': ('dict', { 'cls': 'pathslicing', 'args': { 'root': '/srv/softwareheritage/objects', 'slicing': '0:2/2:4/4:6', }, }), # the set of checksums that should be computed. # Examples: 'sha1_git', 'blake2b512', 'blake2s256' 'compute_checksums': ( 'list[str]', []), # whether checksums that already exist in the DB should be # recomputed/updated or left untouched 'recompute_checksums': ('bool', False), # Number of contents to retrieve blobs at the same time 'batch_size_retrieve_content': ('int', 10), # Number of contents to update at the same time 'batch_size_update': ('int', 100), } CONFIG_BASE_FILENAME = 'indexer/rehash' def __init__(self): self.config = self.parse_config_file() self.storage = get_storage(**self.config['storage']) self.objstorage = get_objstorage(**self.config['objstorage']) self.compute_checksums = self.config['compute_checksums'] self.recompute_checksums = self.config[ 'recompute_checksums'] self.batch_size_retrieve_content = self.config[ 'batch_size_retrieve_content'] self.batch_size_update = self.config[ 'batch_size_update'] self.log = logging.getLogger('swh.indexer.rehash') if not self.compute_checksums: raise ValueError('Checksums list should not be empty.') def _read_content_ids(self, contents): """Read the content identifiers from the contents. """ for c in contents: h = c['sha1'] if isinstance(h, str): h = hashutil.hash_to_bytes(h) yield h def get_new_contents_metadata(self, all_contents): """Retrieve raw contents and compute new checksums on the contents. Unknown or corrupted contents are skipped. Args: all_contents ([dict]): List of contents as dictionary with - the necessary primary keys + the necessary primary keys checksum_algorithms ([str]): List of checksums to compute Yields: - tuple of: content to update, list of checksums computed + tuple: tuple of (content to update, list of checksums computed) """ content_ids = self._read_content_ids(all_contents) for contents in utils.grouper(content_ids, self.batch_size_retrieve_content): contents_iter = itertools.tee(contents, 2) try: content_metadata = self.storage.content_get_metadata( [s for s in contents_iter[0]]) except Exception: self.log.exception( 'Problem when reading contents metadata.') continue for content in content_metadata: if self.recompute_checksums: # Recompute checksums provided # in compute_checksums options checksums_to_compute = list(self.compute_checksums) else: # Compute checksums provided in compute_checksums # options not already defined for that content checksums_to_compute = [h for h in self.compute_checksums if not content.get(h)] if not checksums_to_compute: # Nothing to recompute continue try: raw_content = self.objstorage.get(content['sha1']) except ObjNotFoundError: self.log.warning('Content %s not found in objstorage!' % content['sha1']) continue content_hashes = hashutil.MultiHash.from_data( raw_content, hash_names=checksums_to_compute).digest() content.update(content_hashes) yield content, checksums_to_compute def run(self, contents): """Given a list of content: - (re)compute a given set of checksums on contents available in our object storage - update those contents with the new metadata Args: contents (dict): contents as dictionary with necessary keys. key present in such dictionary should be the ones defined in the 'primary_key' option. """ for data in utils.grouper( self.get_new_contents_metadata(contents), self.batch_size_update): groups = defaultdict(list) for content, keys_to_update in data: keys = ','.join(keys_to_update) groups[keys].append(content) for keys_to_update, contents in groups.items(): keys = keys_to_update.split(',') try: self.storage.content_update(contents, keys=keys) except Exception: self.log.exception('Problem during update.') continue diff --git a/swh/indexer/storage/converters.py b/swh/indexer/storage/converters.py index 65859fc..177dd53 100644 --- a/swh/indexer/storage/converters.py +++ b/swh/indexer/storage/converters.py @@ -1,138 +1,140 @@ # Copyright (C) 2015-2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information def ctags_to_db(ctags): """Convert a ctags entry into a ready ctags entry. Args: ctags (dict): ctags entry with the following keys: - id (bytes): content's identifier - tool_id (int): tool id used to compute ctags - ctags ([dict]): List of dictionary with the following keys: - name (str): symbol's name - kind (str): symbol's kind - line (int): symbol's line in the content - language (str): language Returns: list: list of ctags entries as dicts with the following keys: - - id (bytes): content's identifier - - name (str): symbol's name - - kind (str): symbol's kind - - language (str): language for that content - - tool_id (int): tool id used to compute ctags + - id (bytes): content's identifier + - name (str): symbol's name + - kind (str): symbol's kind + - language (str): language for that content + - tool_id (int): tool id used to compute ctags """ id = ctags['id'] tool_id = ctags['indexer_configuration_id'] for ctag in ctags['ctags']: yield { 'id': id, 'name': ctag['name'], 'kind': ctag['kind'], 'line': ctag['line'], 'lang': ctag['lang'], 'indexer_configuration_id': tool_id, } def db_to_ctags(ctag): """Convert a ctags entry into a ready ctags entry. Args: ctags (dict): ctags entry with the following keys: - - id (bytes): content's identifier - - ctags ([dict]): List of dictionary with the following keys: - - name (str): symbol's name - - kind (str): symbol's kind - - line (int): symbol's line in the content - - language (str): language + + - id (bytes): content's identifier + - ctags ([dict]): List of dictionary with the following keys: + - name (str): symbol's name + - kind (str): symbol's kind + - line (int): symbol's line in the content + - language (str): language Returns: - List of ctags ready entry (dict with the following keys): + list: list of ctags ready entry (dict with the following keys): + - id (bytes): content's identifier - name (str): symbol's name - kind (str): symbol's kind - language (str): language for that content - tool (dict): tool used to compute the ctags """ return { 'id': ctag['id'], 'name': ctag['name'], 'kind': ctag['kind'], 'line': ctag['line'], 'lang': ctag['lang'], 'tool': { 'id': ctag['tool_id'], 'name': ctag['tool_name'], 'version': ctag['tool_version'], 'configuration': ctag['tool_configuration'] } } def db_to_mimetype(mimetype): """Convert a ctags entry into a ready ctags output. """ return { 'id': mimetype['id'], 'encoding': mimetype['encoding'], 'mimetype': mimetype['mimetype'], 'tool': { 'id': mimetype['tool_id'], 'name': mimetype['tool_name'], 'version': mimetype['tool_version'], 'configuration': mimetype['tool_configuration'] } } def db_to_language(language): """Convert a language entry into a ready language output. """ return { 'id': language['id'], 'lang': language['lang'], 'tool': { 'id': language['tool_id'], 'name': language['tool_name'], 'version': language['tool_version'], 'configuration': language['tool_configuration'] } } def db_to_metadata(metadata): """Convert a metadata entry into a ready metadata output. """ metadata['tool'] = { 'id': metadata['tool_id'], 'name': metadata['tool_name'], 'version': metadata['tool_version'], 'configuration': metadata['tool_configuration'] } del metadata['tool_id'], metadata['tool_configuration'] del metadata['tool_version'], metadata['tool_name'] return metadata def db_to_fossology_license(license): return { 'licenses': license['licenses'], 'tool': { 'id': license['tool_id'], 'name': license['tool_name'], 'version': license['tool_version'], 'configuration': license['tool_configuration'], } } diff --git a/swh/indexer/storage/db.py b/swh/indexer/storage/db.py index 0fea30c..68ee6c9 100644 --- a/swh/indexer/storage/db.py +++ b/swh/indexer/storage/db.py @@ -1,396 +1,397 @@ # Copyright (C) 2015-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from swh.model import hashutil from swh.storage.db import BaseDb, stored_procedure, cursor_to_bytes from swh.storage.db import line_to_bytes, execute_values_to_bytes class Db(BaseDb): """Proxy to the SWH Indexer DB, with wrappers around stored procedures """ content_mimetype_hash_keys = ['id', 'indexer_configuration_id'] def _missing_from_list(self, table, data, hash_keys, cur=None): """Read from table the data with hash_keys that are missing. Args: table (str): Table name (e.g content_mimetype, content_language, - etc...) + etc...) data (dict): Dict of data to read from hash_keys ([str]): List of keys to read in the data dict. Yields: The data which is missing from the db. """ cur = self._cursor(cur) keys = ', '.join(hash_keys) equality = ' AND '.join( ('t.%s = c.%s' % (key, key)) for key in hash_keys ) yield from execute_values_to_bytes( cur, """ select %s from (values %%s) as t(%s) where not exists ( select 1 from %s c where %s ) """ % (keys, keys, table, equality), (tuple(m[k] for k in hash_keys) for m in data) ) def content_mimetype_missing_from_list(self, mimetypes, cur=None): """List missing mimetypes. """ yield from self._missing_from_list( 'content_mimetype', mimetypes, self.content_mimetype_hash_keys, cur=cur) content_mimetype_cols = [ 'id', 'mimetype', 'encoding', 'tool_id', 'tool_name', 'tool_version', 'tool_configuration'] @stored_procedure('swh_mktemp_content_mimetype') def mktemp_content_mimetype(self, cur=None): pass def content_mimetype_add_from_temp(self, conflict_update, cur=None): self._cursor(cur).execute("SELECT swh_content_mimetype_add(%s)", (conflict_update, )) def _convert_key(self, key, main_table='c'): """Convert keys according to specific use in the module. + Args: key (str): Key expression to change according to the alias - used in the query + used in the query main_table (str): Alias to use for the main table. Default - to c for content_{something}. + to c for content_{something}. Expected: Tables content_{something} being aliased as 'c' (something in {language, mimetype, ...}), table indexer_configuration being aliased as 'i'. """ if key == 'id': return '%s.id' % main_table elif key == 'tool_id': return 'i.id as tool_id' elif key == 'licenses': return ''' array(select name from fossology_license where id = ANY( array_agg(%s.license_id))) as licenses''' % main_table return key def _get_from_list(self, table, ids, cols, cur=None, id_col='id'): """Fetches entries from the `table` such that their `id` field (or whatever is given to `id_col`) is in `ids`. Returns the columns `cols`. The `cur`sor is used to connect to the database. """ cur = self._cursor(cur) keys = map(self._convert_key, cols) query = """ select {keys} from (values %s) as t(id) inner join {table} c on c.{id_col}=t.id inner join indexer_configuration i on c.indexer_configuration_id=i.id; """.format( keys=', '.join(keys), id_col=id_col, table=table) yield from execute_values_to_bytes( cur, query, ((_id,) for _id in ids) ) content_indexer_names = { 'mimetype': 'content_mimetype', 'fossology_license': 'content_fossology_license', } def content_get_range(self, content_type, start, end, indexer_configuration_id, limit=1000, with_textual_data=False, cur=None): """Retrieve contents with content_type, within range [start, end] bound by limit and associated to the given indexer configuration id. When asking to work on textual content, that filters on the mimetype table with any mimetype that is not binary. """ cur = self._cursor(cur) table = self.content_indexer_names[content_type] if with_textual_data: extra = """inner join content_mimetype cm on (t.id=cm.id and cm.mimetype like 'text/%%')""" else: extra = "" query = """select t.id from %s t inner join indexer_configuration ic on t.indexer_configuration_id=ic.id %s where ic.id=%%s and %%s <= t.id and t.id <= %%s order by t.indexer_configuration_id, t.id limit %%s""" % (table, extra) cur.execute(query, (indexer_configuration_id, start, end, limit)) yield from cursor_to_bytes(cur) def content_mimetype_get_from_list(self, ids, cur=None): yield from self._get_from_list( 'content_mimetype', ids, self.content_mimetype_cols, cur=cur) content_language_hash_keys = ['id', 'indexer_configuration_id'] def content_language_missing_from_list(self, languages, cur=None): """List missing languages. """ yield from self._missing_from_list( 'content_language', languages, self.content_language_hash_keys, cur=cur) content_language_cols = [ 'id', 'lang', 'tool_id', 'tool_name', 'tool_version', 'tool_configuration'] @stored_procedure('swh_mktemp_content_language') def mktemp_content_language(self, cur=None): pass def content_language_add_from_temp(self, conflict_update, cur=None): self._cursor(cur).execute("SELECT swh_content_language_add(%s)", (conflict_update, )) def content_language_get_from_list(self, ids, cur=None): yield from self._get_from_list( 'content_language', ids, self.content_language_cols, cur=cur) content_ctags_hash_keys = ['id', 'indexer_configuration_id'] def content_ctags_missing_from_list(self, ctags, cur=None): """List missing ctags. """ yield from self._missing_from_list( 'content_ctags', ctags, self.content_ctags_hash_keys, cur=cur) content_ctags_cols = [ 'id', 'name', 'kind', 'line', 'lang', 'tool_id', 'tool_name', 'tool_version', 'tool_configuration'] @stored_procedure('swh_mktemp_content_ctags') def mktemp_content_ctags(self, cur=None): pass def content_ctags_add_from_temp(self, conflict_update, cur=None): self._cursor(cur).execute("SELECT swh_content_ctags_add(%s)", (conflict_update, )) def content_ctags_get_from_list(self, ids, cur=None): cur = self._cursor(cur) keys = map(self._convert_key, self.content_ctags_cols) yield from execute_values_to_bytes( cur, """ select %s from (values %%s) as t(id) inner join content_ctags c on c.id=t.id inner join indexer_configuration i on c.indexer_configuration_id=i.id order by line """ % ', '.join(keys), ((_id,) for _id in ids) ) def content_ctags_search(self, expression, last_sha1, limit, cur=None): cur = self._cursor(cur) if not last_sha1: query = """SELECT %s FROM swh_content_ctags_search(%%s, %%s)""" % ( ','.join(self.content_ctags_cols)) cur.execute(query, (expression, limit)) else: if last_sha1 and isinstance(last_sha1, bytes): last_sha1 = '\\x%s' % hashutil.hash_to_hex(last_sha1) elif last_sha1: last_sha1 = '\\x%s' % last_sha1 query = """SELECT %s FROM swh_content_ctags_search(%%s, %%s, %%s)""" % ( ','.join(self.content_ctags_cols)) cur.execute(query, (expression, limit, last_sha1)) yield from cursor_to_bytes(cur) content_fossology_license_cols = [ 'id', 'tool_id', 'tool_name', 'tool_version', 'tool_configuration', 'licenses'] @stored_procedure('swh_mktemp_content_fossology_license') def mktemp_content_fossology_license(self, cur=None): pass def content_fossology_license_add_from_temp(self, conflict_update, cur=None): """Add new licenses per content. """ self._cursor(cur).execute( "SELECT swh_content_fossology_license_add(%s)", (conflict_update, )) def content_fossology_license_get_from_list(self, ids, cur=None): """Retrieve licenses per id. """ cur = self._cursor(cur) keys = map(self._convert_key, self.content_fossology_license_cols) yield from execute_values_to_bytes( cur, """ select %s from (values %%s) as t(id) inner join content_fossology_license c on t.id=c.id inner join indexer_configuration i on i.id=c.indexer_configuration_id group by c.id, i.id, i.tool_name, i.tool_version, i.tool_configuration; """ % ', '.join(keys), ((_id,) for _id in ids) ) content_metadata_hash_keys = ['id', 'indexer_configuration_id'] def content_metadata_missing_from_list(self, metadata, cur=None): """List missing metadata. """ yield from self._missing_from_list( 'content_metadata', metadata, self.content_metadata_hash_keys, cur=cur) content_metadata_cols = [ 'id', 'translated_metadata', 'tool_id', 'tool_name', 'tool_version', 'tool_configuration'] @stored_procedure('swh_mktemp_content_metadata') def mktemp_content_metadata(self, cur=None): pass def content_metadata_add_from_temp(self, conflict_update, cur=None): self._cursor(cur).execute("SELECT swh_content_metadata_add(%s)", (conflict_update, )) def content_metadata_get_from_list(self, ids, cur=None): yield from self._get_from_list( 'content_metadata', ids, self.content_metadata_cols, cur=cur) revision_metadata_hash_keys = ['id', 'indexer_configuration_id'] def revision_metadata_missing_from_list(self, metadata, cur=None): """List missing metadata. """ yield from self._missing_from_list( 'revision_metadata', metadata, self.revision_metadata_hash_keys, cur=cur) revision_metadata_cols = [ 'id', 'translated_metadata', 'tool_id', 'tool_name', 'tool_version', 'tool_configuration'] @stored_procedure('swh_mktemp_revision_metadata') def mktemp_revision_metadata(self, cur=None): pass def revision_metadata_add_from_temp(self, conflict_update, cur=None): self._cursor(cur).execute("SELECT swh_revision_metadata_add(%s)", (conflict_update, )) def revision_metadata_get_from_list(self, ids, cur=None): yield from self._get_from_list( 'revision_metadata', ids, self.revision_metadata_cols, cur=cur) origin_intrinsic_metadata_cols = [ 'origin_id', 'metadata', 'from_revision', 'tool_id', 'tool_name', 'tool_version', 'tool_configuration'] origin_intrinsic_metadata_regconfig = 'pg_catalog.simple' """The dictionary used to normalize 'metadata' and queries. 'pg_catalog.simple' provides no stopword, so it should be suitable for proper names and non-English content. When updating this value, make sure to add a new index on origin_intrinsic_metadata.metadata.""" @stored_procedure('swh_mktemp_origin_intrinsic_metadata') def mktemp_origin_intrinsic_metadata(self, cur=None): pass def origin_intrinsic_metadata_add_from_temp( self, conflict_update, cur=None): cur = self._cursor(cur) cur.execute( "SELECT swh_origin_intrinsic_metadata_add(%s)", (conflict_update, )) def origin_intrinsic_metadata_get_from_list(self, orig_ids, cur=None): yield from self._get_from_list( 'origin_intrinsic_metadata', orig_ids, self.origin_intrinsic_metadata_cols, cur=cur, id_col='origin_id') def origin_intrinsic_metadata_search_fulltext(self, terms, *, limit, cur=None): regconfig = self.origin_intrinsic_metadata_regconfig tsquery_template = ' && '.join("plainto_tsquery('%s', %%s)" % regconfig for _ in terms) tsquery_args = [(term,) for term in terms] keys = map(self._convert_key, self.origin_intrinsic_metadata_cols) query = ("SELECT {keys} FROM origin_intrinsic_metadata AS oim " "INNER JOIN indexer_configuration AS i " "ON oim.indexer_configuration_id=i.id " "JOIN LATERAL (SELECT {tsquery_template}) AS s(tsq) ON true " "WHERE to_tsvector('{regconfig}', metadata) @@ tsq " "ORDER BY ts_rank(oim.metadata_tsvector, tsq, 1) DESC " "LIMIT %s;" ).format(keys=', '.join(keys), regconfig=regconfig, tsquery_template=tsquery_template) cur.execute(query, tsquery_args + [limit]) yield from cursor_to_bytes(cur) indexer_configuration_cols = ['id', 'tool_name', 'tool_version', 'tool_configuration'] @stored_procedure('swh_mktemp_indexer_configuration') def mktemp_indexer_configuration(self, cur=None): pass def indexer_configuration_add_from_temp(self, cur=None): cur = self._cursor(cur) cur.execute("SELECT %s from swh_indexer_configuration_add()" % ( ','.join(self.indexer_configuration_cols), )) yield from cursor_to_bytes(cur) def indexer_configuration_get(self, tool_name, tool_version, tool_configuration, cur=None): cur = self._cursor(cur) cur.execute('''select %s from indexer_configuration where tool_name=%%s and tool_version=%%s and tool_configuration=%%s''' % ( ','.join(self.indexer_configuration_cols)), (tool_name, tool_version, tool_configuration)) data = cur.fetchone() if not data: return None return line_to_bytes(data) diff --git a/swh/indexer/storage/in_memory.py b/swh/indexer/storage/in_memory.py index 1398330..715ecb9 100644 --- a/swh/indexer/storage/in_memory.py +++ b/swh/indexer/storage/in_memory.py @@ -1,249 +1,249 @@ # Copyright (C) 2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from collections import defaultdict import json class MetadataStorage: """Implements missing/get/add logic for both content_metadata and revision_metadata.""" def __init__(self, tools): self._tools = tools self._metadata = {} # map (id_, tool_id) -> metadata_dict self._tools_per_id = defaultdict(set) # map id_ -> Set[tool_id] def _transform_tool(self, tool): return { 'id': tool['id'], 'name': tool['tool_name'], 'version': tool['tool_version'], 'configuration': tool['tool_configuration'], } def missing(self, ids): """List metadata missing from storage. Args: metadata (iterable): dictionaries with keys: - **id** (bytes): sha1 identifier - **indexer_configuration_id** (int): tool used to compute the results Yields: missing sha1s """ for id_ in ids: tool_id = id_['indexer_configuration_id'] id_ = id_['id'] if tool_id not in self._tools_per_id.get(id_, set()): yield id_ def get(self, ids): """Retrieve metadata per id. Args: ids (iterable): sha1 checksums Yields: - dictionaries with the following keys: + dict: dictionaries with the following keys: - id (bytes) - translated_metadata (str): associated metadata - tool (dict): tool used to compute metadata + - **id** (bytes) + - **translated_metadata** (str): associated metadata + - **tool** (dict): tool used to compute metadata """ for id_ in ids: for tool_id in self._tools_per_id.get(id_, set()): key = (id_, tool_id) yield { 'id': id_, 'tool': self._transform_tool(self._tools[tool_id]), 'translated_metadata': self._metadata[key], } def add(self, metadata, conflict_update): """Add metadata not present in storage. Args: metadata (iterable): dictionaries with keys: - - **id**: sha1 - - **translated_metadata**: arbitrary dict - - **indexer_configuration_id**: tool used to compute the - results + - **id**: sha1 + - **translated_metadata**: arbitrary dict + - **indexer_configuration_id**: tool used to compute the + results - conflict_update: Flag to determine if we want to overwrite (true) - or skip duplicates (false) + conflict_update (bool): Flag to determine if we want to overwrite + (true) or skip duplicates (false) """ for item in metadata: tool_id = item['indexer_configuration_id'] data = item['translated_metadata'] id_ = item['id'] if not conflict_update and \ tool_id in self._tools_per_id.get(id_, set()): # Duplicate, should not be updated continue key = (id_, tool_id) self._metadata[key] = data self._tools_per_id[id_].add(tool_id) class IndexerStorage: """In-memory SWH indexer storage.""" def __init__(self): self._tools = {} self._content_metadata = MetadataStorage(self._tools) self._revision_metadata = MetadataStorage(self._tools) def content_metadata_missing(self, metadata): """List metadata missing from storage. Args: metadata (iterable): dictionaries with keys: - - **id** (bytes): sha1 identifier - - **indexer_configuration_id** (int): tool used to compute - the results + - **id** (bytes): sha1 identifier + - **indexer_configuration_id** (int): tool used to compute + the results Yields: missing sha1s """ yield from self._content_metadata.missing(metadata) def content_metadata_get(self, ids): """Retrieve metadata per id. Args: ids (iterable): sha1 checksums Yields: dictionaries with the following keys: - id (bytes) - translated_metadata (str): associated metadata - tool (dict): tool used to compute metadata + - **id** (bytes) + - **translated_metadata** (str): associated metadata + - **tool** (dict): tool used to compute metadata """ yield from self._content_metadata.get(ids) def content_metadata_add(self, metadata, conflict_update=False): """Add metadata not present in storage. Args: metadata (iterable): dictionaries with keys: - - **id**: sha1 - - **translated_metadata**: arbitrary dict - - **indexer_configuration_id**: tool used to compute the - results + - **id**: sha1 + - **translated_metadata**: arbitrary dict + - **indexer_configuration_id**: tool used to compute the + results conflict_update: Flag to determine if we want to overwrite (true) or skip duplicates (false, the default) """ self._content_metadata.add(metadata, conflict_update) def revision_metadata_missing(self, metadata): """List metadata missing from storage. Args: metadata (iterable): dictionaries with keys: - - **id** (bytes): sha1_git revision identifier - - **indexer_configuration_id** (int): tool used to compute - the results + - **id** (bytes): sha1_git revision identifier + - **indexer_configuration_id** (int): tool used to compute + the results Yields: missing ids """ yield from self._revision_metadata.missing(metadata) def revision_metadata_get(self, ids): """Retrieve revision metadata per id. Args: ids (iterable): sha1 checksums Yields: dictionaries with the following keys: - - **id** (bytes) - - **translated_metadata** (str): associated metadata - - **tool** (dict): tool used to compute metadata + - **id** (bytes) + - **translated_metadata** (str): associated metadata + - **tool** (dict): tool used to compute metadata """ yield from self._revision_metadata.get(ids) def revision_metadata_add(self, metadata, conflict_update=False): """Add metadata not present in storage. Args: metadata (iterable): dictionaries with keys: - - **id**: sha1_git of revision - - **translated_metadata**: arbitrary dict - - **indexer_configuration_id**: tool used to compute metadata + - **id**: sha1_git of revision + - **translated_metadata**: arbitrary dict + - **indexer_configuration_id**: tool used to compute metadata conflict_update: Flag to determine if we want to overwrite (true) or skip duplicates (false, the default) """ self._revision_metadata.add(metadata, conflict_update) def indexer_configuration_add(self, tools): """Add new tools to the storage. Args: tools ([dict]): List of dictionary representing tool to - insert in the db. Dictionary with the following keys: + insert in the db. Dictionary with the following keys: - - **tool_name** (str): tool's name - - **tool_version** (str): tool's version - - **tool_configuration** (dict): tool's configuration - (free form dict) + - **tool_name** (str): tool's name + - **tool_version** (str): tool's version + - **tool_configuration** (dict): tool's configuration + (free form dict) Returns: - List of dict inserted in the db (holding the id key as - well). The order of the list is not guaranteed to match + list: List of dict inserted in the db (holding the id key as + well). The order of the list is not guaranteed to match the order of the initial list. """ inserted = [] for tool in tools: tool = tool.copy() id_ = self._tool_key(tool) tool['id'] = id_ self._tools[id_] = tool inserted.append(tool) return inserted def indexer_configuration_get(self, tool): """Retrieve tool information. Args: tool (dict): Dictionary representing a tool with the - following keys: + following keys: - - **tool_name** (str): tool's name - - **tool_version** (str): tool's version - - **tool_configuration** (dict): tool's configuration - (free form dict) + - **tool_name** (str): tool's name + - **tool_version** (str): tool's version + - **tool_configuration** (dict): tool's configuration + (free form dict) Returns: The same dictionary with an `id` key, None otherwise. """ return self._tools.get(self._tool_key(tool)) def _tool_key(self, tool): return (tool['tool_name'], tool['tool_version'], json.dumps(tool['tool_configuration'], sort_keys=True))