diff --git a/swh/indexer/ctags.py b/swh/indexer/ctags.py index 1ad6022..b37732e 100644 --- a/swh/indexer/ctags.py +++ b/swh/indexer/ctags.py @@ -1,156 +1,149 @@ # Copyright (C) 2015-2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import subprocess import json from swh.model import hashutil from .language import compute_language from .indexer import ContentIndexer, DiskIndexer # Options used to compute tags __FLAGS = [ '--fields=+lnz', # +l: language # +n: line number of tag definition # +z: include the symbol's kind (function, variable, ...) '--sort=no', # sort output on tag name '--links=no', # do not follow symlinks '--output-format=json', # outputs in json ] def run_ctags(path, lang=None, ctags_command='ctags'): """Run ctags on file path with optional language. Args: path: path to the file lang: language for that path (optional) Yields: dict: ctags' output """ optional = [] if lang: optional = ['--language-force=%s' % lang] cmd = [ctags_command] + __FLAGS + optional + [path] output = subprocess.check_output(cmd, universal_newlines=True) for symbol in output.split('\n'): if not symbol: continue js_symbol = json.loads(symbol) yield { 'name': js_symbol['name'], 'kind': js_symbol['kind'], 'line': js_symbol['line'], 'lang': js_symbol['language'], } class CtagsIndexer(ContentIndexer, DiskIndexer): CONFIG_BASE_FILENAME = 'indexer/ctags' ADDITIONAL_CONFIG = { 'workdir': ('str', '/tmp/swh/indexer.ctags'), 'tools': ('dict', { 'name': 'universal-ctags', 'version': '~git7859817b', 'configuration': { 'command_line': '''ctags --fields=+lnz --sort=no --links=no ''' '''--output-format=json ''' }, }), 'languages': ('dict', { 'ada': 'Ada', 'adl': None, 'agda': None, # ... }) } def prepare(self): super().prepare() self.working_directory = self.config['workdir'] self.language_map = self.config['languages'] - self.tool = self.tools[0] def filter(self, ids): """Filter out known sha1s and return only missing ones. """ yield from self.idx_storage.content_ctags_missing(( { 'id': sha1, 'indexer_configuration_id': self.tool['id'], } for sha1 in ids )) - def compute_ctags(self, path, lang): - """Compute ctags on file at path with language lang. - - """ - return run_ctags(path, lang=lang) - def index(self, id, data): """Index sha1s' content and store result. Args: id (bytes): content's identifier data (bytes): raw content in bytes Returns: dict: a dict representing a content_mimetype with keys: - **id** (bytes): content's identifier (sha1) - **ctags** ([dict]): ctags list of symbols """ lang = compute_language(data, log=self.log)['lang'] if not lang: return None ctags_lang = self.language_map.get(lang) if not ctags_lang: return None ctags = { 'id': id, } filename = hashutil.hash_to_hex(id) content_path = self.write_to_temp( filename=filename, data=data) result = run_ctags(content_path, lang=ctags_lang) ctags.update({ 'ctags': list(result), 'indexer_configuration_id': self.tool['id'], }) self.cleanup(content_path) return ctags def persist_index_computations(self, results, policy_update): """Persist the results in storage. Args: results ([dict]): list of content_mimetype, dict with the following keys: - id (bytes): content's identifier (sha1) - ctags ([dict]): ctags list of symbols policy_update ([str]): either 'update-dups' or 'ignore-dups' to respectively update duplicates or ignore them """ self.idx_storage.content_ctags_add( results, conflict_update=(policy_update == 'update-dups')) diff --git a/swh/indexer/fossology_license.py b/swh/indexer/fossology_license.py index 3a5cefb..0860a40 100644 --- a/swh/indexer/fossology_license.py +++ b/swh/indexer/fossology_license.py @@ -1,192 +1,176 @@ # Copyright (C) 2016-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import subprocess from swh.model import hashutil from .indexer import ContentIndexer, ContentRangeIndexer, DiskIndexer def compute_license(path, log=None): """Determine license from file at path. Args: path: filepath to determine the license Returns: dict: A dict with the following keys: - licenses ([str]): associated detected licenses to path - path (bytes): content filepath """ try: properties = subprocess.check_output(['nomossa', path], universal_newlines=True) if properties: res = properties.rstrip().split(' contains license(s) ') licenses = res[1].split(',') else: licenses = [] return { 'licenses': licenses, 'path': path, } except subprocess.CalledProcessError: if log: from os import path as __path log.exception('Problem during license detection for sha1 %s' % __path.basename(path)) return { 'licenses': [], 'path': path, } class MixinFossologyLicenseIndexer: """Mixin fossology license indexer. See :class:`FossologyLicenseIndexer` and :class:`FossologyLicenseRangeIndexer` """ ADDITIONAL_CONFIG = { 'workdir': ('str', '/tmp/swh/indexer.fossology.license'), 'tools': ('dict', { 'name': 'nomos', 'version': '3.1.0rc2-31-ga2cbb8c', 'configuration': { 'command_line': 'nomossa ', }, }), 'write_batch_size': ('int', 1000), } CONFIG_BASE_FILENAME = 'indexer/fossology_license' def prepare(self): super().prepare() self.working_directory = self.config['workdir'] - self.tool = self.tools[0] - - def compute_license(self, path, log=None): - """Determine license from file at path. - - Args: - path: filepath to determine the license - - Returns: - dict: A dict with the following keys: - - - licenses ([str]): associated detected licenses to path - - path (bytes): content filepath - - """ - return compute_license(path, log=log) def index(self, id, data): """Index sha1s' content and store result. Args: id (bytes): content's identifier raw_content (bytes): associated raw content to content id Returns: dict: A dict, representing a content_license, with keys: - id (bytes): content's identifier (sha1) - license (bytes): license in bytes - path (bytes): path - indexer_configuration_id (int): tool used to compute the output """ assert isinstance(id, bytes) content_path = self.write_to_temp( filename=hashutil.hash_to_hex(id), # use the id as pathname data=data) try: - properties = self.compute_license(path=content_path, log=self.log) + properties = compute_license(path=content_path, log=self.log) properties.update({ 'id': id, 'indexer_configuration_id': self.tool['id'], }) finally: self.cleanup(content_path) return properties def persist_index_computations(self, results, policy_update): """Persist the results in storage. Args: results ([dict]): list of content_license, dict with the following keys: - id (bytes): content's identifier (sha1) - license (bytes): license in bytes - path (bytes): path policy_update ([str]): either 'update-dups' or 'ignore-dups' to respectively update duplicates or ignore them """ self.idx_storage.content_fossology_license_add( results, conflict_update=(policy_update == 'update-dups')) class FossologyLicenseIndexer( MixinFossologyLicenseIndexer, DiskIndexer, ContentIndexer): """Indexer in charge of: - filtering out content already indexed - reading content from objstorage per the content's id (sha1) - computing {license, encoding} from that content - store result in storage """ def filter(self, ids): """Filter out known sha1s and return only missing ones. """ yield from self.idx_storage.content_fossology_license_missing(( { 'id': sha1, 'indexer_configuration_id': self.tool['id'], } for sha1 in ids )) class FossologyLicenseRangeIndexer( MixinFossologyLicenseIndexer, DiskIndexer, ContentRangeIndexer): """FossologyLicense Range Indexer working on range of content identifiers. - filters out the non textual content - (optionally) filters out content already indexed (cf :meth:`.indexed_contents_in_range`) - reads content from objstorage per the content's id (sha1) - computes {mimetype, encoding} from that content - stores result in storage """ def indexed_contents_in_range(self, start, end): """Retrieve indexed content id within range [start, end]. Args: start (bytes): Starting bound from range identifier end (bytes): End range identifier Returns: dict: a dict with keys: - **ids** [bytes]: iterable of content ids within the range. - **next** (Optional[bytes]): The next range of sha1 starts at this sha1 if any """ return self.idx_storage.content_fossology_license_get_range( start, end, self.tool['id']) diff --git a/swh/indexer/indexer.py b/swh/indexer/indexer.py index 201924f..766d278 100644 --- a/swh/indexer/indexer.py +++ b/swh/indexer/indexer.py @@ -1,621 +1,637 @@ # Copyright (C) 2016-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import abc import ast import os import logging import shutil import tempfile import datetime from copy import deepcopy from swh.scheduler import get_scheduler from swh.storage import get_storage from swh.core.config import SWHConfig from swh.objstorage import get_objstorage from swh.objstorage.exc import ObjNotFoundError from swh.indexer.storage import get_indexer_storage, INDEXER_CFG_KEY from swh.model import hashutil from swh.core import utils class DiskIndexer: """Mixin intended to be used with other SomethingIndexer classes. Indexers inheriting from this class are a category of indexers which needs the disk for their computations. Note: This expects `self.working_directory` variable defined at runtime. """ def write_to_temp(self, filename, data): """Write the sha1's content in a temporary file. Args: filename (str): one of sha1's many filenames data (bytes): the sha1's content to write in temporary file Returns: The path to the temporary file created. That file is filled in with the raw content's data. """ os.makedirs(self.working_directory, exist_ok=True) temp_dir = tempfile.mkdtemp(dir=self.working_directory) content_path = os.path.join(temp_dir, filename) with open(content_path, 'wb') as f: f.write(data) return content_path def cleanup(self, content_path): """Remove content_path from working directory. Args: content_path (str): the file to remove """ temp_dir = os.path.dirname(content_path) shutil.rmtree(temp_dir) class BaseIndexer(SWHConfig, metaclass=abc.ABCMeta): """Base class for indexers to inherit from. The main entry point is the :func:`run` function which is in charge of triggering the computations on the batch dict/ids received. Indexers can: - filter out ids whose data has already been indexed. - retrieve ids data from storage or objstorage - index this data depending on the object and store the result in storage. To implement a new object type indexer, inherit from the BaseIndexer and implement indexing: :meth:`~BaseIndexer.run`: object_ids are different depending on object. For example: sha1 for content, sha1_git for revision, directory, release, and id for origin To implement a new concrete indexer, inherit from the object level classes: :class:`ContentIndexer`, :class:`RevisionIndexer`, :class:`OriginIndexer`. Then you need to implement the following functions: :meth:`~BaseIndexer.filter`: filter out data already indexed (in storage). :meth:`~BaseIndexer.index_object`: compute index on id with data (retrieved from the storage or the objstorage by the id key) and return the resulting index computation. :meth:`~BaseIndexer.persist_index_computations`: persist the results of multiple index computations in the storage. The new indexer implementation can also override the following functions: :meth:`~BaseIndexer.prepare`: Configuration preparation for the indexer. When overriding, this must call the `super().prepare()` instruction. :meth:`~BaseIndexer.check`: Configuration check for the indexer. When overriding, this must call the `super().check()` instruction. :meth:`~BaseIndexer.register_tools`: This should return a dict of the tool(s) to use when indexing or filtering. """ CONFIG = 'indexer/base' DEFAULT_CONFIG = { INDEXER_CFG_KEY: ('dict', { 'cls': 'remote', 'args': { 'url': 'http://localhost:5007/' } }), 'storage': ('dict', { 'cls': 'remote', 'args': { 'url': 'http://localhost:5002/', } }), 'objstorage': ('dict', { 'cls': 'remote', 'args': { 'url': 'http://localhost:5003/', } }) } ADDITIONAL_CONFIG = {} - def __init__(self): + USE_TOOLS = True + + def __init__(self, config=None, **kw): """Prepare and check that the indexer is ready to run. """ super().__init__() + if config is not None: + self.config = config + else: + config_keys = ('base_filename', 'config_filename', + 'additional_configs', 'global_config') + config_args = {k: v for k, v in kw.items() if k in config_keys} + self.config = self.parse_config_file(**config_args) self.prepare() self.check() def prepare(self): """Prepare the indexer's needed runtime configuration. Without this step, the indexer cannot possibly run. """ - # HACK to deal with edge case (e.g revision metadata indexer) - if not hasattr(self, 'config'): - self.config = self.parse_config_file( - additional_configs=[self.ADDITIONAL_CONFIG]) config_storage = self.config.get('storage') if config_storage: self.storage = get_storage(**config_storage) + objstorage = self.config['objstorage'] - self.objstorage = get_objstorage(objstorage['cls'], objstorage['args']) + self.objstorage = get_objstorage(objstorage['cls'], + objstorage['args']) + idx_storage = self.config[INDEXER_CFG_KEY] self.idx_storage = get_indexer_storage(**idx_storage) _log = logging.getLogger('requests.packages.urllib3.connectionpool') _log.setLevel(logging.WARN) self.log = logging.getLogger('swh.indexer') - self.tools = list(self.register_tools(self.config['tools'])) - def check(self, *, check_tools=True): + if self.USE_TOOLS: + self.tools = list(self.register_tools( + self.config.get('tools', []))) + self.results = [] + + @property + def tool(self): + return self.tools[0] + + def check(self): """Check the indexer's configuration is ok before proceeding. If ok, does nothing. If not raise error. """ - if check_tools and not self.tools: + if self.USE_TOOLS and not self.tools: raise ValueError('Tools %s is unknown, cannot continue' % self.tools) def _prepare_tool(self, tool): """Prepare the tool dict to be compliant with the storage api. """ return {'tool_%s' % key: value for key, value in tool.items()} def register_tools(self, tools): """Permit to register tools to the storage. Add a sensible default which can be overridden if not sufficient. (For now, all indexers use only one tool) Expects the self.config['tools'] property to be set with one or more tools. Args: tools (dict/[dict]): Either a dict or a list of dict. Returns: list: List of dicts with additional id key. Raises: ValueError: if not a list nor a dict. """ if isinstance(tools, list): tools = list(map(self._prepare_tool, tools)) elif isinstance(tools, dict): tools = [self._prepare_tool(tools)] else: raise ValueError('Configuration tool(s) must be a dict or list!') if tools: return self.idx_storage.indexer_configuration_add(tools) else: return [] @abc.abstractmethod def index(self, id, data): """Index computation for the id and associated raw data. Args: id (bytes): identifier data (bytes): id's data from storage or objstorage depending on object type Returns: dict: a dict that makes sense for the :meth:`.persist_index_computations` method. """ pass + def filter(self, ids): + """Filter missing ids for that particular indexer. + + Args: + ids ([bytes]): list of ids + + Yields: + iterator of missing ids + + """ + yield from ids + @abc.abstractmethod def persist_index_computations(self, results, policy_update): """Persist the computation resulting from the index. Args: results ([result]): List of results. One result is the result of the index function. policy_update ([str]): either 'update-dups' or 'ignore-dups' to respectively update duplicates or ignore them Returns: None """ pass def next_step(self, results, task): """Do something else with computations results (e.g. send to another queue, ...). (This is not an abstractmethod since it is optional). Args: results ([result]): List of results (dict) as returned by index function. task (dict): a dict in the form expected by `scheduler.backend.SchedulerBackend.create_tasks` without `next_run`, plus an optional `result_name` key. Returns: None """ if task: if getattr(self, 'scheduler', None): scheduler = self.scheduler else: scheduler = get_scheduler(**self.config['scheduler']) task = deepcopy(task) result_name = task.pop('result_name', None) task['next_run'] = datetime.datetime.now() if result_name: task['arguments']['kwargs'][result_name] = self.results scheduler.create_tasks([task]) @abc.abstractmethod def run(self, ids, policy_update, next_step=None, **kwargs): """Given a list of ids: - retrieves the data from the storage - executes the indexing computations - stores the results (according to policy_update) Args: ids ([bytes]): id's identifier list policy_update (str): either 'update-dups' or 'ignore-dups' to respectively update duplicates or ignore them next_step (dict): a dict in the form expected by `scheduler.backend.SchedulerBackend.create_tasks` without `next_run`, plus a `result_name` key. **kwargs: passed to the `index` method """ pass class ContentIndexer(BaseIndexer): """A content indexer working on a list of ids directly. To work on indexer range, use the :class:`ContentRangeIndexer` instead. Note: :class:`ContentIndexer` is not an instantiable object. To use it, one should inherit from this class and override the methods mentioned in the :class:`BaseIndexer` class. """ - @abc.abstractmethod - def filter(self, ids): - """Filter missing ids for that particular indexer. - - Args: - ids ([bytes]): list of ids - - Yields: - iterator of missing ids - - """ - pass def run(self, ids, policy_update, next_step=None, **kwargs): """Given a list of ids: - retrieve the content from the storage - execute the indexing computations - store the results (according to policy_update) Args: ids (Iterable[Union[bytes, str]]): sha1's identifier list policy_update (str): either 'update-dups' or 'ignore-dups' to respectively update duplicates or ignore them next_step (dict): a dict in the form expected by `scheduler.backend.SchedulerBackend.create_tasks` without `next_run`, plus an optional `result_name` key. **kwargs: passed to the `index` method """ ids = [hashutil.hash_to_bytes(id_) if isinstance(id_, str) else id_ for id_ in ids] results = [] try: for sha1 in ids: try: raw_content = self.objstorage.get(sha1) except ObjNotFoundError: self.log.warning('Content %s not found in objstorage' % hashutil.hash_to_hex(sha1)) continue res = self.index(sha1, raw_content, **kwargs) if res: # If no results, skip it results.append(res) self.persist_index_computations(results, policy_update) self.results = results return self.next_step(results, task=next_step) except Exception: self.log.exception( 'Problem when reading contents metadata.') class ContentRangeIndexer(BaseIndexer): """A content range indexer. This expects as input a range of ids to index. To work on a list of ids, use the :class:`ContentIndexer` instead. Note: :class:`ContentRangeIndexer` is not an instantiable object. To use it, one should inherit from this class and override the methods mentioned in the :class:`BaseIndexer` class. """ @abc.abstractmethod def indexed_contents_in_range(self, start, end): """Retrieve indexed contents within range [start, end]. Args: start (bytes): Starting bound from range identifier end (bytes): End range identifier Yields: bytes: Content identifier present in the range ``[start, end]`` """ pass def _list_contents_to_index(self, start, end, indexed): """Compute from storage the new contents to index in the range [start, end]. The already indexed contents are skipped. Args: start (bytes): Starting bound from range identifier end (bytes): End range identifier indexed (Set[bytes]): Set of content already indexed. Yields: bytes: Identifier of contents to index. """ if not isinstance(start, bytes) or not isinstance(end, bytes): raise TypeError('identifiers must be bytes, not %r and %r.' % (start, end)) while start: result = self.storage.content_get_range(start, end) contents = result['contents'] for c in contents: _id = hashutil.hash_to_bytes(c['sha1']) if _id in indexed: continue yield _id start = result['next'] def _index_contents(self, start, end, indexed, **kwargs): """Index the contents from within range [start, end] Args: start (bytes): Starting bound from range identifier end (bytes): End range identifier indexed (Set[bytes]): Set of content already indexed. Yields: dict: Data indexed to persist using the indexer storage """ for sha1 in self._list_contents_to_index(start, end, indexed): try: raw_content = self.objstorage.get(sha1) except ObjNotFoundError: self.log.warning('Content %s not found in objstorage' % hashutil.hash_to_hex(sha1)) continue res = self.index(sha1, raw_content, **kwargs) if res: if not isinstance(res['id'], bytes): raise TypeError( '%r.index should return ids as bytes, not %r' % (self.__class__.__name__, res['id'])) yield res def _index_with_skipping_already_done(self, start, end): """Index not already indexed contents in range [start, end]. Args: start** (Union[bytes, str]): Starting range identifier end (Union[bytes, str]): Ending range identifier Yields: bytes: Content identifier present in the range ``[start, end]`` which are not already indexed. """ while start: indexed_page = self.indexed_contents_in_range(start, end) contents = indexed_page['ids'] _end = contents[-1] if contents else end yield from self._index_contents( start, _end, contents) start = indexed_page['next'] def run(self, start, end, skip_existing=True, **kwargs): """Given a range of content ids, compute the indexing computations on the contents within. Either the indexer is incremental (filter out existing computed data) or not (compute everything from scratch). Args: start (Union[bytes, str]): Starting range identifier end (Union[bytes, str]): Ending range identifier skip_existing (bool): Skip existing indexed data (default) or not **kwargs: passed to the `index` method Returns: bool: True if data was indexed, False otherwise. """ with_indexed_data = False try: if isinstance(start, str): start = hashutil.hash_to_bytes(start) if isinstance(end, str): end = hashutil.hash_to_bytes(end) if skip_existing: gen = self._index_with_skipping_already_done(start, end) else: gen = self._index_contents(start, end, indexed=[]) for results in utils.grouper(gen, n=self.config['write_batch_size']): self.persist_index_computations( results, policy_update='update-dups') with_indexed_data = True except Exception: self.log.exception( 'Problem when computing metadata.') finally: return with_indexed_data class OriginIndexer(BaseIndexer): """An object type indexer, inherits from the :class:`BaseIndexer` and implements Origin indexing using the run method Note: the :class:`OriginIndexer` is not an instantiable object. To use it in another context one should inherit from this class and override the methods mentioned in the :class:`BaseIndexer` class. """ def run(self, ids, policy_update='update-dups', parse_ids=True, next_step=None, **kwargs): """Given a list of origin ids: - retrieve origins from storage - execute the indexing computations - store the results (according to policy_update) Args: ids ([Union[int, Tuple[str, bytes]]]): list of origin ids or (type, url) tuples. policy_update (str): either 'update-dups' or 'ignore-dups' to respectively update duplicates (default) or ignore them next_step (dict): a dict in the form expected by `scheduler.backend.SchedulerBackend.create_tasks` without `next_run`, plus an optional `result_name` key. parse_ids (bool): Do we need to parse id or not (default) **kwargs: passed to the `index` method """ if parse_ids: ids = [o.split('+', 1) if ':' in o else int(o) # type+url or id for o in ids] results = [] for id_ in ids: if isinstance(id_, str): # Data coming from JSON, which requires string keys, so # one extra level of deserialization is needed id_ = ast.literal_eval(id_) if isinstance(id_, (tuple, list)): if len(id_) != 2: raise TypeError('Expected a (type, url) tuple.') (type_, url) = id_ params = {'type': type_, 'url': url} elif isinstance(id_, int): params = {'id': id_} else: raise TypeError('Invalid value in "ids": %r' % id_) origin = self.storage.origin_get(params) if not origin: self.log.warning('Origin %s not found in storage' % list(id_)) continue try: res = self.index(origin, **kwargs) if res: # If no results, skip it results.append(res) except Exception: self.log.exception( 'Problem when processing origin %s' % (id_,)) self.persist_index_computations(results, policy_update) self.results = results return self.next_step(results, task=next_step) class RevisionIndexer(BaseIndexer): """An object type indexer, inherits from the :class:`BaseIndexer` and implements Revision indexing using the run method Note: the :class:`RevisionIndexer` is not an instantiable object. To use it in another context one should inherit from this class and override the methods mentioned in the :class:`BaseIndexer` class. """ def run(self, ids, policy_update, next_step=None): """Given a list of sha1_gits: - retrieve revisions from storage - execute the indexing computations - store the results (according to policy_update) Args: ids ([bytes or str]): sha1_git's identifier list policy_update (str): either 'update-dups' or 'ignore-dups' to respectively update duplicates or ignore them """ results = [] ids = [hashutil.hash_to_bytes(id_) if isinstance(id_, str) else id_ for id_ in ids] revs = self.storage.revision_get(ids) for rev in revs: if not rev: self.log.warning('Revisions %s not found in storage' % list(map(hashutil.hash_to_hex, ids))) continue try: res = self.index(rev) if res: # If no results, skip it results.append(res) except Exception: self.log.exception( 'Problem when processing revision') self.persist_index_computations(results, policy_update) self.results = results return self.next_step(results, task=next_step) diff --git a/swh/indexer/language.py b/swh/indexer/language.py index 5076de8..c69b1dc 100644 --- a/swh/indexer/language.py +++ b/swh/indexer/language.py @@ -1,209 +1,207 @@ # Copyright (C) 2016-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import io from pygments.lexers import guess_lexer from pygments.util import ClassNotFound from chardet.universaldetector import UniversalDetector from .indexer import ContentIndexer def _cleanup_classname(classname): """Determine the language from the pygments' lexer names. """ return classname.lower().replace(' ', '-') def _read_raw(raw_content, size=2048): """Read raw content in chunk. """ bs = io.BytesIO(raw_content) while True: chunk = bs.read(size) if not chunk: break yield chunk def _detect_encoding(raw_content): """Given a raw content, try and detect its encoding. """ detector = UniversalDetector() for chunk in _read_raw(raw_content): detector.feed(chunk) if detector.done: break detector.close() return detector.result['encoding'] def compute_language_from_chunk(encoding, length, raw_content, max_size, log=None): """Determine the raw content's language. Args: encoding (str): Encoding to use to decode the content length (int): raw_content's length raw_content (bytes): raw content to work with max_size (int): max size to split the raw content at Returns: dict: Dict with keys: - **lang**: None if nothing found or the possible language """ try: if max_size <= length: raw_content = raw_content[0:max_size] content = raw_content.decode(encoding) lang = _cleanup_classname( guess_lexer(content).name) except ClassNotFound: lang = None except UnicodeDecodeError: raise except Exception: if log: log.exception('Problem during language detection, skipping') lang = None return { 'lang': lang } def compute_language(raw_content, encoding=None, log=None): """Determine the raw content's language. Args: raw_content (bytes): raw content to work with Returns: dict: Dict with keys: - **lang**: None if nothing found or the possible language """ try: encoding = _detect_encoding(raw_content) content = raw_content.decode(encoding) lang = _cleanup_classname( guess_lexer(content).name) except ClassNotFound: lang = None except Exception: if log: log.exception('Problem during language detection, skipping') lang = None return { 'lang': lang } class LanguageIndexer(ContentIndexer): """Indexer in charge of: - filtering out content already indexed - reading content from objstorage per the content's id (sha1) - computing {mimetype, encoding} from that content - store result in storage """ CONFIG_BASE_FILENAME = 'indexer/language' ADDITIONAL_CONFIG = { 'tools': ('dict', { 'name': 'pygments', 'version': '2.0.1+dfsg-1.1+deb8u1', 'configuration': { 'type': 'library', 'debian-package': 'python3-pygments', 'max_content_size': 10240, }, }), } - def prepare(self): - super().prepare() - c = self.config - self.max_content_size = c['tools']['configuration']['max_content_size'] - self.tool = self.tools[0] + @property + def max_content_size(self): + return self.tool['tool_configuration']['max_content_size'] def filter(self, ids): """Filter out known sha1s and return only missing ones. """ yield from self.idx_storage.content_language_missing(( { 'id': sha1, 'indexer_configuration_id': self.tool['id'] } for sha1 in ids )) def index(self, id, data): """Index sha1s' content and store result. Args: id (bytes): content's identifier data (bytes): raw content in bytes Returns: dict: Dict that represents a content_mimetype, with keys: - id (bytes): content's identifier (sha1) - lang (bytes): detected language """ result = { 'id': id, 'indexer_configuration_id': self.tool['id'], 'lang': None, } encoding = _detect_encoding(data) if not encoding: return result _len = len(data) for i in range(0, 9): max_size = self.max_content_size + i try: result = compute_language_from_chunk( encoding, _len, data, max_size, log=self.log) except UnicodeDecodeError: self.log.warning( 'Decoding failed on wrong byte chunk at [0-%s]' ', trying again at next ending byte.' % max_size) continue # we found something, so we return it result.update({ 'id': id, 'indexer_configuration_id': self.tool['id'], }) break return result def persist_index_computations(self, results, policy_update): """Persist the results in storage. Args: results ([dict]): list of content_mimetype, dict with the following keys: - id (bytes): content's identifier (sha1) - lang (bytes): detected language policy_update ([str]): either 'update-dups' or 'ignore-dups' to respectively update duplicates or ignore them """ self.idx_storage.content_language_add( results, conflict_update=(policy_update == 'update-dups')) diff --git a/swh/indexer/metadata.py b/swh/indexer/metadata.py index 08dcf08..b6f8b3f 100644 --- a/swh/indexer/metadata.py +++ b/swh/indexer/metadata.py @@ -1,339 +1,321 @@ # Copyright (C) 2017-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import click import itertools import logging +from copy import deepcopy from swh.indexer.indexer import ContentIndexer, RevisionIndexer, OriginIndexer from swh.indexer.metadata_dictionary import MAPPINGS from swh.indexer.metadata_detector import detect_metadata from swh.indexer.metadata_detector import extract_minimal_metadata_dict from swh.indexer.storage import INDEXER_CFG_KEY from swh.model import hashutil class ContentMetadataIndexer(ContentIndexer): """Content-level indexer This indexer is in charge of: - filtering out content already indexed in content_metadata - reading content from objstorage with the content's id sha1 - computing translated_metadata by given context - using the metadata_dictionary as the 'swh-metadata-translator' tool - store result in content_metadata table """ # Note: This used when the content metadata indexer is used alone # (not the case for example in the case of the RevisionMetadataIndexer) CONFIG_BASE_FILENAME = 'indexer/content_metadata' - def __init__(self, tool, config): - # FIXME: Simplify this twisted way to use the exact same - # config of RevisionMetadataIndexer object that uses - # internally ContentMetadataIndexer - self.config = config - self.config['tools'] = tool - self.results = [] - super().__init__() - self.tool = self.tools[0] # Tool is now registered (cf. prepare call) - def filter(self, ids): """Filter out known sha1s and return only missing ones. """ yield from self.idx_storage.content_metadata_missing(( { 'id': sha1, 'indexer_configuration_id': self.tool['id'], } for sha1 in ids )) def index(self, id, data): """Index sha1s' content and store result. Args: id (bytes): content's identifier data (bytes): raw content in bytes Returns: dict: dictionary representing a content_metadata. If the translation wasn't successful the translated_metadata keys will be returned as None """ result = { 'id': id, 'indexer_configuration_id': self.tool['id'], 'translated_metadata': None } try: mapping_name = self.tool['tool_configuration']['context'] result['translated_metadata'] = MAPPINGS[mapping_name] \ .translate(data) except Exception: self.log.exception( "Problem during metadata translation " "for content %s" % hashutil.hash_to_hex(id)) if result['translated_metadata'] is None: return None return result def persist_index_computations(self, results, policy_update): """Persist the results in storage. Args: results ([dict]): list of content_metadata, dict with the following keys: - id (bytes): content's identifier (sha1) - translated_metadata (jsonb): detected metadata policy_update ([str]): either 'update-dups' or 'ignore-dups' to respectively update duplicates or ignore them """ self.idx_storage.content_metadata_add( results, conflict_update=(policy_update == 'update-dups')) class RevisionMetadataIndexer(RevisionIndexer): """Revision-level indexer This indexer is in charge of: - filtering revisions already indexed in revision_metadata table with defined computation tool - retrieve all entry_files in root directory - use metadata_detector for file_names containing metadata - compute metadata translation if necessary and possible (depends on tool) - send sha1s to content indexing if possible - store the results for revision """ CONFIG_BASE_FILENAME = 'indexer/revision_metadata' ADDITIONAL_CONFIG = { 'tools': ('dict', { 'name': 'swh-metadata-detector', 'version': '0.0.2', 'configuration': { 'type': 'local', 'context': ['NpmMapping', 'CodemetaMapping'] }, }), } - ContentMetadataIndexer = ContentMetadataIndexer - - def prepare(self): - super().prepare() - self.tool = self.tools[0] - def filter(self, sha1_gits): """Filter out known sha1s and return only missing ones. """ yield from self.idx_storage.revision_metadata_missing(( { 'id': sha1_git, 'indexer_configuration_id': self.tool['id'], } for sha1_git in sha1_gits )) def index(self, rev): """Index rev by processing it and organizing result. use metadata_detector to iterate on filenames - if one filename detected -> sends file to content indexer - if multiple file detected -> translation needed at revision level Args: rev (bytes): revision artifact from storage Returns: dict: dictionary representing a revision_metadata, with keys: - id (str): rev's identifier (sha1_git) - indexer_configuration_id (bytes): tool used - translated_metadata: dict of retrieved metadata """ result = { 'id': rev['id'], 'indexer_configuration_id': self.tool['id'], 'translated_metadata': None } try: root_dir = rev['directory'] dir_ls = self.storage.directory_ls(root_dir, recursive=False) files = [entry for entry in dir_ls if entry['type'] == 'file'] detected_files = detect_metadata(files) result['translated_metadata'] = self.translate_revision_metadata( detected_files) except Exception as e: self.log.exception( 'Problem when indexing rev: %r', e) return result def persist_index_computations(self, results, policy_update): """Persist the results in storage. Args: results ([dict]): list of content_mimetype, dict with the following keys: - id (bytes): content's identifier (sha1) - mimetype (bytes): mimetype in bytes - encoding (bytes): encoding in bytes policy_update ([str]): either 'update-dups' or 'ignore-dups' to respectively update duplicates or ignore them """ # TODO: add functions in storage to keep data in revision_metadata self.idx_storage.revision_metadata_add( results, conflict_update=(policy_update == 'update-dups')) def translate_revision_metadata(self, detected_files): """ Determine plan of action to translate metadata when containing one or multiple detected files: Args: detected_files (dict): dictionary mapping context names (e.g., "npm", "authors") to list of sha1 Returns: dict: dict with translated metadata according to the CodeMeta vocabulary """ translated_metadata = [] tool = { 'name': 'swh-metadata-translator', 'version': '0.0.2', 'configuration': { 'type': 'local', 'context': None }, } # TODO: iterate on each context, on each file # -> get raw_contents # -> translate each content config = { k: self.config[k] for k in [INDEXER_CFG_KEY, 'objstorage', 'storage'] } + config['tools'] = [tool] for context in detected_files.keys(): - tool['configuration']['context'] = context - c_metadata_indexer = self.ContentMetadataIndexer(tool, config) + cfg = deepcopy(config) + cfg['tools'][0]['configuration']['context'] = context + c_metadata_indexer = ContentMetadataIndexer(config=cfg) # sha1s that are in content_metadata table sha1s_in_storage = [] metadata_generator = self.idx_storage.content_metadata_get( detected_files[context]) for c in metadata_generator: # extracting translated_metadata sha1 = c['id'] sha1s_in_storage.append(sha1) local_metadata = c['translated_metadata'] # local metadata is aggregated if local_metadata: translated_metadata.append(local_metadata) sha1s_filtered = [item for item in detected_files[context] if item not in sha1s_in_storage] if sha1s_filtered: # content indexing try: c_metadata_indexer.run(sha1s_filtered, policy_update='ignore-dups') # on the fly possibility: for result in c_metadata_indexer.results: local_metadata = result['translated_metadata'] translated_metadata.append(local_metadata) except Exception: self.log.exception( "Exception while indexing metadata on contents") # transform translated_metadata into min set with swh-metadata-detector min_metadata = extract_minimal_metadata_dict(translated_metadata) return min_metadata class OriginMetadataIndexer(OriginIndexer): CONFIG_BASE_FILENAME = 'indexer/origin_intrinsic_metadata' ADDITIONAL_CONFIG = { 'tools': ('list', []) } - def check(self, **kwargs): - kwargs['check_tools'] = False - super().check(**kwargs) - - def filter(self, ids): - return ids + USE_TOOLS = False def run(self, origin_head, policy_update): """Expected to be called with the result of RevisionMetadataIndexer as first argument; ie. not a list of ids as other indexers would. Args: origin_head (dict): {str(origin_id): rev_id} keys `origin_id` and `revision_id`, which is the result of OriginHeadIndexer. policy_update (str): `'ignore-dups'` or `'update-dups'` """ origin_head_map = {origin_id: hashutil.hash_to_bytes(rev_id) for (origin_id, rev_id) in origin_head.items()} # Fix up the argument order. revisions_metadata has to be the # first argument because of celery.chain; the next line calls # run() with the usual order, ie. origin ids first. return super().run(ids=list(origin_head_map), policy_update=policy_update, parse_ids=False, origin_head_map=origin_head_map) def index(self, origin, *, origin_head_map): # Get the last revision of the origin. revision_id = origin_head_map[str(origin['id'])] revision_metadata = self.idx_storage \ .revision_metadata_get([revision_id]) results = [] for item in revision_metadata: assert item['id'] == revision_id # Get the metadata of that revision, and return it results.append({ 'origin_id': origin['id'], 'metadata': item['translated_metadata'], 'from_revision': revision_id, 'indexer_configuration_id': item['tool']['id'], }) return results def persist_index_computations(self, results, policy_update): self.idx_storage.origin_intrinsic_metadata_add( list(itertools.chain(*results)), conflict_update=(policy_update == 'update-dups')) @click.command() @click.option('--revs', '-i', help='Default sha1_git to lookup', multiple=True) def main(revs): _git_sha1s = list(map(hashutil.hash_to_bytes, revs)) rev_metadata_indexer = RevisionMetadataIndexer() rev_metadata_indexer.run(_git_sha1s, 'update-dups') if __name__ == '__main__': logging.basicConfig(level=logging.INFO) main() diff --git a/swh/indexer/mimetype.py b/swh/indexer/mimetype.py index af957c3..bcdac02 100644 --- a/swh/indexer/mimetype.py +++ b/swh/indexer/mimetype.py @@ -1,150 +1,146 @@ # Copyright (C) 2016-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import magic from swh.model import hashutil from .indexer import ContentIndexer, ContentRangeIndexer def compute_mimetype_encoding(raw_content): """Determine mimetype and encoding from the raw content. Args: raw_content (bytes): content's raw data Returns: dict: mimetype and encoding key and corresponding values (as bytes). """ r = magic.detect_from_content(raw_content) return { 'mimetype': r.mime_type, 'encoding': r.encoding, } class MixinMimetypeIndexer: """Mixin mimetype indexer. See :class:`MimetypeIndexer` and :class:`MimetypeRangeIndexer` """ ADDITIONAL_CONFIG = { 'tools': ('dict', { 'name': 'file', 'version': '1:5.30-1+deb9u1', 'configuration': { "type": "library", "debian-package": "python3-magic" }, }), 'write_batch_size': ('int', 1000), } CONFIG_BASE_FILENAME = 'indexer/mimetype' - def prepare(self): - super().prepare() - self.tool = self.tools[0] - def index(self, id, data): """Index sha1s' content and store result. Args: id (bytes): content's identifier data (bytes): raw content in bytes Returns: dict: content's mimetype; dict keys being - **id** (bytes): content's identifier (sha1) - **mimetype** (bytes): mimetype in bytes - **encoding** (bytes): encoding in bytes """ try: properties = compute_mimetype_encoding(data) properties.update({ 'id': id, 'indexer_configuration_id': self.tool['id'], }) except TypeError: self.log.error('Detecting mimetype error for id %s' % ( hashutil.hash_to_hex(id), )) return None return properties def persist_index_computations(self, results, policy_update): """Persist the results in storage. Args: results ([dict]): list of content's mimetype dicts (see :meth:`.index`) policy_update ([str]): either 'update-dups' or 'ignore-dups' to respectively update duplicates or ignore them """ self.idx_storage.content_mimetype_add( results, conflict_update=(policy_update == 'update-dups')) class MimetypeIndexer(MixinMimetypeIndexer, ContentIndexer): """Mimetype Indexer working on list of content identifiers. It: - (optionally) filters out content already indexed (cf. :meth:`.filter`) - reads content from objstorage per the content's id (sha1) - computes {mimetype, encoding} from that content - stores result in storage """ def filter(self, ids): """Filter out known sha1s and return only missing ones. """ yield from self.idx_storage.content_mimetype_missing(( { 'id': sha1, 'indexer_configuration_id': self.tool['id'], } for sha1 in ids )) class MimetypeRangeIndexer(MixinMimetypeIndexer, ContentRangeIndexer): """Mimetype Range Indexer working on range of content identifiers. It: - (optionally) filters out content already indexed (cf :meth:`.indexed_contents_in_range`) - reads content from objstorage per the content's id (sha1) - computes {mimetype, encoding} from that content - stores result in storage """ def indexed_contents_in_range(self, start, end): """Retrieve indexed content id within range [start, end]. Args: start (bytes): Starting bound from range identifier end (bytes): End range identifier Returns: dict: a dict with keys: - **ids** [bytes]: iterable of content ids within the range. - **next** (Optional[bytes]): The next range of sha1 starts at this sha1 if any """ return self.idx_storage.content_mimetype_get_range( start, end, self.tool['id']) diff --git a/swh/indexer/origin_head.py b/swh/indexer/origin_head.py index 35ea767..ce63708 100644 --- a/swh/indexer/origin_head.py +++ b/swh/indexer/origin_head.py @@ -1,221 +1,218 @@ # Copyright (C) 2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import re import click import logging from swh.scheduler import get_scheduler from swh.scheduler.utils import create_task_dict from swh.indexer.indexer import OriginIndexer from swh.model.hashutil import hash_to_hex class OriginHeadIndexer(OriginIndexer): """Origin-level indexer. This indexer is in charge of looking up the revision that acts as the "head" of an origin. In git, this is usually the commit pointed to by the 'master' branch.""" ADDITIONAL_CONFIG = { 'tools': ('dict', { 'name': 'origin-metadata', 'version': '0.0.1', 'configuration': {}, }), 'tasks': ('dict', { 'revision_metadata': 'revision_metadata', 'origin_intrinsic_metadata': 'origin_metadata', }) } CONFIG_BASE_FILENAME = 'indexer/origin_head' - def filter(self, ids): - yield from ids - def persist_index_computations(self, results, policy_update): """Do nothing. The indexer's results are not persistent, they should only be piped to another indexer.""" pass def next_step(self, results, task): """Once the head is found, call the RevisionMetadataIndexer on these revisions, then call the OriginMetadataIndexer with both the origin_id and the revision metadata, so it can copy the revision metadata to the origin's metadata. Args: results (Iterable[dict]): Iterable of return values from `index`. """ super().next_step(results, task) revision_metadata_task = self.config['tasks']['revision_metadata'] origin_intrinsic_metadata_task = self.config['tasks'][ 'origin_intrinsic_metadata'] if revision_metadata_task is None and \ origin_intrinsic_metadata_task is None: return assert revision_metadata_task is not None assert origin_intrinsic_metadata_task is not None # Second task to run after this one: copy the revision's metadata # to the origin sub_task = create_task_dict( origin_intrinsic_metadata_task, 'oneshot', origin_head={ str(result['origin_id']): hash_to_hex(result['revision_id']) for result in results}, policy_update='update-dups', ) del sub_task['next_run'] # Not json-serializable # First task to run after this one: index the metadata of the # revision task = create_task_dict( revision_metadata_task, 'oneshot', ids=[hash_to_hex(res['revision_id']) for res in results], policy_update='update-dups', next_step=sub_task, ) if getattr(self, 'scheduler', None): scheduler = self.scheduler else: scheduler = get_scheduler(**self.config['scheduler']) scheduler.create_tasks([task]) # Dispatch def index(self, origin): origin_id = origin['id'] latest_snapshot = self.storage.snapshot_get_latest(origin_id) method = getattr(self, '_try_get_%s_head' % origin['type'], None) if method is None: method = self._try_get_head_generic rev_id = method(latest_snapshot) if rev_id is None: return None result = { 'origin_id': origin_id, 'revision_id': rev_id, } return result # VCSs def _try_get_vcs_head(self, snapshot): try: if isinstance(snapshot, dict): branches = snapshot['branches'] if branches[b'HEAD']['target_type'] == 'revision': return branches[b'HEAD']['target'] except KeyError: return None _try_get_hg_head = _try_get_git_head = _try_get_vcs_head # Tarballs _archive_filename_re = re.compile( rb'^' rb'(?P.*)[-_]' rb'(?P[0-9]+(\.[0-9])*)' rb'(?P[-+][a-zA-Z0-9.~]+?)?' rb'(?P(\.[a-zA-Z0-9]+)+)' rb'$') @classmethod def _parse_version(cls, filename): """Extracts the release version from an archive filename, to get an ordering whose maximum is likely to be the last version of the software >>> OriginHeadIndexer._parse_version(b'foo') (-inf,) >>> OriginHeadIndexer._parse_version(b'foo.tar.gz') (-inf,) >>> OriginHeadIndexer._parse_version(b'gnu-hello-0.0.1.tar.gz') (0, 0, 1, 0) >>> OriginHeadIndexer._parse_version(b'gnu-hello-0.0.1-beta2.tar.gz') (0, 0, 1, -1, 'beta2') >>> OriginHeadIndexer._parse_version(b'gnu-hello-0.0.1+foobar.tar.gz') (0, 0, 1, 1, 'foobar') """ res = cls._archive_filename_re.match(filename) if res is None: return (float('-infinity'),) version = [int(n) for n in res.group('version').decode().split('.')] if res.group('preversion') is None: version.append(0) else: preversion = res.group('preversion').decode() if preversion.startswith('-'): version.append(-1) version.append(preversion[1:]) elif preversion.startswith('+'): version.append(1) version.append(preversion[1:]) else: assert False, res.group('preversion') return tuple(version) def _try_get_ftp_head(self, snapshot): archive_names = list(snapshot['branches']) max_archive_name = max(archive_names, key=self._parse_version) r = self._try_resolve_target(snapshot['branches'], max_archive_name) return r # Generic def _try_get_head_generic(self, snapshot): # Works on 'deposit', 'svn', and 'pypi'. try: if isinstance(snapshot, dict): branches = snapshot['branches'] except KeyError: return None else: return ( self._try_resolve_target(branches, b'HEAD') or self._try_resolve_target(branches, b'master') ) def _try_resolve_target(self, branches, target_name): try: target = branches[target_name] while target['target_type'] == 'alias': target = branches[target['target']] if target['target_type'] == 'revision': return target['target'] elif target['target_type'] == 'content': return None # TODO elif target['target_type'] == 'directory': return None # TODO elif target['target_type'] == 'release': return None # TODO else: assert False except KeyError: return None @click.command() @click.option('--origins', '-i', help='Origins to lookup, in the "type+url" format', multiple=True) def main(origins): rev_metadata_indexer = OriginHeadIndexer() rev_metadata_indexer.run(origins) if __name__ == '__main__': logging.basicConfig(level=logging.INFO) main() diff --git a/swh/indexer/tests/test_fossology_license.py b/swh/indexer/tests/test_fossology_license.py index 83fed1e..c47be8b 100644 --- a/swh/indexer/tests/test_fossology_license.py +++ b/swh/indexer/tests/test_fossology_license.py @@ -1,194 +1,208 @@ # Copyright (C) 2017-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import unittest from unittest.mock import patch +from swh.indexer import fossology_license from swh.indexer.fossology_license import ( FossologyLicenseIndexer, FossologyLicenseRangeIndexer, compute_license ) from swh.indexer.tests.utils import ( SHA1_TO_LICENSES, CommonContentIndexerTest, CommonContentIndexerRangeTest, CommonIndexerWithErrorsTest, CommonIndexerNoTool, NoDiskIndexer, BASE_TEST_CONFIG, fill_storage, fill_obj_storage ) class BasicTest(unittest.TestCase): @patch('swh.indexer.fossology_license.subprocess') def test_compute_license(self, mock_subprocess): """Computing licenses from a raw content should return results """ for path, intermediary_result, output in [ (b'some/path', None, []), (b'some/path/2', [], []), (b'other/path', ' contains license(s) GPL,AGPL', ['GPL', 'AGPL'])]: mock_subprocess.check_output.return_value = intermediary_result actual_result = compute_license(path, log=None) self.assertEqual(actual_result, { 'licenses': output, 'path': path, }) -class InjectLicenseIndexer: - """Override license computations. +def mock_compute_license(path, log=None): + """path is the content identifier """ - def compute_license(self, path, log=None): - """path is the content identifier - - """ - if isinstance(id, bytes): - path = path.decode('utf-8') - return { - 'licenses': SHA1_TO_LICENSES.get(path) - } + if isinstance(id, bytes): + path = path.decode('utf-8') + return { + 'licenses': SHA1_TO_LICENSES.get(path) + } class FossologyLicenseTestIndexer( - NoDiskIndexer, InjectLicenseIndexer, FossologyLicenseIndexer): + NoDiskIndexer, FossologyLicenseIndexer): """Specific fossology license whose configuration is enough to satisfy the indexing checks. """ def parse_config_file(self, *args, **kwargs): return { **BASE_TEST_CONFIG, 'workdir': '/nowhere', 'tools': { 'name': 'nomos', 'version': '3.1.0rc2-31-ga2cbb8c', 'configuration': { 'command_line': 'nomossa ', }, }, } class TestFossologyLicenseIndexer(CommonContentIndexerTest, unittest.TestCase): """Language indexer test scenarios: - Known sha1s in the input list have their data indexed - Unknown sha1 in the input list are not indexed """ def get_indexer_results(self, ids): yield from self.idx_storage.content_fossology_license_get(ids) def setUp(self): super().setUp() + # replace actual license computation with a mock + self.orig_compute_license = fossology_license.compute_license + fossology_license.compute_license = mock_compute_license + self.indexer = FossologyLicenseTestIndexer() self.idx_storage = self.indexer.idx_storage fill_storage(self.indexer.storage) fill_obj_storage(self.indexer.objstorage) self.id0 = '01c9379dfc33803963d07c1ccc748d3fe4c96bb5' self.id1 = '688a5ef812c53907562fe379d4b3851e69c7cb15' self.id2 = 'da39a3ee5e6b4b0d3255bfef95601890afd80709' # empty content tool = {k.replace('tool_', ''): v for (k, v) in self.indexer.tool.items()} # then self.expected_results = { self.id0: { 'tool': tool, 'licenses': SHA1_TO_LICENSES[self.id0], }, self.id1: { 'tool': tool, 'licenses': SHA1_TO_LICENSES[self.id1], }, self.id2: { 'tool': tool, 'licenses': SHA1_TO_LICENSES[self.id2], } } + def tearDown(self): + super().tearDown() + fossology_license.compute_license = self.orig_compute_license + class FossologyLicenseRangeIndexerTest( - NoDiskIndexer, InjectLicenseIndexer, FossologyLicenseRangeIndexer): + NoDiskIndexer, FossologyLicenseRangeIndexer): """Testing the range indexer on fossology license. """ def parse_config_file(self, *args, **kwargs): return { **BASE_TEST_CONFIG, 'workdir': '/nowhere', 'tools': { 'name': 'nomos', 'version': '3.1.0rc2-31-ga2cbb8c', 'configuration': { 'command_line': 'nomossa ', }, }, 'write_batch_size': 100, } class TestFossologyLicenseRangeIndexer( CommonContentIndexerRangeTest, unittest.TestCase): """Range Fossology License Indexer tests. - new data within range are indexed - no data outside a range are indexed - with filtering existing indexed data prior to compute new index - without filtering existing indexed data prior to compute new index """ def setUp(self): super().setUp() + + # replace actual license computation with a mock + self.orig_compute_license = fossology_license.compute_license + fossology_license.compute_license = mock_compute_license + self.indexer = FossologyLicenseRangeIndexerTest() fill_storage(self.indexer.storage) fill_obj_storage(self.indexer.objstorage) self.id0 = '01c9379dfc33803963d07c1ccc748d3fe4c96bb5' self.id1 = '02fb2c89e14f7fab46701478c83779c7beb7b069' self.id2 = '103bc087db1d26afc3a0283f38663d081e9b01e6' tool_id = self.indexer.tool['id'] self.expected_results = { self.id0: { 'id': self.id0, 'indexer_configuration_id': tool_id, 'licenses': SHA1_TO_LICENSES[self.id0] }, self.id1: { 'id': self.id1, 'indexer_configuration_id': tool_id, 'licenses': SHA1_TO_LICENSES[self.id1] }, self.id2: { 'id': self.id2, 'indexer_configuration_id': tool_id, 'licenses': SHA1_TO_LICENSES[self.id2] } } + def tearDown(self): + super().tearDown() + fossology_license.compute_license = self.orig_compute_license + class FossologyLicenseIndexerUnknownToolTestStorage( CommonIndexerNoTool, FossologyLicenseTestIndexer): """Fossology license indexer with wrong configuration""" class FossologyLicenseRangeIndexerUnknownToolTestStorage( CommonIndexerNoTool, FossologyLicenseRangeIndexerTest): """Fossology license range indexer with wrong configuration""" class TestFossologyLicenseIndexersErrors( CommonIndexerWithErrorsTest, unittest.TestCase): """Test the indexer raise the right errors when wrongly initialized""" Indexer = FossologyLicenseIndexerUnknownToolTestStorage RangeIndexer = FossologyLicenseRangeIndexerUnknownToolTestStorage diff --git a/swh/indexer/tests/test_metadata.py b/swh/indexer/tests/test_metadata.py index 84d43d3..9bfa6b6 100644 --- a/swh/indexer/tests/test_metadata.py +++ b/swh/indexer/tests/test_metadata.py @@ -1,764 +1,765 @@ # Copyright (C) 2017-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import unittest from swh.model.hashutil import hash_to_bytes from swh.indexer.metadata_dictionary import CROSSWALK_TABLE, MAPPINGS from swh.indexer.metadata_detector import ( detect_metadata, extract_minimal_metadata_dict ) from swh.indexer.metadata import ( ContentMetadataIndexer, RevisionMetadataIndexer ) from .utils import ( BASE_TEST_CONFIG, fill_obj_storage, fill_storage ) TRANSLATOR_TOOL = { 'name': 'swh-metadata-translator', 'version': '0.0.2', 'configuration': { 'type': 'local', 'context': 'NpmMapping' } } class ContentMetadataTestIndexer(ContentMetadataIndexer): """Specific Metadata whose configuration is enough to satisfy the indexing tests. """ def parse_config_file(self, *args, **kwargs): assert False, 'should not be called; the rev indexer configures it.' class RevisionMetadataTestIndexer(RevisionMetadataIndexer): """Specific indexer whose configuration is enough to satisfy the indexing tests. """ ContentMetadataIndexer = ContentMetadataTestIndexer def parse_config_file(self, *args, **kwargs): return { **BASE_TEST_CONFIG, 'tools': TRANSLATOR_TOOL, } class Metadata(unittest.TestCase): """ Tests metadata_mock_tool tool for Metadata detection """ def setUp(self): """ shows the entire diff in the results """ self.maxDiff = None def test_crosstable(self): self.assertEqual(CROSSWALK_TABLE['NodeJS'], { 'repository': 'http://schema.org/codeRepository', 'os': 'http://schema.org/operatingSystem', 'cpu': 'http://schema.org/processorRequirements', 'engines': 'http://schema.org/processorRequirements', 'author': 'http://schema.org/author', 'author.email': 'http://schema.org/email', 'author.name': 'http://schema.org/name', 'contributor': 'http://schema.org/contributor', 'keywords': 'http://schema.org/keywords', 'license': 'http://schema.org/license', 'version': 'http://schema.org/version', 'description': 'http://schema.org/description', 'name': 'http://schema.org/name', 'bugs': 'https://codemeta.github.io/terms/issueTracker', 'homepage': 'http://schema.org/url' }) def test_compute_metadata_none(self): """ testing content empty content is empty should return None """ # given content = b"" # None if no metadata was found or an error occurred declared_metadata = None # when result = MAPPINGS["NpmMapping"].translate(content) # then self.assertEqual(declared_metadata, result) def test_compute_metadata_npm(self): """ testing only computation of metadata with hard_mapping_npm """ # given content = b""" { "name": "test_metadata", "version": "0.0.2", "description": "Simple package.json test for indexer", "repository": { "type": "git", "url": "https://github.com/moranegg/metadata_test" }, "author": { "email": "moranegg@example.com", "name": "Morane G" } } """ declared_metadata = { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'type': 'SoftwareSourceCode', 'name': 'test_metadata', 'version': '0.0.2', 'description': 'Simple package.json test for indexer', 'codeRepository': 'git+https://github.com/moranegg/metadata_test', 'author': [{ 'type': 'Person', 'name': 'Morane G', 'email': 'moranegg@example.com', }], } # when result = MAPPINGS["NpmMapping"].translate(content) # then self.assertEqual(declared_metadata, result) def test_extract_minimal_metadata_dict(self): """ Test the creation of a coherent minimal metadata set """ # given metadata_list = [{ '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'name': 'test_1', 'version': '0.0.2', 'description': 'Simple package.json test for indexer', 'codeRepository': 'git+https://github.com/moranegg/metadata_test', }, { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'name': 'test_0_1', 'version': '0.0.2', 'description': 'Simple package.json test for indexer', 'codeRepository': 'git+https://github.com/moranegg/metadata_test' }, { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'name': 'test_metadata', 'version': '0.0.2', 'author': 'moranegg', }] # when results = extract_minimal_metadata_dict(metadata_list) # then expected_results = { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', "version": '0.0.2', "description": 'Simple package.json test for indexer', "name": ['test_1', 'test_0_1', 'test_metadata'], "author": ['moranegg'], "codeRepository": 'git+https://github.com/moranegg/metadata_test', } self.assertEqual(expected_results, results) def test_index_content_metadata_npm(self): """ testing NPM with package.json - one sha1 uses a file that can't be translated to metadata and should return None in the translated metadata """ # given sha1s = [ hash_to_bytes('26a9f72a7c87cc9205725cfd879f514ff4f3d8d5'), hash_to_bytes('d4c647f0fc257591cc9ba1722484229780d1c607'), hash_to_bytes('02fb2c89e14f7fab46701478c83779c7beb7b069'), ] # this metadata indexer computes only metadata for package.json # in npm context with a hard mapping - metadata_indexer = ContentMetadataTestIndexer( - tool=TRANSLATOR_TOOL, config=BASE_TEST_CONFIG.copy()) + config = BASE_TEST_CONFIG.copy() + config['tools'] = [TRANSLATOR_TOOL] + metadata_indexer = ContentMetadataTestIndexer(config=config) fill_obj_storage(metadata_indexer.objstorage) fill_storage(metadata_indexer.storage) # when metadata_indexer.run(sha1s, policy_update='ignore-dups') results = list(metadata_indexer.idx_storage.content_metadata_get( sha1s)) expected_results = [{ 'translated_metadata': { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'type': 'SoftwareSourceCode', 'codeRepository': 'git+https://github.com/moranegg/metadata_test', 'description': 'Simple package.json test for indexer', 'name': 'test_metadata', 'version': '0.0.1' }, 'id': hash_to_bytes('26a9f72a7c87cc9205725cfd879f514ff4f3d8d5') }, { 'translated_metadata': { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'type': 'SoftwareSourceCode', 'issueTracker': 'https://github.com/npm/npm/issues', 'author': [{ 'type': 'Person', 'name': 'Isaac Z. Schlueter', 'email': 'i@izs.me', 'url': 'http://blog.izs.me', }], 'codeRepository': 'git+https://github.com/npm/npm', 'description': 'a package manager for JavaScript', 'license': 'https://spdx.org/licenses/Artistic-2.0', 'version': '5.0.3', 'name': 'npm', 'keywords': [ 'install', 'modules', 'package manager', 'package.json' ], 'url': 'https://docs.npmjs.com/' }, 'id': hash_to_bytes('d4c647f0fc257591cc9ba1722484229780d1c607') }] for result in results: del result['tool'] # The assertion below returns False sometimes because of nested lists self.assertEqual(expected_results, results) def test_npm_bugs_normalization(self): # valid dictionary package_json = b"""{ "name": "foo", "bugs": { "url": "https://github.com/owner/project/issues", "email": "foo@example.com" } }""" result = MAPPINGS["NpmMapping"].translate(package_json) self.assertEqual(result, { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'name': 'foo', 'issueTracker': 'https://github.com/owner/project/issues', 'type': 'SoftwareSourceCode', }) # "invalid" dictionary package_json = b"""{ "name": "foo", "bugs": { "email": "foo@example.com" } }""" result = MAPPINGS["NpmMapping"].translate(package_json) self.assertEqual(result, { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'name': 'foo', 'type': 'SoftwareSourceCode', }) # string package_json = b"""{ "name": "foo", "bugs": "https://github.com/owner/project/issues" }""" result = MAPPINGS["NpmMapping"].translate(package_json) self.assertEqual(result, { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'name': 'foo', 'issueTracker': 'https://github.com/owner/project/issues', 'type': 'SoftwareSourceCode', }) def test_npm_repository_normalization(self): # normal package_json = b"""{ "name": "foo", "repository": { "type" : "git", "url" : "https://github.com/npm/cli.git" } }""" result = MAPPINGS["NpmMapping"].translate(package_json) self.assertEqual(result, { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'name': 'foo', 'codeRepository': 'git+https://github.com/npm/cli.git', 'type': 'SoftwareSourceCode', }) # missing url package_json = b"""{ "name": "foo", "repository": { "type" : "git" } }""" result = MAPPINGS["NpmMapping"].translate(package_json) self.assertEqual(result, { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'name': 'foo', 'type': 'SoftwareSourceCode', }) # github shortcut package_json = b"""{ "name": "foo", "repository": "github:npm/cli" }""" result = MAPPINGS["NpmMapping"].translate(package_json) expected_result = { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'name': 'foo', 'codeRepository': 'git+https://github.com/npm/cli.git', 'type': 'SoftwareSourceCode', } self.assertEqual(result, expected_result) # github shortshortcut package_json = b"""{ "name": "foo", "repository": "npm/cli" }""" result = MAPPINGS["NpmMapping"].translate(package_json) self.assertEqual(result, expected_result) # gitlab shortcut package_json = b"""{ "name": "foo", "repository": "gitlab:user/repo" }""" result = MAPPINGS["NpmMapping"].translate(package_json) self.assertEqual(result, { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'name': 'foo', 'codeRepository': 'git+https://gitlab.com/user/repo.git', 'type': 'SoftwareSourceCode', }) def test_detect_metadata_package_json(self): # given df = [{ 'sha1_git': b'abc', 'name': b'index.js', 'target': b'abc', 'length': 897, 'status': 'visible', 'type': 'file', 'perms': 33188, 'dir_id': b'dir_a', 'sha1': b'bcd' }, { 'sha1_git': b'aab', 'name': b'package.json', 'target': b'aab', 'length': 712, 'status': 'visible', 'type': 'file', 'perms': 33188, 'dir_id': b'dir_a', 'sha1': b'cde' }] # when results = detect_metadata(df) expected_results = { 'NpmMapping': [ b'cde' ] } # then self.assertEqual(expected_results, results) def test_compute_metadata_valid_codemeta(self): raw_content = ( b"""{ "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "@type": "SoftwareSourceCode", "identifier": "CodeMeta", "description": "CodeMeta is a concept vocabulary that can be used to standardize the exchange of software metadata across repositories and organizations.", "name": "CodeMeta: Minimal metadata schemas for science software and code, in JSON-LD", "codeRepository": "https://github.com/codemeta/codemeta", "issueTracker": "https://github.com/codemeta/codemeta/issues", "license": "https://spdx.org/licenses/Apache-2.0", "version": "2.0", "author": [ { "@type": "Person", "givenName": "Carl", "familyName": "Boettiger", "email": "cboettig@gmail.com", "@id": "http://orcid.org/0000-0002-1642-628X" }, { "@type": "Person", "givenName": "Matthew B.", "familyName": "Jones", "email": "jones@nceas.ucsb.edu", "@id": "http://orcid.org/0000-0003-0077-4738" } ], "maintainer": { "@type": "Person", "givenName": "Carl", "familyName": "Boettiger", "email": "cboettig@gmail.com", "@id": "http://orcid.org/0000-0002-1642-628X" }, "contIntegration": "https://travis-ci.org/codemeta/codemeta", "developmentStatus": "active", "downloadUrl": "https://github.com/codemeta/codemeta/archive/2.0.zip", "funder": { "@id": "https://doi.org/10.13039/100000001", "@type": "Organization", "name": "National Science Foundation" }, "funding":"1549758; Codemeta: A Rosetta Stone for Metadata in Scientific Software", "keywords": [ "metadata", "software" ], "version":"2.0", "dateCreated":"2017-06-05", "datePublished":"2017-06-05", "programmingLanguage": "JSON-LD" }""") # noqa expected_result = { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "type": "SoftwareSourceCode", "identifier": "CodeMeta", "description": "CodeMeta is a concept vocabulary that can " "be used to standardize the exchange of software metadata " "across repositories and organizations.", "name": "CodeMeta: Minimal metadata schemas for science " "software and code, in JSON-LD", "codeRepository": "https://github.com/codemeta/codemeta", "issueTracker": "https://github.com/codemeta/codemeta/issues", "license": "https://spdx.org/licenses/Apache-2.0", "version": "2.0", "author": [ { "type": "Person", "givenName": "Carl", "familyName": "Boettiger", "email": "cboettig@gmail.com", "id": "http://orcid.org/0000-0002-1642-628X" }, { "type": "Person", "givenName": "Matthew B.", "familyName": "Jones", "email": "jones@nceas.ucsb.edu", "id": "http://orcid.org/0000-0003-0077-4738" } ], "maintainer": { "type": "Person", "givenName": "Carl", "familyName": "Boettiger", "email": "cboettig@gmail.com", "id": "http://orcid.org/0000-0002-1642-628X" }, "contIntegration": "https://travis-ci.org/codemeta/codemeta", "developmentStatus": "active", "downloadUrl": "https://github.com/codemeta/codemeta/archive/2.0.zip", "funder": { "id": "https://doi.org/10.13039/100000001", "type": "Organization", "name": "National Science Foundation" }, "funding": "1549758; Codemeta: A Rosetta Stone for Metadata " "in Scientific Software", "keywords": [ "metadata", "software" ], "version": "2.0", "dateCreated": "2017-06-05", "datePublished": "2017-06-05", "programmingLanguage": "JSON-LD" } result = MAPPINGS["CodemetaMapping"].translate(raw_content) self.assertEqual(result, expected_result) def test_compute_metadata_maven(self): raw_content = b""" Maven Default Project 4.0.0 com.mycompany.app my-app 1.2.3 central Maven Repository Switchboard default http://repo1.maven.org/maven2 false Apache License, Version 2.0 https://www.apache.org/licenses/LICENSE-2.0.txt repo A business-friendly OSS license """ result = MAPPINGS["MavenMapping"].translate(raw_content) self.assertEqual(result, { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'type': 'SoftwareSourceCode', 'name': 'Maven Default Project', 'identifier': 'com.mycompany.app', 'version': '1.2.3', 'license': 'https://www.apache.org/licenses/LICENSE-2.0.txt', 'codeRepository': 'http://repo1.maven.org/maven2/com/mycompany/app/my-app', }) def test_compute_metadata_maven_minimal(self): raw_content = b""" Maven Default Project 4.0.0 com.mycompany.app my-app 1.2.3 """ result = MAPPINGS["MavenMapping"].translate(raw_content) self.assertEqual(result, { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'type': 'SoftwareSourceCode', 'name': 'Maven Default Project', 'identifier': 'com.mycompany.app', 'version': '1.2.3', 'codeRepository': 'https://repo.maven.apache.org/maven2/com/mycompany/app/my-app', 'license': [], }) def test_compute_metadata_maven_multiple(self): '''Tests when there are multiple code repos and licenses.''' raw_content = b""" Maven Default Project 4.0.0 com.mycompany.app my-app 1.2.3 central Maven Repository Switchboard default http://repo1.maven.org/maven2 false example Example Maven Repo default http://example.org/maven2 Apache License, Version 2.0 https://www.apache.org/licenses/LICENSE-2.0.txt repo A business-friendly OSS license MIT license https://opensource.org/licenses/MIT """ result = MAPPINGS["MavenMapping"].translate(raw_content) self.assertEqual(result, { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'type': 'SoftwareSourceCode', 'name': 'Maven Default Project', 'identifier': 'com.mycompany.app', 'version': '1.2.3', 'license': [ 'https://www.apache.org/licenses/LICENSE-2.0.txt', 'https://opensource.org/licenses/MIT', ], 'codeRepository': [ 'http://repo1.maven.org/maven2/com/mycompany/app/my-app', 'http://example.org/maven2/com/mycompany/app/my-app', ] }) def test_compute_metadata_pkginfo(self): raw_content = (b"""\ Metadata-Version: 2.1 Name: swh.core Version: 0.0.49 Summary: Software Heritage core utilities Home-page: https://forge.softwareheritage.org/diffusion/DCORE/ Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest Project-URL: Funding, https://www.softwareheritage.org/donate Project-URL: Source, https://forge.softwareheritage.org/source/swh-core Description: swh-core ======== \x20 core library for swh's modules: - config parser - hash computations - serialization - logging mechanism \x20 Platform: UNKNOWN Classifier: Programming Language :: Python :: 3 Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3) Classifier: Operating System :: OS Independent Classifier: Development Status :: 5 - Production/Stable Description-Content-Type: text/markdown Provides-Extra: testing """) # noqa result = MAPPINGS["PythonPkginfoMapping"].translate(raw_content) self.assertCountEqual(result['description'], [ 'Software Heritage core utilities', # note the comma here 'swh-core\n' ' ========\n' ' \n' " core library for swh's modules:\n" ' - config parser\n' ' - hash computations\n' ' - serialization\n' ' - logging mechanism\n' ' '], result) del result['description'] self.assertEqual(result, { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'type': 'SoftwareSourceCode', 'url': 'https://forge.softwareheritage.org/diffusion/DCORE/', 'name': 'swh.core', 'author': [{ 'type': 'Person', 'name': 'Software Heritage developers', 'email': 'swh-devel@inria.fr', }], 'version': '0.0.49', }) def test_compute_metadata_pkginfo_license(self): raw_content = (b"""\ Metadata-Version: 2.1 Name: foo License: MIT """) # noqa result = MAPPINGS["PythonPkginfoMapping"].translate(raw_content) self.assertEqual(result, { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'type': 'SoftwareSourceCode', 'name': 'foo', 'license': 'MIT', }) def test_revision_metadata_indexer(self): metadata_indexer = RevisionMetadataTestIndexer() fill_obj_storage(metadata_indexer.objstorage) fill_storage(metadata_indexer.storage) tool = metadata_indexer.idx_storage.indexer_configuration_get( {'tool_'+k: v for (k, v) in TRANSLATOR_TOOL.items()}) assert tool is not None metadata_indexer.idx_storage.content_metadata_add([{ 'indexer_configuration_id': tool['id'], 'id': b'cde', 'translated_metadata': { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'type': 'SoftwareSourceCode', 'issueTracker': 'https://github.com/librariesio/yarn-parser/issues', 'version': '1.0.0', 'name': 'yarn-parser', 'author': ['Andrew Nesbitt'], 'url': 'https://github.com/librariesio/yarn-parser#readme', 'processorRequirements': {'node': '7.5'}, 'license': 'AGPL-3.0', 'keywords': ['yarn', 'parse', 'lock', 'dependencies'], 'codeRepository': 'git+https://github.com/librariesio/yarn-parser.git', 'description': 'Tiny web service for parsing yarn.lock files', } }]) sha1_gits = [ hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f'), ] metadata_indexer.run(sha1_gits, 'update-dups') results = list(metadata_indexer.idx_storage.revision_metadata_get( sha1_gits)) expected_results = [{ 'id': hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f'), 'tool': TRANSLATOR_TOOL, 'translated_metadata': { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'url': 'https://github.com/librariesio/yarn-parser#readme', 'codeRepository': 'git+https://github.com/librariesio/yarn-parser.git', 'author': ['Andrew Nesbitt'], 'license': 'AGPL-3.0', 'version': '1.0.0', 'description': 'Tiny web service for parsing yarn.lock files', 'issueTracker': 'https://github.com/librariesio/yarn-parser/issues', 'name': 'yarn-parser', 'keywords': ['yarn', 'parse', 'lock', 'dependencies'], }, }] for result in results: del result['tool']['id'] # then self.assertEqual(expected_results, results)