diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 380c658..69b3349 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,46 +1,40 @@ repos: - repo: https://github.com/pre-commit/pre-commit-hooks rev: v2.4.0 hooks: - id: trailing-whitespace - id: flake8 - id: check-json - id: check-yaml - repo: https://github.com/codespell-project/codespell rev: v1.16.0 hooks: - id: codespell - repo: local hooks: - id: mypy name: mypy entry: mypy args: [swh] pass_filenames: false language: system types: [python] +- repo: https://github.com/python/black + rev: 19.10b0 + hooks: + - id: black + # unfortunately, we are far from being able to enable this... # - repo: https://github.com/PyCQA/pydocstyle.git # rev: 4.0.0 # hooks: # - id: pydocstyle # name: pydocstyle # description: pydocstyle is a static analysis tool for checking compliance with Python docstring conventions. # entry: pydocstyle --convention=google # language: python # types: [python] -# black requires py3.6+ -#- repo: https://github.com/python/black -# rev: 19.3b0 -# hooks: -# - id: black -# language_version: python3 -#- repo: https://github.com/asottile/blacken-docs -# rev: v1.0.0-1 -# hooks: -# - id: blacken-docs -# additional_dependencies: [black==19.3b0] diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000..8d79b7e --- /dev/null +++ b/setup.cfg @@ -0,0 +1,6 @@ +[flake8] +# E203: whitespaces before ':' +# E231: missing whitespace after ',' +# W503: line break before binary operator +ignore = E203,E231,W503 +max-line-length = 88 diff --git a/setup.py b/setup.py index 7d4fd32..69e80c8 100755 --- a/setup.py +++ b/setup.py @@ -1,69 +1,69 @@ #!/usr/bin/env python3 # Copyright (C) 2015-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from setuptools import setup, find_packages from os import path from io import open here = path.abspath(path.dirname(__file__)) # Get the long description from the README file -with open(path.join(here, 'README.md'), encoding='utf-8') as f: +with open(path.join(here, "README.md"), encoding="utf-8") as f: long_description = f.read() def parse_requirements(name=None): if name: - reqf = 'requirements-%s.txt' % name + reqf = "requirements-%s.txt" % name else: - reqf = 'requirements.txt' + reqf = "requirements.txt" requirements = [] if not path.exists(reqf): return requirements with open(reqf) as f: for line in f.readlines(): line = line.strip() - if not line or line.startswith('#'): + if not line or line.startswith("#"): continue requirements.append(line) return requirements setup( - name='swh.search', - description='Software Heritage search service', + name="swh.search", + description="Software Heritage search service", long_description=long_description, - long_description_content_type='text/markdown', - author='Software Heritage developers', - author_email='swh-devel@inria.fr', - url='https://forge.softwareheritage.org/diffusion/DSEA', + long_description_content_type="text/markdown", + author="Software Heritage developers", + author_email="swh-devel@inria.fr", + url="https://forge.softwareheritage.org/diffusion/DSEA", packages=find_packages(), # packages's modules - install_requires=parse_requirements() + parse_requirements('swh'), - tests_require=parse_requirements('test'), - entry_points=''' + install_requires=parse_requirements() + parse_requirements("swh"), + tests_require=parse_requirements("test"), + entry_points=""" [swh.cli.subcommands] search=swh.search.cli:cli - ''', - setup_requires=['vcversioner'], - extras_require={'testing': parse_requirements('test')}, + """, + setup_requires=["vcversioner"], + extras_require={"testing": parse_requirements("test")}, vcversioner={}, include_package_data=True, classifiers=[ "Programming Language :: Python :: 3", "Intended Audience :: Developers", "License :: OSI Approved :: GNU General Public License v3 (GPLv3)", "Operating System :: OS Independent", "Development Status :: 3 - Alpha", ], project_urls={ - 'Bug Reports': 'https://forge.softwareheritage.org/maniphest', - 'Funding': 'https://www.softwareheritage.org/donate', - 'Source': 'https://forge.softwareheritage.org/source/swh-search', + "Bug Reports": "https://forge.softwareheritage.org/maniphest", + "Funding": "https://www.softwareheritage.org/donate", + "Source": "https://forge.softwareheritage.org/source/swh-search", }, ) diff --git a/swh/search/__init__.py b/swh/search/__init__.py index 7474665..d66b553 100644 --- a/swh/search/__init__.py +++ b/swh/search/__init__.py @@ -1,32 +1,32 @@ # Copyright (C) 2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information def get_search(cls, args): """Get an search object of class `search_class` with arguments `search_args`. Args: cls (str): search's class, either 'local' or 'remote' args (dict): dictionary of arguments passed to the search class constructor Returns: an instance of swh.search's classes (either local or remote) Raises: ValueError if passed an unknown search class. """ - if cls == 'remote': + if cls == "remote": from .api.client import RemoteSearch as Search - elif cls == 'elasticsearch': + elif cls == "elasticsearch": from .elasticsearch import ElasticSearch as Search - elif cls == 'memory': + elif cls == "memory": from .in_memory import InMemorySearch as Search else: - raise ValueError('Unknown indexer search class `%s`' % cls) + raise ValueError("Unknown indexer search class `%s`" % cls) return Search(**args) diff --git a/swh/search/api/client.py b/swh/search/api/client.py index fbe2433..786efad 100644 --- a/swh/search/api/client.py +++ b/swh/search/api/client.py @@ -1,13 +1,14 @@ # Copyright (C) 2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from swh.core.api import RPCClient from ..elasticsearch import ElasticSearch class RemoteSearch(RPCClient): """Proxy to a remote search API""" + backend_class = ElasticSearch diff --git a/swh/search/api/server.py b/swh/search/api/server.py index adf0402..bf994dc 100644 --- a/swh/search/api/server.py +++ b/swh/search/api/server.py @@ -1,90 +1,86 @@ # Copyright (C) 2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import logging import os from swh.core import config -from swh.core.api import (RPCServerApp, error_handler, - encode_data_server as encode_data) +from swh.core.api import RPCServerApp, error_handler, encode_data_server as encode_data from .. import get_search from ..elasticsearch import ElasticSearch def _get_search(): global search if not search: - search = get_search(**app.config['search']) + search = get_search(**app.config["search"]) return search -app = RPCServerApp(__name__, - backend_class=ElasticSearch, - backend_factory=_get_search) +app = RPCServerApp(__name__, backend_class=ElasticSearch, backend_factory=_get_search) search = None @app.errorhandler(Exception) def my_error_handler(exception): return error_handler(exception, encode_data) -@app.route('/') +@app.route("/") def index(): - return 'SWH Search API server' + return "SWH Search API server" api_cfg = None -def load_and_check_config(config_file, type='elasticsearch'): +def load_and_check_config(config_file, type="elasticsearch"): """Check the minimal configuration is set to run the api or raise an error explanation. Args: config_file (str): Path to the configuration file to load type (str): configuration type. For 'local' type, more checks are done. Raises: Error if the setup is not as expected Returns: configuration as a dict """ if not config_file: - raise EnvironmentError('Configuration file must be defined') + raise EnvironmentError("Configuration file must be defined") if not os.path.exists(config_file): - raise FileNotFoundError('Configuration file %s does not exist' % ( - config_file, )) + raise FileNotFoundError("Configuration file %s does not exist" % (config_file,)) cfg = config.read(config_file) - if 'search' not in cfg: + if "search" not in cfg: raise KeyError("Missing 'search' configuration") return cfg def make_app_from_configfile(): """Run the WSGI app from the webserver, loading the configuration from a configuration file. SWH_CONFIG_FILENAME environment variable defines the configuration path to load. """ global api_cfg if not api_cfg: - config_file = os.environ.get('SWH_CONFIG_FILENAME') + config_file = os.environ.get("SWH_CONFIG_FILENAME") api_cfg = load_and_check_config(config_file) app.config.update(api_cfg) handler = logging.StreamHandler() app.logger.addHandler(handler) return app diff --git a/swh/search/cli.py b/swh/search/cli.py index b43d113..0926d90 100644 --- a/swh/search/cli.py +++ b/swh/search/cli.py @@ -1,89 +1,98 @@ # Copyright (C) 2019-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import functools import click from swh.core import config from swh.core.cli import CONTEXT_SETTINGS from swh.journal.cli import get_journal_client from . import get_search from .journal_client import process_journal_objects from .api.server import load_and_check_config, app -@click.group(name='search', context_settings=CONTEXT_SETTINGS) -@click.option('--config-file', '-C', default=None, - type=click.Path(exists=True, dir_okay=False,), - help="Configuration file.") +@click.group(name="search", context_settings=CONTEXT_SETTINGS) +@click.option( + "--config-file", + "-C", + default=None, + type=click.Path(exists=True, dir_okay=False,), + help="Configuration file.", +) @click.pass_context def cli(ctx, config_file): - '''Software Heritage Search tools.''' + """Software Heritage Search tools.""" ctx.ensure_object(dict) conf = config.read(config_file) - ctx.obj['config'] = conf + ctx.obj["config"] = conf -@cli.command('initialize') +@cli.command("initialize") @click.pass_context def initialize(ctx): """Creates Elasticsearch indices.""" - search = get_search(**ctx.obj['config']['search']) + search = get_search(**ctx.obj["config"]["search"]) search.initialize() - print('Done.') + print("Done.") -@cli.group('journal-client') +@cli.group("journal-client") @click.pass_context def journal_client(ctx): """""" pass -@journal_client.command('objects') -@click.option('--stop-after-objects', '-s', default=None, type=int, - help='Maximum number of objects to replay. Default is to ' - 'run forever.') +@journal_client.command("objects") +@click.option( + "--stop-after-objects", + "-s", + default=None, + type=int, + help="Maximum number of objects to replay. Default is to " "run forever.", +) @click.pass_context def journal_client_objects(ctx, stop_after_objects): """Listens for new objects from the SWH Journal, and schedules tasks to run relevant indexers (currently, only origin) on these new objects.""" client = get_journal_client( - ctx, object_types=['origin', 'origin_visit'], - stop_after_objects=stop_after_objects) - search = get_search(**ctx.obj['config']['search']) - - worker_fn = functools.partial( - process_journal_objects, - search=search, + ctx, + object_types=["origin", "origin_visit"], + stop_after_objects=stop_after_objects, ) + search = get_search(**ctx.obj["config"]["search"]) + + worker_fn = functools.partial(process_journal_objects, search=search,) nb_messages = 0 try: nb_messages = client.process(worker_fn) - print('Processed %d messages.' % nb_messages) + print("Processed %d messages." % nb_messages) except KeyboardInterrupt: ctx.exit(0) else: - print('Done.') + print("Done.") finally: client.close() -@cli.command('rpc-serve') -@click.argument('config-path', required=True) -@click.option('--host', default='0.0.0.0', help="Host to run the server") -@click.option('--port', default=5010, type=click.INT, - help="Binding port of the server") -@click.option('--debug/--nodebug', default=True, - help="Indicates if the server should run in debug mode") +@cli.command("rpc-serve") +@click.argument("config-path", required=True) +@click.option("--host", default="0.0.0.0", help="Host to run the server") +@click.option("--port", default=5010, type=click.INT, help="Binding port of the server") +@click.option( + "--debug/--nodebug", + default=True, + help="Indicates if the server should run in debug mode", +) def rpc_server(config_path, host, port, debug): """Starts a Software Heritage Indexer RPC HTTP server.""" - api_cfg = load_and_check_config(config_path, type='any') + api_cfg = load_and_check_config(config_path, type="any") app.config.update(api_cfg) app.run(host, port=int(port), debug=bool(debug)) diff --git a/swh/search/elasticsearch.py b/swh/search/elasticsearch.py index 5365db3..9a74266 100644 --- a/swh/search/elasticsearch.py +++ b/swh/search/elasticsearch.py @@ -1,231 +1,213 @@ # Copyright (C) 2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import base64 from typing import Any, Iterable, Dict, List, Iterator, Optional from elasticsearch import Elasticsearch from elasticsearch.helpers import bulk, scan import msgpack from swh.core.api import remote_api_endpoint from swh.model import model from swh.model.identifiers import origin_identifier def _sanitize_origin(origin): origin = origin.copy() - res = { - 'url': origin.pop('url') - } - for field_name in ('intrinsic_metadata', 'has_visits'): + res = {"url": origin.pop("url")} + for field_name in ("intrinsic_metadata", "has_visits"): if field_name in origin: res[field_name] = origin.pop(field_name) return res class ElasticSearch: def __init__(self, hosts: List[str]): self._backend = Elasticsearch(hosts=hosts) - @remote_api_endpoint('check') + @remote_api_endpoint("check") def check(self): return self._backend.ping() def deinitialize(self) -> None: """Removes all indices from the Elasticsearch backend""" - self._backend.indices.delete(index='*') + self._backend.indices.delete(index="*") def initialize(self) -> None: """Declare Elasticsearch indices and mappings""" - if not self._backend.indices.exists(index='origin'): - self._backend.indices.create(index='origin') + if not self._backend.indices.exists(index="origin"): + self._backend.indices.create(index="origin") self._backend.indices.put_mapping( - index='origin', + index="origin", body={ - 'properties': { - 'sha1': { - 'type': 'keyword', - 'doc_values': True, - }, - 'url': { - 'type': 'text', + "properties": { + "sha1": {"type": "keyword", "doc_values": True,}, + "url": { + "type": "text", # To split URLs into token on any character # that is not alphanumerical - 'analyzer': 'simple', - 'fields': { - 'as_you_type': { - 'type': 'search_as_you_type', - 'analyzer': 'simple', + "analyzer": "simple", + "fields": { + "as_you_type": { + "type": "search_as_you_type", + "analyzer": "simple", } - } - }, - 'has_visits': { - 'type': 'boolean', + }, }, - 'intrinsic_metadata': { - 'type': 'nested', - 'properties': { - '@context': { + "has_visits": {"type": "boolean",}, + "intrinsic_metadata": { + "type": "nested", + "properties": { + "@context": { # don't bother indexing tokens - 'type': 'keyword', + "type": "keyword", } }, }, } - } + }, ) - @remote_api_endpoint('flush') + @remote_api_endpoint("flush") def flush(self) -> None: """Blocks until all previous calls to _update() are completely applied.""" - self._backend.indices.refresh(index='_all') + self._backend.indices.refresh(index="_all") - @remote_api_endpoint('origin/update') + @remote_api_endpoint("origin/update") def origin_update(self, documents: Iterable[dict]) -> None: documents = map(_sanitize_origin, documents) - documents_with_sha1 = ((origin_identifier(document), document) - for document in documents) + documents_with_sha1 = ( + (origin_identifier(document), document) for document in documents + ) actions = [ { - '_op_type': 'update', - '_id': sha1, - '_index': 'origin', - 'doc': { - **document, - 'sha1': sha1, - }, - 'doc_as_upsert': True, + "_op_type": "update", + "_id": sha1, + "_index": "origin", + "doc": {**document, "sha1": sha1,}, + "doc_as_upsert": True, } for (sha1, document) in documents_with_sha1 ] - bulk(self._backend, actions, index='origin') + bulk(self._backend, actions, index="origin") def origin_dump(self) -> Iterator[model.Origin]: """Returns all content in Elasticsearch's index. Not exposed publicly; but useful for tests.""" - results = scan(self._backend, index='*') + results = scan(self._backend, index="*") for hit in results: - yield self._backend.termvectors( - index='origin', id=hit['_id'], - fields=['*']) + yield self._backend.termvectors(index="origin", id=hit["_id"], fields=["*"]) - @remote_api_endpoint('origin/search') + @remote_api_endpoint("origin/search") def origin_search( - self, *, - url_pattern: str = None, metadata_pattern: str = None, - with_visit: bool = False, - page_token: str = None, count: int = 50 - ) -> Dict[str, object]: + self, + *, + url_pattern: str = None, + metadata_pattern: str = None, + with_visit: bool = False, + page_token: str = None, + count: int = 50 + ) -> Dict[str, object]: """Searches for origins matching the `url_pattern`. Args: url_pattern (str): Part of thr URL to search for with_visit (bool): Whether origins with no visit are to be filtered out page_token (str): Opaque value used for pagination. count (int): number of results to return. Returns: a dictionary with keys: * `next_page_token`: opaque value used for fetching more results. `None` if there are no more result. * `results`: list of dictionaries with key: * `url`: URL of a matching origin """ query_clauses = [] # type: List[Dict[str, Any]] if url_pattern: - query_clauses.append({ - 'multi_match': { - 'query': url_pattern, - 'type': 'bool_prefix', - 'operator': 'and', - 'fields': [ - 'url.as_you_type', - 'url.as_you_type._2gram', - 'url.as_you_type._3gram', - ] + query_clauses.append( + { + "multi_match": { + "query": url_pattern, + "type": "bool_prefix", + "operator": "and", + "fields": [ + "url.as_you_type", + "url.as_you_type._2gram", + "url.as_you_type._3gram", + ], + } } - }) + ) if metadata_pattern: - query_clauses.append({ - 'nested': { - 'path': 'intrinsic_metadata', - 'query': { - 'multi_match': { - 'query': metadata_pattern, - 'operator': 'and', - 'fields': ['intrinsic_metadata.*'] - } - }, + query_clauses.append( + { + "nested": { + "path": "intrinsic_metadata", + "query": { + "multi_match": { + "query": metadata_pattern, + "operator": "and", + "fields": ["intrinsic_metadata.*"], + } + }, + } } - }) + ) if not query_clauses: raise ValueError( - 'At least one of url_pattern and metadata_pattern ' - 'must be provided.') + "At least one of url_pattern and metadata_pattern " "must be provided." + ) if with_visit: - query_clauses.append({ - 'term': { - 'has_visits': True, - } - }) + query_clauses.append({"term": {"has_visits": True,}}) body = { - 'query': { - 'bool': { - 'must': query_clauses, - } - }, - 'size': count, - 'sort': [ - {'_score': 'desc'}, - {'sha1': 'asc'}, - ] + "query": {"bool": {"must": query_clauses,}}, + "size": count, + "sort": [{"_score": "desc"}, {"sha1": "asc"},], } if page_token: # TODO: use ElasticSearch's scroll API? - page_token_content = msgpack.loads( - base64.b64decode(page_token), raw=True) - body['search_after'] = \ - [page_token_content[b'score'], - page_token_content[b'sha1'].decode('ascii')] - - res = self._backend.search( - index='origin', - body=body, - size=count, - ) + page_token_content = msgpack.loads(base64.b64decode(page_token), raw=True) + body["search_after"] = [ + page_token_content[b"score"], + page_token_content[b"sha1"].decode("ascii"), + ] - hits = res['hits']['hits'] + res = self._backend.search(index="origin", body=body, size=count,) + + hits = res["hits"]["hits"] if len(hits) == count: last_hit = hits[-1] next_page_token_content = { - b'score': last_hit['_score'], - b'sha1': last_hit['_source']['sha1'], + b"score": last_hit["_score"], + b"sha1": last_hit["_source"]["sha1"], } - next_page_token = base64.b64encode(msgpack.dumps( - next_page_token_content)) # type: Optional[bytes] + next_page_token = base64.b64encode( + msgpack.dumps(next_page_token_content) + ) # type: Optional[bytes] else: next_page_token = None return { - 'next_page_token': next_page_token, - 'results': [ + "next_page_token": next_page_token, + "results": [ { # TODO: also add 'id'? - 'url': hit['_source']['url'], + "url": hit["_source"]["url"], } for hit in hits - ] + ], } diff --git a/swh/search/in_memory.py b/swh/search/in_memory.py index 01ac4ff..f5fc665 100644 --- a/swh/search/in_memory.py +++ b/swh/search/in_memory.py @@ -1,128 +1,127 @@ # Copyright (C) 2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import base64 from collections import defaultdict import itertools import re from typing import Any, Dict, Iterable, Iterator, List, Optional import msgpack from swh.core.api import remote_api_endpoint from swh.model.identifiers import origin_identifier def _sanitize_origin(origin): origin = origin.copy() - res = { - 'url': origin.pop('url') - } - for field_name in ('type', 'intrinsic_metadata'): + res = {"url": origin.pop("url")} + for field_name in ("type", "intrinsic_metadata"): if field_name in origin: res[field_name] = origin.pop(field_name) return res class InMemorySearch: def __init__(self): pass - @remote_api_endpoint('check') + @remote_api_endpoint("check") def check(self): return True def deinitialize(self) -> None: - if hasattr(self, '_origins'): + if hasattr(self, "_origins"): del self._origins del self._origin_ids def initialize(self) -> None: self._origins = defaultdict(dict) # type: Dict[str, Dict[str, Any]] self._origin_ids = [] # type: List[str] def flush(self) -> None: pass - _url_splitter = re.compile(r'\W') + _url_splitter = re.compile(r"\W") - @remote_api_endpoint('origin/update') + @remote_api_endpoint("origin/update") def origin_update(self, documents: Iterable[dict]) -> None: for document in documents: document = document.copy() id_ = origin_identifier(document) - if 'url' in document: - document['_url_tokens'] = \ - set(self._url_splitter.split(document['url'])) + if "url" in document: + document["_url_tokens"] = set(self._url_splitter.split(document["url"])) self._origins[id_].update(document) if id_ not in self._origin_ids: self._origin_ids.append(id_) - @remote_api_endpoint('origin/search') + @remote_api_endpoint("origin/search") def origin_search( - self, *, - url_pattern: str = None, metadata_pattern: str = None, - with_visit: bool = False, - page_token: str = None, count: int = 50 - ) -> Dict[str, object]: - matches = \ - (self._origins[id_] - for id_ in self._origin_ids) # type: Iterator[Dict[str, Any]] + self, + *, + url_pattern: str = None, + metadata_pattern: str = None, + with_visit: bool = False, + page_token: str = None, + count: int = 50 + ) -> Dict[str, object]: + matches = ( + self._origins[id_] for id_ in self._origin_ids + ) # type: Iterator[Dict[str, Any]] if url_pattern: tokens = set(self._url_splitter.split(url_pattern)) def predicate(match): - missing_tokens = tokens - match['_url_tokens'] + missing_tokens = tokens - match["_url_tokens"] if len(missing_tokens) == 0: return True elif len(missing_tokens) > 1: return False else: # There is one missing token, look up by prefix. (missing_token,) = missing_tokens - return any(token.startswith(missing_token) - for token in match['_url_tokens']) + return any( + token.startswith(missing_token) + for token in match["_url_tokens"] + ) matches = filter(predicate, matches) if metadata_pattern: raise NotImplementedError( - 'Metadata search is not implemented in the in-memory backend.') + "Metadata search is not implemented in the in-memory backend." + ) if not url_pattern and not metadata_pattern: raise ValueError( - 'At least one of url_pattern and metadata_pattern ' - 'must be provided.') + "At least one of url_pattern and metadata_pattern " "must be provided." + ) if with_visit: - matches = filter(lambda o: o.get('has_visits'), matches) + matches = filter(lambda o: o.get("has_visits"), matches) if page_token: - page_token_content = msgpack.loads( - base64.b64decode(page_token)) - start_at_index = page_token_content[b'start_at_index'] + page_token_content = msgpack.loads(base64.b64decode(page_token)) + start_at_index = page_token_content[b"start_at_index"] else: start_at_index = 0 - hits = list(itertools.islice( - matches, start_at_index, start_at_index+count)) + hits = list(itertools.islice(matches, start_at_index, start_at_index + count)) if len(hits) == count: next_page_token_content = { - b'start_at_index': start_at_index+count, + b"start_at_index": start_at_index + count, } - next_page_token = base64.b64encode(msgpack.dumps( - next_page_token_content)) # type: Optional[bytes] + next_page_token = base64.b64encode( + msgpack.dumps(next_page_token_content) + ) # type: Optional[bytes] else: next_page_token = None return { - 'next_page_token': next_page_token, - 'results': [ - {'url': hit['url']} - for hit in hits - ] + "next_page_token": next_page_token, + "results": [{"url": hit["url"]} for hit in hits], } diff --git a/swh/search/journal_client.py b/swh/search/journal_client.py index 4e38c5e..660a0f9 100644 --- a/swh/search/journal_client.py +++ b/swh/search/journal_client.py @@ -1,59 +1,63 @@ # Copyright (C) 2018-2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import logging EXPECTED_MESSAGE_TYPES = { - 'origin', 'origin_visit', 'origin_intrinsic_metadata', + "origin", + "origin_visit", + "origin_intrinsic_metadata", } def process_journal_objects(messages, *, search): """Worker function for `JournalClient.process(worker_fn)`, after currification of `scheduler` and `task_names`.""" assert set(messages) <= EXPECTED_MESSAGE_TYPES, set(messages) - if 'origin' in messages: - process_origins(messages['origin'], search) + if "origin" in messages: + process_origins(messages["origin"], search) - if 'origin_visit' in messages: - process_origin_visits(messages['origin_visit'], search) + if "origin_visit" in messages: + process_origin_visits(messages["origin_visit"], search) - if 'origin_intrinsic_metadata' in messages: - process_origin_intrinsic_metadata( - messages['origin_intrinsic_metadata'], search) + if "origin_intrinsic_metadata" in messages: + process_origin_intrinsic_metadata(messages["origin_intrinsic_metadata"], search) def process_origins(origins, search): - logging.debug('processing origins %r', origins) + logging.debug("processing origins %r", origins) search.origin_update(origins) def process_origin_visits(visits, search): - logging.debug('processing origin visits %r', visits) - - search.origin_update([ - { - 'url': (visit['origin'] if isinstance(visit['origin'], str) - else visit['origin']['url']), - 'has_visits': True - } - for visit in visits - ]) + logging.debug("processing origin visits %r", visits) + + search.origin_update( + [ + { + "url": ( + visit["origin"] + if isinstance(visit["origin"], str) + else visit["origin"]["url"] + ), + "has_visits": True, + } + for visit in visits + ] + ) def process_origin_intrinsic_metadata(origin_metadata, search): - logging.debug('processing origin intrinsic_metadata %r', origin_metadata) + logging.debug("processing origin intrinsic_metadata %r", origin_metadata) origin_metadata = [ - { - 'url': item['origin_url'], - 'intrinsic_metadata': item['metadata'], - } - for item in origin_metadata] + {"url": item["origin_url"], "intrinsic_metadata": item["metadata"],} + for item in origin_metadata + ] search.origin_update(origin_metadata) diff --git a/swh/search/tests/conftest.py b/swh/search/tests/conftest.py index ba1c0f8..9077aa0 100644 --- a/swh/search/tests/conftest.py +++ b/swh/search/tests/conftest.py @@ -1,108 +1,111 @@ # Copyright (C) 2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import socket import subprocess import time import elasticsearch import pytest def free_port(): sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) - sock.bind(('127.0.0.1', 0)) + sock.bind(("127.0.0.1", 0)) port = sock.getsockname()[1] sock.close() return port def wait_for_peer(addr, port): while True: try: sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) sock.connect((addr, port)) except ConnectionRefusedError: time.sleep(0.1) else: sock.close() break -CONFIG_TEMPLATE = ''' +CONFIG_TEMPLATE = """ node.name: node-1 path.data: {data} path.logs: {logs} network.host: 127.0.0.1 http.port: {http_port} transport.port: {transport_port} -''' +""" -def _run_elasticsearch( - conf_dir, data_dir, logs_dir, http_port, transport_port): - es_home = '/usr/share/elasticsearch' +def _run_elasticsearch(conf_dir, data_dir, logs_dir, http_port, transport_port): + es_home = "/usr/share/elasticsearch" - with open(conf_dir + '/elasticsearch.yml', 'w') as fd: - fd.write(CONFIG_TEMPLATE.format( - data=data_dir, - logs=logs_dir, - http_port=http_port, - transport_port=transport_port)) + with open(conf_dir + "/elasticsearch.yml", "w") as fd: + fd.write( + CONFIG_TEMPLATE.format( + data=data_dir, + logs=logs_dir, + http_port=http_port, + transport_port=transport_port, + ) + ) - with open(conf_dir + '/log4j2.properties', 'w') as fd: + with open(conf_dir + "/log4j2.properties", "w") as fd: pass cmd = [ - '/usr/share/elasticsearch/jdk/bin/java', - '-Des.path.home={}'.format(es_home), - '-Des.path.conf={}'.format(conf_dir), - '-Des.bundled_jdk=true', - '-Dlog4j2.disable.jmx=true', - '-cp', '{}/lib/*'.format(es_home), - 'org.elasticsearch.bootstrap.Elasticsearch', + "/usr/share/elasticsearch/jdk/bin/java", + "-Des.path.home={}".format(es_home), + "-Des.path.conf={}".format(conf_dir), + "-Des.bundled_jdk=true", + "-Dlog4j2.disable.jmx=true", + "-cp", + "{}/lib/*".format(es_home), + "org.elasticsearch.bootstrap.Elasticsearch", ] - host = '127.0.0.1:{}'.format(http_port) + host = "127.0.0.1:{}".format(http_port) - with open(logs_dir + '/output.txt', 'w') as fd: + with open(logs_dir + "/output.txt", "w") as fd: p = subprocess.Popen(cmd) - wait_for_peer('127.0.0.1', http_port) + wait_for_peer("127.0.0.1", http_port) client = elasticsearch.Elasticsearch([host]) assert client.ping() return p -@pytest.fixture(scope='session') +@pytest.fixture(scope="session") def elasticsearch_session(tmpdir_factory): - tmpdir = tmpdir_factory.mktemp('elasticsearch') - es_conf = tmpdir.mkdir('conf') + tmpdir = tmpdir_factory.mktemp("elasticsearch") + es_conf = tmpdir.mkdir("conf") http_port = free_port() transport_port = free_port() p = _run_elasticsearch( conf_dir=str(es_conf), - data_dir=str(tmpdir.mkdir('data')), - logs_dir=str(tmpdir.mkdir('logs')), + data_dir=str(tmpdir.mkdir("data")), + logs_dir=str(tmpdir.mkdir("logs")), http_port=http_port, transport_port=transport_port, ) - yield '127.0.0.1:{}'.format(http_port) + yield "127.0.0.1:{}".format(http_port) # Check ES didn't stop assert p.returncode is None, p.returncode p.kill() p.wait() -@pytest.fixture(scope='class') +@pytest.fixture(scope="class") def elasticsearch_host(elasticsearch_session): yield elasticsearch_session diff --git a/swh/search/tests/test_api_client.py b/swh/search/tests/test_api_client.py index c38cb8d..a1fe8e3 100644 --- a/swh/search/tests/test_api_client.py +++ b/swh/search/tests/test_api_client.py @@ -1,48 +1,43 @@ # Copyright (C) 2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import unittest import pytest from swh.core.api.tests.server_testing import ServerTestFixture from swh.search import get_search from swh.search.api.server import app from .test_search import CommonSearchTest class TestRemoteSearch(CommonSearchTest, ServerTestFixture, unittest.TestCase): @pytest.fixture(autouse=True) def _instantiate_search(self, elasticsearch_host): self._elasticsearch_host = elasticsearch_host def setUp(self): self.config = { - 'search': { - 'cls': 'elasticsearch', - 'args': { - 'hosts': [self._elasticsearch_host], - } + "search": { + "cls": "elasticsearch", + "args": {"hosts": [self._elasticsearch_host],}, } } self.app = app super().setUp() self.reset() - self.search = get_search('remote', { - 'url': self.url(), - }) + self.search = get_search("remote", {"url": self.url(),}) def reset(self): - search = get_search('elasticsearch', { - 'hosts': [self._elasticsearch_host], - }) + search = get_search("elasticsearch", {"hosts": [self._elasticsearch_host],}) search.deinitialize() search.initialize() - @pytest.mark.skip('Elasticsearch also returns close matches, ' - 'so this test would fail') + @pytest.mark.skip( + "Elasticsearch also returns close matches, " "so this test would fail" + ) def test_origin_url_paging(self, count): pass diff --git a/swh/search/tests/test_cli.py b/swh/search/tests/test_cli.py index 677b8af..dc1d77a 100644 --- a/swh/search/tests/test_cli.py +++ b/swh/search/tests/test_cli.py @@ -1,133 +1,125 @@ # Copyright (C) 2019-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import tempfile from unittest.mock import patch, MagicMock from click.testing import CliRunner from swh.journal.serializers import value_to_kafka from swh.journal.tests.utils import MockedKafkaConsumer from swh.search.cli import cli from .test_elasticsearch import BaseElasticsearchTest -CLI_CONFIG = ''' +CLI_CONFIG = """ search: cls: elasticsearch args: hosts: - '{elasticsearch_host}' -''' +""" -JOURNAL_OBJECTS_CONFIG = ''' +JOURNAL_OBJECTS_CONFIG = """ journal: brokers: - 192.0.2.1 prefix: swh.journal.objects group_id: test-consumer -''' +""" class MockedKafkaConsumerWithTopics(MockedKafkaConsumer): def list_topics(self, timeout=None): return { - 'swh.journal.objects.origin', - 'swh.journal.objects.origin_visit', + "swh.journal.objects.origin", + "swh.journal.objects.origin_visit", } -def invoke(catch_exceptions, args, config='', *, elasticsearch_host): +def invoke(catch_exceptions, args, config="", *, elasticsearch_host): runner = CliRunner() - with tempfile.NamedTemporaryFile('a', suffix='.yml') as config_fd: - config_fd.write((CLI_CONFIG + config).format( - elasticsearch_host=elasticsearch_host - )) + with tempfile.NamedTemporaryFile("a", suffix=".yml") as config_fd: + config_fd.write( + (CLI_CONFIG + config).format(elasticsearch_host=elasticsearch_host) + ) config_fd.seek(0) - result = runner.invoke(cli, ['-C' + config_fd.name] + args) + result = runner.invoke(cli, ["-C" + config_fd.name] + args) if not catch_exceptions and result.exception: print(result.output) raise result.exception return result class CliTestCase(BaseElasticsearchTest): def test__journal_client__origin(self): """Tests the re-indexing when origin_batch_size*task_batch_size is a divisor of nb_origins.""" - topic = 'swh.journal.objects.origin' - value = value_to_kafka({ - 'url': 'http://foobar.baz', - }) + topic = "swh.journal.objects.origin" + value = value_to_kafka({"url": "http://foobar.baz",}) message = MagicMock() message.error.return_value = None message.topic.return_value = topic message.value.return_value = value mock_consumer = MockedKafkaConsumerWithTopics([message]) - with patch('swh.journal.client.Consumer', - return_value=mock_consumer): - result = invoke(False, [ - 'journal-client', 'objects', - '--stop-after-objects', '1', - ], JOURNAL_OBJECTS_CONFIG, - elasticsearch_host=self._elasticsearch_host) + with patch("swh.journal.client.Consumer", return_value=mock_consumer): + result = invoke( + False, + ["journal-client", "objects", "--stop-after-objects", "1",], + JOURNAL_OBJECTS_CONFIG, + elasticsearch_host=self._elasticsearch_host, + ) # Check the output - expected_output = ( - 'Processed 1 messages.\n' - 'Done.\n' - ) + expected_output = "Processed 1 messages.\n" "Done.\n" assert result.exit_code == 0, result.output assert result.output == expected_output self.search.flush() - results = self.search.origin_search(url_pattern='foobar') - assert results == {'next_page_token': None, 'results': [ - {'url': 'http://foobar.baz'}]} + results = self.search.origin_search(url_pattern="foobar") + assert results == { + "next_page_token": None, + "results": [{"url": "http://foobar.baz"}], + } - results = self.search.origin_search(url_pattern='foobar', - with_visit=True) - assert results == {'next_page_token': None, 'results': []} + results = self.search.origin_search(url_pattern="foobar", with_visit=True) + assert results == {"next_page_token": None, "results": []} def test__journal_client__origin_visit(self): """Tests the re-indexing when origin_batch_size*task_batch_size is a divisor of nb_origins.""" - topic = 'swh.journal.objects.origin_visit' - value = value_to_kafka({ - 'origin': 'http://foobar.baz', - }) + topic = "swh.journal.objects.origin_visit" + value = value_to_kafka({"origin": "http://foobar.baz",}) message = MagicMock() message.error.return_value = None message.topic.return_value = topic message.value.return_value = value mock_consumer = MockedKafkaConsumerWithTopics([message]) - with patch('swh.journal.client.Consumer', - return_value=mock_consumer): - result = invoke(False, [ - 'journal-client', 'objects', - '--stop-after-objects', '1', - ], JOURNAL_OBJECTS_CONFIG, - elasticsearch_host=self._elasticsearch_host) + with patch("swh.journal.client.Consumer", return_value=mock_consumer): + result = invoke( + False, + ["journal-client", "objects", "--stop-after-objects", "1",], + JOURNAL_OBJECTS_CONFIG, + elasticsearch_host=self._elasticsearch_host, + ) # Check the output - expected_output = ( - 'Processed 1 messages.\n' - 'Done.\n' - ) + expected_output = "Processed 1 messages.\n" "Done.\n" assert result.exit_code == 0, result.output assert result.output == expected_output self.search.flush() - results = self.search.origin_search(url_pattern='foobar', - with_visit=True) - assert results == {'next_page_token': None, 'results': [ - {'url': 'http://foobar.baz'}]} + results = self.search.origin_search(url_pattern="foobar", with_visit=True) + assert results == { + "next_page_token": None, + "results": [{"url": "http://foobar.baz"}], + } diff --git a/swh/search/tests/test_elasticsearch.py b/swh/search/tests/test_elasticsearch.py index 73b4486..c5c185b 100644 --- a/swh/search/tests/test_elasticsearch.py +++ b/swh/search/tests/test_elasticsearch.py @@ -1,31 +1,29 @@ # Copyright (C) 2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import unittest import pytest from swh.search import get_search from .test_search import CommonSearchTest class BaseElasticsearchTest(unittest.TestCase): @pytest.fixture(autouse=True) def _instantiate_search(self, elasticsearch_host): self._elasticsearch_host = elasticsearch_host - self.search = get_search('elasticsearch', { - 'hosts': [elasticsearch_host], - }) + self.search = get_search("elasticsearch", {"hosts": [elasticsearch_host],}) def setUp(self): self.reset() def reset(self): self.search.deinitialize() self.search.initialize() class TestElasticsearchSearch(CommonSearchTest, BaseElasticsearchTest): pass diff --git a/swh/search/tests/test_in_memory.py b/swh/search/tests/test_in_memory.py index 57312bb..f48eb02 100644 --- a/swh/search/tests/test_in_memory.py +++ b/swh/search/tests/test_in_memory.py @@ -1,40 +1,40 @@ # Copyright (C) 2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import unittest import pytest from swh.search import get_search from .test_search import CommonSearchTest class InmemorySearchTest(unittest.TestCase, CommonSearchTest): @pytest.fixture(autouse=True) def _instantiate_search(self): - self.search = get_search('memory', {}) + self.search = get_search("memory", {}) def setUp(self): self.reset() def reset(self): self.search.deinitialize() self.search.initialize() - @pytest.mark.skip('Not implemented in the in-memory search') + @pytest.mark.skip("Not implemented in the in-memory search") def test_origin_intrinsic_metadata_description(self): pass - @pytest.mark.skip('Not implemented in the in-memory search') + @pytest.mark.skip("Not implemented in the in-memory search") def test_origin_intrinsic_metadata_all_terms(self): pass - @pytest.mark.skip('Not implemented in the in-memory search') + @pytest.mark.skip("Not implemented in the in-memory search") def test_origin_intrinsic_metadata_nested(self): pass - @pytest.mark.skip('Not implemented in the in-memory search') + @pytest.mark.skip("Not implemented in the in-memory search") def test_origin_intrinsic_metadata_paging(self): pass diff --git a/swh/search/tests/test_journal_client.py b/swh/search/tests/test_journal_client.py index dcb4566..b8d92f9 100644 --- a/swh/search/tests/test_journal_client.py +++ b/swh/search/tests/test_journal_client.py @@ -1,82 +1,71 @@ # Copyright (C) 2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import functools import unittest from unittest.mock import MagicMock from swh.search.journal_client import process_journal_objects class SearchJournalClientTest(unittest.TestCase): def test_origin_from_journal(self): search_mock = MagicMock() - worker_fn = functools.partial( - process_journal_objects, - search=search_mock, - ) + worker_fn = functools.partial(process_journal_objects, search=search_mock,) - worker_fn({'origin': [ - {'url': 'http://foobar.baz'}, - ]}) - search_mock.origin_update.assert_called_once_with([ - {'url': 'http://foobar.baz'}, - ]) + worker_fn({"origin": [{"url": "http://foobar.baz"},]}) + search_mock.origin_update.assert_called_once_with( + [{"url": "http://foobar.baz"},] + ) search_mock.reset_mock() - worker_fn({'origin': [ - {'url': 'http://foobar.baz'}, - {'url': 'http://barbaz.qux'}, - ]}) - search_mock.origin_update.assert_called_once_with([ - {'url': 'http://foobar.baz'}, - {'url': 'http://barbaz.qux'}, - ]) + worker_fn( + {"origin": [{"url": "http://foobar.baz"}, {"url": "http://barbaz.qux"},]} + ) + search_mock.origin_update.assert_called_once_with( + [{"url": "http://foobar.baz"}, {"url": "http://barbaz.qux"},] + ) def test_origin_visit_from_journal(self): search_mock = MagicMock() - worker_fn = functools.partial( - process_journal_objects, - search=search_mock, - ) + worker_fn = functools.partial(process_journal_objects, search=search_mock,) - worker_fn({'origin_visit': [ - { - 'origin': {'url': 'http://foobar.baz'}, - } - ]}) - search_mock.origin_update.assert_called_once_with([ - {'url': 'http://foobar.baz', 'has_visits': True}, - ]) + worker_fn({"origin_visit": [{"origin": {"url": "http://foobar.baz"},}]}) + search_mock.origin_update.assert_called_once_with( + [{"url": "http://foobar.baz", "has_visits": True},] + ) def test_origin_metadata_from_journal(self): search_mock = MagicMock() - worker_fn = functools.partial( - process_journal_objects, - search=search_mock, - ) + worker_fn = functools.partial(process_journal_objects, search=search_mock,) - worker_fn({'origin_intrinsic_metadata': [ - { - 'origin_url': 'http://foobar.baz', - 'metadata': { - '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', - 'description': 'foo bar', - }, - }, - ]}) - search_mock.origin_update.assert_called_once_with([ + worker_fn( { - 'url': 'http://foobar.baz', - 'intrinsic_metadata': { - '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', - 'description': 'foo bar', + "origin_intrinsic_metadata": [ + { + "origin_url": "http://foobar.baz", + "metadata": { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "description": "foo bar", + }, + }, + ] + } + ) + search_mock.origin_update.assert_called_once_with( + [ + { + "url": "http://foobar.baz", + "intrinsic_metadata": { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "description": "foo bar", + }, }, - }, - ]) + ] + ) diff --git a/swh/search/tests/test_search.py b/swh/search/tests/test_search.py index 0105777..b0f0914 100644 --- a/swh/search/tests/test_search.py +++ b/swh/search/tests/test_search.py @@ -1,294 +1,320 @@ # Copyright (C) 2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from hypothesis import given, strategies, settings from swh.search.utils import stream_results class CommonSearchTest: def test_origin_url_unique_word_prefix(self): - self.search.origin_update([ - {'url': 'http://foobar.baz'}, - {'url': 'http://barbaz.qux'}, - {'url': 'http://qux.quux'}, - ]) + self.search.origin_update( + [ + {"url": "http://foobar.baz"}, + {"url": "http://barbaz.qux"}, + {"url": "http://qux.quux"}, + ] + ) self.search.flush() - results = self.search.origin_search(url_pattern='foobar') - assert results == {'next_page_token': None, 'results': [ - {'url': 'http://foobar.baz'}]} + results = self.search.origin_search(url_pattern="foobar") + assert results == { + "next_page_token": None, + "results": [{"url": "http://foobar.baz"}], + } - results = self.search.origin_search(url_pattern='barb') - assert results == {'next_page_token': None, 'results': [ - {'url': 'http://barbaz.qux'}]} + results = self.search.origin_search(url_pattern="barb") + assert results == { + "next_page_token": None, + "results": [{"url": "http://barbaz.qux"}], + } # 'bar' is part of 'foobar', but is not the beginning of it - results = self.search.origin_search(url_pattern='bar') - assert results == {'next_page_token': None, 'results': [ - {'url': 'http://barbaz.qux'}]} - - results = self.search.origin_search(url_pattern='barbaz') - assert results == {'next_page_token': None, 'results': [ - {'url': 'http://barbaz.qux'}]} + results = self.search.origin_search(url_pattern="bar") + assert results == { + "next_page_token": None, + "results": [{"url": "http://barbaz.qux"}], + } + + results = self.search.origin_search(url_pattern="barbaz") + assert results == { + "next_page_token": None, + "results": [{"url": "http://barbaz.qux"}], + } def test_origin_url_unique_word_prefix_multiple_results(self): - self.search.origin_update([ - {'url': 'http://foobar.baz'}, - {'url': 'http://barbaz.qux'}, - {'url': 'http://qux.quux'}, - ]) + self.search.origin_update( + [ + {"url": "http://foobar.baz"}, + {"url": "http://barbaz.qux"}, + {"url": "http://qux.quux"}, + ] + ) self.search.flush() - results = self.search.origin_search(url_pattern='qu') - assert results['next_page_token'] is None + results = self.search.origin_search(url_pattern="qu") + assert results["next_page_token"] is None - results = [res['url'] for res in results['results']] - expected_results = ['http://qux.quux', 'http://barbaz.qux'] + results = [res["url"] for res in results["results"]] + expected_results = ["http://qux.quux", "http://barbaz.qux"] assert sorted(results) == sorted(expected_results) - results = self.search.origin_search(url_pattern='qux') - assert results['next_page_token'] is None + results = self.search.origin_search(url_pattern="qux") + assert results["next_page_token"] is None - results = [res['url'] for res in results['results']] - expected_results = ['http://barbaz.qux', 'http://qux.quux'] + results = [res["url"] for res in results["results"]] + expected_results = ["http://barbaz.qux", "http://qux.quux"] assert sorted(results) == sorted(expected_results) def test_origin_url_all_terms(self): - self.search.origin_update([ - {'url': 'http://foo.bar/baz'}, - {'url': 'http://foo.bar/foo.bar'}, - ]) + self.search.origin_update( + [{"url": "http://foo.bar/baz"}, {"url": "http://foo.bar/foo.bar"},] + ) self.search.flush() # Only results containing all terms should be returned. - results = self.search.origin_search(url_pattern='foo bar baz') - assert results == {'next_page_token': None, 'results': [ - {'url': 'http://foo.bar/baz'}, - ]} + results = self.search.origin_search(url_pattern="foo bar baz") + assert results == { + "next_page_token": None, + "results": [{"url": "http://foo.bar/baz"},], + } def test_origin_with_visit(self): - self.search.origin_update([ - {'url': 'http://foobar.baz', 'has_visits': True}, - ]) + self.search.origin_update( + [{"url": "http://foobar.baz", "has_visits": True},] + ) self.search.flush() - results = self.search.origin_search( - url_pattern='foobar', with_visit=True) - assert results == {'next_page_token': None, 'results': [ - {'url': 'http://foobar.baz'}]} + results = self.search.origin_search(url_pattern="foobar", with_visit=True) + assert results == { + "next_page_token": None, + "results": [{"url": "http://foobar.baz"}], + } def test_origin_with_visit_added(self): - self.search.origin_update([ - {'url': 'http://foobar.baz'}, - ]) + self.search.origin_update( + [{"url": "http://foobar.baz"},] + ) self.search.flush() - results = self.search.origin_search( - url_pattern='foobar', with_visit=True) - assert results == {'next_page_token': None, 'results': []} + results = self.search.origin_search(url_pattern="foobar", with_visit=True) + assert results == {"next_page_token": None, "results": []} - self.search.origin_update([ - {'url': 'http://foobar.baz', 'has_visits': True}, - ]) + self.search.origin_update( + [{"url": "http://foobar.baz", "has_visits": True},] + ) self.search.flush() - results = self.search.origin_search( - url_pattern='foobar', with_visit=True) - assert results == {'next_page_token': None, 'results': [ - {'url': 'http://foobar.baz'}]} + results = self.search.origin_search(url_pattern="foobar", with_visit=True) + assert results == { + "next_page_token": None, + "results": [{"url": "http://foobar.baz"}], + } def test_origin_intrinsic_metadata_description(self): - self.search.origin_update([ - { - 'url': 'http://origin1', - 'intrinsic_metadata': {}, - }, - { - 'url': 'http://origin2', - 'intrinsic_metadata': { - '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', - 'description': 'foo bar', + self.search.origin_update( + [ + {"url": "http://origin1", "intrinsic_metadata": {},}, + { + "url": "http://origin2", + "intrinsic_metadata": { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "description": "foo bar", + }, + }, + { + "url": "http://origin3", + "intrinsic_metadata": { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "description": "bar baz", + }, }, - }, - { - 'url': 'http://origin3', - 'intrinsic_metadata': { - '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', - 'description': 'bar baz', - } - }, - ]) + ] + ) self.search.flush() - results = self.search.origin_search(metadata_pattern='foo') - assert results == {'next_page_token': None, 'results': [ - {'url': 'http://origin2'}]} + results = self.search.origin_search(metadata_pattern="foo") + assert results == { + "next_page_token": None, + "results": [{"url": "http://origin2"}], + } - results = self.search.origin_search(metadata_pattern='foo bar') - assert results == {'next_page_token': None, 'results': [ - {'url': 'http://origin2'}]} + results = self.search.origin_search(metadata_pattern="foo bar") + assert results == { + "next_page_token": None, + "results": [{"url": "http://origin2"}], + } - results = self.search.origin_search(metadata_pattern='bar baz') - assert results == {'next_page_token': None, 'results': [ - {'url': 'http://origin3'}]} + results = self.search.origin_search(metadata_pattern="bar baz") + assert results == { + "next_page_token": None, + "results": [{"url": "http://origin3"}], + } def test_origin_intrinsic_metadata_all_terms(self): - self.search.origin_update([ - { - 'url': 'http://origin1', - 'intrinsic_metadata': { - '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', - 'description': 'foo bar foo bar', + self.search.origin_update( + [ + { + "url": "http://origin1", + "intrinsic_metadata": { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "description": "foo bar foo bar", + }, }, - }, - { - 'url': 'http://origin3', - 'intrinsic_metadata': { - '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', - 'description': 'foo bar baz', - } - }, - ]) + { + "url": "http://origin3", + "intrinsic_metadata": { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "description": "foo bar baz", + }, + }, + ] + ) self.search.flush() - results = self.search.origin_search(metadata_pattern='foo bar baz') - assert results == {'next_page_token': None, 'results': [ - {'url': 'http://origin3'}]} + results = self.search.origin_search(metadata_pattern="foo bar baz") + assert results == { + "next_page_token": None, + "results": [{"url": "http://origin3"}], + } def test_origin_intrinsic_metadata_nested(self): - self.search.origin_update([ - { - 'url': 'http://origin1', - 'intrinsic_metadata': {}, - }, - { - 'url': 'http://origin2', - 'intrinsic_metadata': { - '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', - 'keywords': ['foo', 'bar'], + self.search.origin_update( + [ + {"url": "http://origin1", "intrinsic_metadata": {},}, + { + "url": "http://origin2", + "intrinsic_metadata": { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "keywords": ["foo", "bar"], + }, + }, + { + "url": "http://origin3", + "intrinsic_metadata": { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "keywords": ["bar", "baz"], + }, }, - }, - { - 'url': 'http://origin3', - 'intrinsic_metadata': { - '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', - 'keywords': ['bar', 'baz'], - } - }, - ]) + ] + ) self.search.flush() - results = self.search.origin_search(metadata_pattern='foo') - assert results == {'next_page_token': None, 'results': [ - {'url': 'http://origin2'}]} + results = self.search.origin_search(metadata_pattern="foo") + assert results == { + "next_page_token": None, + "results": [{"url": "http://origin2"}], + } - results = self.search.origin_search(metadata_pattern='foo bar') - assert results == {'next_page_token': None, 'results': [ - {'url': 'http://origin2'}]} + results = self.search.origin_search(metadata_pattern="foo bar") + assert results == { + "next_page_token": None, + "results": [{"url": "http://origin2"}], + } - results = self.search.origin_search(metadata_pattern='bar baz') - assert results == {'next_page_token': None, 'results': [ - {'url': 'http://origin3'}]} + results = self.search.origin_search(metadata_pattern="bar baz") + assert results == { + "next_page_token": None, + "results": [{"url": "http://origin3"}], + } # TODO: add more tests with more codemeta terms # TODO: add more tests with edge cases @settings(deadline=None) @given(strategies.integers(min_value=1, max_value=4)) def test_origin_url_paging(self, count): # TODO: no hypothesis self.reset() - self.search.origin_update([ - {'url': 'http://origin1/foo'}, - {'url': 'http://origin2/foo/bar'}, - {'url': 'http://origin3/foo/bar/baz'}, - ]) + self.search.origin_update( + [ + {"url": "http://origin1/foo"}, + {"url": "http://origin2/foo/bar"}, + {"url": "http://origin3/foo/bar/baz"}, + ] + ) self.search.flush() results = stream_results( - self.search.origin_search, - url_pattern='foo bar baz', count=count) - results = [res['url'] for res in results] + self.search.origin_search, url_pattern="foo bar baz", count=count + ) + results = [res["url"] for res in results] expected_results = [ - 'http://origin3/foo/bar/baz', + "http://origin3/foo/bar/baz", ] - assert sorted(results[0:len(expected_results)]) == \ - sorted(expected_results) + assert sorted(results[0 : len(expected_results)]) == sorted(expected_results) results = stream_results( - self.search.origin_search, - url_pattern='foo bar', count=count) + self.search.origin_search, url_pattern="foo bar", count=count + ) expected_results = [ - 'http://origin2/foo/bar', - 'http://origin3/foo/bar/baz', + "http://origin2/foo/bar", + "http://origin3/foo/bar/baz", ] - results = [res['url'] for res in results] - assert sorted(results[0:len(expected_results)]) == \ - sorted(expected_results) + results = [res["url"] for res in results] + assert sorted(results[0 : len(expected_results)]) == sorted(expected_results) results = stream_results( - self.search.origin_search, - url_pattern='foo', count=count) + self.search.origin_search, url_pattern="foo", count=count + ) expected_results = [ - 'http://origin1/foo', - 'http://origin2/foo/bar', - 'http://origin3/foo/bar/baz', + "http://origin1/foo", + "http://origin2/foo/bar", + "http://origin3/foo/bar/baz", ] - results = [res['url'] for res in results] - assert sorted(results[0:len(expected_results)]) == \ - sorted(expected_results) + results = [res["url"] for res in results] + assert sorted(results[0 : len(expected_results)]) == sorted(expected_results) @settings(deadline=None) @given(strategies.integers(min_value=1, max_value=4)) def test_origin_intrinsic_metadata_paging(self, count): # TODO: no hypothesis self.reset() - self.search.origin_update([ - { - 'url': 'http://origin1', - 'intrinsic_metadata': { - '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', - 'keywords': ['foo'], + self.search.origin_update( + [ + { + "url": "http://origin1", + "intrinsic_metadata": { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "keywords": ["foo"], + }, }, - }, - { - 'url': 'http://origin2', - 'intrinsic_metadata': { - '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', - 'keywords': ['foo', 'bar'], + { + "url": "http://origin2", + "intrinsic_metadata": { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "keywords": ["foo", "bar"], + }, }, - }, - { - 'url': 'http://origin3', - 'intrinsic_metadata': { - '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', - 'keywords': ['foo', 'bar', 'baz'], - } - }, - ]) + { + "url": "http://origin3", + "intrinsic_metadata": { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "keywords": ["foo", "bar", "baz"], + }, + }, + ] + ) self.search.flush() results = stream_results( - self.search.origin_search, - metadata_pattern='foo bar baz', count=count) - assert list(results) == [ - {'url': 'http://origin3'}] + self.search.origin_search, metadata_pattern="foo bar baz", count=count + ) + assert list(results) == [{"url": "http://origin3"}] results = stream_results( - self.search.origin_search, - metadata_pattern='foo bar', count=count) - assert list(results) == [ - {'url': 'http://origin2'}, - {'url': 'http://origin3'}] + self.search.origin_search, metadata_pattern="foo bar", count=count + ) + assert list(results) == [{"url": "http://origin2"}, {"url": "http://origin3"}] results = stream_results( - self.search.origin_search, - metadata_pattern='foo', count=count) + self.search.origin_search, metadata_pattern="foo", count=count + ) assert list(results) == [ - {'url': 'http://origin1'}, - {'url': 'http://origin2'}, - {'url': 'http://origin3'}] + {"url": "http://origin1"}, + {"url": "http://origin2"}, + {"url": "http://origin3"}, + ] diff --git a/swh/search/utils.py b/swh/search/utils.py index b224573..fce8c4e 100644 --- a/swh/search/utils.py +++ b/swh/search/utils.py @@ -1,16 +1,16 @@ # Copyright (C) 2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information def stream_results(f, *args, **kwargs): - if 'page_token' in kwargs: + if "page_token" in kwargs: raise TypeError('stream_results has no argument "page_token".') page_token = None while True: results = f(*args, page_token=page_token, **kwargs) - yield from results['results'] - page_token = results['next_page_token'] + yield from results["results"] + page_token = results["next_page_token"] if page_token is None: break diff --git a/tox.ini b/tox.ini index 04fb628..df58667 100644 --- a/tox.ini +++ b/tox.ini @@ -1,27 +1,34 @@ [tox] -envlist=flake8,mypy,py3 +envlist=black,flake8,mypy,py3 [testenv] extras = testing deps = pytest-cov commands = pytest --cov={envsitepackagesdir}/swh/search \ {envsitepackagesdir}/swh/search \ --cov-branch {posargs} +[testenv:black] +skip_install = true +deps = + black +commands = + {envpython} -m black --check swh + [testenv:flake8] skip_install = true deps = flake8 commands = {envpython} -m flake8 [testenv:mypy] extras = testing deps = mypy commands = mypy swh