diff --git a/swh/search/api/server.py b/swh/search/api/server.py index 25d491a..aa08b75 100644 --- a/swh/search/api/server.py +++ b/swh/search/api/server.py @@ -1,106 +1,106 @@ # Copyright (C) 2019-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import logging import os from typing import Any, Dict from swh.core import config from swh.core.api import RPCServerApp from swh.core.api import encode_data_server as encode_data from swh.core.api import error_handler from swh.search.metrics import timed from .. import get_search from ..exc import SearchException from ..interface import SearchInterface logger = logging.getLogger(__name__) def _get_search(): global search if not search: search = get_search(**app.config["search"]) return search app = RPCServerApp(__name__, backend_class=SearchInterface, backend_factory=_get_search) search = None @app.errorhandler(SearchException) def search_error_handler(exception): return error_handler(exception, encode_data, status_code=400) @app.errorhandler(Exception) def my_error_handler(exception): return error_handler(exception, encode_data) @app.route("/") @timed def index(): return "SWH Search API server" @app.before_first_request def initialized_index(): search = _get_search() logger.info("Initializing indexes with configuration: ", search.origin_config) search.initialize() api_cfg = None def load_and_check_config(config_file: str) -> Dict[str, Any]: """Check the minimal configuration is set to run the api or raise an error explanation. Args: config_file: Path to the configuration file to load type: configuration type. For 'local' type, more checks are done. Raises: Error if the setup is not as expected Returns: configuration as a dict """ if not config_file: raise EnvironmentError("Configuration file must be defined") if not os.path.exists(config_file): raise FileNotFoundError("Configuration file %s does not exist" % (config_file,)) cfg = config.read(config_file) if "search" not in cfg: raise KeyError("Missing 'search' configuration") return cfg def make_app_from_configfile(): """Run the WSGI app from the webserver, loading the configuration from - a configuration file. + a configuration file. - SWH_CONFIG_FILENAME environment variable defines the - configuration path to load. + SWH_CONFIG_FILENAME environment variable defines the + configuration path to load. """ global api_cfg if not api_cfg: config_file = os.environ.get("SWH_CONFIG_FILENAME") api_cfg = load_and_check_config(config_file) app.config.update(api_cfg) handler = logging.StreamHandler() app.logger.addHandler(handler) return app diff --git a/swh/search/cli.py b/swh/search/cli.py index 6979852..4d9932f 100644 --- a/swh/search/cli.py +++ b/swh/search/cli.py @@ -1,150 +1,158 @@ # Copyright (C) 2019-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information # WARNING: do not import unnecessary things here to keep cli startup time under # control import logging import click from swh.core.cli import CONTEXT_SETTINGS from swh.core.cli import swh as swh_cli_group @swh_cli_group.group(name="search", context_settings=CONTEXT_SETTINGS) @click.option( "--config-file", "-C", default=None, - type=click.Path(exists=True, dir_okay=False,), + type=click.Path( + exists=True, + dir_okay=False, + ), help="Configuration file.", ) @click.pass_context def search_cli_group(ctx, config_file): """Software Heritage Search tools.""" from swh.core import config ctx.ensure_object(dict) conf = config.read(config_file) ctx.obj["config"] = conf @search_cli_group.command("initialize") @click.pass_context def initialize(ctx): """Creates Elasticsearch indices.""" from . import get_search search = get_search(**ctx.obj["config"]["search"]) search.initialize() print("Done.") @search_cli_group.command(name="rpc-serve") @click.option( "--host", default="0.0.0.0", metavar="IP", show_default=True, help="Host ip address to bind the server on", ) @click.option( "--port", default=5010, type=click.INT, metavar="PORT", show_default=True, help="Binding port of the server", ) @click.option( "--debug/--no-debug", default=True, help="Indicates if the server should run in debug mode", ) @click.pass_context def serve(ctx, host, port, debug): """Software Heritage Storage RPC server. Do NOT use this in a production environment. """ from swh.search.api.server import app if "log_level" in ctx.obj: logging.getLogger("werkzeug").setLevel(ctx.obj["log_level"]) app.config.update(ctx.obj["config"]) app.run(host, port=int(port), debug=bool(debug)) @search_cli_group.group("journal-client") @click.pass_context def journal_client(ctx): """""" pass @journal_client.command("objects") @click.option( "--stop-after-objects", "-m", default=None, type=int, help="Maximum number of objects to replay. Default is to run forever.", ) @click.option( "--object-type", "-o", multiple=True, help="Default list of object types to subscribe to", ) @click.option( - "--prefix", "-p", help="Topic prefix to use (e.g swh.journal.indexed)", + "--prefix", + "-p", + help="Topic prefix to use (e.g swh.journal.indexed)", ) @click.pass_context def journal_client_objects(ctx, stop_after_objects, object_type, prefix): """Listens for new objects from the SWH Journal, and schedules tasks to run relevant indexers (currently, origin and origin_visit) on these new objects. """ import functools from swh.journal.client import get_journal_client from swh.storage import get_storage from . import get_search from .journal_client import process_journal_objects config = ctx.obj["config"] journal_cfg = config["journal"] journal_cfg["object_types"] = object_type or journal_cfg.get("object_types", []) journal_cfg["prefix"] = prefix or journal_cfg.get("prefix") journal_cfg["stop_after_objects"] = stop_after_objects or journal_cfg.get( "stop_after_objects" ) if len(journal_cfg["object_types"]) == 0: raise ValueError("'object_types' must be specified by cli or configuration") if journal_cfg["prefix"] is None: raise ValueError("'prefix' must be specified by cli or configuration") - client = get_journal_client(cls="kafka", **journal_cfg,) + client = get_journal_client( + cls="kafka", + **journal_cfg, + ) search = get_search(**config["search"]) storage = get_storage(**config["storage"]) worker_fn = functools.partial( process_journal_objects, search=search, storage=storage ) nb_messages = 0 try: nb_messages = client.process(worker_fn) print("Processed %d messages." % nb_messages) except KeyboardInterrupt: ctx.exit(0) else: print("Done.") finally: client.close() diff --git a/swh/search/elasticsearch.py b/swh/search/elasticsearch.py index e83053e..3f156e1 100644 --- a/swh/search/elasticsearch.py +++ b/swh/search/elasticsearch.py @@ -1,554 +1,582 @@ # Copyright (C) 2019-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import base64 from collections import Counter import logging import pprint from textwrap import dedent from typing import Any, Dict, Iterable, List, Optional from elasticsearch import Elasticsearch, helpers import msgpack from swh.indexer import codemeta from swh.model import model from swh.model.hashutil import hash_to_hex from swh.search.interface import ( SORT_BY_OPTIONS, MinimalOriginDict, OriginDict, PagedResult, ) from swh.search.metrics import send_metric, timed from swh.search.translator import Translator from swh.search.utils import escape, get_expansion, parse_and_format_date logger = logging.getLogger(__name__) INDEX_NAME_PARAM = "index" READ_ALIAS_PARAM = "read_alias" WRITE_ALIAS_PARAM = "write_alias" ORIGIN_DEFAULT_CONFIG = { INDEX_NAME_PARAM: "origin", READ_ALIAS_PARAM: "origin-read", WRITE_ALIAS_PARAM: "origin-write", } ORIGIN_MAPPING = { "dynamic_templates": [ { "booleans_as_string": { # All fields stored as string in the metadata # even the booleans "match_mapping_type": "boolean", "path_match": "intrinsic_metadata.*", "mapping": {"type": "keyword"}, } } ], "date_detection": False, "properties": { # sha1 of the URL; used as the document id - "sha1": {"type": "keyword", "doc_values": True,}, + "sha1": { + "type": "keyword", + "doc_values": True, + }, # Used both to search URLs, and as the result to return # as a response to queries "url": { "type": "text", # To split URLs into token on any character # that is not alphanumerical "analyzer": "simple", # 2-gram and partial-3-gram search (ie. with the end of the # third word potentially missing) "fields": { - "as_you_type": {"type": "search_as_you_type", "analyzer": "simple",} + "as_you_type": { + "type": "search_as_you_type", + "analyzer": "simple", + } }, }, "visit_types": {"type": "keyword"}, # used to filter out origins that were never visited - "has_visits": {"type": "boolean",}, + "has_visits": { + "type": "boolean", + }, "nb_visits": {"type": "integer"}, "snapshot_id": {"type": "keyword"}, "last_visit_date": {"type": "date"}, "last_eventful_visit_date": {"type": "date"}, "last_release_date": {"type": "date"}, "last_revision_date": {"type": "date"}, "intrinsic_metadata": { "type": "nested", "properties": { "@context": { # don't bother indexing tokens in these URIs, as the # are used as namespaces "type": "keyword", }, "http://schema": { "properties": { "org/dateCreated": { - "properties": {"@value": {"type": "date",}} + "properties": { + "@value": { + "type": "date", + } + } }, "org/dateModified": { - "properties": {"@value": {"type": "date",}} + "properties": { + "@value": { + "type": "date", + } + } }, "org/datePublished": { - "properties": {"@value": {"type": "date",}} + "properties": { + "@value": { + "type": "date", + } + } }, } }, }, }, # Has this origin been taken down? - "blocklisted": {"type": "boolean",}, + "blocklisted": { + "type": "boolean", + }, }, } # painless script that will be executed when updating an origin document ORIGIN_UPDATE_SCRIPT = dedent( """ // utility function to get and parse date ZonedDateTime getDate(def ctx, String date_field) { String default_date = "0001-01-01T00:00:00Z"; String date = ctx._source.getOrDefault(date_field, default_date); return ZonedDateTime.parse(date); } // backup current visit_types field value List visit_types = ctx._source.getOrDefault("visit_types", []); int nb_visits = ctx._source.getOrDefault("nb_visits", 0); ZonedDateTime last_visit_date = getDate(ctx, "last_visit_date"); String snapshot_id = ctx._source.getOrDefault("snapshot_id", ""); ZonedDateTime last_eventful_visit_date = getDate(ctx, "last_eventful_visit_date"); ZonedDateTime last_revision_date = getDate(ctx, "last_revision_date"); ZonedDateTime last_release_date = getDate(ctx, "last_release_date"); // update origin document with new field values ctx._source.putAll(params); // restore previous visit types after visit_types field overriding if (ctx._source.containsKey("visit_types")) { for (int i = 0; i < visit_types.length; ++i) { if (!ctx._source.visit_types.contains(visit_types[i])) { ctx._source.visit_types.add(visit_types[i]); } } } // Undo overwrite if incoming nb_visits is smaller if (ctx._source.containsKey("nb_visits")) { int incoming_nb_visits = ctx._source.getOrDefault("nb_visits", 0); if(incoming_nb_visits < nb_visits){ ctx._source.nb_visits = nb_visits; } } // Undo overwrite if incoming last_visit_date is older if (ctx._source.containsKey("last_visit_date")) { ZonedDateTime incoming_last_visit_date = getDate(ctx, "last_visit_date"); int difference = // returns -1, 0 or 1 incoming_last_visit_date.compareTo(last_visit_date); if(difference < 0){ ctx._source.last_visit_date = last_visit_date; } } // Undo update of last_eventful_date and snapshot_id if // snapshot_id hasn't changed OR incoming_last_eventful_visit_date is older if (ctx._source.containsKey("snapshot_id")) { String incoming_snapshot_id = ctx._source.getOrDefault("snapshot_id", ""); ZonedDateTime incoming_last_eventful_visit_date = getDate(ctx, "last_eventful_visit_date"); int difference = // returns -1, 0 or 1 incoming_last_eventful_visit_date.compareTo(last_eventful_visit_date); if(snapshot_id == incoming_snapshot_id || difference < 0){ ctx._source.snapshot_id = snapshot_id; ctx._source.last_eventful_visit_date = last_eventful_visit_date; } } // Undo overwrite if incoming last_revision_date is older if (ctx._source.containsKey("last_revision_date")) { ZonedDateTime incoming_last_revision_date = getDate(ctx, "last_revision_date"); int difference = // returns -1, 0 or 1 incoming_last_revision_date.compareTo(last_revision_date); if(difference < 0){ ctx._source.last_revision_date = last_revision_date; } } // Undo overwrite if incoming last_release_date is older if (ctx._source.containsKey("last_release_date")) { ZonedDateTime incoming_last_release_date = getDate(ctx, "last_release_date"); // returns -1, 0 or 1 int difference = incoming_last_release_date.compareTo(last_release_date); if(difference < 0){ ctx._source.last_release_date = last_release_date; } } """ ) def _sanitize_origin(origin): origin = origin.copy() # Whitelist fields to be saved in Elasticsearch res = {"url": origin.pop("url")} for field_name in ( "blocklisted", "has_visits", "intrinsic_metadata", "visit_types", "nb_visits", "snapshot_id", "last_visit_date", "last_eventful_visit_date", "last_revision_date", "last_release_date", ): if field_name in origin: res[field_name] = origin.pop(field_name) # Run the JSON-LD expansion algorithm # # to normalize the Codemeta metadata. # This is required as Elasticsearch will needs each field to have a consistent # type across documents to be searchable; and non-expanded JSON-LD documents # can have various types in the same field. For example, all these are # equivalent in JSON-LD: # * {"author": "Jane Doe"} # * {"author": ["Jane Doe"]} # * {"author": {"@value": "Jane Doe"}} # * {"author": [{"@value": "Jane Doe"}]} # and JSON-LD expansion will convert them all to the last one. if "intrinsic_metadata" in res: intrinsic_metadata = res["intrinsic_metadata"] for date_field in ["dateCreated", "dateModified", "datePublished"]: if date_field in intrinsic_metadata: date = intrinsic_metadata[date_field] # If date{Created,Modified,Published} value isn't parsable # It gets rejected and isn't stored (unlike other fields) formatted_date = parse_and_format_date(date) if formatted_date is None: intrinsic_metadata.pop(date_field) else: intrinsic_metadata[date_field] = formatted_date res["intrinsic_metadata"] = codemeta.expand(intrinsic_metadata) return res def token_encode(index_to_tokenize: Dict[bytes, Any]) -> str: """Tokenize as string an index page result from a search""" page_token = base64.b64encode(msgpack.dumps(index_to_tokenize)) return page_token.decode() def token_decode(page_token: str) -> Dict[bytes, Any]: """Read the page_token""" return msgpack.loads(base64.b64decode(page_token.encode()), raw=True) class ElasticSearch: def __init__(self, hosts: List[str], indexes: Dict[str, Dict[str, str]] = {}): self._backend = Elasticsearch(hosts=hosts) self._translator = Translator() # Merge current configuration with default values origin_config = indexes.get("origin", {}) self.origin_config = {**ORIGIN_DEFAULT_CONFIG, **origin_config} def _get_origin_index(self) -> str: return self.origin_config[INDEX_NAME_PARAM] def _get_origin_read_alias(self) -> str: return self.origin_config[READ_ALIAS_PARAM] def _get_origin_write_alias(self) -> str: return self.origin_config[WRITE_ALIAS_PARAM] @timed def check(self): return self._backend.ping() def deinitialize(self) -> None: """Removes all indices from the Elasticsearch backend""" self._backend.indices.delete(index="*") def initialize(self) -> None: """Declare Elasticsearch indices, aliases and mappings""" if not self._backend.indices.exists(index=self._get_origin_index()): self._backend.indices.create(index=self._get_origin_index()) if not self._backend.indices.exists_alias(name=self._get_origin_read_alias()): self._backend.indices.put_alias( index=self._get_origin_index(), name=self._get_origin_read_alias() ) if not self._backend.indices.exists_alias(name=self._get_origin_write_alias()): self._backend.indices.put_alias( index=self._get_origin_index(), name=self._get_origin_write_alias() ) self._backend.indices.put_mapping( index=self._get_origin_index(), body=ORIGIN_MAPPING ) @timed def flush(self) -> None: self._backend.indices.refresh(index=self._get_origin_write_alias()) @timed def origin_update(self, documents: Iterable[OriginDict]) -> None: write_index = self._get_origin_write_alias() documents = map(_sanitize_origin, documents) documents_with_sha1 = ( (hash_to_hex(model.Origin(url=document["url"]).id), document) for document in documents ) actions = [ { "_op_type": "update", "_id": sha1, "_index": write_index, "scripted_upsert": True, - "upsert": {**document, "sha1": sha1,}, + "upsert": { + **document, + "sha1": sha1, + }, "retry_on_conflict": 10, "script": { "source": ORIGIN_UPDATE_SCRIPT, "lang": "painless", "params": document, }, } for (sha1, document) in documents_with_sha1 ] indexed_count, errors = helpers.bulk(self._backend, actions, index=write_index) assert isinstance(errors, List) # Make mypy happy send_metric("document:index", count=indexed_count, method_name="origin_update") send_metric( "document:index_error", count=len(errors), method_name="origin_update" ) @timed def origin_search( self, *, query: str = "", url_pattern: Optional[str] = None, metadata_pattern: Optional[str] = None, with_visit: bool = False, visit_types: Optional[List[str]] = None, min_nb_visits: int = 0, min_last_visit_date: str = "", min_last_eventful_visit_date: str = "", min_last_revision_date: str = "", min_last_release_date: str = "", min_date_created: str = "", min_date_modified: str = "", min_date_published: str = "", programming_languages: Optional[List[str]] = None, licenses: Optional[List[str]] = None, keywords: Optional[List[str]] = None, sort_by: Optional[List[str]] = None, page_token: Optional[str] = None, limit: int = 50, ) -> PagedResult[MinimalOriginDict]: query_clauses: List[Dict[str, Any]] = [] query_filters = [] if url_pattern: query_filters.append(f"origin : {escape(url_pattern)}") if metadata_pattern: query_filters.append(f"metadata : {escape(metadata_pattern)}") # if not query_clauses: # raise ValueError( # "At least one of url_pattern and metadata_pattern must be provided." # ) if with_visit: query_filters.append(f"visited = {'true' if with_visit else 'false'}") if min_nb_visits: query_filters.append(f"visits >= {min_nb_visits}") if min_last_visit_date: query_filters.append( f"last_visit >= {min_last_visit_date.replace('Z', '+00:00')}" ) if min_last_eventful_visit_date: query_filters.append( "last_eventful_visit >= " f"{min_last_eventful_visit_date.replace('Z', '+00:00')}" ) if min_last_revision_date: query_filters.append( f"last_revision >= {min_last_revision_date.replace('Z', '+00:00')}" ) if min_last_release_date: query_filters.append( f"last_release >= {min_last_release_date.replace('Z', '+00:00')}" ) if keywords: query_filters.append(f"keyword in {escape(keywords)}") if licenses: query_filters.append(f"license in {escape(licenses)}") if programming_languages: query_filters.append(f"language in {escape(programming_languages)}") if min_date_created: query_filters.append( f"created >= {min_date_created.replace('Z', '+00:00')}" ) if min_date_modified: query_filters.append( f"modified >= {min_date_modified.replace('Z', '+00:00')}" ) if min_date_published: query_filters.append( f"published >= {min_date_published.replace('Z', '+00:00')}" ) if visit_types is not None: query_filters.append(f"visit_type = {escape(visit_types)}") combined_filters = " and ".join(query_filters) if combined_filters and query: query = f"{combined_filters} and {query}" else: query = combined_filters or query parsed_query = self._translator.parse_query(query) query_clauses.append(parsed_query["filters"]) field_map = { "visits": "nb_visits", "last_visit": "last_visit_date", "last_eventful_visit": "last_eventful_visit_date", "last_revision": "last_revision_date", "last_release": "last_release_date", "created": "date_created", "modified": "date_modified", "published": "date_published", } if "sortBy" in parsed_query: if sort_by is None: sort_by = [] for sort_by_option in parsed_query["sortBy"]: if sort_by_option[0] == "-": sort_by.append("-" + field_map[sort_by_option[1:]]) else: sort_by.append(field_map[sort_by_option]) if parsed_query.get("limit", 0): limit = parsed_query["limit"] sorting_params: List[Dict[str, Any]] = [] if sort_by: for field in sort_by: order = "asc" if field and field[0] == "-": field = field[1:] order = "desc" if field in ["date_created", "date_modified", "date_published"]: sorting_params.append( { get_expansion(field, "."): { "nested_path": "intrinsic_metadata", "order": order, } } ) elif field in SORT_BY_OPTIONS: sorting_params.append({field: order}) sorting_params.extend( - [{"_score": "desc"}, {"sha1": "asc"},] + [ + {"_score": "desc"}, + {"sha1": "asc"}, + ] ) body = { "query": { "bool": { "must": query_clauses, "must_not": [{"term": {"blocklisted": True}}], } }, "sort": sorting_params, } if page_token: # TODO: use ElasticSearch's scroll API? page_token_content = token_decode(page_token) body["search_after"] = [ page_token_content[b"score"], page_token_content[b"sha1"].decode("ascii"), ] if logger.isEnabledFor(logging.DEBUG): formatted_body = pprint.pformat(body) logger.debug("Search query body: %s", formatted_body) res = self._backend.search( index=self._get_origin_read_alias(), body=body, size=limit ) hits = res["hits"]["hits"] next_page_token: Optional[str] = None if len(hits) == limit: # There are more results after this page; return a pagination token # to get them in a future query last_hit = hits[-1] next_page_token_content = { b"score": last_hit["_score"], b"sha1": last_hit["_source"]["sha1"], } next_page_token = token_encode(next_page_token_content) assert len(hits) <= limit return PagedResult( results=[{"url": hit["_source"]["url"]} for hit in hits], next_page_token=next_page_token, ) def visit_types_count(self) -> Counter: body = { "aggs": { "not_blocklisted": { "filter": {"bool": {"must_not": [{"term": {"blocklisted": True}}]}}, "aggs": { "visit_types": {"terms": {"field": "visit_types", "size": 1000}} }, } } } res = self._backend.search( index=self._get_origin_read_alias(), body=body, size=0 ) buckets = ( res.get("aggregations", {}) .get("not_blocklisted", {}) .get("visit_types", {}) .get("buckets", []) ) return Counter({bucket["key"]: bucket["doc_count"] for bucket in buckets}) diff --git a/swh/search/in_memory.py b/swh/search/in_memory.py index 3c20dda..5bc3572 100644 --- a/swh/search/in_memory.py +++ b/swh/search/in_memory.py @@ -1,520 +1,535 @@ # Copyright (C) 2019-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from collections import Counter, defaultdict from datetime import datetime, timezone from itertools import chain import re from typing import Any, Dict, Iterable, Iterator, List, Optional from swh.indexer import codemeta from swh.model import model from swh.model.hashutil import hash_to_hex from swh.search.interface import ( SORT_BY_OPTIONS, MinimalOriginDict, OriginDict, PagedResult, ) from swh.search.utils import get_expansion, parse_and_format_date _words_regexp = re.compile(r"\w+") def _dict_words_set(d): """Recursively extract set of words from dict content.""" values = set() def extract(obj, words): if isinstance(obj, dict): for k, v in obj.items(): extract(v, words) elif isinstance(obj, list): for item in obj: extract(item, words) else: words.update(_words_regexp.findall(str(obj).lower())) return words return extract(d, values) def _nested_get(nested_dict, nested_keys, default=""): """Extracts values from deeply nested dictionary nested_dict using the nested_keys and returns a list of all of the values discovered in the process. >>> nested_dict = [ ... {"name": [{"@value": {"first": "f1", "last": "l1"}}], "address": "XYZ"}, ... {"name": [{"@value": {"first": "f2", "last": "l2"}}], "address": "ABC"}, ... ] >>> _nested_get(nested_dict, ["name", "@value", "last"]) ['l1', 'l2'] >>> _nested_get(nested_dict, ["address"]) ['XYZ', 'ABC'] It doesn't allow fetching intermediate values and returns "" for such cases >>> _nested_get(nested_dict, ["name", "@value"]) ['', ''] """ def _nested_get_recursive(nested_dict, nested_keys): try: curr_obj = nested_dict type_curr_obj = type(curr_obj) for i, key in enumerate(nested_keys): if key in curr_obj: curr_obj = curr_obj[key] type_curr_obj = type(curr_obj) else: if type_curr_obj == list: curr_obj = [ _nested_get_recursive(obj, nested_keys[i:]) for obj in curr_obj ] # If value isn't a list or string or integer elif type_curr_obj != str and type_curr_obj != int: return default # If only one element is present in the list, take it out # This ensures a flat array every time if type_curr_obj == list and len(curr_obj) == 1: curr_obj = curr_obj[0] return curr_obj except Exception: return default res = _nested_get_recursive(nested_dict, nested_keys) if type(res) != list: return [res] return res def _tokenize(x): return x.lower().replace(",", " ").split() def _get_sorting_key(origin, field): """Get value of the field from an origin for sorting origins. Here field should be a member of SORT_BY_OPTIONS. If "-" is present at the start of field then invert the value in a way that it reverses the sorting order. """ reversed = False if field[0] == "-": field = field[1:] reversed = True DATETIME_OBJ_MAX = datetime.max.replace(tzinfo=timezone.utc) DATETIME_MIN = "0001-01-01T00:00:00Z" DATE_OBJ_MAX = datetime.max DATE_MIN = "0001-01-01" if field == "score": if reversed: return -origin.get(field, 0) else: return origin.get(field, 0) if field in ["date_created", "date_modified", "date_published"]: date = datetime.strptime( _nested_get(origin, get_expansion(field), DATE_MIN)[0], "%Y-%m-%d" ) if reversed: return DATE_OBJ_MAX - date else: return date elif field in ["nb_visits"]: # unlike other options, nb_visits is of type integer if reversed: return -origin.get(field, 0) else: return origin.get(field, 0) elif field in SORT_BY_OPTIONS: date = datetime.fromisoformat( origin.get(field, DATETIME_MIN).replace("Z", "+00:00") ) if reversed: return DATETIME_OBJ_MAX - date else: return date class InMemorySearch: def __init__(self): pass def check(self): return True def deinitialize(self) -> None: if hasattr(self, "_origins"): del self._origins del self._origin_ids def initialize(self) -> None: self._origins: Dict[str, Dict[str, Any]] = defaultdict(dict) self._origin_ids: List[str] = [] def flush(self) -> None: pass _url_splitter = re.compile(r"\W") def origin_update(self, documents: Iterable[OriginDict]) -> None: for source_document in documents: document: Dict[str, Any] = dict(source_document) id_ = hash_to_hex(model.Origin(url=document["url"]).id) if "url" in document: document["_url_tokens"] = set( self._url_splitter.split(source_document["url"]) ) if "visit_types" in document: document["visit_types"] = set(source_document["visit_types"]) if "visit_types" in self._origins[id_]: document["visit_types"].update(self._origins[id_]["visit_types"]) if "nb_visits" in document: document["nb_visits"] = max( document["nb_visits"], self._origins[id_].get("nb_visits", 0) ) if "last_visit_date" in document: document["last_visit_date"] = max( datetime.fromisoformat(document["last_visit_date"]), datetime.fromisoformat( self._origins[id_] - .get("last_visit_date", "0001-01-01T00:00:00.000000Z",) + .get( + "last_visit_date", + "0001-01-01T00:00:00.000000Z", + ) .replace("Z", "+00:00") ), ).isoformat() if "snapshot_id" in document and "last_eventful_visit_date" in document: incoming_date = datetime.fromisoformat( document["last_eventful_visit_date"] ) current_date = datetime.fromisoformat( self._origins[id_] - .get("last_eventful_visit_date", "0001-01-01T00:00:00Z",) + .get( + "last_eventful_visit_date", + "0001-01-01T00:00:00Z", + ) .replace("Z", "+00:00") ) incoming_snapshot_id = document["snapshot_id"] current_snapshot_id = self._origins[id_].get("snapshot_id", "") if ( incoming_snapshot_id == current_snapshot_id or incoming_date < current_date ): # update not required so override the incoming_values document["snapshot_id"] = current_snapshot_id document["last_eventful_visit_date"] = current_date.isoformat() if "last_revision_date" in document: document["last_revision_date"] = max( datetime.fromisoformat(document["last_revision_date"]), datetime.fromisoformat( self._origins[id_] - .get("last_revision_date", "0001-01-01T00:00:00Z",) + .get( + "last_revision_date", + "0001-01-01T00:00:00Z", + ) .replace("Z", "+00:00") ), ).isoformat() if "last_release_date" in document: document["last_release_date"] = max( datetime.fromisoformat(document["last_release_date"]), datetime.fromisoformat( self._origins[id_] - .get("last_release_date", "0001-01-01T00:00:00Z",) + .get( + "last_release_date", + "0001-01-01T00:00:00Z", + ) .replace("Z", "+00:00") ), ).isoformat() if "intrinsic_metadata" in document: intrinsic_metadata = document["intrinsic_metadata"] for date_field in ["dateCreated", "dateModified", "datePublished"]: if date_field in intrinsic_metadata: date = intrinsic_metadata[date_field] # If date{Created,Modified,Published} value isn't parsable # It gets rejected and isn't stored (unlike other fields) formatted_date = parse_and_format_date(date) if formatted_date is None: intrinsic_metadata.pop(date_field) else: intrinsic_metadata[date_field] = formatted_date document["intrinsic_metadata"] = codemeta.expand(intrinsic_metadata) if len(document["intrinsic_metadata"]) != 1: continue metadata = document["intrinsic_metadata"][0] if "http://schema.org/license" in metadata: metadata["http://schema.org/license"] = [ {"@id": license["@id"].lower()} for license in metadata["http://schema.org/license"] ] if "http://schema.org/programmingLanguage" in metadata: metadata["http://schema.org/programmingLanguage"] = [ {"@value": license["@value"].lower()} for license in metadata["http://schema.org/programmingLanguage"] ] self._origins[id_].update(document) if id_ not in self._origin_ids: self._origin_ids.append(id_) def origin_search( self, *, query: str = "", url_pattern: Optional[str] = None, metadata_pattern: Optional[str] = None, with_visit: bool = False, visit_types: Optional[List[str]] = None, min_nb_visits: int = 0, min_last_visit_date: str = "", min_last_eventful_visit_date: str = "", min_last_revision_date: str = "", min_last_release_date: str = "", min_date_created: str = "", min_date_modified: str = "", min_date_published: str = "", programming_languages: Optional[List[str]] = None, licenses: Optional[List[str]] = None, keywords: Optional[List[str]] = None, sort_by: Optional[List[str]] = None, page_token: Optional[str] = None, limit: int = 50, ) -> PagedResult[MinimalOriginDict]: hits = self._get_hits() if url_pattern: tokens = set(self._url_splitter.split(url_pattern)) def predicate(match): missing_tokens = tokens - match["_url_tokens"] if len(missing_tokens) == 0: return True elif len(missing_tokens) > 1: return False else: # There is one missing token, look up by prefix. (missing_token,) = missing_tokens return any( token.startswith(missing_token) for token in match["_url_tokens"] ) hits = filter(predicate, hits) if metadata_pattern: metadata_pattern_words = set( _words_regexp.findall(metadata_pattern.lower()) ) def predicate(match): if "intrinsic_metadata" not in match: return False return metadata_pattern_words.issubset( _dict_words_set(match["intrinsic_metadata"]) ) hits = filter(predicate, hits) if not url_pattern and not metadata_pattern: raise ValueError( "At least one of url_pattern and metadata_pattern must be provided." ) next_page_token: Optional[str] = None if with_visit: hits = filter(lambda o: o.get("has_visits"), hits) if min_nb_visits: hits = filter(lambda o: o.get("nb_visits", 0) >= min_nb_visits, hits) if min_last_visit_date: hits = filter( lambda o: datetime.fromisoformat( o.get("last_visit_date", "0001-01-01T00:00:00Z").replace( "Z", "+00:00" ) ) >= datetime.fromisoformat(min_last_visit_date), hits, ) if min_last_eventful_visit_date: hits = filter( lambda o: datetime.fromisoformat( o.get("last_eventful_visit_date", "0001-01-01T00:00:00Z").replace( "Z", "+00:00" ) ) >= datetime.fromisoformat(min_last_eventful_visit_date), hits, ) if min_last_revision_date: hits = filter( lambda o: datetime.fromisoformat( o.get("last_revision_date", "0001-01-01T00:00:00Z").replace( "Z", "+00:00" ) ) >= datetime.fromisoformat(min_last_revision_date), hits, ) if min_last_release_date: hits = filter( lambda o: datetime.fromisoformat( o.get("last_release_date", "0001-01-01T00:00:00Z").replace( "Z", "+00:00" ) ) >= datetime.fromisoformat(min_last_release_date), hits, ) if min_date_created: min_date_created_obj = datetime.strptime(min_date_created, "%Y-%m-%d") hits = filter( lambda o: datetime.strptime( _nested_get(o, get_expansion("date_created"))[0], "%Y-%m-%d" ) >= min_date_created_obj, hits, ) if min_date_modified: min_date_modified_obj = datetime.strptime(min_date_modified, "%Y-%m-%d") hits = filter( lambda o: datetime.strptime( _nested_get(o, get_expansion("date_modified"))[0], "%Y-%m-%d" ) >= min_date_modified_obj, hits, ) if min_date_published: min_date_published_obj = datetime.strptime(min_date_published, "%Y-%m-%d") hits = filter( lambda o: datetime.strptime( _nested_get(o, get_expansion("date_published"))[0], "%Y-%m-%d" ) >= min_date_published_obj, hits, ) if licenses: queried_licenses = [license_keyword.lower() for license_keyword in licenses] hits = filter( lambda o: any( # If any of the queried licenses are found, include the origin any( # returns True if queried_license_keyword is found # in any of the licenses of the origin queried_license_keyword in origin_license for origin_license in _nested_get(o, get_expansion("licenses")) ) for queried_license_keyword in queried_licenses ), hits, ) if programming_languages: queried_programming_languages = [ lang_keyword.lower() for lang_keyword in programming_languages ] hits = filter( lambda o: any( # If any of the queried languages are found, include the origin any( # returns True if queried_lang_keyword is found # in any of the langs of the origin queried_lang_keyword in origin_lang for origin_lang in _nested_get( o, get_expansion("programming_languages") ) ) for queried_lang_keyword in queried_programming_languages ), hits, ) if keywords: if sort_by: sort_by.append("-score") else: sort_by = ["-score"] from copy import deepcopy hits_list = deepcopy(list(hits)) for origin in hits_list: origin_keywords = [ _tokenize(keyword) for keyword in _nested_get(origin, get_expansion("keywords")) ] origin_descriptions = [ _tokenize(description) for description in _nested_get( origin, get_expansion("descriptions") ) ] for q_keyword in keywords: for origin_keyword_tokens in origin_keywords: if q_keyword in origin_keyword_tokens: origin["score"] = origin.get("score", 0) + 2 for origin_description_token in origin_descriptions: if q_keyword in origin_description_token: origin["score"] = origin.get("score", 0) + 1 hits = (origin for origin in hits_list if origin.get("score", 0) > 0) if visit_types is not None: visit_types_set = set(visit_types) hits = filter( lambda o: visit_types_set.intersection(o.get("visit_types", set())), hits, ) hits_list = list(hits) if sort_by: sort_by_list = list(sort_by) hits_list.sort( key=lambda o: tuple( _get_sorting_key(o, field) for field in sort_by_list ) ) start_at_index = int(page_token) if page_token else 0 origins = [ {"url": hit["url"]} for hit in hits_list[start_at_index : start_at_index + limit] ] if len(origins) == limit: next_page_token = str(start_at_index + limit) assert len(origins) <= limit - return PagedResult(results=origins, next_page_token=next_page_token,) + return PagedResult( + results=origins, + next_page_token=next_page_token, + ) def visit_types_count(self) -> Counter: hits = self._get_hits() return Counter(chain(*[hit.get("visit_types", []) for hit in hits])) def _get_hits(self) -> Iterator[Dict[str, Any]]: return ( self._origins[id_] for id_ in self._origin_ids if not self._origins[id_].get("blocklisted") ) diff --git a/swh/search/interface.py b/swh/search/interface.py index 03d148e..bdad43e 100644 --- a/swh/search/interface.py +++ b/swh/search/interface.py @@ -1,142 +1,137 @@ # Copyright (C) 2020-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from collections import Counter from typing import Iterable, List, Optional, TypeVar from typing_extensions import TypedDict from swh.core.api import remote_api_endpoint from swh.core.api.classes import PagedResult as CorePagedResult TResult = TypeVar("TResult") PagedResult = CorePagedResult[TResult, str] SORT_BY_OPTIONS = [ "nb_visits", "last_visit_date", "last_eventful_visit_date", "last_revision_date", "last_release_date", "date_created", "date_modified", "date_published", ] class MinimalOriginDict(TypedDict): """Mandatory keys of an :class:`OriginDict`""" url: str class OriginDict(MinimalOriginDict, total=False): """Argument passed to :meth:`SearchInterface.origin_update`.""" visit_types: List[str] has_visits: bool class SearchInterface: @remote_api_endpoint("check") def check(self): - """Dedicated method to execute some specific check per implementation. - - """ + """Dedicated method to execute some specific check per implementation.""" ... @remote_api_endpoint("flush") def flush(self) -> None: """Blocks until all previous calls to _update() are completely applied. """ ... @remote_api_endpoint("origin/update") def origin_update(self, documents: Iterable[OriginDict]) -> None: - """Persist documents to the search backend. - - """ + """Persist documents to the search backend.""" ... @remote_api_endpoint("origin/search") def origin_search( self, *, query: str = "", url_pattern: Optional[str] = None, metadata_pattern: Optional[str] = None, with_visit: bool = False, visit_types: Optional[List[str]] = None, min_nb_visits: int = 0, min_last_visit_date: str = "", min_last_eventful_visit_date: str = "", min_last_revision_date: str = "", min_last_release_date: str = "", min_date_created: str = "", min_date_modified: str = "", min_date_published: str = "", programming_languages: Optional[List[str]] = None, licenses: Optional[List[str]] = None, keywords: Optional[List[str]] = None, sort_by: Optional[List[str]] = None, page_token: Optional[str] = None, limit: int = 50, ) -> PagedResult[MinimalOriginDict]: """Searches for origins matching the `url_pattern`. Args: query: Find origins according the queries written as per the swh-search query language syntax. url_pattern: Part of the URL to search for metadata_pattern: Keywords to look for (across all the fields of intrinsic_metadata) with_visit: Whether origins with no visits are to be filtered out visit_types: Only origins having any of the provided visit types (e.g. git, svn, pypi) will be returned min_nb_visits: Filter origins that have number of visits >= the provided value min_last_visit_date: Filter origins that have last_visit_date on or after the provided date(ISO format) min_last_eventful_visit_date: Filter origins that have last_eventful_visit_date (eventful = snapshot_id changed) on or after the provided date(ISO format) min_last_revision_date: Filter origins that have last_revision_date on or after the provided date(ISO format) min_last_release_date: Filter origins that have last_release_date on or after the provided date(ISO format) min_date_created: Filter origins that have date_created from intrinsic_metadata on or after the provided date min_date_modified: Filter origins that have date_modified from intrinsic_metadata on or after the provided date min_date_published: Filter origins that have date_published from intrinsic_metadata on or after the provided date programming_languages: Filter origins with programming languages present in the given list (based on instrinsic_metadata) licenses: Filter origins with licenses present in the given list (based on instrinsic_metadata) keywords: Filter origins having description/keywords (extracted from instrinsic_metadata) that match given values sort_by: Sort results based on a list of fields mentioned in SORT_BY_OPTIONS (nb_visits,last_visit_date, last_eventful_visit_date, last_revision_date, last_release_date). Return results in descending order if "-" is present at the beginning otherwise in ascending order. page_token: Opaque value used for pagination limit: number of results to return Returns: PagedResult of origin dicts matching the search criteria. If next_page_token is None, there is no longer data to retrieve. """ ... @remote_api_endpoint("visit_types_count") def visit_types_count(self) -> Counter: - """Returns origin counts per visit type (git, hg, svn, ...). - """ + """Returns origin counts per visit type (git, hg, svn, ...).""" ... diff --git a/swh/search/journal_client.py b/swh/search/journal_client.py index 2884e10..a3c6e40 100644 --- a/swh/search/journal_client.py +++ b/swh/search/journal_client.py @@ -1,131 +1,134 @@ # Copyright (C) 2018-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import logging import sys from typing import Dict, Optional from swh.model.model import TargetType from swh.storage.algos.snapshot import snapshot_get_all_branches from swh.storage.interface import StorageInterface EXPECTED_MESSAGE_TYPES = { "origin", "origin_visit_status", "origin_intrinsic_metadata", } def fetch_last_revision_release_date( snapshot_id: bytes, storage: StorageInterface ) -> Dict[str, str]: if "pytest" not in sys.modules: # FIXME: This function is too slow to be reasonably used in the journal-client # (at least the main one), we need to figure out a solution before this can # be enabled again. return {} if not snapshot_id: return {} snapshot = snapshot_get_all_branches(storage, snapshot_id) if not snapshot: return {} branches = snapshot.branches.values() tip_revision_ids = [] tip_release_ids = [] for branch in branches: if branch.target_type == TargetType.REVISION: tip_revision_ids.append(branch.target) elif branch.target_type == TargetType.RELEASE: tip_release_ids.append(branch.target) revision_datetimes = [ revision.date.to_datetime() for revision in storage.revision_get(tip_revision_ids) if revision and revision.date ] release_datetimes = [ release.date.to_datetime() for release in storage.release_get(tip_release_ids) if release and release.date ] ret = {} if revision_datetimes: ret["last_revision_date"] = max(revision_datetimes).isoformat() if release_datetimes: ret["last_release_date"] = max(release_datetimes).isoformat() return ret def process_journal_objects(messages, *, search, storage=None): """Worker function for `JournalClient.process(worker_fn)`, after currification of `scheduler` and `task_names`.""" assert set(messages) <= EXPECTED_MESSAGE_TYPES, set(messages) if "origin" in messages: process_origins(messages["origin"], search) if "origin_visit_status" in messages: process_origin_visit_statuses(messages["origin_visit_status"], search, storage) if "origin_intrinsic_metadata" in messages: process_origin_intrinsic_metadata(messages["origin_intrinsic_metadata"], search) def process_origins(origins, search): logging.debug("processing origins %r", origins) search.origin_update(origins) def process_origin_visit_statuses(visit_statuses, search, storage): logging.debug("processing origin visit statuses %r", visit_statuses) def hexify(b: Optional[bytes]) -> Optional[str]: if b is None: return None return b.hex() processed_visit_statuses = [] for visit_status in visit_statuses: processed_status = { "url": visit_status["origin"], "visit_types": [visit_status["type"]], } if visit_status["status"] == "full": processed_status.update( { "has_visits": True, "nb_visits": visit_status["visit"], "snapshot_id": hexify(visit_status.get("snapshot")), "last_visit_date": visit_status["date"].isoformat(), "last_eventful_visit_date": visit_status["date"].isoformat(), **fetch_last_revision_release_date( visit_status.get("snapshot"), storage ), } ) processed_visit_statuses.append(processed_status) if processed_visit_statuses: search.origin_update(processed_visit_statuses) def process_origin_intrinsic_metadata(origin_metadata, search): logging.debug("processing origin intrinsic_metadata %r", origin_metadata) origin_metadata = [ - {"url": item["id"], "intrinsic_metadata": item["metadata"],} + { + "url": item["id"], + "intrinsic_metadata": item["metadata"], + } for item in origin_metadata ] search.origin_update(origin_metadata) diff --git a/swh/search/metrics.py b/swh/search/metrics.py index f8afee0..fb7e964 100644 --- a/swh/search/metrics.py +++ b/swh/search/metrics.py @@ -1,63 +1,61 @@ # Copyright (C) 2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from functools import wraps import logging from swh.core.statsd import statsd OPERATIONS_METRIC = "swh_search_operations_total" DURATION_METRIC = "swh_search_request_duration_seconds" def timed(f): - """Time that function! - - """ + """Time that function!""" @wraps(f) def d(*a, **kw): with statsd.timed(DURATION_METRIC, tags={"endpoint": f.__name__}): return f(*a, **kw) return d def send_metric(metric: str, count: int, method_name: str) -> bool: """Send statsd metric with count for method `method_name` If count is 0, the metric is discarded. If the metric is not parseable, the metric is discarded with a log message. Args: metric: Metric's name (e.g content:add, content:add:bytes) count: Associated value for the metric method_name: Method's name Returns: Bool to explicit if metric has been set or not """ if count == 0: return False metric_type = metric.split(":") _length = len(metric_type) if _length == 2: object_type, operation = metric_type metric_name = OPERATIONS_METRIC else: logging.warning("Skipping unknown metric {%s: %s}" % (metric, count)) return False statsd.increment( metric_name, count, tags={ "endpoint": method_name, "object_type": object_type, "operation": operation, }, ) return True diff --git a/swh/search/tests/test_api_client.py b/swh/search/tests/test_api_client.py index c8cf385..68f01e1 100644 --- a/swh/search/tests/test_api_client.py +++ b/swh/search/tests/test_api_client.py @@ -1,64 +1,67 @@ # Copyright (C) 2019-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import unittest import pytest from swh.core.api.tests.server_testing import ServerTestFixture from swh.search import get_search from swh.search.api.server import app from .test_elasticsearch import CommonElasticsearchSearchTest class TestRemoteSearch( CommonElasticsearchSearchTest, ServerTestFixture, unittest.TestCase ): @pytest.fixture(autouse=True) def _instantiate_search(self, elasticsearch_host): self._elasticsearch_host = elasticsearch_host def setUp(self): self.config = { "search": { "cls": "elasticsearch", "args": { "hosts": [self._elasticsearch_host], "indexes": { "origin": { "index": "test", "read_alias": "test-read", "write_alias": "test-write", } }, }, } } self.app = app super().setUp() self.reset() - self.search = get_search("remote", url=self.url(),) + self.search = get_search( + "remote", + url=self.url(), + ) def reset(self): search = get_search( "elasticsearch", hosts=[self._elasticsearch_host], indexes={ "origin": { "index": "test", "read_alias": "test-read", "write_alias": "test-write", } }, ) search.deinitialize() search.initialize() @pytest.mark.skip( "Elasticsearch also returns close matches, so this test would fail" ) def test_origin_url_paging(self, count): pass diff --git a/swh/search/tests/test_cli.py b/swh/search/tests/test_cli.py index d4d2392..df25d8d 100644 --- a/swh/search/tests/test_cli.py +++ b/swh/search/tests/test_cli.py @@ -1,397 +1,398 @@ # Copyright (C) 2019-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import copy from datetime import datetime, timezone import tempfile from click.testing import CliRunner from confluent_kafka import Producer import pytest import yaml from swh.journal.serializers import value_to_kafka from swh.model.hashutil import hash_to_bytes from swh.search import get_search from swh.search.cli import search_cli_group CLI_CONFIG = """ search: cls: elasticsearch hosts: - '%(elasticsearch_host)s' indexes: origin: index: test read_alias: test-read write_alias: test-write storage: cls: memory """ JOURNAL_OBJECTS_CONFIG_TEMPLATE = """ journal: brokers: - {broker} prefix: {prefix} group_id: {group_id} """ def invoke(catch_exceptions, args, config="", *, elasticsearch_host): runner = CliRunner() with tempfile.NamedTemporaryFile("a", suffix=".yml") as config_fd: config_fd.write( (CLI_CONFIG + config) % {"elasticsearch_host": elasticsearch_host} ) config_fd.seek(0) result = runner.invoke(search_cli_group, ["-C" + config_fd.name] + args) if not catch_exceptions and result.exception: print(result.output) raise result.exception return result def test__journal_client__origin( swh_search, elasticsearch_host: str, kafka_prefix: str, kafka_server ): """Tests the re-indexing when origin_batch_size*task_batch_size is a divisor of nb_origins.""" producer = Producer( { "bootstrap.servers": kafka_server, "client.id": "test search origin producer", "acks": "all", } ) origin_foobar_baz = { "url": "http://foobar.baz", } value = value_to_kafka(origin_foobar_baz) topic = f"{kafka_prefix}.origin" producer.produce(topic=topic, key=b"bogus-origin", value=value) producer.flush() journal_objects_config = JOURNAL_OBJECTS_CONFIG_TEMPLATE.format( broker=kafka_server, prefix=kafka_prefix, group_id="test-consumer" ) result = invoke( False, [ "journal-client", "objects", "--stop-after-objects", "1", "--object-type", "origin", "--prefix", kafka_prefix, ], journal_objects_config, elasticsearch_host=elasticsearch_host, ) # Check the output expected_output = "Processed 1 messages.\nDone.\n" assert result.exit_code == 0, result.output assert result.output == expected_output swh_search.flush() # searching origin without visit as requirement actual_page = swh_search.origin_search(url_pattern="foobar") # We find it assert actual_page.next_page_token is None assert actual_page.results == [origin_foobar_baz] # It's an origin with no visit, searching for it with visit actual_page = swh_search.origin_search(url_pattern="foobar", with_visit=True) # returns nothing assert actual_page.next_page_token is None assert actual_page.results == [] def test__journal_client__origin_visit_status( swh_search, elasticsearch_host, kafka_prefix: str, kafka_server ): - """Subscribing to origin-visit-status should result in swh-search indexation - - """ + """Subscribing to origin-visit-status should result in swh-search indexation""" origin_foobar = {"url": "http://baz.foobar"} producer = Producer( { "bootstrap.servers": kafka_server, "client.id": "test search origin visit status producer", "acks": "all", } ) topic = f"{kafka_prefix}.origin_visit_status" value = value_to_kafka( { "origin": origin_foobar["url"], "visit": 1, "type": "git", "date": datetime.now(tz=timezone.utc), "snapshot": None, "status": "full", } ) producer.produce(topic=topic, key=b"bogus-origin-visit-status", value=value) producer.flush() journal_objects_config = JOURNAL_OBJECTS_CONFIG_TEMPLATE.format( broker=kafka_server, prefix=kafka_prefix, group_id="test-consumer" ) result = invoke( False, [ "journal-client", "objects", "--stop-after-objects", "1", "--prefix", kafka_prefix, "--object-type", "origin_visit_status", ], journal_objects_config, elasticsearch_host=elasticsearch_host, ) # Check the output expected_output = "Processed 1 messages.\nDone.\n" assert result.exit_code == 0, result.output assert result.output == expected_output swh_search.flush() # Both search returns the visit actual_page = swh_search.origin_search(url_pattern="foobar", with_visit=False) assert actual_page.next_page_token is None assert actual_page.results == [origin_foobar] actual_page = swh_search.origin_search(url_pattern="foobar", with_visit=True) assert actual_page.next_page_token is None assert actual_page.results == [origin_foobar] def test__journal_client__origin_intrinsic_metadata( swh_search, elasticsearch_host, kafka_prefix: str, kafka_server ): - """Subscribing to origin-intrinsic-metadata should result in swh-search indexation - - """ + """Subscribing to origin-intrinsic-metadata should result in swh-search indexation""" origin_foobar = {"url": "https://github.com/clojure/clojure"} origin_intrinsic_metadata = { "id": origin_foobar["url"], "metadata": { "name": "clojure", "type": "SoftwareSourceCode", "license": "http://opensource.org/licenses/eclipse-1.0.php", "version": "1.10.2-master-SNAPSHOT", "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "identifier": "org.clojure", "description": "Clojure core environment and runtime library.", "codeRepository": "https://repo.maven.apache.org/maven2/org/clojure/clojure", # noqa }, "indexer_configuration_id": 1, "from_revision": hash_to_bytes("f47c139e20970ee0852166f48ee2a4626632b86e"), "mappings": ["maven"], } producer = Producer( { "bootstrap.servers": kafka_server, "client.id": "test search origin intrinsic metadata producer", "acks": "all", } ) topic = f"{kafka_prefix}.origin_intrinsic_metadata" value = value_to_kafka(origin_intrinsic_metadata) producer.produce(topic=topic, key=b"bogus-origin-intrinsic-metadata", value=value) producer.flush() journal_objects_config = JOURNAL_OBJECTS_CONFIG_TEMPLATE.format( broker=kafka_server, prefix=kafka_prefix, group_id="test-consumer" ) result = invoke( False, [ "journal-client", "objects", "--stop-after-objects", "1", "--object-type", "origin_intrinsic_metadata", ], journal_objects_config, elasticsearch_host=elasticsearch_host, ) # Check the output expected_output = "Processed 1 messages.\nDone.\n" assert result.exit_code == 0, result.output assert result.output == expected_output swh_search.flush() # search without visit returns the metadata actual_page = swh_search.origin_search(url_pattern="clojure", with_visit=False) assert actual_page.next_page_token is None assert actual_page.results == [origin_foobar] # no visit associated so it does not return anything actual_page = swh_search.origin_search(url_pattern="clojure", with_visit=True) assert actual_page.next_page_token is None assert actual_page.results == [] def test__journal_client__missing_main_journal_config_key(elasticsearch_host): """Missing configuration on journal should raise""" with pytest.raises(KeyError, match="journal"): invoke( catch_exceptions=False, - args=["journal-client", "objects", "--stop-after-objects", "1",], + args=[ + "journal-client", + "objects", + "--stop-after-objects", + "1", + ], config="", # missing config will make it raise elasticsearch_host=elasticsearch_host, ) def test__journal_client__missing_journal_config_keys(elasticsearch_host): """Missing configuration on mandatory journal keys should raise""" kafka_prefix = "swh.journal.objects" journal_objects_config = JOURNAL_OBJECTS_CONFIG_TEMPLATE.format( broker="192.0.2.1", prefix=kafka_prefix, group_id="test-consumer" ) journal_config = yaml.safe_load(journal_objects_config) for key in journal_config["journal"].keys(): if key == "prefix": # optional continue cfg = copy.deepcopy(journal_config) del cfg["journal"][key] # make config incomplete yaml_cfg = yaml.dump(cfg) with pytest.raises(TypeError, match=f"{key}"): invoke( catch_exceptions=False, args=[ "journal-client", "objects", "--stop-after-objects", "1", "--prefix", kafka_prefix, "--object-type", "origin_visit_status", ], config=yaml_cfg, # incomplete config will make the cli raise elasticsearch_host=elasticsearch_host, ) def test__journal_client__missing_prefix_config_key( swh_search, elasticsearch_host, kafka_server ): """Missing configuration on mandatory prefix key should raise""" journal_cfg_template = """ journal: brokers: - {broker} group_id: {group_id} """ journal_cfg = journal_cfg_template.format( broker=kafka_server, group_id="test-consumer" ) with pytest.raises(ValueError, match="prefix"): invoke( False, # Missing --prefix (and no config key) will make the cli raise [ "journal-client", "objects", "--stop-after-objects", "1", "--object-type", "origin_visit_status", ], journal_cfg, elasticsearch_host=elasticsearch_host, ) def test__journal_client__missing_object_types_config_key( swh_search, elasticsearch_host, kafka_server ): """Missing configuration on mandatory object-types key should raise""" journal_cfg_template = """ journal: brokers: - {broker} prefix: swh.journal.objects group_id: {group_id} """ journal_cfg = journal_cfg_template.format( broker=kafka_server, group_id="test-consumer" ) with pytest.raises(ValueError, match="object_types"): invoke( False, # Missing --object-types (and no config key) will make the cli raise ["journal-client", "objects", "--stop-after-objects", "1"], journal_cfg, elasticsearch_host=elasticsearch_host, ) def test__initialize__with_index_name(elasticsearch_host): """Initializing the index with an index name should create the right index""" search = get_search( "elasticsearch", hosts=[elasticsearch_host], indexes={"origin": {"index": "test"}}, ) assert search._get_origin_index() == "test" assert search._get_origin_read_alias() == "origin-read" assert search._get_origin_write_alias() == "origin-write" def test__initialize__with_read_alias(elasticsearch_host): """Initializing the index with a search alias name should create - the right search alias""" + the right search alias""" search = get_search( "elasticsearch", hosts=[elasticsearch_host], indexes={"origin": {"read_alias": "test"}}, ) assert search._get_origin_index() == "origin" assert search._get_origin_read_alias() == "test" assert search._get_origin_write_alias() == "origin-write" def test__initialize__with_write_alias(elasticsearch_host): """Initializing the index with an indexing alias name should create - the right indexing alias""" + the right indexing alias""" search = get_search( "elasticsearch", hosts=[elasticsearch_host], indexes={"origin": {"write_alias": "test"}}, ) assert search._get_origin_index() == "origin" assert search._get_origin_read_alias() == "origin-read" assert search._get_origin_write_alias() == "test" diff --git a/swh/search/tests/test_elasticsearch.py b/swh/search/tests/test_elasticsearch.py index 147317d..4483afd 100644 --- a/swh/search/tests/test_elasticsearch.py +++ b/swh/search/tests/test_elasticsearch.py @@ -1,281 +1,286 @@ # Copyright (C) 2019-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from datetime import datetime, timedelta, timezone from textwrap import dedent import types import unittest from elasticsearch.helpers.errors import BulkIndexError import pytest from swh.search.exc import SearchQuerySyntaxError from swh.search.metrics import OPERATIONS_METRIC from .test_search import CommonSearchTest now = datetime.now(tz=timezone.utc).isoformat() now_minus_5_days = (datetime.now(tz=timezone.utc) - timedelta(days=5)).isoformat() now_plus_5_days = (datetime.now(tz=timezone.utc) + timedelta(days=5)).isoformat() ORIGINS = [ { "url": "http://foobar.1.com", "nb_visits": 1, "last_visit_date": now_minus_5_days, "last_eventful_visit_date": now_minus_5_days, }, { "url": "http://foobar.2.com", "nb_visits": 2, "last_visit_date": now, "last_eventful_visit_date": now, }, { "url": "http://foobar.3.com", "nb_visits": 3, "last_visit_date": now_plus_5_days, "last_eventful_visit_date": now_minus_5_days, }, { "url": "http://barbaz.4.com", "nb_visits": 3, "last_visit_date": now_plus_5_days, "last_eventful_visit_date": now_minus_5_days, }, ] class CommonElasticsearchSearchTest(CommonSearchTest): """Tests shared between this module (direct ES backend test) and test_api_client.py (ES backend via HTTP test)""" def test_sort_by_and_limit_query(self): self.search.origin_update(ORIGINS) self.search.flush() def _check_results(query, origin_indices): page = self.search.origin_search(url_pattern="foobar", query=query) results = [r["url"] for r in page.results] assert results == [ORIGINS[index]["url"] for index in origin_indices] _check_results("sort_by = [-visits]", [2, 1, 0]) _check_results("sort_by = [last_visit]", [0, 1, 2]) _check_results("sort_by = [-last_eventful_visit, visits]", [1, 0, 2]) _check_results("sort_by = [last_eventful_visit,-last_visit]", [2, 0, 1]) _check_results("sort_by = [-visits] limit = 1", [2]) _check_results("sort_by = [last_visit] and limit = 2", [0, 1]) _check_results("sort_by = [-last_eventful_visit, visits] limit = 3", [1, 0, 2]) def test_search_ql_simple(self): self.search.origin_update(ORIGINS) self.search.flush() results = { r["url"] for r in self.search.origin_search(query='origin : "foobar"').results } assert results == { "http://foobar.1.com", "http://foobar.2.com", "http://foobar.3.com", } def test_search_ql_datetimes(self): self.search.origin_update(ORIGINS) self.search.flush() now_minus_5_minutes = ( datetime.now(tz=timezone.utc) - timedelta(minutes=5) ).isoformat() now_plus_5_minutes = ( datetime.now(tz=timezone.utc) + timedelta(minutes=5) ).isoformat() results = { r["url"] for r in self.search.origin_search( query=( f"last_visit < {now_minus_5_minutes} " f"or last_visit > {now_plus_5_minutes}" ) ).results } assert results == { "http://foobar.1.com", "http://foobar.3.com", "http://barbaz.4.com", } def test_search_ql_dates(self): self.search.origin_update(ORIGINS) self.search.flush() now_minus_2_days = ( (datetime.now(tz=timezone.utc) - timedelta(days=2)).date().isoformat() ) now_plus_2_days = ( (datetime.now(tz=timezone.utc) + timedelta(days=2)).date().isoformat() ) results = { r["url"] for r in self.search.origin_search( query=( f"last_visit < {now_minus_2_days} " f"or last_visit > {now_plus_2_days}" ) ).results } assert results == { "http://foobar.1.com", "http://foobar.3.com", "http://barbaz.4.com", } def test_search_ql_visited(self): self.search.origin_update( [ { "url": "http://foobar.1.com", "has_visits": True, "nb_visits": 1, "last_visit_date": now_minus_5_days, "last_eventful_visit_date": now_minus_5_days, }, - {"url": "http://foobar.2.com",}, - {"url": "http://foobar.3.com", "has_visits": False,}, + { + "url": "http://foobar.2.com", + }, + { + "url": "http://foobar.3.com", + "has_visits": False, + }, ] ) self.search.flush() assert { r["url"] for r in self.search.origin_search(query="visited = true").results } == {"http://foobar.1.com"} assert { r["url"] for r in self.search.origin_search(query="visited = false").results } == {"http://foobar.2.com", "http://foobar.3.com"} assert ( self.search.origin_search( query="visited = true and visited = false" ).results == [] ) assert ( self.search.origin_search(query="visited = false", with_visit=True).results == [] ) def test_query_syntax_error(self): self.search.origin_update(ORIGINS) self.search.flush() with pytest.raises(SearchQuerySyntaxError): self.search.origin_search(query="foobar") class TestElasticsearchSearch(CommonElasticsearchSearchTest, unittest.TestCase): @pytest.fixture(autouse=True) def _instantiate_search(self, swh_search, elasticsearch_host, mocker): self._elasticsearch_host = elasticsearch_host self.search = swh_search self.mocker = mocker # override self.search.origin_update to catch painless script errors # and pretty print them origin_update = self.search.origin_update def _origin_update(self, *args, **kwargs): script_error = False error_detail = "" try: origin_update(*args, **kwargs) except BulkIndexError as e: error = e.errors[0].get("update", {}).get("error", {}).get("caused_by") if error and "script_stack" in error: script_error = True error_detail = dedent( f""" Painless update script failed ({error.get('reason')}). error type: {error.get('caused_by', {}).get('type')} error reason: {error.get('caused_by', {}).get('reason')} script stack: """ ) error_detail += "\n".join(error["script_stack"]) else: raise e assert script_error is False, error_detail[1:] self.search.origin_update = types.MethodType(_origin_update, self.search) def reset(self): self.search.deinitialize() self.search.initialize() def test_metrics_update_duration(self): mock = self.mocker.patch("swh.search.metrics.statsd.timing") for url in ["http://foobar.bar", "http://foobar.baz"]: self.search.origin_update([{"url": url}]) assert mock.call_count == 2 def test_metrics_search_duration(self): mock = self.mocker.patch("swh.search.metrics.statsd.timing") for url_pattern in ["foobar", "foobaz"]: self.search.origin_search(url_pattern=url_pattern, with_visit=True) assert mock.call_count == 2 def test_metrics_indexation_counters(self): mock_es = self.mocker.patch("elasticsearch.helpers.bulk") mock_es.return_value = 2, ["error"] mock_metrics = self.mocker.patch("swh.search.metrics.statsd.increment") self.search.origin_update([{"url": "http://foobar.baz"}]) assert mock_metrics.call_count == 2 mock_metrics.assert_any_call( OPERATIONS_METRIC, 2, tags={ "endpoint": "origin_update", "object_type": "document", "operation": "index", }, ) mock_metrics.assert_any_call( OPERATIONS_METRIC, 1, tags={ "endpoint": "origin_update", "object_type": "document", "operation": "index_error", }, ) def test_write_alias_usage(self): mock = self.mocker.patch("elasticsearch.helpers.bulk") mock.return_value = 2, ["result"] self.search.origin_update([{"url": "http://foobar.baz"}]) assert mock.call_args[1]["index"] == "test-write" def test_read_alias_usage(self): mock = self.mocker.patch("elasticsearch.Elasticsearch.search") mock.return_value = {"hits": {"hits": []}} self.search.origin_search(url_pattern="foobar.baz") assert mock.call_args[1]["index"] == "test-read" diff --git a/swh/search/tests/test_journal_client.py b/swh/search/tests/test_journal_client.py index f1bd668..6c21531 100644 --- a/swh/search/tests/test_journal_client.py +++ b/swh/search/tests/test_journal_client.py @@ -1,291 +1,319 @@ # Copyright (C) 2019-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from datetime import datetime, timedelta, timezone import functools from unittest.mock import MagicMock import pytest from swh.model.hashutil import hash_to_bytes from swh.model.model import ( ObjectType, Person, Release, Revision, RevisionType, Snapshot, SnapshotBranch, TargetType, TimestampWithTimezone, ) from swh.search.journal_client import ( fetch_last_revision_release_date, process_journal_objects, ) from swh.storage import get_storage DATES = [ TimestampWithTimezone.from_datetime( datetime(2009, 2, 14, 1, 31, 31, tzinfo=timezone(timedelta(hours=2))) ), TimestampWithTimezone.from_datetime( datetime(2009, 2, 14, 1, 31, 32, tzinfo=timezone(timedelta(hours=2))) ), TimestampWithTimezone.from_datetime( datetime(2009, 2, 14, 1, 31, 33, tzinfo=timezone(timedelta(hours=2))) ), TimestampWithTimezone.from_datetime( datetime(2009, 2, 14, 1, 31, 34, tzinfo=timezone(timedelta(hours=2))) ), ] COMMITTERS = [ Person(fullname=b"foo", name=b"foo", email=b""), Person(fullname=b"bar", name=b"bar", email=b""), ] REVISIONS = [ Revision( message=b"revision_1_message", date=DATES[0], committer=COMMITTERS[0], author=COMMITTERS[0], committer_date=DATES[0], type=RevisionType.GIT, directory=b"\x01" * 20, synthetic=False, metadata=None, parents=( hash_to_bytes("9b918dd063cec85c2bc63cc7f167e29f5894dcbc"), hash_to_bytes("757f38bdcd8473aaa12df55357f5e2f1a318e672"), ), ), Revision( message=b"revision_2_message", date=DATES[1], committer=COMMITTERS[1], author=COMMITTERS[1], committer_date=DATES[1], type=RevisionType.MERCURIAL, directory=b"\x02" * 20, synthetic=False, metadata=None, parents=(), extra_headers=((b"foo", b"bar"),), ), Revision( message=b"revision_3_message", date=DATES[2], committer=COMMITTERS[0], author=COMMITTERS[0], committer_date=DATES[2], type=RevisionType.GIT, directory=b"\x03" * 20, synthetic=False, metadata=None, parents=(), ), ] RELEASES = [ Release( name=b"v0.0.1", date=DATES[1], author=COMMITTERS[0], target_type=ObjectType.REVISION, target=b"\x04" * 20, message=b"foo", synthetic=False, ), Release( name=b"v0.0.2", date=DATES[2], author=COMMITTERS[1], target_type=ObjectType.REVISION, target=b"\x05" * 20, message=b"bar", synthetic=False, ), Release( name=b"v0.0.3", date=DATES[3], author=COMMITTERS[1], target_type=ObjectType.REVISION, target=b"\x05" * 20, message=b"foobar", synthetic=False, ), ] SNAPSHOTS = [ Snapshot( branches={ b"target/revision1": SnapshotBranch( - target_type=TargetType.REVISION, target=REVISIONS[0].id, + target_type=TargetType.REVISION, + target=REVISIONS[0].id, ), b"target/revision2": SnapshotBranch( - target_type=TargetType.REVISION, target=REVISIONS[1].id, + target_type=TargetType.REVISION, + target=REVISIONS[1].id, ), b"target/revision3": SnapshotBranch( - target_type=TargetType.REVISION, target=REVISIONS[2].id, + target_type=TargetType.REVISION, + target=REVISIONS[2].id, ), b"target/release1": SnapshotBranch( target_type=TargetType.RELEASE, target=RELEASES[0].id ), b"target/release2": SnapshotBranch( target_type=TargetType.RELEASE, target=RELEASES[1].id ), b"target/release3": SnapshotBranch( target_type=TargetType.RELEASE, target=RELEASES[2].id ), b"target/alias": SnapshotBranch( target_type=TargetType.ALIAS, target=b"target/revision1" ), }, ), Snapshot( branches={ b"target/revision1": SnapshotBranch( - target_type=TargetType.REVISION, target=REVISIONS[0].id, + target_type=TargetType.REVISION, + target=REVISIONS[0].id, ) }, ), Snapshot( branches={ b"target/release1": SnapshotBranch( target_type=TargetType.RELEASE, target=RELEASES[0].id ) }, ), Snapshot(branches={}), ] @pytest.fixture def storage(): storage = get_storage("memory") storage.revision_add(REVISIONS) storage.release_add(RELEASES) storage.snapshot_add(SNAPSHOTS) return storage def test_journal_client_origin_from_journal(): search_mock = MagicMock() - worker_fn = functools.partial(process_journal_objects, search=search_mock,) + worker_fn = functools.partial( + process_journal_objects, + search=search_mock, + ) - worker_fn({"origin": [{"url": "http://foobar.baz"},]}) + worker_fn( + { + "origin": [ + {"url": "http://foobar.baz"}, + ] + } + ) search_mock.origin_update.assert_called_once_with( - [{"url": "http://foobar.baz"},] + [ + {"url": "http://foobar.baz"}, + ] ) search_mock.reset_mock() - worker_fn({"origin": [{"url": "http://foobar.baz"}, {"url": "http://barbaz.qux"},]}) + worker_fn( + { + "origin": [ + {"url": "http://foobar.baz"}, + {"url": "http://barbaz.qux"}, + ] + } + ) search_mock.origin_update.assert_called_once_with( - [{"url": "http://foobar.baz"}, {"url": "http://barbaz.qux"},] + [ + {"url": "http://foobar.baz"}, + {"url": "http://barbaz.qux"}, + ] ) def test_journal_client_origin_visit_status_from_journal(storage): search_mock = MagicMock() worker_fn = functools.partial( process_journal_objects, search=search_mock, storage=storage ) current_datetime = datetime.now(tz=timezone.utc) worker_fn( { "origin_visit_status": [ { "origin": "http://foobar.baz", "status": "full", "type": "git", "visit": 5, "date": current_datetime, "snapshot": SNAPSHOTS[0].id, } # full visits ok ] } ) search_mock.origin_update.assert_called_once_with( [ { "url": "http://foobar.baz", "visit_types": ["git"], "has_visits": True, "nb_visits": 5, "snapshot_id": SNAPSHOTS[0].id.hex(), "last_visit_date": current_datetime.isoformat(), "last_eventful_visit_date": current_datetime.isoformat(), "last_revision_date": "2009-02-14T01:31:33+02:00", "last_release_date": "2009-02-14T01:31:34+02:00", }, ] ) search_mock.reset_mock() # non-full visits only set the visit_types attribute worker_fn( { "origin_visit_status": [ { "origin": "http://foobar.baz", "type": "git", "status": "partial", "visit": 5, "date": current_datetime, } ] } ) search_mock.origin_update.assert_called_once_with( [{"url": "http://foobar.baz", "visit_types": ["git"]}] ) def test_journal_client_origin_metadata_from_journal(): search_mock = MagicMock() - worker_fn = functools.partial(process_journal_objects, search=search_mock,) + worker_fn = functools.partial( + process_journal_objects, + search=search_mock, + ) worker_fn( { "origin_intrinsic_metadata": [ { "id": "http://foobar.baz", "metadata": { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "description": "foo bar", "programmingLanguage": "python", "license": "MIT", }, }, ] } ) search_mock.origin_update.assert_called_once_with( [ { "url": "http://foobar.baz", "intrinsic_metadata": { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "description": "foo bar", "programmingLanguage": "python", "license": "MIT", }, }, ] ) def test_fetch_last_revision_release_date(storage): for snapshot in SNAPSHOTS: assert fetch_last_revision_release_date(snapshot.id, storage) is not None diff --git a/swh/search/tests/test_search.py b/swh/search/tests/test_search.py index 5653685..b092cce 100644 --- a/swh/search/tests/test_search.py +++ b/swh/search/tests/test_search.py @@ -1,1235 +1,1259 @@ # Copyright (C) 2019-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from collections import Counter from datetime import datetime, timedelta, timezone from itertools import permutations from hypothesis import given, settings, strategies import pytest from swh.core.api.classes import stream_results class CommonSearchTest: def test_origin_url_unique_word_prefix(self): origin_foobar_baz = {"url": "http://foobar.baz"} origin_barbaz_qux = {"url": "http://barbaz.qux"} origin_qux_quux = {"url": "http://qux.quux"} origins = [origin_foobar_baz, origin_barbaz_qux, origin_qux_quux] self.search.origin_update(origins) self.search.flush() actual_page = self.search.origin_search(url_pattern="foobar") assert actual_page.next_page_token is None assert actual_page.results == [origin_foobar_baz] actual_page = self.search.origin_search(url_pattern="barb") assert actual_page.next_page_token is None assert actual_page.results == [origin_barbaz_qux] # 'bar' is part of 'foobar', but is not the beginning of it actual_page = self.search.origin_search(url_pattern="bar") assert actual_page.next_page_token is None assert actual_page.results == [origin_barbaz_qux] actual_page = self.search.origin_search(url_pattern="barbaz") assert actual_page.next_page_token is None assert actual_page.results == [origin_barbaz_qux] def test_origin_url_unique_word_prefix_multiple_results(self): origin_foobar_baz = {"url": "http://foobar.baz"} origin_barbaz_qux = {"url": "http://barbaz.qux"} origin_qux_quux = {"url": "http://qux.quux"} self.search.origin_update( [origin_foobar_baz, origin_barbaz_qux, origin_qux_quux] ) self.search.flush() actual_page = self.search.origin_search(url_pattern="qu") assert actual_page.next_page_token is None results = [r["url"] for r in actual_page.results] expected_results = [o["url"] for o in [origin_qux_quux, origin_barbaz_qux]] assert sorted(results) == sorted(expected_results) actual_page = self.search.origin_search(url_pattern="qux") assert actual_page.next_page_token is None results = [r["url"] for r in actual_page.results] expected_results = [o["url"] for o in [origin_qux_quux, origin_barbaz_qux]] assert sorted(results) == sorted(expected_results) def test_origin_url_all_terms(self): origin_foo_bar_baz = {"url": "http://foo.bar/baz"} origin_foo_bar_foo_bar = {"url": "http://foo.bar/foo.bar"} origins = [origin_foo_bar_baz, origin_foo_bar_foo_bar] self.search.origin_update(origins) self.search.flush() # Only results containing all terms should be returned. actual_page = self.search.origin_search(url_pattern="foo bar baz") assert actual_page.next_page_token is None assert actual_page.results == [origin_foo_bar_baz] def test_origin_with_visit(self): origin_foobar_baz = {"url": "http://foobar/baz"} self.search.origin_update( [{**o, "has_visits": True} for o in [origin_foobar_baz]] ) self.search.flush() actual_page = self.search.origin_search(url_pattern="foobar", with_visit=True) assert actual_page.next_page_token is None assert actual_page.results == [origin_foobar_baz] def test_origin_with_visit_added(self): origin_foobar_baz = {"url": "http://foobar.baz"} self.search.origin_update([origin_foobar_baz]) self.search.flush() actual_page = self.search.origin_search(url_pattern="foobar", with_visit=True) assert actual_page.next_page_token is None assert actual_page.results == [] self.search.origin_update( [{**o, "has_visits": True} for o in [origin_foobar_baz]] ) self.search.flush() actual_page = self.search.origin_search(url_pattern="foobar", with_visit=True) assert actual_page.next_page_token is None assert actual_page.results == [origin_foobar_baz] def test_origin_no_visit_types_search(self): origins = [{"url": "http://foobar.baz"}] self.search.origin_update(origins) self.search.flush() actual_page = self.search.origin_search(url_pattern="http", visit_types=["git"]) assert actual_page.next_page_token is None results = [r["url"] for r in actual_page.results] expected_results = [] assert sorted(results) == sorted(expected_results) actual_page = self.search.origin_search(url_pattern="http", visit_types=None) assert actual_page.next_page_token is None results = [r["url"] for r in actual_page.results] expected_results = [origin["url"] for origin in origins] assert sorted(results) == sorted(expected_results) def test_origin_visit_types_search(self): origins = [ {"url": "http://foobar.baz", "visit_types": ["git"]}, {"url": "http://barbaz.qux", "visit_types": ["svn"]}, {"url": "http://qux.quux", "visit_types": ["hg"]}, ] self.search.origin_update(origins) self.search.flush() for origin in origins: actual_page = self.search.origin_search( url_pattern="http", visit_types=origin["visit_types"] ) assert actual_page.next_page_token is None results = [r["url"] for r in actual_page.results] expected_results = [origin["url"]] assert sorted(results) == sorted(expected_results) actual_page = self.search.origin_search(url_pattern="http", visit_types=None) assert actual_page.next_page_token is None results = [r["url"] for r in actual_page.results] expected_results = [origin["url"] for origin in origins] assert sorted(results) == sorted(expected_results) def test_origin_visit_types_update_search(self): origin_url = "http://foobar.baz" self.search.origin_update([{"url": origin_url}]) self.search.flush() def _add_visit_type(visit_type): self.search.origin_update( [{"url": origin_url, "visit_types": [visit_type]}] ) self.search.flush() def _check_visit_types(visit_types_list): for visit_types in visit_types_list: actual_page = self.search.origin_search( url_pattern="http", visit_types=visit_types ) assert actual_page.next_page_token is None results = [r["url"] for r in actual_page.results] expected_results = [origin_url] assert sorted(results) == sorted(expected_results) _add_visit_type("git") _check_visit_types([["git"], ["git", "hg"]]) _add_visit_type("svn") _check_visit_types([["git"], ["svn"], ["svn", "git"], ["git", "hg", "svn"]]) _add_visit_type("hg") _check_visit_types( [ ["git"], ["svn"], ["hg"], ["svn", "git"], ["hg", "git"], ["hg", "svn"], ["git", "hg", "svn"], ] ) def test_origin_nb_visits_update_search(self): origin_url = "http://foobar.baz" self.search.origin_update([{"url": origin_url}]) self.search.flush() def _update_nb_visits(nb_visits): self.search.origin_update([{"url": origin_url, "nb_visits": nb_visits}]) self.search.flush() def _check_min_nb_visits(min_nb_visits): actual_page = self.search.origin_search( - url_pattern=origin_url, min_nb_visits=min_nb_visits, + url_pattern=origin_url, + min_nb_visits=min_nb_visits, ) assert actual_page.next_page_token is None results = [r["url"] for r in actual_page.results] expected_results = [origin_url] assert sorted(results) == sorted(expected_results) _update_nb_visits(2) _check_min_nb_visits(2) # Works for = 2 _check_min_nb_visits(1) # Works for < 2 with pytest.raises(AssertionError): _check_min_nb_visits( 5 ) # No results for nb_visits >= 5 (should throw error) _update_nb_visits(5) _check_min_nb_visits(5) # Works for = 5 _check_min_nb_visits(3) # Works for < 5 def test_origin_last_visit_date_update_search(self): origin_url = "http://foobar.baz" self.search.origin_update([{"url": origin_url}]) self.search.flush() def _update_last_visit_date(last_visit_date): self.search.origin_update( [{"url": origin_url, "last_visit_date": last_visit_date}] ) self.search.flush() def _check_min_last_visit_date(min_last_visit_date): actual_page = self.search.origin_search( - url_pattern=origin_url, min_last_visit_date=min_last_visit_date, + url_pattern=origin_url, + min_last_visit_date=min_last_visit_date, ) assert actual_page.next_page_token is None results = [r["url"] for r in actual_page.results] expected_results = [origin_url] assert sorted(results) == sorted(expected_results) now = datetime.now(tz=timezone.utc).isoformat() now_minus_5_hours = ( datetime.now(tz=timezone.utc) - timedelta(hours=5) ).isoformat() now_plus_5_hours = ( datetime.now(tz=timezone.utc) + timedelta(hours=5) ).isoformat() _update_last_visit_date(now) _check_min_last_visit_date(now) # Works for = _check_min_last_visit_date(now_minus_5_hours) # Works for < with pytest.raises(AssertionError): _check_min_last_visit_date(now_plus_5_hours) # Fails for > _update_last_visit_date(now_plus_5_hours) _check_min_last_visit_date(now_plus_5_hours) # Works for = _check_min_last_visit_date(now) # Works for < def test_journal_client_origin_visit_status_permutation(self): NOW = datetime.now(tz=timezone.utc).isoformat() NOW_MINUS_5_HOURS = ( datetime.now(tz=timezone.utc) - timedelta(hours=5) ).isoformat() NOW_PLUS_5_HOURS = ( datetime.now(tz=timezone.utc) + timedelta(hours=5) ).isoformat() VISIT_STATUSES = [ { "url": "http://foobar.baz", "snapshot_id": "SNAPSHOT_1", "last_eventful_visit_date": NOW, }, { "url": "http://foobar.baz", "snapshot_id": "SNAPSHOT_1", "last_eventful_visit_date": NOW_MINUS_5_HOURS, }, { "url": "http://foobar.baz", "snapshot_id": "SNAPSHOT_2", "last_eventful_visit_date": NOW_PLUS_5_HOURS, }, ] for visit_statuses in permutations(VISIT_STATUSES, len(VISIT_STATUSES)): self.search.origin_update(visit_statuses) self.search.flush() origin_url = "http://foobar.baz" actual_page = self.search.origin_search( - url_pattern=origin_url, min_last_eventful_visit_date=NOW_PLUS_5_HOURS, + url_pattern=origin_url, + min_last_eventful_visit_date=NOW_PLUS_5_HOURS, ) assert actual_page.next_page_token is None results = [r["url"] for r in actual_page.results] expected_results = [origin_url] assert sorted(results) == sorted(expected_results) self.reset() def test_origin_last_eventful_visit_date_update_search(self): origin_url = "http://foobar.baz" self.search.origin_update([{"url": origin_url}]) self.search.flush() def _update_last_eventful_visit_date(snapshot_id, last_eventful_visit_date): self.search.origin_update( [ { "url": origin_url, "snapshot_id": snapshot_id, "last_eventful_visit_date": last_eventful_visit_date, } ] ) self.search.flush() def _check_min_last_eventful_visit_date(min_last_eventful_visit_date): actual_page = self.search.origin_search( url_pattern=origin_url, min_last_eventful_visit_date=min_last_eventful_visit_date, ) assert actual_page.next_page_token is None results = [r["url"] for r in actual_page.results] expected_results = [origin_url] assert sorted(results) == sorted(expected_results) now = datetime.now(tz=timezone.utc).isoformat() now_minus_5_hours = ( datetime.now(tz=timezone.utc) - timedelta(hours=5) ).isoformat() now_plus_5_hours = ( datetime.now(tz=timezone.utc) + timedelta(hours=5) ).isoformat() snapshot_1 = "SNAPSHOT_1" snapshot_2 = "SNAPSHOT_2" _update_last_eventful_visit_date(snapshot_1, now) _check_min_last_eventful_visit_date(now) # Works for = _check_min_last_eventful_visit_date(now_minus_5_hours) # Works for < with pytest.raises(AssertionError): _check_min_last_eventful_visit_date(now_plus_5_hours) # Fails for > _update_last_eventful_visit_date( snapshot_1, now_plus_5_hours ) # Revisit(not eventful) same origin _check_min_last_eventful_visit_date( now ) # Should remain the same because recent visit wasn't eventful with pytest.raises(AssertionError): _check_min_last_eventful_visit_date(now_plus_5_hours) _update_last_eventful_visit_date( snapshot_2, now_plus_5_hours ) # Revisit(eventful) same origin _check_min_last_eventful_visit_date(now_plus_5_hours) # Works for = _check_min_last_eventful_visit_date(now) # Works for < def _test_origin_last_revision_release_date_update_search(self, date_type): origin_url = "http://foobar.baz" self.search.origin_update([{"url": origin_url}]) self.search.flush() def _update_last_revision_release_date(date): - self.search.origin_update([{"url": origin_url, date_type: date,}]) + self.search.origin_update( + [ + { + "url": origin_url, + date_type: date, + } + ] + ) self.search.flush() def _check_min_last_revision_release_date(date): actual_page = self.search.origin_search( - url_pattern=origin_url, **{f"min_{date_type}": date}, + url_pattern=origin_url, + **{f"min_{date_type}": date}, ) assert actual_page.next_page_token is None results = [r["url"] for r in actual_page.results] expected_results = [origin_url] assert sorted(results) == sorted(expected_results) now = datetime.now(tz=timezone.utc).isoformat() now_minus_5_hours = ( datetime.now(tz=timezone.utc) - timedelta(hours=5) ).isoformat() now_plus_5_hours = ( datetime.now(tz=timezone.utc) + timedelta(hours=5) ).isoformat() _update_last_revision_release_date(now) _check_min_last_revision_release_date(now) _check_min_last_revision_release_date(now_minus_5_hours) with pytest.raises(AssertionError): _check_min_last_revision_release_date(now_plus_5_hours) _update_last_revision_release_date(now_plus_5_hours) _check_min_last_revision_release_date(now_plus_5_hours) _check_min_last_revision_release_date(now) def test_origin_last_revision_date_update_search(self): self._test_origin_last_revision_release_date_update_search( date_type="last_revision_date" ) def test_origin_last_release_date_update_search(self): self._test_origin_last_revision_release_date_update_search( date_type="last_revision_date" ) def test_origin_instrinsic_metadata_dates_filter_sorting_search(self): DATE_0 = "1999-06-28" DATE_1 = "2001-02-13" DATE_2 = "2005-10-02" ORIGINS = [ { "url": "http://foobar.0.com", "intrinsic_metadata": { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "dateCreated": DATE_0, "dateModified": DATE_1, "datePublished": DATE_2, }, }, { "url": "http://foobar.1.com", "intrinsic_metadata": { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "dateCreated": DATE_1, "dateModified": DATE_2, "datePublished": DATE_2, }, }, { "url": "http://foobar.2.com", "intrinsic_metadata": { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "dateCreated": DATE_2, "dateModified": DATE_2, "datePublished": DATE_2, }, }, ] self.search.origin_update(ORIGINS) self.search.flush() def _check_results(origin_indices, sort_results=True, **kwargs): page = self.search.origin_search(url_pattern="foobar", **kwargs) results = [r["url"] for r in page.results] if sort_results: assert sorted(results) == sorted( [ORIGINS[index]["url"] for index in origin_indices] ) else: assert results == [ORIGINS[index]["url"] for index in origin_indices] _check_results(min_date_created=DATE_0, origin_indices=[0, 1, 2]) _check_results(min_date_created=DATE_1, origin_indices=[1, 2]) _check_results(min_date_created=DATE_2, origin_indices=[2]) _check_results(min_date_modified=DATE_0, origin_indices=[0, 1, 2]) _check_results(min_date_modified=DATE_1, origin_indices=[0, 1, 2]) _check_results(min_date_modified=DATE_2, origin_indices=[1, 2]) _check_results(min_date_published=DATE_0, origin_indices=[0, 1, 2]) _check_results(min_date_published=DATE_1, origin_indices=[0, 1, 2]) _check_results(min_date_published=DATE_2, origin_indices=[0, 1, 2]) # Sorting _check_results( sort_by=["-date_created"], origin_indices=[2, 1, 0], sort_results=False ) _check_results( sort_by=["date_created"], origin_indices=[0, 1, 2], sort_results=False ) def test_origin_instrinsic_metadata_dates_processing(self): DATE_0 = "foo" # will be discarded DATE_1 = "2001-2-13" # will be formatted to 2001-02-13 DATE_2 = "2005-10-2" # will be formatted to 2005-10-02 ORIGINS = [ { "url": "http://foobar.0.com", "intrinsic_metadata": { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "dateCreated": DATE_0, "dateModified": DATE_1, "datePublished": DATE_2, }, }, { "url": "http://foobar.1.com", "intrinsic_metadata": { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "dateCreated": DATE_1, "dateModified": DATE_2, "datePublished": DATE_2, }, }, { "url": "http://foobar.2.com", "intrinsic_metadata": { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "dateCreated": DATE_2, "dateModified": DATE_2, "datePublished": DATE_2, }, }, ] self.search.origin_update(ORIGINS) self.search.flush() # check origins have been successfully processed page = self.search.origin_search(url_pattern="foobar") assert {r["url"] for r in page.results} == { "http://foobar.0.com", "http://foobar.2.com", "http://foobar.1.com", } def test_origin_keywords_search(self): ORIGINS = [ { "url": "http://foobar.1.com", "intrinsic_metadata": { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "description": "Django is a backend framework for applications", "keywords": "django,backend,server,web,framework", }, }, { "url": "http://foobar.2.com", "intrinsic_metadata": { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "description": "Native Android applications are fast", "keywords": "android,mobile,ui", }, }, { "url": "http://foobar.3.com", "intrinsic_metadata": { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "description": "React framework helps you build web applications", "keywords": "react,web,ui", }, }, ] self.search.origin_update(ORIGINS) self.search.flush() def _check_results(keywords, origin_indices, sorting=False): page = self.search.origin_search(url_pattern="foobar", keywords=keywords) results = [r["url"] for r in page.results] if sorting: assert sorted(results) == sorted( [ORIGINS[index]["url"] for index in origin_indices] ) else: assert results == [ORIGINS[index]["url"] for index in origin_indices] _check_results(["build"], [2]) _check_results(["web"], [2, 0]) _check_results(["ui"], [1, 2]) # Following tests ensure that boosts work properly # Baseline: "applications" is common in all origin descriptions _check_results(["applications"], [1, 0, 2], True) # ORIGINS[0] has 'framework' in: keyword + description # ORIGINS[2] has 'framework' in: description # ORIGINS[1] has 'framework' in: None _check_results(["framework", "applications"], [0, 2, 1]) # ORIGINS[1] has 'ui' in: keyword # ORIGINS[1] has 'ui' in: keyword # ORIGINS[0] has 'ui' in: None _check_results(["applications", "ui"], [1, 2, 0]) # ORIGINS[2] has 'web' in: keyword + description # ORIGINS[0] has 'web' in: keyword # ORIGINS[1] has 'web' in: None _check_results(["web", "applications"], [2, 0, 1]) def test_origin_sort_by_search(self): now = datetime.now(tz=timezone.utc).isoformat() now_minus_5_hours = ( datetime.now(tz=timezone.utc) - timedelta(hours=5) ).isoformat() now_plus_5_hours = ( datetime.now(tz=timezone.utc) + timedelta(hours=5) ).isoformat() ORIGINS = [ { "url": "http://foobar.1.com", "nb_visits": 1, "last_visit_date": now_minus_5_hours, }, - {"url": "http://foobar.2.com", "nb_visits": 2, "last_visit_date": now,}, + { + "url": "http://foobar.2.com", + "nb_visits": 2, + "last_visit_date": now, + }, { "url": "http://foobar.3.com", "nb_visits": 3, "last_visit_date": now_plus_5_hours, }, ] self.search.origin_update(ORIGINS) self.search.flush() def _check_results(sort_by, origins): page = self.search.origin_search(url_pattern="foobar", sort_by=sort_by) results = [r["url"] for r in page.results] assert results == [origin["url"] for origin in origins] _check_results(["nb_visits"], ORIGINS) _check_results(["-nb_visits"], ORIGINS[::-1]) _check_results(["last_visit_date"], ORIGINS) _check_results(["-last_visit_date"], ORIGINS[::-1]) _check_results(["nb_visits", "-last_visit_date"], ORIGINS) _check_results(["-last_visit_date", "nb_visits"], ORIGINS[::-1]) def test_origin_instrinsic_metadata_license_search(self): ORIGINS = [ { "url": "http://foobar.1.com", "intrinsic_metadata": { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "description": "foo bar", "license": "https://spdx.org/licenses/MIT", }, }, { "url": "http://foobar.2.com", "intrinsic_metadata": { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "description": "foo bar", "license": "BSD-3-Clause", }, }, ] self.search.origin_update(ORIGINS) self.search.flush() def _check_results(licenses, origin_indices): page = self.search.origin_search(url_pattern="foobar", licenses=licenses) results = [r["url"] for r in page.results] assert sorted(results) == sorted( [ORIGINS[i]["url"] for i in origin_indices] ) _check_results(["MIT"], [0]) _check_results(["bsd"], [1]) _check_results(["mit", "3-Clause"], [0, 1]) def test_origin_instrinsic_metadata_programming_language_search(self): ORIGINS = [ { "url": "http://foobar.1.com", "intrinsic_metadata": { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "description": "foo bar", "programmingLanguage": "python", }, }, { "url": "http://foobar.2.com", "intrinsic_metadata": { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "description": "foo bar", "programmingLanguage": "javascript", }, }, ] self.search.origin_update(ORIGINS) self.search.flush() def _check_results(programming_languages, origin_indices): page = self.search.origin_search( url_pattern="foobar", programming_languages=programming_languages ) results = [r["url"] for r in page.results] assert sorted(results) == sorted( [ORIGINS[i]["url"] for i in origin_indices] ) _check_results(["python"], [0]) _check_results(["javascript"], [1]) _check_results(["python", "javascript"], [0, 1]) def test_origin_instrinsic_metadata_multiple_field_search(self): ORIGINS = [ { "url": "http://foobar.1.com", "intrinsic_metadata": { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "description": "foo bar 1", "programmingLanguage": "python", "license": "https://spdx.org/licenses/MIT", }, }, { "url": "http://foobar.2.com", "intrinsic_metadata": { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "description": "foo bar 2", "programmingLanguage": ["javascript", "html", "css"], "license": [ "https://spdx.org/licenses/CC-BY-1.0", "https://spdx.org/licenses/Apache-1.0", ], }, }, { "url": "http://foobar.3.com", "intrinsic_metadata": { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "description": "foo bar 3", "programmingLanguage": ["Cpp", "c"], "license": "https://spdx.org/licenses/LGPL-2.0-only", }, }, ] self.search.origin_update(ORIGINS) self.search.flush() def _check_result(programming_languages, licenses, origin_indices): page = self.search.origin_search( url_pattern="foobar", programming_languages=programming_languages, licenses=licenses, ) results = [r["url"] for r in page.results] assert sorted(results) == sorted( [ORIGINS[i]["url"] for i in origin_indices] ) _check_result(["javascript"], ["CC"], [1]) _check_result(["css"], ["CC"], [1]) _check_result(["css"], ["CC", "apache"], [1]) _check_result(["python", "javascript"], ["MIT"], [0]) _check_result(["c", "python"], ["LGPL", "mit"], [2, 0]) def test_origin_update_with_no_visit_types(self): """ Update an origin with visit types first then with no visit types, check origin can still be searched with visit types afterwards. """ origin_url = "http://foobar.baz" self.search.origin_update([{"url": origin_url, "visit_types": ["git"]}]) self.search.flush() self.search.origin_update([{"url": origin_url}]) self.search.flush() actual_page = self.search.origin_search(url_pattern="http", visit_types=["git"]) assert actual_page.next_page_token is None results = [r["url"] for r in actual_page.results] expected_results = [origin_url] assert results == expected_results def test_origin_intrinsic_metadata_description(self): origin1_nothin = {"url": "http://origin1"} origin2_foobar = {"url": "http://origin2"} origin3_barbaz = {"url": "http://origin3"} self.search.origin_update( [ - {**origin1_nothin, "intrinsic_metadata": {},}, + { + **origin1_nothin, + "intrinsic_metadata": {}, + }, { **origin2_foobar, "intrinsic_metadata": { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "description": "foo bar", }, }, { **origin3_barbaz, "intrinsic_metadata": { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "description": "bar baz", }, }, ] ) self.search.flush() actual_page = self.search.origin_search(metadata_pattern="foo") assert actual_page.next_page_token is None assert actual_page.results == [origin2_foobar] actual_page = self.search.origin_search(metadata_pattern="foo bar") assert actual_page.next_page_token is None assert actual_page.results == [origin2_foobar] actual_page = self.search.origin_search(metadata_pattern="bar baz") assert actual_page.next_page_token is None assert actual_page.results == [origin3_barbaz] def test_origin_intrinsic_metadata_all_terms(self): origin1_foobarfoobar = {"url": "http://origin1"} origin3_foobarbaz = {"url": "http://origin2"} self.search.origin_update( [ { **origin1_foobarfoobar, "intrinsic_metadata": { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "description": "foo bar foo bar", }, }, { **origin3_foobarbaz, "intrinsic_metadata": { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "description": "foo bar baz", }, }, ] ) self.search.flush() actual_page = self.search.origin_search(metadata_pattern="foo bar baz") assert actual_page.next_page_token is None assert actual_page.results == [origin3_foobarbaz] def test_origin_intrinsic_metadata_long_description(self): """Checks ElasticSearch does not try to store large values untokenize, which would be inefficient and crash it with: Document contains at least one immense term in field="intrinsic_metadata.http://schema.org/description.@value" (whose UTF8 encoding is longer than the max length 32766), all of which were skipped. """ # noqa origin1 = {"url": "http://origin1"} self.search.origin_update( [ { **origin1, "intrinsic_metadata": { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "description": " ".join(f"foo{i}" for i in range(100000)), }, }, ] ) self.search.flush() actual_page = self.search.origin_search(metadata_pattern="foo42") assert actual_page.next_page_token is None assert actual_page.results == [origin1] def test_origin_intrinsic_metadata_matches_cross_fields(self): """Checks the backend finds results even if the two words in the query are each in a different field.""" origin1 = {"url": "http://origin1"} self.search.origin_update( [ { **origin1, "intrinsic_metadata": { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "description": "foo bar", "author": "John Doe", }, }, ] ) self.search.flush() actual_page = self.search.origin_search(metadata_pattern="foo John") assert actual_page.next_page_token is None assert actual_page.results == [origin1] def test_origin_intrinsic_metadata_nested(self): origin1_nothin = {"url": "http://origin1"} origin2_foobar = {"url": "http://origin2"} origin3_barbaz = {"url": "http://origin3"} self.search.origin_update( [ - {**origin1_nothin, "intrinsic_metadata": {},}, + { + **origin1_nothin, + "intrinsic_metadata": {}, + }, { **origin2_foobar, "intrinsic_metadata": { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "keywords": ["foo", "bar"], }, }, { **origin3_barbaz, "intrinsic_metadata": { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "keywords": ["bar", "baz"], }, }, ] ) self.search.flush() actual_page = self.search.origin_search(metadata_pattern="foo") assert actual_page.next_page_token is None assert actual_page.results == [origin2_foobar] actual_page = self.search.origin_search(metadata_pattern="foo bar") assert actual_page.next_page_token is None assert actual_page.results == [origin2_foobar] actual_page = self.search.origin_search(metadata_pattern="bar baz") assert actual_page.next_page_token is None assert actual_page.results == [origin3_barbaz] def test_origin_intrinsic_metadata_inconsistent_type(self): """Checks the same field can have a concrete value, an object, or an array in different documents.""" origin1_foobar = {"url": "http://origin1"} origin2_barbaz = {"url": "http://origin2"} origin3_bazqux = {"url": "http://origin3"} self.search.origin_update( [ { **origin1_foobar, "intrinsic_metadata": { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", - "author": {"familyName": "Foo", "givenName": "Bar",}, + "author": { + "familyName": "Foo", + "givenName": "Bar", + }, }, }, ] ) self.search.flush() self.search.origin_update( [ { **origin2_barbaz, "intrinsic_metadata": { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "author": "Bar Baz", }, }, { **origin3_bazqux, "intrinsic_metadata": { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "author": ["Baz", "Qux"], }, }, ] ) self.search.flush() actual_page = self.search.origin_search(metadata_pattern="bar") assert actual_page.next_page_token is None results = [r["url"] for r in actual_page.results] expected_results = [o["url"] for o in [origin2_barbaz, origin1_foobar]] assert sorted(results) == sorted(expected_results) actual_page = self.search.origin_search(metadata_pattern="baz") assert actual_page.next_page_token is None assert actual_page.results == [origin2_barbaz, origin3_bazqux] actual_page = self.search.origin_search(metadata_pattern="foo") assert actual_page.next_page_token is None assert actual_page.results == [origin1_foobar] actual_page = self.search.origin_search(metadata_pattern="bar baz") assert actual_page.next_page_token is None assert actual_page.results == [origin2_barbaz] actual_page = self.search.origin_search(metadata_pattern="qux") assert actual_page.next_page_token is None assert actual_page.results == [origin3_bazqux] actual_page = self.search.origin_search(metadata_pattern="baz qux") assert actual_page.next_page_token is None assert actual_page.results == [origin3_bazqux] actual_page = self.search.origin_search(metadata_pattern="foo bar") assert actual_page.next_page_token is None assert actual_page.results == [origin1_foobar] def test_origin_intrinsic_metadata_string_mapping(self): """Checks inserting a date-like in a field does not update the mapping to require every document uses a date in that field; or that search queries use a date either. Likewise for numeric and boolean fields.""" origin1 = {"url": "http://origin1"} origin2 = {"url": "http://origin2"} self.search.origin_update( [ { **origin1, "intrinsic_metadata": { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "dateCreated": "2021-02-18T10:16:52", "version": "1.0", "isAccessibleForFree": True, }, } ] ) self.search.flush() self.search.origin_update( [ { **origin2, "intrinsic_metadata": { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "dateCreated": "a long time ago", "address": "in a galaxy far, far away", "version": "a new hope", "isAccessibleForFree": "it depends", }, }, ] ) self.search.flush() actual_page = self.search.origin_search(metadata_pattern="1.0") assert actual_page.next_page_token is None assert actual_page.results == [origin1] actual_page = self.search.origin_search(metadata_pattern="long") assert actual_page.next_page_token is None assert ( actual_page.results == [] ) # "%Y-%m-%d" not followed, so value is rejected actual_page = self.search.origin_search(metadata_pattern="true") assert actual_page.next_page_token is None assert actual_page.results == [origin1] actual_page = self.search.origin_search(metadata_pattern="it depends") assert actual_page.next_page_token is None assert actual_page.results == [origin2] def test_origin_intrinsic_metadata_update(self): origin = {"url": "http://origin1"} origin_data = { **origin, "intrinsic_metadata": { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "author": "John Doe", }, } self.search.origin_update([origin_data]) self.search.flush() actual_page = self.search.origin_search(metadata_pattern="John") assert actual_page.next_page_token is None assert actual_page.results == [origin] origin_data["intrinsic_metadata"]["author"] = "Jane Doe" self.search.origin_update([origin_data]) self.search.flush() actual_page = self.search.origin_search(metadata_pattern="Jane") assert actual_page.next_page_token is None assert actual_page.results == [origin] # TODO: add more tests with more codemeta terms # TODO: add more tests with edge cases @settings(deadline=None) @given(strategies.integers(min_value=1, max_value=4)) def test_origin_url_paging(self, limit): # TODO: no hypothesis origin1_foo = {"url": "http://origin1/foo"} origin2_foobar = {"url": "http://origin2/foo/bar"} origin3_foobarbaz = {"url": "http://origin3/foo/bar/baz"} self.reset() self.search.origin_update([origin1_foo, origin2_foobar, origin3_foobarbaz]) self.search.flush() results = stream_results( self.search.origin_search, url_pattern="foo bar baz", limit=limit ) results = [res["url"] for res in results] expected_results = [o["url"] for o in [origin3_foobarbaz]] assert sorted(results[0 : len(expected_results)]) == sorted(expected_results) results = stream_results( self.search.origin_search, url_pattern="foo bar", limit=limit ) results = [res["url"] for res in results] expected_results = [o["url"] for o in [origin2_foobar, origin3_foobarbaz]] assert sorted(results[0 : len(expected_results)]) == sorted(expected_results) results = stream_results( self.search.origin_search, url_pattern="foo", limit=limit ) results = [res["url"] for res in results] expected_results = [ o["url"] for o in [origin1_foo, origin2_foobar, origin3_foobarbaz] ] assert sorted(results[0 : len(expected_results)]) == sorted(expected_results) @settings(deadline=None) @given(strategies.integers(min_value=1, max_value=4)) def test_origin_intrinsic_metadata_paging(self, limit): # TODO: no hypothesis origin1_foo = {"url": "http://origin1"} origin2_foobar = {"url": "http://origin2"} origin3_foobarbaz = {"url": "http://origin3"} self.reset() self.search.origin_update( [ { **origin1_foo, "intrinsic_metadata": { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "keywords": ["foo"], }, }, { **origin2_foobar, "intrinsic_metadata": { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "keywords": ["foo", "bar"], }, }, { **origin3_foobarbaz, "intrinsic_metadata": { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "keywords": ["foo", "bar", "baz"], }, }, ] ) self.search.flush() results = stream_results( self.search.origin_search, metadata_pattern="foo bar baz", limit=limit ) assert list(results) == [origin3_foobarbaz] results = stream_results( self.search.origin_search, metadata_pattern="foo bar", limit=limit ) assert list(results) == [origin2_foobar, origin3_foobarbaz] results = stream_results( self.search.origin_search, metadata_pattern="foo", limit=limit ) assert list(results) == [origin1_foo, origin2_foobar, origin3_foobarbaz] def test_search_blocklisted_results(self): origin1 = {"url": "http://origin1"} origin2 = {"url": "http://origin2", "blocklisted": True} self.search.origin_update([origin1, origin2]) self.search.flush() actual_page = self.search.origin_search(url_pattern="origin") assert actual_page.next_page_token is None assert actual_page.results == [origin1] def test_search_blocklisted_update(self): origin1 = {"url": "http://origin1"} self.search.origin_update([origin1]) self.search.flush() result_page = self.search.origin_search(url_pattern="origin") assert result_page.next_page_token is None assert result_page.results == [origin1] self.search.origin_update([{**origin1, "blocklisted": True}]) self.search.flush() result_page = self.search.origin_search(url_pattern="origin") assert result_page.next_page_token is None assert result_page.results == [] self.search.origin_update( [{**origin1, "has_visits": True, "visit_types": ["git"]}] ) self.search.flush() result_page = self.search.origin_search(url_pattern="origin") assert result_page.next_page_token is None assert result_page.results == [] def test_filter_keyword_in_filter(self): origin1 = { "url": "foo language in ['foo baz'] bar", } self.search.origin_update([origin1]) self.search.flush() result_page = self.search.origin_search(url_pattern="language in ['foo bar']") assert result_page.next_page_token is None assert result_page.results == [origin1] result_page = self.search.origin_search(url_pattern="baaz") assert result_page.next_page_token is None assert result_page.results == [] def test_visit_types_count(self): assert self.search.visit_types_count() == Counter() origins = [ {"url": "http://foobar.baz", "visit_types": ["git"], "blocklisted": True} ] for idx, visit_type in enumerate(["git", "hg", "svn"]): for i in range(idx + 1): origins.append( { "url": f"http://{visit_type}.foobar.baz.{i}", "visit_types": [visit_type], } ) self.search.origin_update(origins) self.search.flush() assert self.search.visit_types_count() == Counter(git=1, hg=2, svn=3) diff --git a/swh/search/tests/test_server.py b/swh/search/tests/test_server.py index 15b1b65..9961c05 100644 --- a/swh/search/tests/test_server.py +++ b/swh/search/tests/test_server.py @@ -1,156 +1,163 @@ # Copyright (C) 2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import os from typing import Any, Dict import pytest import yaml from swh.core.api import RPCServerApp from swh.core.config import load_from_envvar from swh.search.api import server from swh.search.api.server import load_and_check_config, make_app_from_configfile def teardown_function(): # Ensure there is no configuration loaded from a previous test server.api_cfg = None def _write_config_file(tmp_path, monkeypatch, content): conf_path = os.path.join(str(tmp_path), "search.yml") with open(conf_path, "w") as f: f.write(yaml.dump(content)) monkeypatch.setenv("SWH_CONFIG_FILENAME", conf_path) return conf_path @pytest.fixture def swh_search_server_config_without_indexes() -> Dict[str, Any]: - return {"search": {"cls": "elasticsearch", "hosts": ["es1"],}} + return { + "search": { + "cls": "elasticsearch", + "hosts": ["es1"], + } + } @pytest.fixture def swh_search_server_config_with_indexes( swh_search_server_config_without_indexes, ) -> Dict[str, Any]: return { "search": { **{"indexes": {"origin": {"index": "test"}}}, **swh_search_server_config_without_indexes["search"], } } @pytest.fixture def swh_search_config_without_indexes( monkeypatch, swh_search_server_config_without_indexes, tmp_path ): return _write_config_file( tmp_path, monkeypatch, swh_search_server_config_without_indexes ) @pytest.fixture def swh_search_config_with_indexes( monkeypatch, swh_search_server_config_with_indexes, tmp_path ): return _write_config_file( tmp_path, monkeypatch, swh_search_server_config_with_indexes ) def prepare_config_file(tmpdir, config_dict: Dict, name: str = "config.yml") -> str: """Prepare configuration file in `$tmpdir/name` with content `content`. Args: tmpdir (LocalPath): root directory content: Content of the file either as string or as a dict. If a dict, converts the dict into a yaml string. name: configuration filename Returns path of the configuration file prepared. """ config_path = tmpdir / name config_path.write_text(yaml.dump(config_dict), encoding="utf-8") # pytest on python3.5 does not support LocalPath manipulation, so # convert path to string return str(config_path) @pytest.mark.parametrize("config_file", [None, ""]) def test_load_and_check_config_no_configuration(config_file): """Inexistent configuration files raises""" with pytest.raises(EnvironmentError, match="Configuration file must be defined"): load_and_check_config(config_file) def test_load_and_check_config_inexistent_file(): config_path = "/some/inexistent/config.yml" expected_error = f"Configuration file {config_path} does not exist" with pytest.raises(EnvironmentError, match=expected_error): load_and_check_config(config_path) def test_load_and_check_config_wrong_configuration(tmpdir): """Wrong configuration raises""" config_path = prepare_config_file(tmpdir, {"something": "useless"}) with pytest.raises(KeyError, match="Missing 'search' configuration"): load_and_check_config(config_path) def test_load_and_check_config_local_config_fine( swh_search_server_config_with_indexes, tmpdir ): """'local' complete configuration is fine""" config_path = prepare_config_file(tmpdir, swh_search_server_config_with_indexes) cfg = load_and_check_config(config_path) assert cfg == swh_search_server_config_with_indexes def test_server_make_app_from_config_file_without_indexes( swh_search_config_without_indexes, ): app = make_app_from_configfile() expected_cfg = load_from_envvar() assert app is not None assert isinstance(app, RPCServerApp) assert app.config["search"] == expected_cfg["search"] app2 = make_app_from_configfile() assert app is app2 -def test_server_make_app_from_config_file_with_indexes(swh_search_config_with_indexes,): +def test_server_make_app_from_config_file_with_indexes( + swh_search_config_with_indexes, +): app = make_app_from_configfile() expected_cfg = load_from_envvar() assert app is not None assert isinstance(app, RPCServerApp) assert app.config["search"] == expected_cfg["search"] app2 = make_app_from_configfile() assert app is app2 def test_server_first_call_initialize_elasticsearch( swh_search_config_with_indexes, mocker ): """Test the initialize method is called during the first and first only - request to the server""" + request to the server""" mock = mocker.patch("swh.search.elasticsearch.ElasticSearch.initialize") app = make_app_from_configfile() app.config["TESTING"] = True tc = app.test_client() tc.get("/") assert mock.call_count == 1 tc.get("/") assert mock.call_count == 1 diff --git a/swh/search/tests/test_translator.py b/swh/search/tests/test_translator.py index 9789c62..108b5b6 100644 --- a/swh/search/tests/test_translator.py +++ b/swh/search/tests/test_translator.py @@ -1,442 +1,452 @@ # Copyright (C) 2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import pytest from swh.search.translator import Translator from swh.search.utils import get_expansion def _test_results(query, expected): output = Translator().parse_query(query) assert output == expected def test_empty_query(): query = "" with pytest.raises(Exception): _test_results(query, {}) def test_conjunction_operators(): query = "visited = true or visits > 2 and visits < 5" expected = { "filters": { "bool": { "should": [ {"term": {"has_visits": True}}, { "bool": { "must": [ {"range": {"nb_visits": {"gt": 2}}}, {"range": {"nb_visits": {"lt": 5}}}, ] } }, ] } } } _test_results(query, expected) def test_visited(): query = "visited = true" expected = { "filters": {"term": {"has_visits": True}}, } _test_results(query, expected) query = "visited = false" expected = { "filters": { "bool": { "should": [ {"term": {"has_visits": False}}, {"bool": {"must_not": {"exists": {"field": "has_visits"}}}}, ] } } } _test_results(query, expected) def test_conjunction_op_precedence_override(): query = "(visited = true or visits > 2) and visits < 5" expected = { "filters": { "bool": { "must": [ { "bool": { "should": [ {"term": {"has_visits": True}}, {"range": {"nb_visits": {"gt": 2}}}, ] } }, {"range": {"nb_visits": {"lt": 5}}}, ] } } } _test_results(query, expected) def test_limit_and_sortby(): query = "visited = true sort_by = [-visits,last_visit] limit = 15" expected = { "filters": {"term": {"has_visits": True}}, "sortBy": ["-visits", "last_visit"], "limit": 15, } _test_results(query, expected) def test_deeply_nested_filters(): query = "(((visited = true and visits > 0)))" expected = { "filters": { "bool": { "must": [ - {"term": {"has_visits": True},}, + { + "term": {"has_visits": True}, + }, {"range": {"nb_visits": {"gt": 0}}}, ] } }, } _test_results(query, expected) def test_origin_and_metadata_filters(): query = 'origin : django or metadata : "framework and web"' expected = { "filters": { "bool": { "should": [ { "multi_match": { "query": "django", "type": "bool_prefix", "operator": "and", "fields": [ "url.as_you_type", "url.as_you_type._2gram", "url.as_you_type._3gram", ], } }, { "nested": { "path": "intrinsic_metadata", "query": { "multi_match": { "query": "framework and web", "type": "cross_fields", "operator": "and", "fields": ["intrinsic_metadata.*"], "lenient": True, } }, } }, ] } } } _test_results(query, expected) def test_visits_not_equal_to_filter(): query = "visits != 5" expected = { "filters": { - "bool": {"must_not": [{"range": {"nb_visits": {"gte": 5, "lte": 5}}},]} + "bool": { + "must_not": [ + {"range": {"nb_visits": {"gte": 5, "lte": 5}}}, + ] + } }, } _test_results(query, expected) def test_visit_type_filter(): query = 'visit_type = [git,"pypi"]' expected = {"filters": {"terms": {"visit_types": ["git", "pypi"]}}} _test_results(query, expected) def test_keyword_filter(): query = r"""keyword in [word1, "word2 \" \' word3"]""" expected = { "filters": { "nested": { "path": "intrinsic_metadata", "query": { "multi_match": { "query": r"""word1 word2 " ' word3""", "fields": [ get_expansion("keywords", ".") + "^2", get_expansion("descriptions", "."), ], } }, } } } _test_results(query, expected) def test_language_filter(): query = 'language in [python, "go lang", cpp]' expected = { "filters": { "nested": { "path": "intrinsic_metadata", "query": { "bool": { "should": [ { "match": { get_expansion( "programming_languages", "." ): "python" } }, { "match": { get_expansion( "programming_languages", "." ): "go lang" } }, { "match": { get_expansion("programming_languages", "."): "cpp" } }, ] } }, } } } _test_results(query, expected) def test_license_filter(): query = 'license in ["GPL 3", Apache, MIT]' expected = { "filters": { "nested": { "path": "intrinsic_metadata", "query": { "bool": { "should": [ {"match": {get_expansion("licenses", "."): "GPL 3"}}, {"match": {get_expansion("licenses", "."): "Apache"}}, {"match": {get_expansion("licenses", "."): "MIT"}}, ] } }, } } } _test_results(query, expected) def test_date_created_not_equal_to_filter(): query = "created != 2020-01-01" expected = { "filters": { "nested": { "path": "intrinsic_metadata", "query": { "bool": { "must_not": [ { "range": { get_expansion("date_created", "."): { "gte": "2020-01-01", "lte": "2020-01-01", } } } ] } }, } } } _test_results(query, expected) def test_date_created_greater_than_filter(): query = "created >= 2020-01-01" expected = { "filters": { "nested": { "path": "intrinsic_metadata", "query": { "bool": { "must": [ { "range": { get_expansion("date_created", "."): { "gte": "2020-01-01", } } } ] } }, } } } _test_results(query, expected) def test_visit_date_range(): query = "last_visit >= 2020-01-01 and last_visit < 2021-01-01" expected = { "filters": { "bool": { "must": [ {"range": {"last_visit_date": {"gte": "2020-01-01"}}}, {"range": {"last_visit_date": {"lt": "2021-01-01"}}}, ] } }, } _test_results(query, expected) def test_last_eventful_visit_not_equal_to_filter(): query = "last_visit != 2020-01-01" expected = { "filters": { "bool": { "must_not": [ { "range": { "last_visit_date": { "gte": "2020-01-01", "lte": "2020-01-01", } } } ] } } } _test_results(query, expected) def test_last_eventful_visit_less_than_to_filter(): query = "last_visit < 2020-01-01" expected = {"filters": {"range": {"last_visit_date": {"lt": "2020-01-01"}}}} _test_results(query, expected) def test_keyword_no_escape_inside_filter(): # any keyword (filter name/operator/value) inside a filter # must be considered a string. query = r'''origin : "language in [\'go lang\', python]"''' expected = { "filters": { "multi_match": { "query": r"""language in ['go lang', python]""", "type": "bool_prefix", "operator": "and", "fields": [ "url.as_you_type", "url.as_you_type._2gram", "url.as_you_type._3gram", ], } } } _test_results(query, expected) def test_escaped_punctuation_parsing(): query = r"""keyword in ["foo \'\" bar"]""" expected = { "filters": { "nested": { "path": "intrinsic_metadata", "query": { "multi_match": { "query": r"""foo '" bar""", "fields": [ get_expansion("keywords", ".") + "^2", get_expansion("descriptions", "."), ], } }, } } } _test_results(query, expected) def test_nonascii(): query = r"""keyword in ["café"]""" expected = { "filters": { "nested": { "path": "intrinsic_metadata", "query": { "multi_match": { "query": r"""café""", "fields": [ get_expansion("keywords", ".") + "^2", get_expansion("descriptions", "."), ], } }, } } } _test_results(query, expected) def test_nonascii_before_operator(): query = r"""keyword in ["🐍"] and visited = true""" expected = { "filters": { "bool": { "must": [ { "nested": { "path": "intrinsic_metadata", "query": { "multi_match": { "query": r"""🐍""", "fields": [ get_expansion("keywords", ".") + "^2", get_expansion("descriptions", "."), ], } }, }, }, - {"term": {"has_visits": True,},}, + { + "term": { + "has_visits": True, + }, + }, ], } } } _test_results(query, expected) diff --git a/swh/search/tests/test_utils.py b/swh/search/tests/test_utils.py index 8db4838..281a1b9 100644 --- a/swh/search/tests/test_utils.py +++ b/swh/search/tests/test_utils.py @@ -1,23 +1,24 @@ # Copyright (C) 2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import pytest from swh.search.utils import parse_and_format_date @pytest.mark.parametrize( "date_str", ["2021-07-03", "2021-7-03", "2021-07-3", "2021-7-3", "2021-07-03T15:17:08Z"], ) def test_parse_and_format_date_success(date_str): assert parse_and_format_date(date_str) == "2021-07-03" @pytest.mark.parametrize( - "date_str", ["foo", "2021/07/03", "2021+07+03T15,17,08Z"], + "date_str", + ["foo", "2021/07/03", "2021+07+03T15,17,08Z"], ) def test_parse_and_format_date_failure(date_str): assert parse_and_format_date(date_str) is None diff --git a/swh/search/translator.py b/swh/search/translator.py index 2e29c71..4f85bbb 100644 --- a/swh/search/translator.py +++ b/swh/search/translator.py @@ -1,324 +1,327 @@ # Copyright (C) 2021-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import logging import os import tempfile from pkg_resources import resource_filename from tree_sitter import Language, Parser from swh.search.exc import SearchQuerySyntaxError from swh.search.utils import get_expansion, unescape logger = logging.getLogger(__name__) class Translator: RANGE_OPERATOR_MAP = { ">": "gt", "<": "lt", ">=": "gte", "<=": "lte", } def __init__(self): ql_path = resource_filename("swh.search", "static/swh_ql.so") if not os.path.exists(ql_path): logging.info("%s does not exist, building in temporary directory", ql_path) self._build_dir = tempfile.TemporaryDirectory(prefix="swh.search-build") source_path = resource_filename("swh.search", "query_language") ql_path = os.path.join(self._build_dir.name, "swh_ql.so") Language.build_library(ql_path, [source_path]) search_ql = Language(ql_path, "swh_search_ql") self.parser = Parser() self.parser.set_language(search_ql) self.query = "" def parse_query(self, query): self.query = query.encode() tree = self.parser.parse(self.query) self.query_node = tree.root_node if self.query_node.has_error: raise SearchQuerySyntaxError("Invalid query") return self._traverse(self.query_node) def _traverse(self, node): if len(node.children) == 3 and node.children[1].type == "filters": # filters => ( filters ) return self._traverse(node.children[1]) # Go past the () brackets if node.type == "query": result = {} for child in node.children: # query => filters sort_by limit result[child.type] = self._traverse(child) return result if node.type == "filters": if len(node.children) == 1: # query => filters # filters => filters # filters => filter # Current node is just a wrapper, so go one level deep return self._traverse(node.children[0]) if len(node.children) == 3: # filters => filters conj_op filters filters1 = self._traverse(node.children[0]) conj_op = self._get_value(node.children[1]) filters2 = self._traverse(node.children[2]) if conj_op == "and": # "must" is equivalent to "AND" return {"bool": {"must": [filters1, filters2]}} if conj_op == "or": # "should" is equivalent to "OR" return {"bool": {"should": [filters1, filters2]}} if node.type == "filter": filter_category = node.children[0] return self._parse_filter(filter_category) if node.type == "sortBy": return self._parse_filter(node) if node.type == "limit": return self._parse_filter(node) return Exception( f"Unknown node type ({node.type}) " f"or unexpected number of children ({node.children})" ) def _get_value(self, node): if ( len(node.children) > 0 and node.children[0].type == "[" and node.children[-1].type == "]" ): # array return [self._get_value(child) for child in node.children if child.is_named] start = node.start_point[1] end = node.end_point[1] value = self.query[start:end].decode() if len(value) > 1 and ( (value[0] == "'" and value[-1] == "'") or (value[0] and value[-1] == '"') ): return unescape(value[1:-1]) if node.type in ["number", "numberVal"]: return int(value) return unescape(value) def _parse_filter(self, filter): if filter.type == "boundedListFilter": filter = filter.children[0] children = filter.children assert len(children) == 3 category = filter.type name, op, value = [self._get_value(child) for child in children] if category == "patternFilter": if name == "origin": return { "multi_match": { "query": value, "type": "bool_prefix", "operator": "and", "fields": [ "url.as_you_type", "url.as_you_type._2gram", "url.as_you_type._3gram", ], } } elif name == "metadata": return { "nested": { "path": "intrinsic_metadata", "query": { "multi_match": { "query": value, # Makes it so that the "foo bar" query returns # documents which contain "foo" in a field and "bar" # in a different field "type": "cross_fields", # All keywords must be found in a document for it to # be considered a match. # TODO: allow missing keywords? "operator": "and", # Searches on all fields of the intrinsic_metadata dict, # recursively. "fields": ["intrinsic_metadata.*"], # date{Created,Modified,Published} are of type date "lenient": True, } }, } } if category == "booleanFilter": if name == "visited": if value == "true": return {"term": {"has_visits": True}} else: # non-visited origins will typically not have "has_visits" set # at all return { "bool": { "should": [ {"term": {"has_visits": False}}, { "bool": { "must_not": {"exists": {"field": "has_visits"}} } }, ] } } if category == "numericFilter": if name == "visits": if op in ["=", "!="]: return { "bool": { ("must" if op == "=" else "must_not"): [ {"range": {"nb_visits": {"gte": value, "lte": value}}} ] } } else: return { "range": {"nb_visits": {self.RANGE_OPERATOR_MAP[op]: value}} } if category == "visitTypeFilter": if name == "visit_type": return {"terms": {"visit_types": value}} if category == "unboundedListFilter": value_array = value if name == "keyword": return { "nested": { "path": "intrinsic_metadata", "query": { "multi_match": { "query": " ".join(value_array), "fields": [ get_expansion("keywords", ".") + "^2", get_expansion("descriptions", "."), # "^2" boosts an origin's score by 2x # if it the queried keywords are # found in its intrinsic_metadata.keywords ], } }, } } elif name in ["language", "license"]: name_mapping = { "language": "programming_languages", "license": "licenses", } name = name_mapping[name] return { "nested": { "path": "intrinsic_metadata", "query": { "bool": { "should": [ {"match": {get_expansion(name, "."): val}} for val in value_array ], } }, } } if category == "dateFilter": if name in ["created", "modified", "published"]: if op in ["=", "!="]: return { "nested": { "path": "intrinsic_metadata", "query": { "bool": { ("must" if op == "=" else "must_not"): [ { "range": { get_expansion(f"date_{name}", "."): { "gte": value, "lte": value, } } } ], } }, } } return { "nested": { "path": "intrinsic_metadata", "query": { "bool": { "must": [ { "range": { get_expansion(f"date_{name}", "."): { self.RANGE_OPERATOR_MAP[op]: value, } } } ], } }, } } else: if op in ["=", "!="]: return { "bool": { ("must" if op == "=" else "must_not"): [ { "range": { - f"{name}_date": {"gte": value, "lte": value,} + f"{name}_date": { + "gte": value, + "lte": value, + } } } ], } } return { "range": { f"{name}_date": { self.RANGE_OPERATOR_MAP[op]: value.replace("Z", "+00:00"), } } } if category == "sortBy": return value if category == "limit": return value raise SearchQuerySyntaxError(f"Unknown filter {category}.{name}") diff --git a/swh/search/utils.py b/swh/search/utils.py index 464c435..a4d9943 100644 --- a/swh/search/utils.py +++ b/swh/search/utils.py @@ -1,111 +1,121 @@ # Copyright (C) 2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import codecs from datetime import datetime from typing import Optional import iso8601 def get_expansion(field, sep=None): METADATA_FIELDS = { "licenses": ["intrinsic_metadata", "http://schema.org/license", "@id"], "programming_languages": [ "intrinsic_metadata", "http://schema.org/programmingLanguage", "@value", ], - "keywords": ["intrinsic_metadata", "http://schema.org/keywords", "@value",], + "keywords": [ + "intrinsic_metadata", + "http://schema.org/keywords", + "@value", + ], "descriptions": [ "intrinsic_metadata", "http://schema.org/description", "@value", ], "date_created": [ "intrinsic_metadata", "http://schema.org/dateCreated", "@value", ], "date_modified": [ "intrinsic_metadata", "http://schema.org/dateModified", "@value", ], "date_published": [ "intrinsic_metadata", "http://schema.org/datePublished", "@value", ], } if sep: return sep.join(METADATA_FIELDS[field]) return METADATA_FIELDS[field] def parse_and_format_date(date_str: str) -> Optional[str]: """ Parses a string date in the format %Y-%m-%d or ISO8601 and returns a new string date in the format YYYY-mm-dd if the parsing succeeded otherwise None. """ try: return datetime.strptime(date_str, "%Y-%m-%d").strftime("%Y-%m-%d") except Exception: try: return iso8601.parse_date(date_str).strftime("%Y-%m-%d") except Exception: return None def escape(obj): r"""Makes the object directly injectable into the query language by converting the escapable parts of the object into escape sequences. For strings, appends \ before special characters like ', ", and \ For arrays, applies the same transformation on each element, joins the elements and returns a string-like representation of the list. >>> print(escape("foo ' bar")) "foo \' bar" >>> print(escape([r"foo ' bar", r"bar \\\' baz", r'foo " baz'])) ["foo \' bar", "bar \\\\\\\' baz", "foo \" baz"] """ if type(obj) == list: items = [escape(item) for item in obj] return "[" + ", ".join(items) + "]" elif type(obj) == str: return ( '"' - + obj.translate({ord("'"): r"\'", ord('"'): r"\"", ord("\\"): r"\\",}) + + obj.translate( + { + ord("'"): r"\'", + ord('"'): r"\"", + ord("\\"): r"\\", + } + ) + '"' ) else: raise Exception(f"Unexpected item type {type(obj)}") def unescape(string): r"""Processes the escaped special characters >>> unescape(r'''foo " bar''') == r'''foo " bar''' True >>> unescape(r'''foo \" bar''') == r'''foo " bar''' True >>> unescape(r'''foo \\" bar''') == r'''foo \" bar''' True >>> unescape(r'''foo \\\" bar''') == r'''foo \" bar''' True >>> unescape(r'''foo \\\\" bar''') == r'''foo \\" bar''' True >>> unescape(r'''café \" foo''') == r'''café " foo''' True """ return codecs.escape_decode(string.encode())[0].decode()