diff --git a/Makefile.local b/Makefile.local deleted file mode 100644 index b363dbb..0000000 --- a/Makefile.local +++ /dev/null @@ -1,32 +0,0 @@ -YARN ?= yarn -PYTHON ?= python3 - -ts-install: package.json - $(PYTHON) setup.py ts_install - -ts-generate: ts-install query_language/grammar.js - $(PYTHON) setup.py ts_generate - -ts-dev: ts-install -ifdef sanitize - $(YARN) dev | sed '5,$$s/[[0-9]\+, [0-9]\+]/ /g' | sed '5,$$s/ *- *//g'; -else - $(YARN) dev; -endif - -ts-test: ts-install - $(YARN) test - -ts-repl: ts-generate - $(YARN) repl - -ts-build-so: ts-generate query_language/src/ - $(PYTHON) setup.py ts_build_so - -ts-build-wasm: ts-generate query_language/src/ - $(PYTHON) setup.py ts_build_wasm - -ts-build: ts-build-so ts-build-wasm - @echo 'Build completed' - -test: ts-build-so diff --git a/PKG-INFO b/PKG-INFO index 122b124..38ebb20 100644 --- a/PKG-INFO +++ b/PKG-INFO @@ -1,90 +1,90 @@ Metadata-Version: 2.1 Name: swh.search -Version: 0.11.4 +Version: 0.11.5 Summary: Software Heritage search service Home-page: https://forge.softwareheritage.org/diffusion/DSEA Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest Project-URL: Funding, https://www.softwareheritage.org/donate Project-URL: Source, https://forge.softwareheritage.org/source/swh-search Project-URL: Documentation, https://docs.softwareheritage.org/devel/swh-search/ Platform: UNKNOWN Classifier: Programming Language :: Python :: 3 Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3) Classifier: Operating System :: OS Independent Classifier: Development Status :: 3 - Alpha Requires-Python: >=3.7 Description-Content-Type: text/markdown Provides-Extra: testing License-File: LICENSE License-File: AUTHORS swh-search ========== Search service for the Software Heritage archive. It is similar to swh-storage in what it contains, but provides different ways to query it: while swh-storage is mostly a key-value store that returns an object from a primary key, swh-search is focused on reverse indices, to allow finding objects that match some criteria; for example full-text search. Currently uses ElasticSearch, and provides only origin search (by URL and metadata) ## Dependencies - Python tests for this module include tests that cannot be run without a local ElasticSearch instance, so you need the ElasticSearch server executable on your machine (no need to have a running ElasticSearch server). - Debian-like host The elasticsearch package is required. As it's not part of debian-stable, [another debian repository is required to be configured](https://www.elastic.co/guide/en/elasticsearch/reference/current/deb.html#deb-repo) - Non Debian-like host The tests expect: - `/usr/share/elasticsearch/jdk/bin/java` to exist. - `org.elasticsearch.bootstrap.Elasticsearch` to be in java's classpath. - Emscripten is required for generating tree-sitter WASM module. The following commands need to be executed for the setup: ```bash cd /opt && git clone https://github.com/emscripten-core/emsdk.git && cd emsdk && \ ./emsdk install latest && ./emsdk activate latest PATH="${PATH}:/opt/emsdk/upstream/emscripten" ``` **Note:** If emsdk isn't found in the PATH, the tree-sitter cli automatically pulls `emscripten/emsdk` image from docker hub when `make ts-build-wasm` or `make ts-build` is used. ## Make targets Below is the list of available make targets that can be executed from the root directory of swh-search in order to build and/or execute the swh-search under various configurations: * **ts-install**: Install node_modules and emscripten SDK required for TreeSitter * **ts-generate**: Generate parser files(C and JSON) from the grammar * **ts-repl**: Starts a web based playground for the TreeSitter grammar. It's the recommended way for developing TreeSitter grammar. * **ts-dev**: Parse the `query_language/sample_query` and print the corresponding syntax expression along with the start and end positions of all the nodes. * **ts-dev sanitize=1**: Same as **ts-dev** but without start and end position of the nodes. This format is expected by TreeSitter's native test command. `sanitize=1` cleans the output of **ts-dev** using `sed` to achieve the desired format. * **ts-test**: executes TreeSitter's native tests * **ts-build-so**: Generates `swh_ql.so` file from the previously generated parser using py-tree-sitter * **ts-build-so**: Generates `swh_ql.wasm` file from the previously generated parser using emscripten * **ts-build**: Executes both **ts-build-so** and **ts-build-so** diff --git a/setup.py b/setup.py index e3b24f3..1e7e26e 100755 --- a/setup.py +++ b/setup.py @@ -1,188 +1,188 @@ #!/usr/bin/env python3 # Copyright (C) 2015-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from distutils.cmd import Command from distutils.command.build import build from io import open import os import shutil import subprocess from setuptools import find_packages, setup from setuptools.command.develop import develop from setuptools.command.sdist import sdist here = os.path.abspath(os.path.dirname(__file__)) # Get the long description from the README file with open(os.path.join(here, "README.md"), encoding="utf-8") as f: long_description = f.read() def parse_requirements(name=None): if name: reqf = "requirements-%s.txt" % name else: reqf = "requirements.txt" requirements = [] if not os.path.exists(reqf): return requirements with open(reqf) as f: for line in f.readlines(): line = line.strip() if not line or line.startswith("#"): continue requirements.append(line) return requirements yarn = os.environ.get("YARN", "yarn") class TSCommand(Command): user_options = [] def initialize_options(self): pass def finalize_options(self): pass class TSInstallCommand(TSCommand): description = "Installs node_modules related to query language" def run(self): subprocess.run([yarn, "install"], check=True) class TSBuildSoCommand(TSCommand): description = "Builds swh_ql.so" def initialize_options(self): self.build_lib = None super().initialize_options() def finalize_options(self): self.set_undefined_options("build", ("build_lib", "build_lib")) super().finalize_options() def run(self): ql_dir = os.path.join(self.build_lib, "swh/search/query_language") copy_ql_tree(ql_dir) if not os.path.exists(os.path.join(ql_dir, "src/parser.c")): print("parser.c missing from build dir.") self.run_command("ts_install") generate_parser(ql_dir) static_dir = os.path.join(self.build_lib, "swh/search/static") os.makedirs(static_dir, exist_ok=True) # This import cannot be toplevel, as setuptools installs it after the script # starts running from tree_sitter import Language Language.build_library(os.path.join(static_dir, "swh_ql.so"), [ql_dir]) print("swh_ql.so file generated") class TSBuildCommand(TSCommand): - description = "Builds swh_ql.so and swh_ql.wasm" + description = "Builds swh_ql.so" def run(self): self.run_command("ts_build_so") class custom_build(build): def run(self): super().run() if not self.dry_run: self.run_command("ts_build") class custom_sdist(sdist): def make_release_tree(self, base_dir, files): super().make_release_tree(base_dir, files) dist_ql_path = os.path.join(base_dir, "swh/search/query_language") if not self.dry_run: self.run_command("ts_install") copy_ql_tree(dist_ql_path) generate_parser(dist_ql_path) class custom_develop(develop): def run(self): super().run() - if not self.dry_run: + self.run_command("ts_install") generate_parser("swh/search/query_language") def copy_ql_tree(dest_path): # FIXME: setuptools should copy this itself... print("Copying parser files") if os.path.exists(dest_path): shutil.rmtree(dest_path) shutil.copytree("swh/search/query_language", dest_path) def generate_parser(dest_path): print("Getting path") path = subprocess.check_output([yarn, "bin"]).decode().strip() env = {**os.environ, "PATH": os.pathsep.join([path, os.environ["PATH"]])} print("Generating") subprocess.run(["tree-sitter", "generate", "--no-bindings"], cwd=dest_path, env=env) setup( name="swh.search", description="Software Heritage search service", long_description=long_description, long_description_content_type="text/markdown", python_requires=">=3.7", author="Software Heritage developers", author_email="swh-devel@inria.fr", url="https://forge.softwareheritage.org/diffusion/DSEA", packages=find_packages(), # packages's modules install_requires=parse_requirements() + parse_requirements("swh"), tests_require=parse_requirements("test"), entry_points=""" [swh.cli.subcommands] search=swh.search.cli """, setup_requires=["setuptools-scm", "tree-sitter==0.19.0"], use_scm_version=True, extras_require={"testing": parse_requirements("test")}, include_package_data=True, classifiers=[ "Programming Language :: Python :: 3", "Intended Audience :: Developers", "License :: OSI Approved :: GNU General Public License v3 (GPLv3)", "Operating System :: OS Independent", "Development Status :: 3 - Alpha", ], project_urls={ "Bug Reports": "https://forge.softwareheritage.org/maniphest", "Funding": "https://www.softwareheritage.org/donate", "Source": "https://forge.softwareheritage.org/source/swh-search", "Documentation": "https://docs.softwareheritage.org/devel/swh-search/", }, cmdclass={ "build": custom_build, "sdist": custom_sdist, "develop": custom_develop, "ts_install": TSInstallCommand, "ts_build_so": TSBuildSoCommand, "ts_build": TSBuildCommand, }, zip_safe=False, ) diff --git a/swh.search.egg-info/PKG-INFO b/swh.search.egg-info/PKG-INFO index 122b124..38ebb20 100644 --- a/swh.search.egg-info/PKG-INFO +++ b/swh.search.egg-info/PKG-INFO @@ -1,90 +1,90 @@ Metadata-Version: 2.1 Name: swh.search -Version: 0.11.4 +Version: 0.11.5 Summary: Software Heritage search service Home-page: https://forge.softwareheritage.org/diffusion/DSEA Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest Project-URL: Funding, https://www.softwareheritage.org/donate Project-URL: Source, https://forge.softwareheritage.org/source/swh-search Project-URL: Documentation, https://docs.softwareheritage.org/devel/swh-search/ Platform: UNKNOWN Classifier: Programming Language :: Python :: 3 Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3) Classifier: Operating System :: OS Independent Classifier: Development Status :: 3 - Alpha Requires-Python: >=3.7 Description-Content-Type: text/markdown Provides-Extra: testing License-File: LICENSE License-File: AUTHORS swh-search ========== Search service for the Software Heritage archive. It is similar to swh-storage in what it contains, but provides different ways to query it: while swh-storage is mostly a key-value store that returns an object from a primary key, swh-search is focused on reverse indices, to allow finding objects that match some criteria; for example full-text search. Currently uses ElasticSearch, and provides only origin search (by URL and metadata) ## Dependencies - Python tests for this module include tests that cannot be run without a local ElasticSearch instance, so you need the ElasticSearch server executable on your machine (no need to have a running ElasticSearch server). - Debian-like host The elasticsearch package is required. As it's not part of debian-stable, [another debian repository is required to be configured](https://www.elastic.co/guide/en/elasticsearch/reference/current/deb.html#deb-repo) - Non Debian-like host The tests expect: - `/usr/share/elasticsearch/jdk/bin/java` to exist. - `org.elasticsearch.bootstrap.Elasticsearch` to be in java's classpath. - Emscripten is required for generating tree-sitter WASM module. The following commands need to be executed for the setup: ```bash cd /opt && git clone https://github.com/emscripten-core/emsdk.git && cd emsdk && \ ./emsdk install latest && ./emsdk activate latest PATH="${PATH}:/opt/emsdk/upstream/emscripten" ``` **Note:** If emsdk isn't found in the PATH, the tree-sitter cli automatically pulls `emscripten/emsdk` image from docker hub when `make ts-build-wasm` or `make ts-build` is used. ## Make targets Below is the list of available make targets that can be executed from the root directory of swh-search in order to build and/or execute the swh-search under various configurations: * **ts-install**: Install node_modules and emscripten SDK required for TreeSitter * **ts-generate**: Generate parser files(C and JSON) from the grammar * **ts-repl**: Starts a web based playground for the TreeSitter grammar. It's the recommended way for developing TreeSitter grammar. * **ts-dev**: Parse the `query_language/sample_query` and print the corresponding syntax expression along with the start and end positions of all the nodes. * **ts-dev sanitize=1**: Same as **ts-dev** but without start and end position of the nodes. This format is expected by TreeSitter's native test command. `sanitize=1` cleans the output of **ts-dev** using `sed` to achieve the desired format. * **ts-test**: executes TreeSitter's native tests * **ts-build-so**: Generates `swh_ql.so` file from the previously generated parser using py-tree-sitter * **ts-build-so**: Generates `swh_ql.wasm` file from the previously generated parser using emscripten * **ts-build**: Executes both **ts-build-so** and **ts-build-so** diff --git a/swh.search.egg-info/SOURCES.txt b/swh.search.egg-info/SOURCES.txt index 34e7969..156c662 100644 --- a/swh.search.egg-info/SOURCES.txt +++ b/swh.search.egg-info/SOURCES.txt @@ -1,70 +1,69 @@ .gitignore .pre-commit-config.yaml AUTHORS CODE_OF_CONDUCT.md CONTRIBUTORS LICENSE MANIFEST.in Makefile -Makefile.local README.md mypy.ini package.json pyproject.toml pytest.ini requirements-swh.txt requirements-test.txt requirements.txt setup.cfg setup.py tox.ini yarn.lock docs/.gitignore docs/Makefile docs/cli.rst docs/conf.py docs/index.rst docs/query-language.rst docs/_static/.placeholder docs/_templates/.placeholder es_config/elasticsearch.keystore es_config/elasticsearch.yml es_config/jvm.options es_config/log4j2.properties swh/__init__.py swh.search.egg-info/PKG-INFO swh.search.egg-info/SOURCES.txt swh.search.egg-info/dependency_links.txt swh.search.egg-info/entry_points.txt swh.search.egg-info/not-zip-safe swh.search.egg-info/requires.txt swh.search.egg-info/top_level.txt swh/search/__init__.py swh/search/cli.py swh/search/elasticsearch.py swh/search/in_memory.py swh/search/interface.py swh/search/journal_client.py swh/search/metrics.py swh/search/py.typed swh/search/translator.py swh/search/utils.py swh/search/api/__init__.py swh/search/api/client.py swh/search/api/server.py swh/search/query_language/.gitignore swh/search/query_language/grammar.js swh/search/query_language/sample_query swh/search/query_language/tokens.js swh/search/query_language/test/corpus/combinations.txt swh/search/tests/__init__.py swh/search/tests/conftest.py swh/search/tests/test_api_client.py swh/search/tests/test_cli.py swh/search/tests/test_elasticsearch.py swh/search/tests/test_in_memory.py swh/search/tests/test_init.py swh/search/tests/test_journal_client.py swh/search/tests/test_search.py swh/search/tests/test_server.py swh/search/tests/test_translator.py \ No newline at end of file diff --git a/swh/search/elasticsearch.py b/swh/search/elasticsearch.py index 05ccf1f..dc04fa5 100644 --- a/swh/search/elasticsearch.py +++ b/swh/search/elasticsearch.py @@ -1,554 +1,549 @@ # Copyright (C) 2019-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import base64 from collections import Counter import logging import pprint from textwrap import dedent -from typing import Any, Dict, Iterable, Iterator, List, Optional +from typing import Any, Dict, Iterable, List, Optional from elasticsearch import Elasticsearch, helpers import msgpack from swh.indexer import codemeta from swh.model import model -from swh.model.identifiers import origin_identifier +from swh.model.hashutil import hash_to_hex from swh.search.interface import ( SORT_BY_OPTIONS, MinimalOriginDict, OriginDict, PagedResult, ) from swh.search.metrics import send_metric, timed from swh.search.translator import Translator from swh.search.utils import escape, get_expansion, is_date_parsable logger = logging.getLogger(__name__) INDEX_NAME_PARAM = "index" READ_ALIAS_PARAM = "read_alias" WRITE_ALIAS_PARAM = "write_alias" ORIGIN_DEFAULT_CONFIG = { INDEX_NAME_PARAM: "origin", READ_ALIAS_PARAM: "origin-read", WRITE_ALIAS_PARAM: "origin-write", } def _sanitize_origin(origin): origin = origin.copy() # Whitelist fields to be saved in Elasticsearch res = {"url": origin.pop("url")} for field_name in ( "blocklisted", "has_visits", "intrinsic_metadata", "visit_types", "nb_visits", "snapshot_id", "last_visit_date", "last_eventful_visit_date", "last_revision_date", "last_release_date", ): if field_name in origin: res[field_name] = origin.pop(field_name) # Run the JSON-LD expansion algorithm # # to normalize the Codemeta metadata. # This is required as Elasticsearch will needs each field to have a consistent # type across documents to be searchable; and non-expanded JSON-LD documents # can have various types in the same field. For example, all these are # equivalent in JSON-LD: # * {"author": "Jane Doe"} # * {"author": ["Jane Doe"]} # * {"author": {"@value": "Jane Doe"}} # * {"author": [{"@value": "Jane Doe"}]} # and JSON-LD expansion will convert them all to the last one. if "intrinsic_metadata" in res: intrinsic_metadata = res["intrinsic_metadata"] for date_field in ["dateCreated", "dateModified", "datePublished"]: if date_field in intrinsic_metadata: date = intrinsic_metadata[date_field] # If date{Created,Modified,Published} value isn't parsable # It gets rejected and isn't stored (unlike other fields) if not is_date_parsable(date): intrinsic_metadata.pop(date_field) res["intrinsic_metadata"] = codemeta.expand(intrinsic_metadata) return res def token_encode(index_to_tokenize: Dict[bytes, Any]) -> str: """Tokenize as string an index page result from a search""" page_token = base64.b64encode(msgpack.dumps(index_to_tokenize)) return page_token.decode() def token_decode(page_token: str) -> Dict[bytes, Any]: """Read the page_token""" return msgpack.loads(base64.b64decode(page_token.encode()), raw=True) class ElasticSearch: def __init__(self, hosts: List[str], indexes: Dict[str, Dict[str, str]] = {}): self._backend = Elasticsearch(hosts=hosts) self._translator = Translator() # Merge current configuration with default values origin_config = indexes.get("origin", {}) self.origin_config = {**ORIGIN_DEFAULT_CONFIG, **origin_config} def _get_origin_index(self) -> str: return self.origin_config[INDEX_NAME_PARAM] def _get_origin_read_alias(self) -> str: return self.origin_config[READ_ALIAS_PARAM] def _get_origin_write_alias(self) -> str: return self.origin_config[WRITE_ALIAS_PARAM] @timed def check(self): return self._backend.ping() def deinitialize(self) -> None: """Removes all indices from the Elasticsearch backend""" self._backend.indices.delete(index="*") def initialize(self) -> None: """Declare Elasticsearch indices, aliases and mappings""" if not self._backend.indices.exists(index=self._get_origin_index()): self._backend.indices.create(index=self._get_origin_index()) - if not self._backend.indices.exists_alias(self._get_origin_read_alias()): + if not self._backend.indices.exists_alias(name=self._get_origin_read_alias()): self._backend.indices.put_alias( index=self._get_origin_index(), name=self._get_origin_read_alias() ) - if not self._backend.indices.exists_alias(self._get_origin_write_alias()): + if not self._backend.indices.exists_alias(name=self._get_origin_write_alias()): self._backend.indices.put_alias( index=self._get_origin_index(), name=self._get_origin_write_alias() ) self._backend.indices.put_mapping( index=self._get_origin_index(), body={ "dynamic_templates": [ { "booleans_as_string": { # All fields stored as string in the metadata # even the booleans "match_mapping_type": "boolean", "path_match": "intrinsic_metadata.*", "mapping": {"type": "keyword"}, } } ], "date_detection": False, "properties": { # sha1 of the URL; used as the document id "sha1": {"type": "keyword", "doc_values": True,}, # Used both to search URLs, and as the result to return # as a response to queries "url": { "type": "text", # To split URLs into token on any character # that is not alphanumerical "analyzer": "simple", # 2-gram and partial-3-gram search (ie. with the end of the # third word potentially missing) "fields": { "as_you_type": { "type": "search_as_you_type", "analyzer": "simple", } }, }, "visit_types": {"type": "keyword"}, # used to filter out origins that were never visited "has_visits": {"type": "boolean",}, "nb_visits": {"type": "integer"}, "snapshot_id": {"type": "keyword"}, "last_visit_date": {"type": "date"}, "last_eventful_visit_date": {"type": "date"}, "last_release_date": {"type": "date"}, "last_revision_date": {"type": "date"}, "intrinsic_metadata": { "type": "nested", "properties": { "@context": { # don't bother indexing tokens in these URIs, as the # are used as namespaces "type": "keyword", }, "http://schema": { "properties": { "org/dateCreated": { "properties": {"@value": {"type": "date",}} }, "org/dateModified": { "properties": {"@value": {"type": "date",}} }, "org/datePublished": { "properties": {"@value": {"type": "date",}} }, } }, }, }, # Has this origin been taken down? "blocklisted": {"type": "boolean",}, }, }, ) @timed def flush(self) -> None: self._backend.indices.refresh(index=self._get_origin_write_alias()) @timed def origin_update(self, documents: Iterable[OriginDict]) -> None: write_index = self._get_origin_write_alias() documents = map(_sanitize_origin, documents) documents_with_sha1 = ( - (origin_identifier(document), document) for document in documents + (hash_to_hex(model.Origin(url=document["url"]).id), document) + for document in documents ) # painless script that will be executed when updating an origin document update_script = dedent( """ // utility function to get and parse date ZonedDateTime getDate(def ctx, String date_field) { String default_date = "0001-01-01T00:00:00Z"; String date = ctx._source.getOrDefault(date_field, default_date); return ZonedDateTime.parse(date); } // backup current visit_types field value List visit_types = ctx._source.getOrDefault("visit_types", []); int nb_visits = ctx._source.getOrDefault("nb_visits", 0); ZonedDateTime last_visit_date = getDate(ctx, "last_visit_date"); String snapshot_id = ctx._source.getOrDefault("snapshot_id", ""); ZonedDateTime last_eventful_visit_date = getDate(ctx, "last_eventful_visit_date"); ZonedDateTime last_revision_date = getDate(ctx, "last_revision_date"); ZonedDateTime last_release_date = getDate(ctx, "last_release_date"); // update origin document with new field values ctx._source.putAll(params); // restore previous visit types after visit_types field overriding if (ctx._source.containsKey("visit_types")) { for (int i = 0; i < visit_types.length; ++i) { if (!ctx._source.visit_types.contains(visit_types[i])) { ctx._source.visit_types.add(visit_types[i]); } } } // Undo overwrite if incoming nb_visits is smaller if (ctx._source.containsKey("nb_visits")) { int incoming_nb_visits = ctx._source.getOrDefault("nb_visits", 0); if(incoming_nb_visits < nb_visits){ ctx._source.nb_visits = nb_visits; } } // Undo overwrite if incoming last_visit_date is older if (ctx._source.containsKey("last_visit_date")) { ZonedDateTime incoming_last_visit_date = getDate(ctx, "last_visit_date"); int difference = // returns -1, 0 or 1 incoming_last_visit_date.compareTo(last_visit_date); if(difference < 0){ ctx._source.last_visit_date = last_visit_date; } } // Undo update of last_eventful_date and snapshot_id if // snapshot_id hasn't changed OR incoming_last_eventful_visit_date is older if (ctx._source.containsKey("snapshot_id")) { String incoming_snapshot_id = ctx._source.getOrDefault("snapshot_id", ""); ZonedDateTime incoming_last_eventful_visit_date = getDate(ctx, "last_eventful_visit_date"); int difference = // returns -1, 0 or 1 incoming_last_eventful_visit_date.compareTo(last_eventful_visit_date); if(snapshot_id == incoming_snapshot_id || difference < 0){ ctx._source.snapshot_id = snapshot_id; ctx._source.last_eventful_visit_date = last_eventful_visit_date; } } // Undo overwrite if incoming last_revision_date is older if (ctx._source.containsKey("last_revision_date")) { ZonedDateTime incoming_last_revision_date = getDate(ctx, "last_revision_date"); int difference = // returns -1, 0 or 1 incoming_last_revision_date.compareTo(last_revision_date); if(difference < 0){ ctx._source.last_revision_date = last_revision_date; } } // Undo overwrite if incoming last_release_date is older if (ctx._source.containsKey("last_release_date")) { ZonedDateTime incoming_last_release_date = getDate(ctx, "last_release_date"); // returns -1, 0 or 1 int difference = incoming_last_release_date.compareTo(last_release_date); if(difference < 0){ ctx._source.last_release_date = last_release_date; } } """ # noqa ) actions = [ { "_op_type": "update", "_id": sha1, "_index": write_index, "scripted_upsert": True, "upsert": {**document, "sha1": sha1,}, + "retry_on_conflict": 10, "script": { "source": update_script, "lang": "painless", "params": document, }, } for (sha1, document) in documents_with_sha1 ] indexed_count, errors = helpers.bulk(self._backend, actions, index=write_index) assert isinstance(errors, List) # Make mypy happy send_metric("document:index", count=indexed_count, method_name="origin_update") send_metric( "document:index_error", count=len(errors), method_name="origin_update" ) - def origin_dump(self) -> Iterator[model.Origin]: - results = helpers.scan(self._backend, index=self._get_origin_read_alias()) - for hit in results: - yield self._backend.termvectors( - index=self._get_origin_read_alias(), id=hit["_id"], fields=["*"] - ) - @timed def origin_search( self, *, query: str = "", url_pattern: Optional[str] = None, metadata_pattern: Optional[str] = None, with_visit: bool = False, visit_types: Optional[List[str]] = None, min_nb_visits: int = 0, min_last_visit_date: str = "", min_last_eventful_visit_date: str = "", min_last_revision_date: str = "", min_last_release_date: str = "", min_date_created: str = "", min_date_modified: str = "", min_date_published: str = "", programming_languages: Optional[List[str]] = None, licenses: Optional[List[str]] = None, keywords: Optional[List[str]] = None, sort_by: Optional[List[str]] = None, page_token: Optional[str] = None, limit: int = 50, ) -> PagedResult[MinimalOriginDict]: query_clauses: List[Dict[str, Any]] = [] query_filters = [] if url_pattern: query_filters.append(f"origin = {escape(url_pattern)}") if metadata_pattern: query_filters.append(f"metadata = {escape(metadata_pattern)}") # if not query_clauses: # raise ValueError( # "At least one of url_pattern and metadata_pattern must be provided." # ) if with_visit: query_filters.append(f"visited = {'true' if with_visit else 'false'}") if min_nb_visits: query_filters.append(f"visits >= {min_nb_visits}") if min_last_visit_date: query_filters.append( f"last_visit >= {min_last_visit_date.replace('Z', '+00:00')}" ) if min_last_eventful_visit_date: query_filters.append( "last_eventful_visit >= " f"{min_last_eventful_visit_date.replace('Z', '+00:00')}" ) if min_last_revision_date: query_filters.append( f"last_revision >= {min_last_revision_date.replace('Z', '+00:00')}" ) if min_last_release_date: query_filters.append( f"last_release >= {min_last_release_date.replace('Z', '+00:00')}" ) if keywords: query_filters.append(f"keyword in {escape(keywords)}") if licenses: query_filters.append(f"license in {escape(licenses)}") if programming_languages: query_filters.append(f"language in {escape(programming_languages)}") if min_date_created: query_filters.append( f"created >= {min_date_created.replace('Z', '+00:00')}" ) if min_date_modified: query_filters.append( f"modified >= {min_date_modified.replace('Z', '+00:00')}" ) if min_date_published: query_filters.append( f"published >= {min_date_published.replace('Z', '+00:00')}" ) if visit_types is not None: query_filters.append(f"visit_type = {escape(visit_types)}") combined_filters = f"({' and '.join(query_filters)})" query = f"{combined_filters}{' and ' if query != '' else ' '}{query}" parsed_query = self._translator.parse_query(query) query_clauses.append(parsed_query["filters"]) field_map = { "visits": "nb_visits", "last_visit": "last_visit_date", "last_eventful_visit": "last_eventful_visit_date", "last_revision": "last_revision_date", "last_release": "last_release_date", "created": "date_created", "modified": "date_modified", "published": "date_published", } if "sortBy" in parsed_query: if sort_by is None: sort_by = [] for sort_by_option in parsed_query["sortBy"]: if sort_by_option[0] == "-": sort_by.append("-" + field_map[sort_by_option[1:]]) else: sort_by.append(field_map[sort_by_option]) if parsed_query.get("limit", 0): limit = parsed_query["limit"] sorting_params: List[Dict[str, Any]] = [] if sort_by: for field in sort_by: order = "asc" if field and field[0] == "-": field = field[1:] order = "desc" if field in ["date_created", "date_modified", "date_published"]: sorting_params.append( { get_expansion(field, "."): { "nested_path": "intrinsic_metadata", "order": order, } } ) elif field in SORT_BY_OPTIONS: sorting_params.append({field: order}) sorting_params.extend( [{"_score": "desc"}, {"sha1": "asc"},] ) body = { "query": { "bool": { "must": query_clauses, "must_not": [{"term": {"blocklisted": True}}], } }, "sort": sorting_params, } if page_token: # TODO: use ElasticSearch's scroll API? page_token_content = token_decode(page_token) body["search_after"] = [ page_token_content[b"score"], page_token_content[b"sha1"].decode("ascii"), ] if logger.isEnabledFor(logging.DEBUG): formatted_body = pprint.pformat(body) logger.debug("Search query body: %s", formatted_body) res = self._backend.search( index=self._get_origin_read_alias(), body=body, size=limit ) hits = res["hits"]["hits"] next_page_token: Optional[str] = None if len(hits) == limit: # There are more results after this page; return a pagination token # to get them in a future query last_hit = hits[-1] next_page_token_content = { b"score": last_hit["_score"], b"sha1": last_hit["_source"]["sha1"], } next_page_token = token_encode(next_page_token_content) assert len(hits) <= limit return PagedResult( results=[{"url": hit["_source"]["url"]} for hit in hits], next_page_token=next_page_token, ) def visit_types_count(self) -> Counter: body = { "aggs": { "not_blocklisted": { "filter": {"bool": {"must_not": [{"term": {"blocklisted": True}}]}}, "aggs": { "visit_types": {"terms": {"field": "visit_types", "size": 1000}} }, } } } res = self._backend.search( index=self._get_origin_read_alias(), body=body, size=0 ) buckets = ( res.get("aggregations", {}) .get("not_blocklisted", {}) .get("visit_types", {}) .get("buckets", []) ) return Counter({bucket["key"]: bucket["doc_count"] for bucket in buckets}) diff --git a/swh/search/in_memory.py b/swh/search/in_memory.py index dfc0e4e..b185636 100644 --- a/swh/search/in_memory.py +++ b/swh/search/in_memory.py @@ -1,516 +1,517 @@ # Copyright (C) 2019-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from collections import Counter, defaultdict from datetime import datetime, timezone from itertools import chain import re from typing import Any, Dict, Iterable, Iterator, List, Optional from swh.indexer import codemeta -from swh.model.identifiers import origin_identifier +from swh.model import model +from swh.model.hashutil import hash_to_hex from swh.search.interface import ( SORT_BY_OPTIONS, MinimalOriginDict, OriginDict, PagedResult, ) from swh.search.utils import get_expansion, is_date_parsable _words_regexp = re.compile(r"\w+") def _dict_words_set(d): """Recursively extract set of words from dict content.""" values = set() def extract(obj, words): if isinstance(obj, dict): for k, v in obj.items(): extract(v, words) elif isinstance(obj, list): for item in obj: extract(item, words) else: words.update(_words_regexp.findall(str(obj).lower())) return words return extract(d, values) def _nested_get(nested_dict, nested_keys, default=""): """Extracts values from deeply nested dictionary nested_dict using the nested_keys and returns a list of all of the values discovered in the process. >>> nested_dict = [ ... {"name": [{"@value": {"first": "f1", "last": "l1"}}], "address": "XYZ"}, ... {"name": [{"@value": {"first": "f2", "last": "l2"}}], "address": "ABC"}, ... ] >>> _nested_get(nested_dict, ["name", "@value", "last"]) ['l1', 'l2'] >>> _nested_get(nested_dict, ["address"]) ['XYZ', 'ABC'] It doesn't allow fetching intermediate values and returns "" for such cases >>> _nested_get(nested_dict, ["name", "@value"]) ['', ''] """ def _nested_get_recursive(nested_dict, nested_keys): try: curr_obj = nested_dict type_curr_obj = type(curr_obj) for i, key in enumerate(nested_keys): if key in curr_obj: curr_obj = curr_obj[key] type_curr_obj = type(curr_obj) else: if type_curr_obj == list: curr_obj = [ _nested_get_recursive(obj, nested_keys[i:]) for obj in curr_obj ] # If value isn't a list or string or integer elif type_curr_obj != str and type_curr_obj != int: return default # If only one element is present in the list, take it out # This ensures a flat array every time if type_curr_obj == list and len(curr_obj) == 1: curr_obj = curr_obj[0] return curr_obj except Exception: return default res = _nested_get_recursive(nested_dict, nested_keys) if type(res) != list: return [res] return res def _tokenize(x): return x.lower().replace(",", " ").split() def _get_sorting_key(origin, field): """Get value of the field from an origin for sorting origins. Here field should be a member of SORT_BY_OPTIONS. If "-" is present at the start of field then invert the value in a way that it reverses the sorting order. """ reversed = False if field[0] == "-": field = field[1:] reversed = True DATETIME_OBJ_MAX = datetime.max.replace(tzinfo=timezone.utc) DATETIME_MIN = "0001-01-01T00:00:00Z" DATE_OBJ_MAX = datetime.max DATE_MIN = "0001-01-01" if field == "score": if reversed: return -origin.get(field, 0) else: return origin.get(field, 0) if field in ["date_created", "date_modified", "date_published"]: date = datetime.strptime( _nested_get(origin, get_expansion(field), DATE_MIN)[0], "%Y-%m-%d" ) if reversed: return DATE_OBJ_MAX - date else: return date elif field in ["nb_visits"]: # unlike other options, nb_visits is of type integer if reversed: return -origin.get(field, 0) else: return origin.get(field, 0) elif field in SORT_BY_OPTIONS: date = datetime.fromisoformat( origin.get(field, DATETIME_MIN).replace("Z", "+00:00") ) if reversed: return DATETIME_OBJ_MAX - date else: return date class InMemorySearch: def __init__(self): pass def check(self): return True def deinitialize(self) -> None: if hasattr(self, "_origins"): del self._origins del self._origin_ids def initialize(self) -> None: self._origins: Dict[str, Dict[str, Any]] = defaultdict(dict) self._origin_ids: List[str] = [] def flush(self) -> None: pass _url_splitter = re.compile(r"\W") def origin_update(self, documents: Iterable[OriginDict]) -> None: for source_document in documents: document: Dict[str, Any] = dict(source_document) - id_ = origin_identifier(document) + id_ = hash_to_hex(model.Origin(url=document["url"]).id) if "url" in document: document["_url_tokens"] = set( self._url_splitter.split(source_document["url"]) ) if "visit_types" in document: document["visit_types"] = set(source_document["visit_types"]) if "visit_types" in self._origins[id_]: document["visit_types"].update(self._origins[id_]["visit_types"]) if "nb_visits" in document: document["nb_visits"] = max( document["nb_visits"], self._origins[id_].get("nb_visits", 0) ) if "last_visit_date" in document: document["last_visit_date"] = max( datetime.fromisoformat(document["last_visit_date"]), datetime.fromisoformat( self._origins[id_] .get("last_visit_date", "0001-01-01T00:00:00.000000Z",) .replace("Z", "+00:00") ), ).isoformat() if "snapshot_id" in document and "last_eventful_visit_date" in document: incoming_date = datetime.fromisoformat( document["last_eventful_visit_date"] ) current_date = datetime.fromisoformat( self._origins[id_] .get("last_eventful_visit_date", "0001-01-01T00:00:00Z",) .replace("Z", "+00:00") ) incoming_snapshot_id = document["snapshot_id"] current_snapshot_id = self._origins[id_].get("snapshot_id", "") if ( incoming_snapshot_id == current_snapshot_id or incoming_date < current_date ): # update not required so override the incoming_values document["snapshot_id"] = current_snapshot_id document["last_eventful_visit_date"] = current_date.isoformat() if "last_revision_date" in document: document["last_revision_date"] = max( datetime.fromisoformat(document["last_revision_date"]), datetime.fromisoformat( self._origins[id_] .get("last_revision_date", "0001-01-01T00:00:00Z",) .replace("Z", "+00:00") ), ).isoformat() if "last_release_date" in document: document["last_release_date"] = max( datetime.fromisoformat(document["last_release_date"]), datetime.fromisoformat( self._origins[id_] .get("last_release_date", "0001-01-01T00:00:00Z",) .replace("Z", "+00:00") ), ).isoformat() if "intrinsic_metadata" in document: intrinsic_metadata = document["intrinsic_metadata"] for date_field in ["dateCreated", "dateModified", "datePublished"]: if date_field in intrinsic_metadata: date = intrinsic_metadata[date_field] # If date{Created,Modified,Published} value isn't parsable # It gets rejected and isn't stored (unlike other fields) if not is_date_parsable(date): intrinsic_metadata.pop(date_field) document["intrinsic_metadata"] = codemeta.expand(intrinsic_metadata) if len(document["intrinsic_metadata"]) != 1: continue metadata = document["intrinsic_metadata"][0] if "http://schema.org/license" in metadata: metadata["http://schema.org/license"] = [ {"@id": license["@id"].lower()} for license in metadata["http://schema.org/license"] ] if "http://schema.org/programmingLanguage" in metadata: metadata["http://schema.org/programmingLanguage"] = [ {"@value": license["@value"].lower()} for license in metadata["http://schema.org/programmingLanguage"] ] self._origins[id_].update(document) if id_ not in self._origin_ids: self._origin_ids.append(id_) def origin_search( self, *, query: str = "", url_pattern: Optional[str] = None, metadata_pattern: Optional[str] = None, with_visit: bool = False, visit_types: Optional[List[str]] = None, min_nb_visits: int = 0, min_last_visit_date: str = "", min_last_eventful_visit_date: str = "", min_last_revision_date: str = "", min_last_release_date: str = "", min_date_created: str = "", min_date_modified: str = "", min_date_published: str = "", programming_languages: Optional[List[str]] = None, licenses: Optional[List[str]] = None, keywords: Optional[List[str]] = None, sort_by: Optional[List[str]] = None, page_token: Optional[str] = None, limit: int = 50, ) -> PagedResult[MinimalOriginDict]: hits = self._get_hits() if url_pattern: tokens = set(self._url_splitter.split(url_pattern)) def predicate(match): missing_tokens = tokens - match["_url_tokens"] if len(missing_tokens) == 0: return True elif len(missing_tokens) > 1: return False else: # There is one missing token, look up by prefix. (missing_token,) = missing_tokens return any( token.startswith(missing_token) for token in match["_url_tokens"] ) hits = filter(predicate, hits) if metadata_pattern: metadata_pattern_words = set( _words_regexp.findall(metadata_pattern.lower()) ) def predicate(match): if "intrinsic_metadata" not in match: return False return metadata_pattern_words.issubset( _dict_words_set(match["intrinsic_metadata"]) ) hits = filter(predicate, hits) if not url_pattern and not metadata_pattern: raise ValueError( "At least one of url_pattern and metadata_pattern must be provided." ) next_page_token: Optional[str] = None if with_visit: hits = filter(lambda o: o.get("has_visits"), hits) if min_nb_visits: hits = filter(lambda o: o.get("nb_visits", 0) >= min_nb_visits, hits) if min_last_visit_date: hits = filter( lambda o: datetime.fromisoformat( o.get("last_visit_date", "0001-01-01T00:00:00Z").replace( "Z", "+00:00" ) ) >= datetime.fromisoformat(min_last_visit_date), hits, ) if min_last_eventful_visit_date: hits = filter( lambda o: datetime.fromisoformat( o.get("last_eventful_visit_date", "0001-01-01T00:00:00Z").replace( "Z", "+00:00" ) ) >= datetime.fromisoformat(min_last_eventful_visit_date), hits, ) if min_last_revision_date: hits = filter( lambda o: datetime.fromisoformat( o.get("last_revision_date", "0001-01-01T00:00:00Z").replace( "Z", "+00:00" ) ) >= datetime.fromisoformat(min_last_revision_date), hits, ) if min_last_release_date: hits = filter( lambda o: datetime.fromisoformat( o.get("last_release_date", "0001-01-01T00:00:00Z").replace( "Z", "+00:00" ) ) >= datetime.fromisoformat(min_last_release_date), hits, ) if min_date_created: min_date_created_obj = datetime.strptime(min_date_created, "%Y-%m-%d") hits = filter( lambda o: datetime.strptime( _nested_get(o, get_expansion("date_created"))[0], "%Y-%m-%d" ) >= min_date_created_obj, hits, ) if min_date_modified: min_date_modified_obj = datetime.strptime(min_date_modified, "%Y-%m-%d") hits = filter( lambda o: datetime.strptime( _nested_get(o, get_expansion("date_modified"))[0], "%Y-%m-%d" ) >= min_date_modified_obj, hits, ) if min_date_published: min_date_published_obj = datetime.strptime(min_date_published, "%Y-%m-%d") hits = filter( lambda o: datetime.strptime( _nested_get(o, get_expansion("date_published"))[0], "%Y-%m-%d" ) >= min_date_published_obj, hits, ) if licenses: queried_licenses = [license_keyword.lower() for license_keyword in licenses] hits = filter( lambda o: any( # If any of the queried licenses are found, include the origin any( # returns True if queried_license_keyword is found # in any of the licenses of the origin queried_license_keyword in origin_license for origin_license in _nested_get(o, get_expansion("licenses")) ) for queried_license_keyword in queried_licenses ), hits, ) if programming_languages: queried_programming_languages = [ lang_keyword.lower() for lang_keyword in programming_languages ] hits = filter( lambda o: any( # If any of the queried languages are found, include the origin any( # returns True if queried_lang_keyword is found # in any of the langs of the origin queried_lang_keyword in origin_lang for origin_lang in _nested_get( o, get_expansion("programming_languages") ) ) for queried_lang_keyword in queried_programming_languages ), hits, ) if keywords: if sort_by: sort_by.append("-score") else: sort_by = ["-score"] from copy import deepcopy hits_list = deepcopy(list(hits)) for origin in hits_list: origin_keywords = [ _tokenize(keyword) for keyword in _nested_get(origin, get_expansion("keywords")) ] origin_descriptions = [ _tokenize(description) for description in _nested_get( origin, get_expansion("descriptions") ) ] for q_keyword in keywords: for origin_keyword_tokens in origin_keywords: if q_keyword in origin_keyword_tokens: origin["score"] = origin.get("score", 0) + 2 for origin_description_token in origin_descriptions: if q_keyword in origin_description_token: origin["score"] = origin.get("score", 0) + 1 hits = (origin for origin in hits_list if origin.get("score", 0) > 0) if visit_types is not None: visit_types_set = set(visit_types) hits = filter( lambda o: visit_types_set.intersection(o.get("visit_types", set())), hits, ) hits_list = list(hits) if sort_by: sort_by_list = list(sort_by) hits_list.sort( key=lambda o: tuple( _get_sorting_key(o, field) for field in sort_by_list ) ) start_at_index = int(page_token) if page_token else 0 origins = [ {"url": hit["url"]} for hit in hits_list[start_at_index : start_at_index + limit] ] if len(origins) == limit: next_page_token = str(start_at_index + limit) assert len(origins) <= limit return PagedResult(results=origins, next_page_token=next_page_token,) def visit_types_count(self) -> Counter: hits = self._get_hits() return Counter(chain(*[hit.get("visit_types", []) for hit in hits])) def _get_hits(self) -> Iterator[Dict[str, Any]]: return ( self._origins[id_] for id_ in self._origin_ids if not self._origins[id_].get("blocklisted") ) diff --git a/swh/search/tests/test_journal_client.py b/swh/search/tests/test_journal_client.py index c225c55..b7a66a4 100644 --- a/swh/search/tests/test_journal_client.py +++ b/swh/search/tests/test_journal_client.py @@ -1,300 +1,300 @@ # Copyright (C) 2019-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from datetime import datetime, timezone import functools from unittest.mock import MagicMock import pytest +from swh.model.hashutil import hash_to_bytes from swh.model.model import ( ObjectType, Person, Release, Revision, RevisionType, Snapshot, SnapshotBranch, TargetType, Timestamp, TimestampWithTimezone, - hash_to_bytes, ) from swh.search.journal_client import ( fetch_last_revision_release_date, process_journal_objects, ) from swh.storage import get_storage DATES = [ TimestampWithTimezone( timestamp=Timestamp(seconds=1234567891, microseconds=0,), offset=120, negative_utc=False, ), TimestampWithTimezone( timestamp=Timestamp(seconds=1234567892, microseconds=0,), offset=120, negative_utc=False, ), TimestampWithTimezone( timestamp=Timestamp(seconds=1234567893, microseconds=0,), offset=120, negative_utc=False, ), TimestampWithTimezone( timestamp=Timestamp(seconds=1234567894, microseconds=0,), offset=120, negative_utc=False, ), ] COMMITTERS = [ Person(fullname=b"foo", name=b"foo", email=b""), Person(fullname=b"bar", name=b"bar", email=b""), ] REVISIONS = [ Revision( message=b"revision_1_message", date=DATES[0], committer=COMMITTERS[0], author=COMMITTERS[0], committer_date=DATES[0], type=RevisionType.GIT, directory=b"\x01" * 20, synthetic=False, metadata=None, parents=( hash_to_bytes("9b918dd063cec85c2bc63cc7f167e29f5894dcbc"), hash_to_bytes("757f38bdcd8473aaa12df55357f5e2f1a318e672"), ), ), Revision( message=b"revision_2_message", date=DATES[1], committer=COMMITTERS[1], author=COMMITTERS[1], committer_date=DATES[1], type=RevisionType.MERCURIAL, directory=b"\x02" * 20, synthetic=False, metadata=None, parents=(), extra_headers=((b"foo", b"bar"),), ), Revision( message=b"revision_3_message", date=DATES[2], committer=COMMITTERS[0], author=COMMITTERS[0], committer_date=DATES[2], type=RevisionType.GIT, directory=b"\x03" * 20, synthetic=False, metadata=None, parents=(), ), ] RELEASES = [ Release( name=b"v0.0.1", date=DATES[1], author=COMMITTERS[0], target_type=ObjectType.REVISION, target=b"\x04" * 20, message=b"foo", synthetic=False, ), Release( name=b"v0.0.2", date=DATES[2], author=COMMITTERS[1], target_type=ObjectType.REVISION, target=b"\x05" * 20, message=b"bar", synthetic=False, ), Release( name=b"v0.0.3", date=DATES[3], author=COMMITTERS[1], target_type=ObjectType.REVISION, target=b"\x05" * 20, message=b"foobar", synthetic=False, ), ] SNAPSHOTS = [ Snapshot( branches={ b"target/revision1": SnapshotBranch( target_type=TargetType.REVISION, target=REVISIONS[0].id, ), b"target/revision2": SnapshotBranch( target_type=TargetType.REVISION, target=REVISIONS[1].id, ), b"target/revision3": SnapshotBranch( target_type=TargetType.REVISION, target=REVISIONS[2].id, ), b"target/release1": SnapshotBranch( target_type=TargetType.RELEASE, target=RELEASES[0].id ), b"target/release2": SnapshotBranch( target_type=TargetType.RELEASE, target=RELEASES[1].id ), b"target/release3": SnapshotBranch( target_type=TargetType.RELEASE, target=RELEASES[2].id ), b"target/alias": SnapshotBranch( target_type=TargetType.ALIAS, target=b"target/revision1" ), }, ), Snapshot( branches={ b"target/revision1": SnapshotBranch( target_type=TargetType.REVISION, target=REVISIONS[0].id, ) }, ), Snapshot( branches={ b"target/release1": SnapshotBranch( target_type=TargetType.RELEASE, target=RELEASES[0].id ) }, ), Snapshot(branches={}), ] @pytest.fixture def storage(): storage = get_storage("memory") storage.revision_add(REVISIONS) storage.release_add(RELEASES) storage.snapshot_add(SNAPSHOTS) return storage def test_journal_client_origin_from_journal(): search_mock = MagicMock() worker_fn = functools.partial(process_journal_objects, search=search_mock,) worker_fn({"origin": [{"url": "http://foobar.baz"},]}) search_mock.origin_update.assert_called_once_with( [{"url": "http://foobar.baz"},] ) search_mock.reset_mock() worker_fn({"origin": [{"url": "http://foobar.baz"}, {"url": "http://barbaz.qux"},]}) search_mock.origin_update.assert_called_once_with( [{"url": "http://foobar.baz"}, {"url": "http://barbaz.qux"},] ) def test_journal_client_origin_visit_status_from_journal(storage): search_mock = MagicMock() worker_fn = functools.partial( process_journal_objects, search=search_mock, storage=storage ) current_datetime = datetime.now(tz=timezone.utc) worker_fn( { "origin_visit_status": [ { "origin": "http://foobar.baz", "status": "full", "type": "git", "visit": 5, "date": current_datetime, "snapshot": SNAPSHOTS[0].id, } # full visits ok ] } ) search_mock.origin_update.assert_called_once_with( [ { "url": "http://foobar.baz", "visit_types": ["git"], "has_visits": True, "nb_visits": 5, "snapshot_id": SNAPSHOTS[0].id.hex(), "last_visit_date": current_datetime.isoformat(), "last_eventful_visit_date": current_datetime.isoformat(), "last_revision_date": "2009-02-14T01:31:33+02:00", "last_release_date": "2009-02-14T01:31:34+02:00", }, ] ) search_mock.reset_mock() # non-full visits only set the visit_types attribute worker_fn( { "origin_visit_status": [ { "origin": "http://foobar.baz", "type": "git", "status": "partial", "visit": 5, "date": current_datetime, } ] } ) search_mock.origin_update.assert_called_once_with( [{"url": "http://foobar.baz", "visit_types": ["git"]}] ) def test_journal_client_origin_metadata_from_journal(): search_mock = MagicMock() worker_fn = functools.partial(process_journal_objects, search=search_mock,) worker_fn( { "origin_intrinsic_metadata": [ { "id": "http://foobar.baz", "metadata": { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "description": "foo bar", "programmingLanguage": "python", "license": "MIT", }, }, ] } ) search_mock.origin_update.assert_called_once_with( [ { "url": "http://foobar.baz", "intrinsic_metadata": { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "description": "foo bar", "programmingLanguage": "python", "license": "MIT", }, }, ] ) def test_fetch_last_revision_release_date(storage): for snapshot in SNAPSHOTS: assert fetch_last_revision_release_date(snapshot.id, storage) is not None diff --git a/swh/search/tests/test_translator.py b/swh/search/tests/test_translator.py index 9dcf018..af0b675 100644 --- a/swh/search/tests/test_translator.py +++ b/swh/search/tests/test_translator.py @@ -1,351 +1,400 @@ import pytest from swh.search.translator import Translator from swh.search.utils import get_expansion def _test_results(query, expected): output = Translator().parse_query(query) assert output == expected def test_empty_query(): query = "" with pytest.raises(Exception): _test_results(query, {}) def test_conjunction_operators(): query = "visited = true or visits > 2 and visits < 5" expected = { "filters": { "bool": { "should": [ {"term": {"has_visits": True}}, { "bool": { "must": [ {"range": {"nb_visits": {"gt": 2}}}, {"range": {"nb_visits": {"lt": 5}}}, ] } }, ] } } } _test_results(query, expected) def test_conjunction_op_precedence_override(): query = "(visited = false or visits > 2) and visits < 5" expected = { "filters": { "bool": { "must": [ { "bool": { "should": [ {"term": {"has_visits": False}}, {"range": {"nb_visits": {"gt": 2}}}, ] } }, {"range": {"nb_visits": {"lt": 5}}}, ] } } } _test_results(query, expected) def test_limit_and_sortby(): query = "visited = true sort_by = [-visits,last_visit] limit = 15" expected = { "filters": {"term": {"has_visits": True}}, "sortBy": ["-visits", "last_visit"], "limit": 15, } _test_results(query, expected) def test_deeply_nested_filters(): query = "(((visited = true and visits > 0)))" expected = { "filters": { "bool": { "must": [ {"term": {"has_visits": True},}, {"range": {"nb_visits": {"gt": 0}}}, ] } }, } _test_results(query, expected) def test_origin_and_metadata_filters(): query = 'origin = django or metadata = "framework and web"' expected = { "filters": { "bool": { "should": [ { "multi_match": { "query": "django", "type": "bool_prefix", "operator": "and", "fields": [ "url.as_you_type", "url.as_you_type._2gram", "url.as_you_type._3gram", ], } }, { "nested": { "path": "intrinsic_metadata", "query": { "multi_match": { "query": "framework and web", "type": "cross_fields", "operator": "and", "fields": ["intrinsic_metadata.*"], "lenient": True, } }, } }, ] } } } _test_results(query, expected) def test_visits_not_equal_to_filter(): query = "visits != 5" expected = { "filters": { "bool": {"must_not": [{"range": {"nb_visits": {"gte": 5, "lte": 5}}},]} }, } _test_results(query, expected) def test_visit_type_filter(): query = 'visit_type = [git,"pypi"]' expected = {"filters": {"terms": {"visit_types": ["git", "pypi"]}}} _test_results(query, expected) def test_keyword_filter(): query = r"""keyword in [word1, "word2 \" \' word3"]""" expected = { "filters": { "nested": { "path": "intrinsic_metadata", "query": { "multi_match": { "query": r"""word1 word2 " ' word3""", "fields": [ get_expansion("keywords", ".") + "^2", get_expansion("descriptions", "."), ], } }, } } } _test_results(query, expected) def test_language_filter(): query = 'language in [python, "go lang", cpp]' expected = { "filters": { "nested": { "path": "intrinsic_metadata", "query": { "bool": { "should": [ { "match": { get_expansion( "programming_languages", "." ): "python" } }, { "match": { get_expansion( "programming_languages", "." ): "go lang" } }, { "match": { get_expansion("programming_languages", "."): "cpp" } }, ] } }, } } } _test_results(query, expected) def test_license_filter(): query = 'license in ["GPL 3", Apache, MIT]' expected = { "filters": { "nested": { "path": "intrinsic_metadata", "query": { "bool": { "should": [ {"match": {get_expansion("licenses", "."): "GPL 3"}}, {"match": {get_expansion("licenses", "."): "Apache"}}, {"match": {get_expansion("licenses", "."): "MIT"}}, ] } }, } } } _test_results(query, expected) def test_date_created_not_equal_to_filter(): query = "created != 2020-01-01" expected = { "filters": { "nested": { "path": "intrinsic_metadata", "query": { "bool": { "must_not": [ { "range": { get_expansion("date_created", "."): { "gte": "2020-01-01", "lte": "2020-01-01", } } } ] } }, } } } _test_results(query, expected) def test_date_created_greater_than_filter(): query = "created >= 2020-01-01" expected = { "filters": { "nested": { "path": "intrinsic_metadata", "query": { "bool": { "must": [ { "range": { get_expansion("date_created", "."): { "gte": "2020-01-01", } } } ] } }, } } } _test_results(query, expected) def test_last_eventful_visit_not_equal_to_filter(): query = "last_visit != 2020-01-01" expected = { "filters": { "bool": { "must_not": [ { "range": { "last_visit_date": { "gte": "2020-01-01", "lte": "2020-01-01", } } } ] } } } _test_results(query, expected) def test_last_eventful_visit_less_than_to_filter(): query = "last_visit < 2020-01-01" expected = {"filters": {"range": {"last_visit_date": {"lt": "2020-01-01"}}}} _test_results(query, expected) def test_keyword_no_escape_inside_filter(): # any keyword (filter name/operator/value) inside a filter # must be considered a string. query = r'''origin = "language in [\'go lang\', python]"''' expected = { "filters": { "multi_match": { "query": r"""language in ['go lang', python]""", "type": "bool_prefix", "operator": "and", "fields": [ "url.as_you_type", "url.as_you_type._2gram", "url.as_you_type._3gram", ], } } } _test_results(query, expected) -def test_escaped_punctutation_parsing(): +def test_escaped_punctuation_parsing(): query = r"""keyword in ["foo \'\" bar"]""" expected = { "filters": { "nested": { "path": "intrinsic_metadata", "query": { "multi_match": { "query": r"""foo '" bar""", "fields": [ get_expansion("keywords", ".") + "^2", get_expansion("descriptions", "."), ], } }, } } } _test_results(query, expected) + + +def test_nonascii(): + query = r"""keyword in ["café"]""" + expected = { + "filters": { + "nested": { + "path": "intrinsic_metadata", + "query": { + "multi_match": { + "query": r"""café""", + "fields": [ + get_expansion("keywords", ".") + "^2", + get_expansion("descriptions", "."), + ], + } + }, + } + } + } + _test_results(query, expected) + + +def test_nonascii_before_operator(): + query = r"""keyword in ["🐍"] and visited = true""" + expected = { + "filters": { + "bool": { + "must": [ + { + "nested": { + "path": "intrinsic_metadata", + "query": { + "multi_match": { + "query": r"""🐍""", + "fields": [ + get_expansion("keywords", ".") + "^2", + get_expansion("descriptions", "."), + ], + } + }, + }, + }, + {"term": {"has_visits": True,},}, + ], + } + } + } + _test_results(query, expected) diff --git a/swh/search/translator.py b/swh/search/translator.py index 4229bde..af3b16d 100644 --- a/swh/search/translator.py +++ b/swh/search/translator.py @@ -1,307 +1,307 @@ # Copyright (C) 2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import logging import os import tempfile from pkg_resources import resource_filename from tree_sitter import Language, Parser from swh.search.utils import get_expansion, unescape logger = logging.getLogger(__name__) class Translator: RANGE_OPERATOR_MAP = { ">": "gt", "<": "lt", ">=": "gte", "<=": "lte", } def __init__(self): ql_path = resource_filename("swh.search", "static/swh_ql.so") if not os.path.exists(ql_path): logging.info("%s does not exist, building in temporary directory", ql_path) self._build_dir = tempfile.TemporaryDirectory(prefix="swh.search-build") source_path = resource_filename("swh.search", "query_language") ql_path = os.path.join(self._build_dir.name, "swh_ql.so") Language.build_library(ql_path, [source_path]) search_ql = Language(ql_path, "swh_search_ql") self.parser = Parser() self.parser.set_language(search_ql) self.query = "" def parse_query(self, query): - self.query = query - tree = self.parser.parse(query.encode("utf8")) + self.query = query.encode() + tree = self.parser.parse(self.query) self.query_node = tree.root_node if self.query_node.has_error: raise Exception("Invalid query") return self._traverse(self.query_node) def _traverse(self, node): if len(node.children) == 3 and node.children[1].type == "filters": # filters => ( filters ) return self._traverse(node.children[1]) # Go past the () brackets if node.type == "query": result = {} for child in node.children: # query => filters sort_by limit result[child.type] = self._traverse(child) return result if node.type == "filters": if len(node.children) == 1: # query => filters # filters => filters # filters => filter # Current node is just a wrapper, so go one level deep return self._traverse(node.children[0]) if len(node.children) == 3: # filters => filters conj_op filters filters1 = self._traverse(node.children[0]) conj_op = self._get_value(node.children[1]) filters2 = self._traverse(node.children[2]) if conj_op == "and": # "must" is equivalent to "AND" return {"bool": {"must": [filters1, filters2]}} if conj_op == "or": # "should" is equivalent to "OR" return {"bool": {"should": [filters1, filters2]}} if node.type == "filter": filter_category = node.children[0] return self._parse_filter(filter_category) if node.type == "sortBy": return self._parse_filter(node) if node.type == "limit": return self._parse_filter(node) return Exception( f"Unknown node type ({node.type}) " f"or unexpected number of children ({node.children})" ) def _get_value(self, node): if ( len(node.children) > 0 and node.children[0].type == "[" and node.children[-1].type == "]" ): # array return [self._get_value(child) for child in node.children if child.is_named] start = node.start_point[1] end = node.end_point[1] - value = self.query[start:end] + value = self.query[start:end].decode() if len(value) > 1 and ( (value[0] == "'" and value[-1] == "'") or (value[0] and value[-1] == '"') ): return unescape(value[1:-1]) if node.type in ["number", "numberVal"]: return int(value) return unescape(value) def _parse_filter(self, filter): if filter.type == "boundedListFilter": filter = filter.children[0] children = filter.children assert len(children) == 3 category = filter.type name, op, value = [self._get_value(child) for child in children] if category == "patternFilter": if name == "origin": return { "multi_match": { "query": value, "type": "bool_prefix", "operator": "and", "fields": [ "url.as_you_type", "url.as_you_type._2gram", "url.as_you_type._3gram", ], } } elif name == "metadata": return { "nested": { "path": "intrinsic_metadata", "query": { "multi_match": { "query": value, # Makes it so that the "foo bar" query returns # documents which contain "foo" in a field and "bar" # in a different field "type": "cross_fields", # All keywords must be found in a document for it to # be considered a match. # TODO: allow missing keywords? "operator": "and", # Searches on all fields of the intrinsic_metadata dict, # recursively. "fields": ["intrinsic_metadata.*"], # date{Created,Modified,Published} are of type date "lenient": True, } }, } } if category == "booleanFilter": if name == "visited": return {"term": {"has_visits": value == "true"}} if category == "numericFilter": if name == "visits": if op in ["=", "!="]: return { "bool": { ("must" if op == "=" else "must_not"): [ {"range": {"nb_visits": {"gte": value, "lte": value}}} ] } } else: return { "range": {"nb_visits": {self.RANGE_OPERATOR_MAP[op]: value}} } if category == "visitTypeFilter": if name == "visit_type": return {"terms": {"visit_types": value}} if category == "unboundedListFilter": value_array = value if name == "keyword": return { "nested": { "path": "intrinsic_metadata", "query": { "multi_match": { "query": " ".join(value_array), "fields": [ get_expansion("keywords", ".") + "^2", get_expansion("descriptions", "."), # "^2" boosts an origin's score by 2x # if it the queried keywords are # found in its intrinsic_metadata.keywords ], } }, } } elif name in ["language", "license"]: name_mapping = { "language": "programming_languages", "license": "licenses", } name = name_mapping[name] return { "nested": { "path": "intrinsic_metadata", "query": { "bool": { "should": [ {"match": {get_expansion(name, "."): val}} for val in value_array ], } }, } } if category == "dateFilter": if name in ["created", "modified", "published"]: if op in ["=", "!="]: return { "nested": { "path": "intrinsic_metadata", "query": { "bool": { ("must" if op == "=" else "must_not"): [ { "range": { get_expansion(f"date_{name}", "."): { "gte": value, "lte": value, } } } ], } }, } } return { "nested": { "path": "intrinsic_metadata", "query": { "bool": { "must": [ { "range": { get_expansion(f"date_{name}", "."): { self.RANGE_OPERATOR_MAP[op]: value, } } } ], } }, } } else: if op in ["=", "!="]: return { "bool": { ("must" if op == "=" else "must_not"): [ { "range": { f"{name}_date": {"gte": value, "lte": value,} } } ], } } return { "range": { f"{name}_date": { self.RANGE_OPERATOR_MAP[op]: value.replace("Z", "+00:00"), } } } if category == "sortBy": return value if category == "limit": return value raise Exception(f"Unknown filter {category}.{name}") diff --git a/swh/search/utils.py b/swh/search/utils.py index aaa9ed8..e55b26a 100644 --- a/swh/search/utils.py +++ b/swh/search/utils.py @@ -1,105 +1,112 @@ +# Copyright (C) 2021 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import codecs from datetime import datetime import iso8601 # type: ignore def get_expansion(field, sep=None): METADATA_FIELDS = { "licenses": ["intrinsic_metadata", "http://schema.org/license", "@id"], "programming_languages": [ "intrinsic_metadata", "http://schema.org/programmingLanguage", "@value", ], "keywords": ["intrinsic_metadata", "http://schema.org/keywords", "@value",], "descriptions": [ "intrinsic_metadata", "http://schema.org/description", "@value", ], "date_created": [ "intrinsic_metadata", "http://schema.org/dateCreated", "@value", ], "date_modified": [ "intrinsic_metadata", "http://schema.org/dateModified", "@value", ], "date_published": [ "intrinsic_metadata", "http://schema.org/datePublished", "@value", ], } if sep: return sep.join(METADATA_FIELDS[field]) return METADATA_FIELDS[field] def is_date_parsable(date_str): """ Return True if date_str is in the format %Y-%m-%d or the standard ISO format. Otherwise return False. """ try: datetime.strptime(date_str, "%Y-%m-%d") return True except Exception: try: iso8601.parse_date(date_str) return True except Exception: return False def escape(obj): r"""Makes the object directly injectable into the query language by converting the escapable parts of the object into escape sequences. For strings, appends \ before special characters like ', ", and \ For arrays, applies the same transformation on each element, joins the elements and returns a string-like representation of the list. >>> print(escape("foo ' bar")) "foo \' bar" >>> print(escape([r"foo ' bar", r"bar \\\' baz", r'foo " baz'])) ["foo \' bar", "bar \\\\\\\' baz", "foo \" baz"] """ if type(obj) == list: items = [escape(item) for item in obj] return "[" + ", ".join(items) + "]" elif type(obj) == str: return ( '"' + obj.translate({ord("'"): r"\'", ord('"'): r"\"", ord("\\"): r"\\",}) + '"' ) else: raise Exception(f"Unexpected item type {type(obj)}") def unescape(string): r"""Processes the escaped special characters >>> unescape(r'''foo " bar''') == r'''foo " bar''' True >>> unescape(r'''foo \" bar''') == r'''foo " bar''' True >>> unescape(r'''foo \\" bar''') == r'''foo \" bar''' True >>> unescape(r'''foo \\\" bar''') == r'''foo \" bar''' True >>> unescape(r'''foo \\\\" bar''') == r'''foo \\" bar''' True + >>> unescape(r'''café \" foo''') == r'''café " foo''' + True """ - - return bytes(string, "utf-8").decode("unicode_escape") + return codecs.escape_decode(string.encode())[0].decode()