diff --git a/PKG-INFO b/PKG-INFO index 922684c..22a5020 100644 --- a/PKG-INFO +++ b/PKG-INFO @@ -1,52 +1,52 @@ Metadata-Version: 2.1 Name: swh.search -Version: 0.7.0 +Version: 0.7.1 Summary: Software Heritage search service Home-page: https://forge.softwareheritage.org/diffusion/DSEA Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest Project-URL: Funding, https://www.softwareheritage.org/donate Project-URL: Source, https://forge.softwareheritage.org/source/swh-search Project-URL: Documentation, https://docs.softwareheritage.org/devel/swh-search/ Description: swh-search ========== Search service for the Software Heritage archive. It is similar to swh-storage in what it contains, but provides different ways to query it: while swh-storage is mostly a key-value store that returns an object from a primary key, swh-search is focused on reverse indices, to allow finding objects that match some criteria; for example full-text search. Currently uses ElasticSearch, and provides only origin search (by URL and metadata) # Dependencies Python tests for this module include tests that cannot be run without a local ElasticSearch instance, so you need the ElasticSearch server executable on your machine (no need to have a running ElasticSearch server). ## Debian-like host The elasticsearch package is required. As it's not part of debian-stable, [another debian repository is required to be configured](https://www.elastic.co/guide/en/elasticsearch/reference/current/deb.html#deb-repo) ## Non Debian-like host The tests expect: - `/usr/share/elasticsearch/jdk/bin/java` to exist. - `org.elasticsearch.bootstrap.Elasticsearch` to be in java's classpath. Platform: UNKNOWN Classifier: Programming Language :: Python :: 3 Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3) Classifier: Operating System :: OS Independent Classifier: Development Status :: 3 - Alpha Requires-Python: >=3.7 Description-Content-Type: text/markdown Provides-Extra: testing diff --git a/swh.search.egg-info/PKG-INFO b/swh.search.egg-info/PKG-INFO index 922684c..22a5020 100644 --- a/swh.search.egg-info/PKG-INFO +++ b/swh.search.egg-info/PKG-INFO @@ -1,52 +1,52 @@ Metadata-Version: 2.1 Name: swh.search -Version: 0.7.0 +Version: 0.7.1 Summary: Software Heritage search service Home-page: https://forge.softwareheritage.org/diffusion/DSEA Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest Project-URL: Funding, https://www.softwareheritage.org/donate Project-URL: Source, https://forge.softwareheritage.org/source/swh-search Project-URL: Documentation, https://docs.softwareheritage.org/devel/swh-search/ Description: swh-search ========== Search service for the Software Heritage archive. It is similar to swh-storage in what it contains, but provides different ways to query it: while swh-storage is mostly a key-value store that returns an object from a primary key, swh-search is focused on reverse indices, to allow finding objects that match some criteria; for example full-text search. Currently uses ElasticSearch, and provides only origin search (by URL and metadata) # Dependencies Python tests for this module include tests that cannot be run without a local ElasticSearch instance, so you need the ElasticSearch server executable on your machine (no need to have a running ElasticSearch server). ## Debian-like host The elasticsearch package is required. As it's not part of debian-stable, [another debian repository is required to be configured](https://www.elastic.co/guide/en/elasticsearch/reference/current/deb.html#deb-repo) ## Non Debian-like host The tests expect: - `/usr/share/elasticsearch/jdk/bin/java` to exist. - `org.elasticsearch.bootstrap.Elasticsearch` to be in java's classpath. Platform: UNKNOWN Classifier: Programming Language :: Python :: 3 Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3) Classifier: Operating System :: OS Independent Classifier: Development Status :: 3 - Alpha Requires-Python: >=3.7 Description-Content-Type: text/markdown Provides-Extra: testing diff --git a/swh/search/elasticsearch.py b/swh/search/elasticsearch.py index b90885d..842c837 100644 --- a/swh/search/elasticsearch.py +++ b/swh/search/elasticsearch.py @@ -1,312 +1,312 @@ # Copyright (C) 2019-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import base64 from typing import Any, Dict, Iterable, Iterator, List, Optional from elasticsearch import Elasticsearch, helpers import msgpack from swh.indexer import codemeta from swh.model import model from swh.model.identifiers import origin_identifier from swh.search.interface import MinimalOriginDict, OriginDict, PagedResult from swh.search.metrics import send_metric, timed INDEX_NAME_PARAM = "index" READ_ALIAS_PARAM = "read_alias" WRITE_ALIAS_PARAM = "write_alias" ORIGIN_DEFAULT_CONFIG = { INDEX_NAME_PARAM: "origin", READ_ALIAS_PARAM: "origin-read", WRITE_ALIAS_PARAM: "origin-write", } def _sanitize_origin(origin): origin = origin.copy() # Whitelist fields to be saved in Elasticsearch res = {"url": origin.pop("url")} for field_name in ("intrinsic_metadata", "has_visits", "visit_types"): if field_name in origin: res[field_name] = origin.pop(field_name) # Run the JSON-LD expansion algorithm # # to normalize the Codemeta metadata. # This is required as Elasticsearch will needs each field to have a consistent # type across documents to be searchable; and non-expanded JSON-LD documents # can have various types in the same field. For example, all these are # equivalent in JSON-LD: # * {"author": "Jane Doe"} # * {"author": ["Jane Doe"]} # * {"author": {"@value": "Jane Doe"}} # * {"author": [{"@value": "Jane Doe"}]} # and JSON-LD expansion will convert them all to the last one. if "intrinsic_metadata" in res: res["intrinsic_metadata"] = codemeta.expand(res["intrinsic_metadata"]) return res def token_encode(index_to_tokenize: Dict[bytes, Any]) -> str: """Tokenize as string an index page result from a search """ page_token = base64.b64encode(msgpack.dumps(index_to_tokenize)) return page_token.decode() def token_decode(page_token: str) -> Dict[bytes, Any]: """Read the page_token """ return msgpack.loads(base64.b64decode(page_token.encode()), raw=True) class ElasticSearch: - def __init__(self, hosts: List[str], indexes: Dict[str, Dict[str, str]]): + def __init__(self, hosts: List[str], indexes: Dict[str, Dict[str, str]] = {}): self._backend = Elasticsearch(hosts=hosts) # Merge current configuration with default values origin_config = indexes.get("origin", {}) self.origin_config = {**ORIGIN_DEFAULT_CONFIG, **origin_config} def _get_origin_index(self) -> str: return self.origin_config[INDEX_NAME_PARAM] def _get_origin_read_alias(self) -> str: return self.origin_config[READ_ALIAS_PARAM] def _get_origin_write_alias(self) -> str: return self.origin_config[WRITE_ALIAS_PARAM] @timed def check(self): return self._backend.ping() def deinitialize(self) -> None: """Removes all indices from the Elasticsearch backend""" self._backend.indices.delete(index="*") def initialize(self) -> None: """Declare Elasticsearch indices, aliases and mappings""" if not self._backend.indices.exists(index=self._get_origin_index()): self._backend.indices.create(index=self._get_origin_index()) if not self._backend.indices.exists_alias(self._get_origin_read_alias()): self._backend.indices.put_alias( index=self._get_origin_index(), name=self._get_origin_read_alias() ) if not self._backend.indices.exists_alias(self._get_origin_write_alias()): self._backend.indices.put_alias( index=self._get_origin_index(), name=self._get_origin_write_alias() ) self._backend.indices.put_mapping( index=self._get_origin_index(), body={ "date_detection": False, "properties": { # sha1 of the URL; used as the document id "sha1": {"type": "keyword", "doc_values": True,}, # Used both to search URLs, and as the result to return # as a response to queries "url": { "type": "text", # To split URLs into token on any character # that is not alphanumerical "analyzer": "simple", # 2-gram and partial-3-gram search (ie. with the end of the # third word potentially missing) "fields": { "as_you_type": { "type": "search_as_you_type", "analyzer": "simple", } }, }, "visit_types": {"type": "keyword"}, # used to filter out origins that were never visited "has_visits": {"type": "boolean",}, "intrinsic_metadata": { "type": "nested", "properties": { "@context": { # don't bother indexing tokens in these URIs, as the # are used as namespaces "type": "keyword", } }, }, }, }, ) @timed def flush(self) -> None: self._backend.indices.refresh(index=self._get_origin_write_alias()) @timed def origin_update(self, documents: Iterable[OriginDict]) -> None: write_index = self._get_origin_write_alias() documents = map(_sanitize_origin, documents) documents_with_sha1 = ( (origin_identifier(document), document) for document in documents ) # painless script that will be executed when updating an origin document update_script = """ // backup current visit_types field value List visit_types = ctx._source.getOrDefault("visit_types", []); // update origin document with new field values ctx._source.putAll(params); // restore previous visit types after visit_types field overriding if (ctx._source.containsKey("visit_types")) { for (int i = 0; i < visit_types.length; ++i) { if (!ctx._source.visit_types.contains(visit_types[i])) { ctx._source.visit_types.add(visit_types[i]); } } } """ actions = [ { "_op_type": "update", "_id": sha1, "_index": write_index, "scripted_upsert": True, "upsert": {**document, "sha1": sha1,}, "script": { "source": update_script, "lang": "painless", "params": document, }, } for (sha1, document) in documents_with_sha1 ] indexed_count, errors = helpers.bulk(self._backend, actions, index=write_index) assert isinstance(errors, List) # Make mypy happy send_metric("document:index", count=indexed_count, method_name="origin_update") send_metric( "document:index_error", count=len(errors), method_name="origin_update" ) def origin_dump(self) -> Iterator[model.Origin]: results = helpers.scan(self._backend, index=self._get_origin_read_alias()) for hit in results: yield self._backend.termvectors( index=self._get_origin_read_alias(), id=hit["_id"], fields=["*"] ) @timed def origin_search( self, *, url_pattern: Optional[str] = None, metadata_pattern: Optional[str] = None, with_visit: bool = False, visit_types: Optional[List[str]] = None, page_token: Optional[str] = None, limit: int = 50, ) -> PagedResult[MinimalOriginDict]: query_clauses: List[Dict[str, Any]] = [] if url_pattern: query_clauses.append( { "multi_match": { "query": url_pattern, "type": "bool_prefix", "operator": "and", "fields": [ "url.as_you_type", "url.as_you_type._2gram", "url.as_you_type._3gram", ], } } ) if metadata_pattern: query_clauses.append( { "nested": { "path": "intrinsic_metadata", "query": { "multi_match": { "query": metadata_pattern, # Makes it so that the "foo bar" query returns # documents which contain "foo" in a field and "bar" # in a different field "type": "cross_fields", # All keywords must be found in a document for it to # be considered a match. # TODO: allow missing keywords? "operator": "and", # Searches on all fields of the intrinsic_metadata dict, # recursively. "fields": ["intrinsic_metadata.*"], } }, } } ) if not query_clauses: raise ValueError( "At least one of url_pattern and metadata_pattern must be provided." ) if with_visit: query_clauses.append({"term": {"has_visits": True,}}) if visit_types is not None: query_clauses.append({"terms": {"visit_types": visit_types}}) body = { "query": {"bool": {"must": query_clauses,}}, "sort": [{"_score": "desc"}, {"sha1": "asc"},], } if page_token: # TODO: use ElasticSearch's scroll API? page_token_content = token_decode(page_token) body["search_after"] = [ page_token_content[b"score"], page_token_content[b"sha1"].decode("ascii"), ] res = self._backend.search( index=self._get_origin_read_alias(), body=body, size=limit ) hits = res["hits"]["hits"] next_page_token: Optional[str] = None if len(hits) == limit: # There are more results after this page; return a pagination token # to get them in a future query last_hit = hits[-1] next_page_token_content = { b"score": last_hit["_score"], b"sha1": last_hit["_source"]["sha1"], } next_page_token = token_encode(next_page_token_content) assert len(hits) <= limit return PagedResult( results=[{"url": hit["_source"]["url"]} for hit in hits], next_page_token=next_page_token, )