diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000..1271b63 --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1,5 @@ +include Makefile +include requirements.txt +include requirements-swh.txt +include version.txt +include README.md diff --git a/es_config/elasticsearch.keystore b/es_config/elasticsearch.keystore new file mode 100644 index 0000000..463724a Binary files /dev/null and b/es_config/elasticsearch.keystore differ diff --git a/es_config/elasticsearch.yml b/es_config/elasticsearch.yml new file mode 100644 index 0000000..df04d7c --- /dev/null +++ b/es_config/elasticsearch.yml @@ -0,0 +1,89 @@ +# ======================== Elasticsearch Configuration ========================= +# +# NOTE: Elasticsearch comes with reasonable defaults for most settings. +# Before you set out to tweak and tune the configuration, make sure you +# understand what are you trying to accomplish and the consequences. +# +# The primary way of configuring a node is via this file. This template lists +# the most important settings you may want to configure for a production cluster. +# +# Please consult the documentation for further information on configuration options: +# https://www.elastic.co/guide/en/elasticsearch/reference/index.html +# +# ---------------------------------- Cluster ----------------------------------- +# +# Use a descriptive name for your cluster: +# +#cluster.name: my-application +# +# ------------------------------------ Node ------------------------------------ +# +# Use a descriptive name for the node: +# +#node.name: node-1 +node.name: node-1 +# +# Add custom attributes to the node: +# +#node.attr.rack: r1 +# +# ----------------------------------- Paths ------------------------------------ +# +# Path to directory where to store the data (separate multiple locations by comma): +# +path.data: /tmp/elasticsearch +# +# Path to log files: +# +path.logs: /tmp/elasticsearch +# +# ----------------------------------- Memory ----------------------------------- +# +# Lock the memory on startup: +# +#bootstrap.memory_lock: true +# +# Make sure that the heap size is set to about half the memory available +# on the system and that the owner of the process is allowed to use this +# limit. +# +# Elasticsearch performs poorly when the system is swapping the memory. +# +# ---------------------------------- Network ----------------------------------- +# +# Set the bind address to a specific IP (IPv4 or IPv6): +# +#network.host: 192.168.0.1 +# +# Set a custom port for HTTP: +# +#http.port: 9200 +# +# For more information, consult the network module documentation. +# +# --------------------------------- Discovery ---------------------------------- +# +# Pass an initial list of hosts to perform discovery when this node is started: +# The default list of hosts is ["127.0.0.1", "[::1]"] +# +#discovery.seed_hosts: ["host1", "host2"] +# +# Bootstrap the cluster using an initial set of master-eligible nodes: +# +#cluster.initial_master_nodes: ["node-1", "node-2"] +# +# For more information, consult the discovery and cluster formation module documentation. +# +# ---------------------------------- Gateway ----------------------------------- +# +# Block initial recovery after a full cluster restart until N nodes are started: +# +#gateway.recover_after_nodes: 3 +# +# For more information, consult the gateway module documentation. +# +# ---------------------------------- Various ----------------------------------- +# +# Require explicit names when deleting indices: +# +#action.destructive_requires_name: true diff --git a/es_config/jvm.options b/es_config/jvm.options new file mode 100644 index 0000000..d16eba3 --- /dev/null +++ b/es_config/jvm.options @@ -0,0 +1,106 @@ +## JVM configuration + +################################################################ +## IMPORTANT: JVM heap size +################################################################ +## +## You should always set the min and max JVM heap +## size to the same value. For example, to set +## the heap to 4 GB, set: +## +## -Xms4g +## -Xmx4g +## +## See https://www.elastic.co/guide/en/elasticsearch/reference/current/heap-size.html +## for more information +## +################################################################ + +# Xms represents the initial size of total heap space +# Xmx represents the maximum size of total heap space + +-Xms1g +-Xmx1g + +################################################################ +## Expert settings +################################################################ +## +## All settings below this section are considered +## expert settings. Don't tamper with them unless +## you understand what you are doing +## +################################################################ + +## GC configuration +-XX:+UseConcMarkSweepGC +-XX:CMSInitiatingOccupancyFraction=75 +-XX:+UseCMSInitiatingOccupancyOnly + +## G1GC Configuration +# NOTE: G1GC is only supported on JDK version 10 or later. +# To use G1GC uncomment the lines below. +# 10-:-XX:-UseConcMarkSweepGC +# 10-:-XX:-UseCMSInitiatingOccupancyOnly +# 10-:-XX:+UseG1GC +# 10-:-XX:InitiatingHeapOccupancyPercent=75 + +## DNS cache policy +# cache ttl in seconds for positive DNS lookups noting that this overrides the +# JDK security property networkaddress.cache.ttl; set to -1 to cache forever +-Des.networkaddress.cache.ttl=60 +# cache ttl in seconds for negative DNS lookups noting that this overrides the +# JDK security property networkaddress.cache.negative ttl; set to -1 to cache +# forever +-Des.networkaddress.cache.negative.ttl=10 + +## optimizations + +# pre-touch memory pages used by the JVM during initialization +-XX:+AlwaysPreTouch + +## basic + +# explicitly set the stack size +-Xss1m + +# set to headless, just in case +-Djava.awt.headless=true + +# ensure UTF-8 encoding by default (e.g. filenames) +-Dfile.encoding=UTF-8 + +# use our provided JNA always versus the system one +-Djna.nosys=true + +# turn off a JDK optimization that throws away stack traces for common +# exceptions because stack traces are important for debugging +-XX:-OmitStackTraceInFastThrow + +# flags to configure Netty +-Dio.netty.noUnsafe=true +-Dio.netty.noKeySetOptimization=true +-Dio.netty.recycler.maxCapacityPerThread=0 + +# log4j 2 +-Dlog4j.shutdownHookEnabled=false +-Dlog4j2.disable.jmx=true + +-Djava.io.tmpdir=${ES_TMPDIR} + +## heap dumps + +# generate a heap dump when an allocation from the Java heap fails +# heap dumps are created in the working directory of the JVM +-XX:+HeapDumpOnOutOfMemoryError + +# specify an alternative path for heap dumps; ensure the directory exists and +# has sufficient space +-XX:HeapDumpPath=/var/lib/elasticsearch + +# specify an alternative path for JVM fatal error logs +-XX:ErrorFile=/var/log/elasticsearch/hs_err_pid%p.log + +# due to internationalization enhancements in JDK 9 Elasticsearch need to set the provider to COMPAT otherwise +# time/date parsing will break in an incompatible way for some date patterns and locals +9-:-Djava.locale.providers=COMPAT diff --git a/es_config/log4j2.properties b/es_config/log4j2.properties new file mode 100644 index 0000000..511b66c --- /dev/null +++ b/es_config/log4j2.properties @@ -0,0 +1,260 @@ +status = error + +# log action execution errors for easier debugging +logger.action.name = org.elasticsearch.action +logger.action.level = debug + +appender.console.type = Console +appender.console.name = console +appender.console.layout.type = PatternLayout +appender.console.layout.pattern = [%d{ISO8601}][%-5p][%-25c{1.}] [%node_name]%marker %m%n + +######## Server JSON ############################ +appender.rolling.type = RollingFile +appender.rolling.name = rolling +appender.rolling.fileName = ${sys:es.logs.base_path}${sys:file.separator}${sys:es.logs.cluster_name}_server.json +appender.rolling.layout.type = ESJsonLayout +appender.rolling.layout.type_name = server + +appender.rolling.filePattern = ${sys:es.logs.base_path}${sys:file.separator}${sys:es.logs.cluster_name}-%d{yyyy-MM-dd}-%i.json.gz +appender.rolling.policies.type = Policies +appender.rolling.policies.time.type = TimeBasedTriggeringPolicy +appender.rolling.policies.time.interval = 1 +appender.rolling.policies.time.modulate = true +appender.rolling.policies.size.type = SizeBasedTriggeringPolicy +appender.rolling.policies.size.size = 128MB +appender.rolling.strategy.type = DefaultRolloverStrategy +appender.rolling.strategy.fileIndex = nomax +appender.rolling.strategy.action.type = Delete +appender.rolling.strategy.action.basepath = ${sys:es.logs.base_path} +appender.rolling.strategy.action.condition.type = IfFileName +appender.rolling.strategy.action.condition.glob = ${sys:es.logs.cluster_name}-* +appender.rolling.strategy.action.condition.nested_condition.type = IfAccumulatedFileSize +appender.rolling.strategy.action.condition.nested_condition.exceeds = 2GB +################################################ +######## Server - old style pattern ########### +appender.rolling_old.type = RollingFile +appender.rolling_old.name = rolling_old +appender.rolling_old.fileName = ${sys:es.logs.base_path}${sys:file.separator}${sys:es.logs.cluster_name}.log +appender.rolling_old.layout.type = PatternLayout +appender.rolling_old.layout.pattern = [%d{ISO8601}][%-5p][%-25c{1.}] [%node_name]%marker %m%n + +appender.rolling_old.filePattern = ${sys:es.logs.base_path}${sys:file.separator}${sys:es.logs.cluster_name}-%d{yyyy-MM-dd}-%i.log.gz +appender.rolling_old.policies.type = Policies +appender.rolling_old.policies.time.type = TimeBasedTriggeringPolicy +appender.rolling_old.policies.time.interval = 1 +appender.rolling_old.policies.time.modulate = true +appender.rolling_old.policies.size.type = SizeBasedTriggeringPolicy +appender.rolling_old.policies.size.size = 128MB +appender.rolling_old.strategy.type = DefaultRolloverStrategy +appender.rolling_old.strategy.fileIndex = nomax +appender.rolling_old.strategy.action.type = Delete +appender.rolling_old.strategy.action.basepath = ${sys:es.logs.base_path} +appender.rolling_old.strategy.action.condition.type = IfFileName +appender.rolling_old.strategy.action.condition.glob = ${sys:es.logs.cluster_name}-* +appender.rolling_old.strategy.action.condition.nested_condition.type = IfAccumulatedFileSize +appender.rolling_old.strategy.action.condition.nested_condition.exceeds = 2GB +################################################ + +rootLogger.level = info +rootLogger.appenderRef.console.ref = console +rootLogger.appenderRef.rolling.ref = rolling +rootLogger.appenderRef.rolling_old.ref = rolling_old + +######## Deprecation JSON ####################### +appender.deprecation_rolling.type = RollingFile +appender.deprecation_rolling.name = deprecation_rolling +appender.deprecation_rolling.fileName = ${sys:es.logs.base_path}${sys:file.separator}${sys:es.logs.cluster_name}_deprecation.json +appender.deprecation_rolling.layout.type = ESJsonLayout +appender.deprecation_rolling.layout.type_name = deprecation + +appender.deprecation_rolling.filePattern = ${sys:es.logs.base_path}${sys:file.separator}${sys:es.logs.cluster_name}_deprecation-%i.json.gz +appender.deprecation_rolling.policies.type = Policies +appender.deprecation_rolling.policies.size.type = SizeBasedTriggeringPolicy +appender.deprecation_rolling.policies.size.size = 1GB +appender.deprecation_rolling.strategy.type = DefaultRolloverStrategy +appender.deprecation_rolling.strategy.max = 4 +################################################# +######## Deprecation - old style pattern ####### +appender.deprecation_rolling_old.type = RollingFile +appender.deprecation_rolling_old.name = deprecation_rolling_old +appender.deprecation_rolling_old.fileName = ${sys:es.logs.base_path}${sys:file.separator}${sys:es.logs.cluster_name}_deprecation.log +appender.deprecation_rolling_old.layout.type = PatternLayout +appender.deprecation_rolling_old.layout.pattern = [%d{ISO8601}][%-5p][%-25c{1.}] [%node_name]%marker %m%n + +appender.deprecation_rolling_old.filePattern = ${sys:es.logs.base_path}${sys:file.separator}${sys:es.logs.cluster_name}\ + _deprecation-%i.log.gz +appender.deprecation_rolling_old.policies.type = Policies +appender.deprecation_rolling_old.policies.size.type = SizeBasedTriggeringPolicy +appender.deprecation_rolling_old.policies.size.size = 1GB +appender.deprecation_rolling_old.strategy.type = DefaultRolloverStrategy +appender.deprecation_rolling_old.strategy.max = 4 +################################################# +logger.deprecation.name = org.elasticsearch.deprecation +logger.deprecation.level = warn +logger.deprecation.appenderRef.deprecation_rolling.ref = deprecation_rolling +logger.deprecation.appenderRef.deprecation_rolling_old.ref = deprecation_rolling_old +logger.deprecation.additivity = false + +######## Search slowlog JSON #################### +appender.index_search_slowlog_rolling.type = RollingFile +appender.index_search_slowlog_rolling.name = index_search_slowlog_rolling +appender.index_search_slowlog_rolling.fileName = ${sys:es.logs.base_path}${sys:file.separator}${sys:es.logs\ + .cluster_name}_index_search_slowlog.json +appender.index_search_slowlog_rolling.layout.type = ESJsonLayout +appender.index_search_slowlog_rolling.layout.type_name = index_search_slowlog + +appender.index_search_slowlog_rolling.filePattern = ${sys:es.logs.base_path}${sys:file.separator}${sys:es.logs\ + .cluster_name}_index_search_slowlog-%i.json.gz +appender.index_search_slowlog_rolling.policies.type = Policies +appender.index_search_slowlog_rolling.policies.size.type = SizeBasedTriggeringPolicy +appender.index_search_slowlog_rolling.policies.size.size = 1GB +appender.index_search_slowlog_rolling.strategy.type = DefaultRolloverStrategy +appender.index_search_slowlog_rolling.strategy.max = 4 +################################################# +######## Search slowlog - old style pattern #### +appender.index_search_slowlog_rolling_old.type = RollingFile +appender.index_search_slowlog_rolling_old.name = index_search_slowlog_rolling_old +appender.index_search_slowlog_rolling_old.fileName = ${sys:es.logs.base_path}${sys:file.separator}${sys:es.logs.cluster_name}\ + _index_search_slowlog.log +appender.index_search_slowlog_rolling_old.layout.type = PatternLayout +appender.index_search_slowlog_rolling_old.layout.pattern = [%d{ISO8601}][%-5p][%-25c{1.}] [%node_name]%marker %m%n + +appender.index_search_slowlog_rolling_old.filePattern = ${sys:es.logs.base_path}${sys:file.separator}${sys:es.logs.cluster_name}\ + _index_search_slowlog-%i.log.gz +appender.index_search_slowlog_rolling_old.policies.type = Policies +appender.index_search_slowlog_rolling_old.policies.size.type = SizeBasedTriggeringPolicy +appender.index_search_slowlog_rolling_old.policies.size.size = 1GB +appender.index_search_slowlog_rolling_old.strategy.type = DefaultRolloverStrategy +appender.index_search_slowlog_rolling_old.strategy.max = 4 +################################################# +logger.index_search_slowlog_rolling.name = index.search.slowlog +logger.index_search_slowlog_rolling.level = trace +logger.index_search_slowlog_rolling.appenderRef.index_search_slowlog_rolling.ref = index_search_slowlog_rolling +logger.index_search_slowlog_rolling.appenderRef.index_search_slowlog_rolling_old.ref = index_search_slowlog_rolling_old +logger.index_search_slowlog_rolling.additivity = false + +######## Indexing slowlog JSON ################## +appender.index_indexing_slowlog_rolling.type = RollingFile +appender.index_indexing_slowlog_rolling.name = index_indexing_slowlog_rolling +appender.index_indexing_slowlog_rolling.fileName = ${sys:es.logs.base_path}${sys:file.separator}${sys:es.logs.cluster_name}\ + _index_indexing_slowlog.json +appender.index_indexing_slowlog_rolling.layout.type = ESJsonLayout +appender.index_indexing_slowlog_rolling.layout.type_name = index_indexing_slowlog + +appender.index_indexing_slowlog_rolling.filePattern = ${sys:es.logs.base_path}${sys:file.separator}${sys:es.logs.cluster_name}\ + _index_indexing_slowlog-%i.json.gz +appender.index_indexing_slowlog_rolling.policies.type = Policies +appender.index_indexing_slowlog_rolling.policies.size.type = SizeBasedTriggeringPolicy +appender.index_indexing_slowlog_rolling.policies.size.size = 1GB +appender.index_indexing_slowlog_rolling.strategy.type = DefaultRolloverStrategy +appender.index_indexing_slowlog_rolling.strategy.max = 4 +################################################# +######## Indexing slowlog - old style pattern ## +appender.index_indexing_slowlog_rolling_old.type = RollingFile +appender.index_indexing_slowlog_rolling_old.name = index_indexing_slowlog_rolling_old +appender.index_indexing_slowlog_rolling_old.fileName = ${sys:es.logs.base_path}${sys:file.separator}${sys:es.logs.cluster_name}\ + _index_indexing_slowlog.log +appender.index_indexing_slowlog_rolling_old.layout.type = PatternLayout +appender.index_indexing_slowlog_rolling_old.layout.pattern = [%d{ISO8601}][%-5p][%-25c{1.}] [%node_name]%marker %m%n + +appender.index_indexing_slowlog_rolling_old.filePattern = ${sys:es.logs.base_path}${sys:file.separator}${sys:es.logs.cluster_name}\ + _index_indexing_slowlog-%i.log.gz +appender.index_indexing_slowlog_rolling_old.policies.type = Policies +appender.index_indexing_slowlog_rolling_old.policies.size.type = SizeBasedTriggeringPolicy +appender.index_indexing_slowlog_rolling_old.policies.size.size = 1GB +appender.index_indexing_slowlog_rolling_old.strategy.type = DefaultRolloverStrategy +appender.index_indexing_slowlog_rolling_old.strategy.max = 4 +################################################# + +logger.index_indexing_slowlog.name = index.indexing.slowlog.index +logger.index_indexing_slowlog.level = trace +logger.index_indexing_slowlog.appenderRef.index_indexing_slowlog_rolling.ref = index_indexing_slowlog_rolling +logger.index_indexing_slowlog.appenderRef.index_indexing_slowlog_rolling_old.ref = index_indexing_slowlog_rolling_old +logger.index_indexing_slowlog.additivity = false + + +appender.audit_rolling.type = RollingFile +appender.audit_rolling.name = audit_rolling +appender.audit_rolling.fileName = ${sys:es.logs.base_path}${sys:file.separator}${sys:es.logs.cluster_name}_audit.json +appender.audit_rolling.layout.type = PatternLayout +appender.audit_rolling.layout.pattern = {\ + "type":"audit", \ + "timestamp":"%d{yyyy-MM-dd'T'HH:mm:ss,SSSZ}"\ + %varsNotEmpty{, "node.name":"%enc{%map{node.name}}{JSON}"}\ + %varsNotEmpty{, "node.id":"%enc{%map{node.id}}{JSON}"}\ + %varsNotEmpty{, "host.name":"%enc{%map{host.name}}{JSON}"}\ + %varsNotEmpty{, "host.ip":"%enc{%map{host.ip}}{JSON}"}\ + %varsNotEmpty{, "event.type":"%enc{%map{event.type}}{JSON}"}\ + %varsNotEmpty{, "event.action":"%enc{%map{event.action}}{JSON}"}\ + %varsNotEmpty{, "user.name":"%enc{%map{user.name}}{JSON}"}\ + %varsNotEmpty{, "user.run_by.name":"%enc{%map{user.run_by.name}}{JSON}"}\ + %varsNotEmpty{, "user.run_as.name":"%enc{%map{user.run_as.name}}{JSON}"}\ + %varsNotEmpty{, "user.realm":"%enc{%map{user.realm}}{JSON}"}\ + %varsNotEmpty{, "user.run_by.realm":"%enc{%map{user.run_by.realm}}{JSON}"}\ + %varsNotEmpty{, "user.run_as.realm":"%enc{%map{user.run_as.realm}}{JSON}"}\ + %varsNotEmpty{, "user.roles":%map{user.roles}}\ + %varsNotEmpty{, "origin.type":"%enc{%map{origin.type}}{JSON}"}\ + %varsNotEmpty{, "origin.address":"%enc{%map{origin.address}}{JSON}"}\ + %varsNotEmpty{, "realm":"%enc{%map{realm}}{JSON}"}\ + %varsNotEmpty{, "url.path":"%enc{%map{url.path}}{JSON}"}\ + %varsNotEmpty{, "url.query":"%enc{%map{url.query}}{JSON}"}\ + %varsNotEmpty{, "request.method":"%enc{%map{request.method}}{JSON}"}\ + %varsNotEmpty{, "request.body":"%enc{%map{request.body}}{JSON}"}\ + %varsNotEmpty{, "request.id":"%enc{%map{request.id}}{JSON}"}\ + %varsNotEmpty{, "action":"%enc{%map{action}}{JSON}"}\ + %varsNotEmpty{, "request.name":"%enc{%map{request.name}}{JSON}"}\ + %varsNotEmpty{, "indices":%map{indices}}\ + %varsNotEmpty{, "opaque_id":"%enc{%map{opaque_id}}{JSON}"}\ + %varsNotEmpty{, "x_forwarded_for":"%enc{%map{x_forwarded_for}}{JSON}"}\ + %varsNotEmpty{, "transport.profile":"%enc{%map{transport.profile}}{JSON}"}\ + %varsNotEmpty{, "rule":"%enc{%map{rule}}{JSON}"}\ + %varsNotEmpty{, "event.category":"%enc{%map{event.category}}{JSON}"}\ + }%n +# "node.name" node name from the `elasticsearch.yml` settings +# "node.id" node id which should not change between cluster restarts +# "host.name" unresolved hostname of the local node +# "host.ip" the local bound ip (i.e. the ip listening for connections) +# "event.type" a received REST request is translated into one or more transport requests. This indicates which processing layer generated the event "rest" or "transport" (internal) +# "event.action" the name of the audited event, eg. "authentication_failed", "access_granted", "run_as_granted", etc. +# "user.name" the subject name as authenticated by a realm +# "user.run_by.name" the original authenticated subject name that is impersonating another one. +# "user.run_as.name" if this "event.action" is of a run_as type, this is the subject name to be impersonated as. +# "user.realm" the name of the realm that authenticated "user.name" +# "user.run_by.realm" the realm name of the impersonating subject ("user.run_by.name") +# "user.run_as.realm" if this "event.action" is of a run_as type, this is the realm name the impersonated user is looked up from +# "user.roles" the roles array of the user; these are the roles that are granting privileges +# "origin.type" it is "rest" if the event is originating (is in relation to) a REST request; possible other values are "transport" and "ip_filter" +# "origin.address" the remote address and port of the first network hop, i.e. a REST proxy or another cluster node +# "realm" name of a realm that has generated an "authentication_failed" or an "authentication_successful"; the subject is not yet authenticated +# "url.path" the URI component between the port and the query string; it is percent (URL) encoded +# "url.query" the URI component after the path and before the fragment; it is percent (URL) encoded +# "request.method" the method of the HTTP request, i.e. one of GET, POST, PUT, DELETE, OPTIONS, HEAD, PATCH, TRACE, CONNECT +# "request.body" the content of the request body entity, JSON escaped +# "request.id" a synthentic identifier for the incoming request, this is unique per incoming request, and consistent across all audit events generated by that request +# "action" an action is the most granular operation that is authorized and this identifies it in a namespaced way (internal) +# "request.name" if the event is in connection to a transport message this is the name of the request class, similar to how rest requests are identified by the url path (internal) +# "indices" the array of indices that the "action" is acting upon +# "opaque_id" opaque value conveyed by the "X-Opaque-Id" request header +# "x_forwarded_for" the addresses from the "X-Forwarded-For" request header, as a verbatim string value (not an array) +# "transport.profile" name of the transport profile in case this is a "connection_granted" or "connection_denied" event +# "rule" name of the applied rulee if the "origin.type" is "ip_filter" +# "event.category" fixed value "elasticsearch-audit" + +appender.audit_rolling.filePattern = ${sys:es.logs.base_path}${sys:file.separator}${sys:es.logs.cluster_name}_audit-%d{yyyy-MM-dd}.json +appender.audit_rolling.policies.type = Policies +appender.audit_rolling.policies.time.type = TimeBasedTriggeringPolicy +appender.audit_rolling.policies.time.interval = 1 +appender.audit_rolling.policies.time.modulate = true + +logger.xpack_security_audit_logfile.name = org.elasticsearch.xpack.security.audit.logfile.LoggingAuditTrail +logger.xpack_security_audit_logfile.level = info +logger.xpack_security_audit_logfile.appenderRef.audit_rolling.ref = audit_rolling +logger.xpack_security_audit_logfile.additivity = false + +logger.xmlsig.name = org.apache.xml.security.signature.XMLSignature +logger.xmlsig.level = error +logger.samlxml_decrypt.name = org.opensaml.xmlsec.encryption.support.Decrypter +logger.samlxml_decrypt.level = fatal +logger.saml2_decrypt.name = org.opensaml.saml.saml2.encryption.Decrypter +logger.saml2_decrypt.level = fatal diff --git a/jvm.options b/jvm.options new file mode 100644 index 0000000..e69de29 diff --git a/log4j2.properties b/log4j2.properties new file mode 100644 index 0000000..e69de29 diff --git a/requirements-swh.txt b/requirements-swh.txt new file mode 100644 index 0000000..ae53050 --- /dev/null +++ b/requirements-swh.txt @@ -0,0 +1,4 @@ +# Add here internal Software Heritage dependencies, one per line. +swh.core +swh.journal +swh.model diff --git a/requirements-test.txt b/requirements-test.txt new file mode 100644 index 0000000..f03976e --- /dev/null +++ b/requirements-test.txt @@ -0,0 +1,2 @@ +pytest +pytest-elasticsearch diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..fff18ad --- /dev/null +++ b/requirements.txt @@ -0,0 +1,6 @@ +# Add here external Python modules dependencies, one per line. Module names +# should match https://pypi.python.org/pypi names. For the full spec or +# dependency lines, see https://pip.readthedocs.org/en/1.1/requirements.html +vcversioner +click +elasticsearch>=7.0.0,<8.0.0 diff --git a/setup.py b/setup.py new file mode 100755 index 0000000..a66caa1 --- /dev/null +++ b/setup.py @@ -0,0 +1,69 @@ +#!/usr/bin/env python3 +# Copyright (C) 2015-2018 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from setuptools import setup, find_packages + +from os import path +from io import open + +here = path.abspath(path.dirname(__file__)) + +# Get the long description from the README file +with open(path.join(here, 'README.md'), encoding='utf-8') as f: + long_description = f.read() + + +def parse_requirements(name=None): + if name: + reqf = 'requirements-%s.txt' % name + else: + reqf = 'requirements.txt' + + requirements = [] + if not path.exists(reqf): + return requirements + + with open(reqf) as f: + for line in f.readlines(): + line = line.strip() + if not line or line.startswith('#'): + continue + requirements.append(line) + return requirements + + +setup( + name='swh.search', + description='Software Heritage search service', + long_description=long_description, + long_description_content_type='text/markdown', + author='Software Heritage developers', + author_email='swh-devel@inria.fr', + url='https://forge.softwareheritage.org/diffusion/DSEA', + packages=find_packages(), # packages's modules + install_requires=parse_requirements() + parse_requirements('swh'), + tests_require=parse_requirements('test'), + entry_points=''' + [swh.cli.subcommands] + search=swh.search.cli:search + ''', + setup_requires=['vcversioner'], + extras_require={'testing': parse_requirements('test')}, + vcversioner={}, + include_package_data=True, + classifiers=[ + "Programming Language :: Python :: 3", + "Intended Audience :: Developers", + "License :: OSI Approved :: GNU General Public License v3 (GPLv3)", + "Operating System :: OS Independent", + "Development Status :: 3 - Alpha", + ], + project_urls={ + 'Bug Reports': 'https://forge.softwareheritage.org/maniphest', + 'Funding': 'https://www.softwareheritage.org/donate', + 'Source': 'https://forge.softwareheritage.org/source/swh-search', + }, +) diff --git a/swh/__init__.py b/swh/__init__.py new file mode 100644 index 0000000..69e3be5 --- /dev/null +++ b/swh/__init__.py @@ -0,0 +1 @@ +__path__ = __import__('pkgutil').extend_path(__path__, __name__) diff --git a/swh/search/__init__.py b/swh/search/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/swh/search/cli.py b/swh/search/cli.py new file mode 100644 index 0000000..69778b5 --- /dev/null +++ b/swh/search/cli.py @@ -0,0 +1,15 @@ +# Copyright (C) 2019 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import click + +from swh.core.cli import CONTEXT_SETTINGS + + +@click.group(name='search', context_settings=CONTEXT_SETTINGS) +@click.pass_context +def search(ctx): + '''Software Heritage Search tools.''' + pass diff --git a/swh/search/elasticsearch.py b/swh/search/elasticsearch.py new file mode 100644 index 0000000..cf34c72 --- /dev/null +++ b/swh/search/elasticsearch.py @@ -0,0 +1,145 @@ +# Copyright (C) 2019 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import base64 +from typing import Iterable, Dict, List, Iterator + +from elasticsearch import Elasticsearch +from elasticsearch.client import IndicesClient +from elasticsearch.helpers import bulk, scan +import msgpack + +from swh.model import model +from swh.model.identifiers import origin_identifier + + +class ElasticSearch: + def __init__(self, hosts: List[str]): + self._backend = Elasticsearch(hosts=hosts) + + def check(self): + self._backend.ping() + + def initialize(self) -> None: + self._backend.indices.create( + index='origin', + body={ + 'mappings': { + 'properties': { + 'url': { + 'type': 'text', + # TODO: consider removing fielddata when + # swh-storage allows querying by hash, so the + # full URL does not have to be stored in ES' + # memory. See: + # https://www.elastic.co/guide/en/elasticsearch/reference/current/fielddata.html#before-enabling-fielddata + 'fielddata': True, + 'analyzer': 'simple', + 'fields': { + 'as_you_type': { + 'type': 'search_as_you_type', + 'analyzer': 'simple', + } + } + } + } + } + } + ) + + def origin_add(self, origins: Iterable[model.Origin]) -> None: + origins = (origin.to_dict() for origin in origins) + ''' + for origin in origins: + self._backend.index( + index='origin', + id=origin_identifier(origin), + body=origin, + ) + self._backend.indices.refresh(index='origin') + ''' + actions = [ + { + '_id': origin_identifier(origin), + '_index': 'origin', + '_source': origin, + } + for origin in origins + ] + res = bulk(self._backend, actions, index='origin', refresh='wait_for') + + def origin_dump(self) -> Iterator[model.Origin]: + results = list(scan(self._backend, index='*')) + for hit in results: + yield self._backend.termvectors( + index='origin', id=hit['_id'], + fields=['url', 'url.as_you_type', 'url.as_you_type._2gram' + 'url.as_you_type._3gram', 'url._2gram', 'url._3gram']) + + def origin_search( + self, url_substring: str, cursor: str = None, count: int = 50 + ) -> Dict[str, object]: + """Searches for origins matching the `url_substring`. + + Args: + url_substring (str): Part of thr URL to search for + cursor (str): `cursor` is opaque value used for pagination. + count (int): number of results to return. + + Returns: + a dictionary with keys: + * `cursor`: + opaque value used for fetching more results. `None` if there + are no more result. + * `results`: + list of dictionaries with key: + * `url`: URL of a matching origin + """ + body = { + 'query': { + 'multi_match': { + 'query': url_substring, + 'type': 'bool_prefix', + 'fields': [ + 'url.as_you_type', + 'url.as_you_type._2gram', + 'url.as_you_type._3gram', + ] + } + }, + 'size': count, + 'sort': [ + {'_score': 'desc'}, + {'url': 'asc'}, + ] + } + if cursor: + cursor = msgpack.decode(base64.b64decode(cursor)) + body['search_after'] = [cursor['_score'], cursor['url']] + + res = self._backend.search( + index='origin', + body=body, + size=count, + ) + + hits = res['hits']['hits'] + + if len(hits) == count: + last_hit = hits[-1] + next_cursor = { + 'score': last_hit['_score'], + 'url': last_hit['_source']['url'], + } + next_cursor = base64.b64encode(msgpack.dumps(next_cursor)) + else: + next_cursor = None + + return { + 'cursor': next_cursor, + 'results': [ + {'url': hit['_source']['url'] for hit in hits} + ] + } diff --git a/swh/search/tests/__init__.py b/swh/search/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/swh/search/tests/conftest.py b/swh/search/tests/conftest.py new file mode 100644 index 0000000..c37cef7 --- /dev/null +++ b/swh/search/tests/conftest.py @@ -0,0 +1,108 @@ +# Copyright (C) 2019 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import os +import socket +import subprocess +import time + +import elasticsearch +import pytest + +def free_port(): + sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + sock.bind(('127.0.0.1', 0)) + port = sock.getsockname()[1] + sock.close() + return port + + +def wait_for_peer(addr, port): + while True: + try: + sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + sock.connect((addr, port)) + except ConnectionRefusedError: + time.sleep(0.1) + else: + sock.close() + break + + +CONFIG_TEMPLATE = ''' +node.name: node-1 +path.data: {data} +path.logs: {logs} +network.host: 127.0.0.1 +http.port: {http_port} +transport.port: {transport_port} +''' + +def _run_elasticsearch(conf_dir, data_dir, logs_dir, http_port, transport_port): + es_home = '/usr/share/elasticsearch' + + with open(conf_dir + '/elasticsearch.yml', 'w') as fd: + fd.write(CONFIG_TEMPLATE.format( + data=data_dir, + logs=logs_dir, + http_port=http_port, + transport_port=transport_port)) + + with open(conf_dir + '/log4j2.properties', 'w') as fd: + pass + + cmd = [ + '/usr/share/elasticsearch/jdk/bin/java', + '-Des.path.home={}'.format(es_home), + '-Des.path.conf={}'.format(conf_dir), + '-Des.bundled_jdk=true', + '-Dlog4j2.disable.jmx=true', + '-cp', '{}/lib/*'.format(es_home), + 'org.elasticsearch.bootstrap.Elasticsearch', + ] + + host = '127.0.0.1:{}'.format(http_port) + + with open(logs_dir + '/output.txt', 'w') as fd: + p = subprocess.Popen(cmd) #, stdout=fd, stderr=fd) + + wait_for_peer('127.0.0.1', http_port) + + client = elasticsearch.Elasticsearch([host]) + assert client.ping() + + return p + +@pytest.fixture(scope='session') +def elasticsearch_session(tmpdir_factory): + tmpdir = tmpdir_factory.mktemp('elasticsearch') + es_conf = tmpdir.mkdir('conf') + + http_port = free_port() + http_port = 9200 + transport_port = free_port() + + p = _run_elasticsearch( + conf_dir=str(es_conf), + data_dir=str(tmpdir.mkdir('data')), + logs_dir=str(tmpdir.mkdir('logs')), + http_port=http_port, + transport_port=transport_port, + ) + + yield '127.0.0.1:{}'.format(http_port) + + # Check ES didn't stop + assert p.returncode is None, p.returncode + + p.kill() + p.wait() + + +@pytest.fixture(scope='function') +def elasticsearch_host(elasticsearch_session): + client = elasticsearch.Elasticsearch([elasticsearch_session]) + client.indices.delete(index='*') + yield elasticsearch_session diff --git a/swh/search/tests/test_search.py b/swh/search/tests/test_search.py new file mode 100644 index 0000000..f23d78f --- /dev/null +++ b/swh/search/tests/test_search.py @@ -0,0 +1,37 @@ +# Copyright (C) 2019 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import unittest + +import pytest + +from swh.model.model import Origin +from swh.search.elasticsearch import ElasticSearch + + +def test_origin_url_unique_substring(elasticsearch_host): + search = ElasticSearch([elasticsearch_host]) + search.initialize() + search.origin_add([ + Origin(url='http://foobar.baz', type=None), + Origin(url='http://barbaz.qux', type=None), + ]) + search.origin_dump() + + results = search.origin_search('foobar') + assert results == {'cursor': None, 'results': [{'url': 'http://foobar.baz'}]} + + results = search.origin_search('barb') + assert results == {'cursor': None, 'results': [{'url': 'http://barbaz.qux'}]} + + # 'bar' is part of 'foobar', but is not the beginning of it + results = search.origin_search('bar') + assert results == {'cursor': None, 'results': [{'url': 'http://barbaz.qux'}]} + + results = search.origin_search('barbaz') + assert results == {'cursor': None, 'results': [{'url': 'http://barbaz.qux'}]} + + results = search.origin_search('qux') + assert results == {'cursor': None, 'results': [{'url': 'http://barbaz.qux'}]} diff --git a/tox.ini b/tox.ini index 335f4ed..5b56c67 100644 --- a/tox.ini +++ b/tox.ini @@ -1,23 +1,24 @@ [tox] envlist=check-manifest,flake8,py3 [testenv:py3] deps = .[testing] pytest-cov + https://github.com/ClearcodeHQ/pytest-elasticsearch/tarball/master#egg=pytest-elasticsearch commands = pytest --cov=swh --cov-branch {posargs} [testenv:flake8] skip_install = true deps = flake8 commands = {envpython} -m flake8 [testenv:check-manifest] skip_install = true deps = check-manifest commands = {envpython} -m check_manifest {toxinidir}