Page MenuHomeSoftware Heritage
Paste P871

ES 7.10 tokenization
ActivePublic

Authored by vlorentz on Nov 20 2020, 12:24 PM.
$ curl -X PUT "localhost:9200/origin/"
{"acknowledged":true,"shards_acknowledged":true,"index":"origin"}
$ curl -X PUT "localhost:9200/origin/_mapping" -H 'Content-Type: application/json' -d'
{"properties": {
"url": {
"type": "text",
"analyzer": "simple",
"fields": {
"as_you_type": {
"type": "search_as_you_type",
"analyzer": "simple"
}
}
}
}}'
{"acknowledged":true}
$ curl -X POST "localhost:9200/origin/_doc/" -H 'Content-Type: application/json' -d'{"url": "http://barbaz.qux"}'
{"_index":"origin","_type":"_doc","_id":"ybBI5XUBJDHqP9p-BVs2","_version":1,"result":"created","_shards":{"total":2,"successful":1,"failed":0},"_seq_no":0,"_primary_term":1}
$ curl -X GET "localhost:9200/origin/_termvectors/ybBI5XUBJDHqP9p-BVs2?fields=*&pretty"
{
"_index" : "origin",
"_type" : "_doc",
"_id" : "ybBI5XUBJDHqP9p-BVs2",
"_version" : 1,
"found" : true,
"took" : 1,
"term_vectors" : {
"url.as_you_type._index_prefix" : {
"field_statistics" : {
"sum_doc_freq" : 28,
"doc_count" : 1,
"sum_ttf" : 28
},
"terms" : {
"b" : {
"term_freq" : 1,
"tokens" : [
{
"position" : 1,
"start_offset" : 7,
"end_offset" : 17
}
]
},
"ba" : {
"term_freq" : 1,
"tokens" : [
{
"position" : 1,
"start_offset" : 7,
"end_offset" : 17
}
]
},
"bar" : {
"term_freq" : 1,
"tokens" : [
{
"position" : 1,
"start_offset" : 7,
"end_offset" : 17
}
]
},
"barb" : {
"term_freq" : 1,
"tokens" : [
{
"position" : 1,
"start_offset" : 7,
"end_offset" : 17
}
]
},
"barba" : {
"term_freq" : 1,
"tokens" : [
{
"position" : 1,
"start_offset" : 7,
"end_offset" : 17
}
]
},
"barbaz" : {
"term_freq" : 1,
"tokens" : [
{
"position" : 1,
"start_offset" : 7,
"end_offset" : 17
}
]
},
"barbaz." : {
"term_freq" : 1,
"tokens" : [
{
"position" : 1,
"start_offset" : 7,
"end_offset" : 17
}
]
},
"barbaz.q" : {
"term_freq" : 1,
"tokens" : [
{
"position" : 1,
"start_offset" : 7,
"end_offset" : 17
}
]
},
"barbaz.qu" : {
"term_freq" : 1,
"tokens" : [
{
"position" : 1,
"start_offset" : 7,
"end_offset" : 17
}
]
},
"barbaz.qux" : {
"term_freq" : 1,
"tokens" : [
{
"position" : 1,
"start_offset" : 7,
"end_offset" : 17
}
]
},
"barbaz.qux " : {
"term_freq" : 1,
"tokens" : [
{
"position" : 1,
"start_offset" : 7,
"end_offset" : 17
}
]
},
"barbaz.qux " : {
"term_freq" : 1,
"tokens" : [
{
"position" : 1,
"start_offset" : 7,
"end_offset" : 17
}
]
},
"h" : {
"term_freq" : 1,
"tokens" : [
{
"position" : 0,
"start_offset" : 0,
"end_offset" : 17
}
]
},
"ht" : {
"term_freq" : 1,
"tokens" : [
{
"position" : 0,
"start_offset" : 0,
"end_offset" : 17
}
]
},
"htt" : {
"term_freq" : 1,
"tokens" : [
{
"position" : 0,
"start_offset" : 0,
"end_offset" : 17
}
]
},
"http" : {
"term_freq" : 1,
"tokens" : [
{
"position" : 0,
"start_offset" : 0,
"end_offset" : 17
}
]
},
"http " : {
"term_freq" : 1,
"tokens" : [
{
"position" : 0,
"start_offset" : 0,
"end_offset" : 17
}
]
},
"http b" : {
"term_freq" : 1,
"tokens" : [
{
"position" : 0,
"start_offset" : 0,
"end_offset" : 17
}
]
},
"http ba" : {
"term_freq" : 1,
"tokens" : [
{
"position" : 0,
"start_offset" : 0,
"end_offset" : 17
}
]
},
"http bar" : {
"term_freq" : 1,
"tokens" : [
{
"position" : 0,
"start_offset" : 0,
"end_offset" : 17
}
]
},
"http barb" : {
"term_freq" : 1,
"tokens" : [
{
"position" : 0,
"start_offset" : 0,
"end_offset" : 17
}
]
},
"http barba" : {
"term_freq" : 1,
"tokens" : [
{
"position" : 0,
"start_offset" : 0,
"end_offset" : 17
}
]
},
"http barbaz" : {
"term_freq" : 1,
"tokens" : [
{
"position" : 0,
"start_offset" : 0,
"end_offset" : 17
}
]
},
"http barbaz." : {
"term_freq" : 1,
"tokens" : [
{
"position" : 0,
"start_offset" : 0,
"end_offset" : 17
}
]
},
"http barbaz.q" : {
"term_freq" : 1,
"tokens" : [
{
"position" : 0,
"start_offset" : 0,
"end_offset" : 17
}
]
},
"http barbaz.qu" : {
"term_freq" : 1,
"tokens" : [
{
"position" : 0,
"start_offset" : 0,
"end_offset" : 17
}
]
},
"http barbaz.qux" : {
"term_freq" : 1,
"tokens" : [
{
"position" : 0,
"start_offset" : 0,
"end_offset" : 17
}
]
},
"http barbaz.qux " : {
"term_freq" : 1,
"tokens" : [
{
"position" : 0,
"start_offset" : 0,
"end_offset" : 17
}
]
}
}
},
"url.as_you_type._2gram" : {
"field_statistics" : {
"sum_doc_freq" : 1,
"doc_count" : 1,
"sum_ttf" : 1
},
"terms" : {
"http barbaz.qux" : {
"term_freq" : 1,
"tokens" : [
{
"position" : 0,
"start_offset" : 0,
"end_offset" : 17
}
]
}
}
},
"url.as_you_type" : {
"field_statistics" : {
"sum_doc_freq" : 2,
"doc_count" : 1,
"sum_ttf" : 2
},
"terms" : {
"barbaz.qux" : {
"term_freq" : 1,
"tokens" : [
{
"position" : 1,
"start_offset" : 7,
"end_offset" : 17
}
]
},
"http" : {
"term_freq" : 1,
"tokens" : [
{
"position" : 0,
"start_offset" : 0,
"end_offset" : 4
}
]
}
}
},
"url" : {
"field_statistics" : {
"sum_doc_freq" : 3,
"doc_count" : 1,
"sum_ttf" : 3
},
"terms" : {
"barbaz" : {
"term_freq" : 1,
"tokens" : [
{
"position" : 1,
"start_offset" : 7,
"end_offset" : 13
}
]
},
"http" : {
"term_freq" : 1,
"tokens" : [
{
"position" : 0,
"start_offset" : 0,
"end_offset" : 4
}
]
},
"qux" : {
"term_freq" : 1,
"tokens" : [
{
"position" : 2,
"start_offset" : 14,
"end_offset" : 17
}
]
}
}
}
}
}
$ curl -X POST "localhost:9200/origin/_search?pretty" -H 'Content-Type: application/json' -d '{"query": {"bool": {"must": [{"multi_match": {"query": "qu", "type": "bool_prefix", "operator": "and", "fields": ["url.as_you_type", "url.as_you_type._2gram", "url.as_you_type._3gram"]}}]}}}'
{
"took" : 35,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 0,
"relation" : "eq"
},
"max_score" : null,
"hits" : [ ]
}
}