问题
because I use the option "output_unigrams_if_no_shingles": true" in the "shingle_filter" filter for in the search for suggestion only show shingles in the results, but the suggestions display the ngrams
"shingle_filter": {
"type": "shingle",
"min_shingle_size": 2,
"max_shingle_size": 3,
"output_unigrams_if_no_shingles": true
Follow below my mapping
{
"settings": {
"index": {
"number_of_shards": "5",
"number_of_replicas": "0",
"analysis": {
"filter": {
"stemmer_plural_portugues": {
"name": "minimal_portuguese",
"stopwords" : ["http", "https", "ftp", "www"],
"type": "stemmer"
},
"ngram_filter": {
"type": "ngram",
"min_gram": 3,
"max_gram": 3,
"token_chars": [
"letter",
"digit"
]
},
"synonym_filter": {
"type": "synonym",
"lenient": true,
"synonyms_path": "analysis/synonym.txt",
"updateable" : false
},
"shingle_filter": {
"type": "shingle",
"min_shingle_size": 2,
"max_shingle_size": 3,
"output_unigrams_if_no_shingles": true
}
},
"analyzer": {
"analyzer_customizado": {
"filter": [
"lowercase",
"stemmer_plural_portugues",
"asciifolding",
"synonym_filter",
"ngram_filter",
"shingle_filter"
],
"tokenizer": "lowercase"
}
}
}
}
},
"mappings": {
"properties": {
"id": {
"type": "long"
},
"data": {
"type": "date"
},
"quebrado": {
"type": "byte"
},
"pgrk": {
"type": "integer"
},
"url_length": {
"type": "integer"
},
"title": {
"analyzer": "analyzer_customizado",
"type": "text",
"fields": {
"keyword": {
"ignore_above": 256,
"type": "keyword"
}
}
},
"description": {
"analyzer": "analyzer_customizado",
"type": "text",
"fields": {
"keyword": {
"ignore_above": 256,
"type": "keyword"
}
}
},
"url": {
"analyzer": "analyzer_customizado",
"type": "text",
"fields": {
"keyword": {
"ignore_above": 256,
"type": "keyword"
}
}
}
}
}
}
I insert the doc below
{
"title": "shopping",
"description": "sex video",
"url": "www.ohcs.com"
}
In my suggestion query below I type "video" in the wrong way "vidio"
{
"suggest": {
"text": "vidio",
"simple_phrase": {
"phrase": {
"field": "description",
"size": 1,
"max_errors": 100,
"direct_generator": [
{
"field" : "description",
"suggest_mode" : "always",
"min_word_length" : 1
}
],
"collate": {
"query": {
"source" : {
"match": {
"{{field_name}}": {
"query": "{{suggestion}}",
"operator": "and"
}
}
}
},
"params": {"field_name" : "description"},
"prune": true
},
"highlight": {
"pre_tag": "<strong>",
"post_tag": "</strong>"
}
}
}
}
}
in the result below the suggestion search the result displays the correct suggestion "video" but displays and several ngram tokens instead of the entire word
{
"took": 1,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"skipped": 0,
"failed": 0
},
"hits": {
"total": {
"value": 0,
"relation": "eq"
},
"max_score": null,
"hits": [
]
},
"suggest": {
"simple_phrase": [
{
"text": "vidio",
"offset": 0,
"length": 5,
"options": [
{
"text": "vid ide deo",
"highlighted": "vid <strong>ide deo</strong>",
"score": 0.2648209,
"collate_match": true
}
]
}
]
}
}
how do I get the results of the suggestion to display the entire word "video" without being divided into several ngram tokens?
回答1:
I put the mapping as you ordered
{
"settings": {
"index": {
"number_of_shards": "5",
"number_of_replicas": "0",
"max_ngram_diff": 2,
"analysis": {
"filter": {
"stemmer_plural_portugues": {
"name": "minimal_portuguese",
"stopwords" : ["http", "https", "ftp", "www"],
"type": "stemmer"
},
"ngram_filter": {
"type": "ngram",
"min_gram": 1,
"max_gram": 3,
"token_chars": [
"letter",
"digit"
]
},
"synonym_filter": {
"type": "synonym",
"lenient": true,
"synonyms_path": "analysis/synonym.txt",
"updateable" : false
},
"shingle_filter": {
"type": "shingle",
"min_shingle_size": 2,
"max_shingle_size": 3,
"output_unigrams" : false,
"output_unigrams_if_no_shingles" : true
}
},
"analyzer": {
"analyzer_customizado": {
"filter": [
"lowercase",
"stemmer_plural_portugues",
"asciifolding",
"synonym_filter",
"ngram_filter",
"shingle_filter"
],
"tokenizer": "lowercase"
}
}
}
}
},
"mappings": {
"properties": {
"id": {
"type": "long"
},
"data": {
"type": "date"
},
"quebrado": {
"type": "byte"
},
"pgrk": {
"type": "integer"
},
"url_length": {
"type": "integer"
},
"title": {
"analyzer": "analyzer_customizado",
"type": "text",
"fields": {
"keyword": {
"ignore_above": 256,
"type": "keyword"
}
}
},
"description": {
"analyzer": "analyzer_customizado",
"type": "text",
"fields": {
"keyword": {
"ignore_above": 256,
"type": "keyword"
}
}
},
"url": {
"analyzer": "analyzer_customizado",
"type": "text",
"fields": {
"keyword": {
"ignore_above": 256,
"type": "keyword"
}
}
}
}
}
}
then I executed the query and suggestion below
{
"suggest": {
"text": "vidio",
"simple_phrase": {
"phrase": {
"field": "description",
"size": 1,
"max_errors": 100,
"direct_generator": [
{
"field" : "description",
"suggest_mode" : "always",
"min_word_length" : 1
}
],
"collate": {
"query": {
"source" : {
"match": {
"{{field_name}}": {
"query": "{{suggestion}}",
"operator": "and"
}
}
}
},
"params": {"field_name" : "description"},
"prune": true
},
"highlight": {
"pre_tag": "<strong>",
"post_tag": "</strong>"
}
}
}
}
}
the suggestion query displays the error message below
{
"took": 6,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 4,
"skipped": 0,
"failed": 1,
"failures": [
{
"shard": 4,
"index": "teste",
"node": "IW_SN_-fSkSIySZ4CO9rGA",
"reason": {
"type": "illegal_state_exception",
"reason": "At least one unigram is required but all tokens were ngrams"
}
}
]
},
"hits": {
"total": {
"value": 0,
"relation": "eq"
},
"max_score": null,
"hits": [
]
}
}
回答2:
The problem is your ngram filter. You set min gram as 3 and max gram as 3.
Hence you are gettinf only 3 letter words. You can change max gram to the value you want. In your example if you set to 5, you can get video in your output.
You have as below:
"ngram_filter": {
"type": "ngram",
"min_gram": 3,
"max_gram": 3,
"token_chars": [
"letter",
"digit"
]
},
来源:https://stackoverflow.com/questions/62616417/phrase-suggester-with-ngrams