Използвам ElasticSearch за индексиране на база данни. Опитвам се да използвам edgeNGram tokenizer, за да режа низове до shoter с изискване „новият низ трябва да е по-дълъг от 4 знака“. Използвам следния код за създаване на индекс:
PUT test
POST /test/_close
PUT /test/_settings
{
"analysis": {
"analyzer": {
"index_edge_ngram" : {
"type": "custom",
"filter": ["custom_word_delimiter"],
"tokenizer" : "left_tokenizer"
}
},
"filter" : {
"custom_word_delimiter" : {
"type": "word_delimiter",
"generate_word_parts": "true",
"generate_number_parts": "true",
"catenate_words": "false",
"catenate_numbers": "false",
"catenate_all": "false",
"split_on_case_change": "false",
"preserve_original": "false",
"split_on_numerics": "true",
"ignore_case": "true"
}
},
"tokenizer" : {
"left_tokenizer" : {
"max_gram" : 30,
"min_gram" : 5,
"type" : "edgeNGram"
}
}
}
}
POST /test/_open
Сега провеждам тест, за да прегледам резултатите
GET /test/_analyze?analyzer=index_edge_ngram&text=please pay for multiple wins with only one payment
и получете резултатите
{
"tokens": [
{
"token": "pleas",
"start_offset": 0,
"end_offset": 5,
"type": "word",
"position": 1
},
{
"token": "please",
"start_offset": 0,
"end_offset": 6,
"type": "word",
"position": 2
},
{
"token": "please",
"start_offset": 0,
"end_offset": 6,
"type": "word",
"position": 3
},
{
"token": "please",
"start_offset": 0,
"end_offset": 6,
"type": "word",
"position": 4
},
{
"token": "p",
"start_offset": 7,
"end_offset": 8,
"type": "word",
"position": 5
},
{
"token": "please",
"start_offset": 0,
"end_offset": 6,
"type": "word",
"position": 6
},
{
"token": "pa",
"start_offset": 7,
"end_offset": 9,
"type": "word",
"position": 7
},
{
"token": "please",
"start_offset": 0,
"end_offset": 6,
"type": "word",
"position": 8
},
{
"token": "pay",
"start_offset": 7,
"end_offset": 10,
"type": "word",
"position": 9
},
{
"token": "please",
"start_offset": 0,
"end_offset": 6,
"type": "word",
"position": 10
},
{
"token": "pay",
"start_offset": 7,
"end_offset": 10,
"type": "word",
"position": 11
},
{
"token": "please",
"start_offset": 0,
"end_offset": 6,
"type": "word",
"position": 12
},
{
"token": "pay",
"start_offset": 7,
"end_offset": 10,
"type": "word",
"position": 13
},
{
"token": "f",
"start_offset": 11,
"end_offset": 12,
"type": "word",
"position": 14
},
{
"token": "please",
"start_offset": 0,
"end_offset": 6,
"type": "word",
"position": 15
},
{
"token": "pay",
"start_offset": 7,
"end_offset": 10,
"type": "word",
"position": 16
},
{
"token": "fo",
"start_offset": 11,
"end_offset": 13,
"type": "word",
"position": 17
},
{
"token": "please",
"start_offset": 0,
"end_offset": 6,
"type": "word",
"position": 18
},
{
"token": "pay",
"start_offset": 7,
"end_offset": 10,
"type": "word",
"position": 19
},
{
"token": "for",
"start_offset": 11,
"end_offset": 14,
"type": "word",
"position": 20
},
{
"token": "please",
"start_offset": 0,
"end_offset": 6,
"type": "word",
"position": 21
},
{
"token": "pay",
"start_offset": 7,
"end_offset": 10,
"type": "word",
"position": 22
},
{
"token": "for",
"start_offset": 11,
"end_offset": 14,
"type": "word",
"position": 23
},
{
"token": "please",
"start_offset": 0,
"end_offset": 6,
"type": "word",
"position": 24
},
{
"token": "pay",
"start_offset": 7,
"end_offset": 10,
"type": "word",
"position": 25
},
{
"token": "for",
"start_offset": 11,
"end_offset": 14,
"type": "word",
"position": 26
},
{
"token": "m",
"start_offset": 15,
"end_offset": 16,
"type": "word",
"position": 27
},
{
"token": "please",
"start_offset": 0,
"end_offset": 6,
"type": "word",
"position": 28
},
{
"token": "pay",
"start_offset": 7,
"end_offset": 10,
"type": "word",
"position": 29
},
{
"token": "for",
"start_offset": 11,
"end_offset": 14,
"type": "word",
"position": 30
},
{
"token": "mu",
"start_offset": 15,
"end_offset": 17,
"type": "word",
"position": 31
},
{
"token": "please",
"start_offset": 0,
"end_offset": 6,
"type": "word",
"position": 32
},
{
"token": "pay",
"start_offset": 7,
"end_offset": 10,
"type": "word",
"position": 33
},
{
"token": "for",
"start_offset": 11,
"end_offset": 14,
"type": "word",
"position": 34
},
{
"token": "mul",
"start_offset": 15,
"end_offset": 18,
"type": "word",
"position": 35
},
{
"token": "please",
"start_offset": 0,
"end_offset": 6,
"type": "word",
"position": 36
},
{
"token": "pay",
"start_offset": 7,
"end_offset": 10,
"type": "word",
"position": 37
},
{
"token": "for",
"start_offset": 11,
"end_offset": 14,
"type": "word",
"position": 38
},
{
"token": "mult",
"start_offset": 15,
"end_offset": 19,
"type": "word",
"position": 39
},
{
"token": "please",
"start_offset": 0,
"end_offset": 6,
"type": "word",
"position": 40
},
{
"token": "pay",
"start_offset": 7,
"end_offset": 10,
"type": "word",
"position": 41
},
{
"token": "for",
"start_offset": 11,
"end_offset": 14,
"type": "word",
"position": 42
},
{
"token": "multi",
"start_offset": 15,
"end_offset": 20,
"type": "word",
"position": 43
},
{
"token": "please",
"start_offset": 0,
"end_offset": 6,
"type": "word",
"position": 44
},
{
"token": "pay",
"start_offset": 7,
"end_offset": 10,
"type": "word",
"position": 45
},
{
"token": "for",
"start_offset": 11,
"end_offset": 14,
"type": "word",
"position": 46
},
{
"token": "multip",
"start_offset": 15,
"end_offset": 21,
"type": "word",
"position": 47
},
{
"token": "please",
"start_offset": 0,
"end_offset": 6,
"type": "word",
"position": 48
},
{
"token": "pay",
"start_offset": 7,
"end_offset": 10,
"type": "word",
"position": 49
},
{
"token": "for",
"start_offset": 11,
"end_offset": 14,
"type": "word",
"position": 50
},
{
"token": "multipl",
"start_offset": 15,
"end_offset": 22,
"type": "word",
"position": 51
},
{
"token": "please",
"start_offset": 0,
"end_offset": 6,
"type": "word",
"position": 52
},
{
"token": "pay",
"start_offset": 7,
"end_offset": 10,
"type": "word",
"position": 53
},
{
"token": "for",
"start_offset": 11,
"end_offset": 14,
"type": "word",
"position": 54
},
{
"token": "multiple",
"start_offset": 15,
"end_offset": 23,
"type": "word",
"position": 55
},
{
"token": "please",
"start_offset": 0,
"end_offset": 6,
"type": "word",
"position": 56
},
{
"token": "pay",
"start_offset": 7,
"end_offset": 10,
"type": "word",
"position": 57
},
{
"token": "for",
"start_offset": 11,
"end_offset": 14,
"type": "word",
"position": 58
},
{
"token": "multiple",
"start_offset": 15,
"end_offset": 23,
"type": "word",
"position": 59
},
{
"token": "please",
"start_offset": 0,
"end_offset": 6,
"type": "word",
"position": 60
},
{
"token": "pay",
"start_offset": 7,
"end_offset": 10,
"type": "word",
"position": 61
},
{
"token": "for",
"start_offset": 11,
"end_offset": 14,
"type": "word",
"position": 62
},
{
"token": "multiple",
"start_offset": 15,
"end_offset": 23,
"type": "word",
"position": 63
},
{
"token": "w",
"start_offset": 24,
"end_offset": 25,
"type": "word",
"position": 64
},
{
"token": "please",
"start_offset": 0,
"end_offset": 6,
"type": "word",
"position": 65
},
{
"token": "pay",
"start_offset": 7,
"end_offset": 10,
"type": "word",
"position": 66
},
{
"token": "for",
"start_offset": 11,
"end_offset": 14,
"type": "word",
"position": 67
},
{
"token": "multiple",
"start_offset": 15,
"end_offset": 23,
"type": "word",
"position": 68
},
{
"token": "wi",
"start_offset": 24,
"end_offset": 26,
"type": "word",
"position": 69
},
{
"token": "please",
"start_offset": 0,
"end_offset": 6,
"type": "word",
"position": 70
},
{
"token": "pay",
"start_offset": 7,
"end_offset": 10,
"type": "word",
"position": 71
},
{
"token": "for",
"start_offset": 11,
"end_offset": 14,
"type": "word",
"position": 72
},
{
"token": "multiple",
"start_offset": 15,
"end_offset": 23,
"type": "word",
"position": 73
},
{
"token": "win",
"start_offset": 24,
"end_offset": 27,
"type": "word",
"position": 74
},
{
"token": "please",
"start_offset": 0,
"end_offset": 6,
"type": "word",
"position": 75
},
{
"token": "pay",
"start_offset": 7,
"end_offset": 10,
"type": "word",
"position": 76
},
{
"token": "for",
"start_offset": 11,
"end_offset": 14,
"type": "word",
"position": 77
},
{
"token": "multiple",
"start_offset": 15,
"end_offset": 23,
"type": "word",
"position": 78
},
{
"token": "wins",
"start_offset": 24,
"end_offset": 28,
"type": "word",
"position": 79
},
{
"token": "please",
"start_offset": 0,
"end_offset": 6,
"type": "word",
"position": 80
},
{
"token": "pay",
"start_offset": 7,
"end_offset": 10,
"type": "word",
"position": 81
},
{
"token": "for",
"start_offset": 11,
"end_offset": 14,
"type": "word",
"position": 82
},
{
"token": "multiple",
"start_offset": 15,
"end_offset": 23,
"type": "word",
"position": 83
},
{
"token": "wins",
"start_offset": 24,
"end_offset": 28,
"type": "word",
"position": 84
},
{
"token": "please",
"start_offset": 0,
"end_offset": 6,
"type": "word",
"position": 85
},
{
"token": "pay",
"start_offset": 7,
"end_offset": 10,
"type": "word",
"position": 86
},
{
"token": "for",
"start_offset": 11,
"end_offset": 14,
"type": "word",
"position": 87
},
{
"token": "multiple",
"start_offset": 15,
"end_offset": 23,
"type": "word",
"position": 88
},
{
"token": "wins",
"start_offset": 24,
"end_offset": 28,
"type": "word",
"position": 89
},
{
"token": "w",
"start_offset": 29,
"end_offset": 30,
"type": "word",
"position": 90
}
]
}
Ето моите въпроси:
Защо има жетони Shoter тогава 5 знака?
Защо свойството "position" показва позицията на токена, но не и позицията на думата в текста? Изглежда, че другите токенизатори работят по този начин.
Защо няма всички думи в резултата? Изглежда, че спира на "победите".
Защо има толкова много повторения на един и същи символ?