Elasticsearchに、Analyze APIというものがあるらしいです。
Analyze | Elasticsearch Reference [2.2] | Elastic
テキストを、Analyze(単語分割)した時の結果が分かるAPIみたいです。
これ、Solrの管理UIで同じような機能がTokenizerやFilterが適用されていく様子がわかるような形でかなり便利に使えていたのですが、Elasticsearchだと最終形しかわからなかったようで、こちらのようなプラグインが作られていたようです。
GitHub - johtani/elasticsearch-extended-analyze: Extend Analyze API Plugin for Elasticsearch
で、それがElasticsearch 2.2.0で入ったと。
※プラグインを使う前にElasticsearch 2.2.0が出てしまった…
https://www.elastic.co/guide/en/elasticsearch/reference/current/release-notes-2.2.0.html
こちらのPull Requestに出てきている方を見て、なるほど、と…。
では、早速使ってみましょう。
通常のAnalyze API
まずは、普通にAnalyze APIを使ってみます。ここで、「kuromoji_analyzer」はKuromoji Analysis Pluginが提供するTokenizer、各種Filterなどを適用したAnalyzerとします。
$ curl -XGET 'http://localhost:9200/myindex/_analyze?pretty&analyzer=kuromoji_analyzer' -d 'Elasticsearchは、全文検索エンジンです。' { "tokens" : [ { "token" : "elasticsearch", "start_offset" : 0, "end_offset" : 13, "type" : "word", "position" : 0 }, { "token" : "全文", "start_offset" : 15, "end_offset" : 17, "type" : "word", "position" : 2 }, { "token" : "検索", "start_offset" : 17, "end_offset" : 19, "type" : "word", "position" : 3 }, { "token" : "エンジン", "start_offset" : 19, "end_offset" : 23, "type" : "word", "position" : 4 } ] }
こんな結果になります、と。
Analyze API(explain=true)
次に、2.2.0で追加されたexplain=trueを付けて試してみます。
結果は、このように。
$ curl -XGET 'http://localhost:9200/myindex/_analyze?pretty&explain=true&analyzer=kuromoji_analyzer' -d 'Elasticsearch は、全文検索エンジンです。' { "detail" : { "custom_analyzer" : true, "charfilters" : [ ], "tokenizer" : { "name" : "kuromoji_tokenizer_search", "tokens" : [ { "token" : "Elasticsearch", "start_offset" : 0, "end_offset" : 13, "type" : "word", "position" : 0, "baseForm" : null, "bytes" : "[45 6c 61 73 74 69 63 73 65 61 72 63 68]", "inflectionForm" : null, "inflectionForm (en)" : null, "inflectionType" : null, "inflectionType (en)" : null, "partOfSpeech" : "名詞-固有名詞-組織", "partOfSpeech (en)" : "noun-proper-organization", "positionLength" : 1, "pronunciation" : null, "pronunciation (en)" : null, "reading" : null, "reading (en)" : null }, { "token" : "は", "start_offset" : 13, "end_offset" : 14, "type" : "word", "position" : 1, "baseForm" : null, "bytes" : "[e3 81 af]", "inflectionForm" : null, "inflectionForm (en)" : null, "inflectionType" : null, "inflectionType (en)" : null, "partOfSpeech" : "助詞-係助詞", "partOfSpeech (en)" : "particle-dependency", "positionLength" : 1, "pronunciation" : "ワ", "pronunciation (en)" : "wa", "reading" : "ハ", "reading (en)" : "ha" }, { "token" : "全文", "start_offset" : 15, "end_offset" : 17, "type" : "word", "position" : 2, "baseForm" : null, "bytes" : "[e5 85 a8 e6 96 87]", "inflectionForm" : null, "inflectionForm (en)" : null, "inflectionType" : null, "inflectionType (en)" : null, "partOfSpeech" : "名詞-一般", "partOfSpeech (en)" : "noun-common", "positionLength" : 1, "pronunciation" : "ゼンブン", "pronunciation (en)" : "zembun", "reading" : "ゼンブン", "reading (en)" : "zembun" }, { "token" : "検索", "start_offset" : 17, "end_offset" : 19, "type" : "word", "position" : 3, "baseForm" : null, "bytes" : "[e6 a4 9c e7 b4 a2]", "inflectionForm" : null, "inflectionForm (en)" : null, "inflectionType" : null, "inflectionType (en)" : null, "partOfSpeech" : "名詞-サ変接続", "partOfSpeech (en)" : "noun-verbal", "positionLength" : 1, "pronunciation" : "ケンサク", "pronunciation (en)" : "kensaku", "reading" : "ケンサク", "reading (en)" : "kensaku" }, { "token" : "エンジン", "start_offset" : 19, "end_offset" : 23, "type" : "word", "position" : 4, "baseForm" : null, "bytes" : "[e3 82 a8 e3 83 b3 e3 82 b8 e3 83 b3]", "inflectionForm" : null, "inflectionForm (en)" : null, "inflectionType" : null, "inflectionType (en)" : null, "partOfSpeech" : "名詞-一般", "partOfSpeech (en)" : "noun-common", "positionLength" : 1, "pronunciation" : "エンジン", "pronunciation (en)" : "enjin", "reading" : "エンジン", "reading (en)" : "enjin" }, { "token" : "です", "start_offset" : 23, "end_offset" : 25, "type" : "word", "position" : 5, "baseForm" : null, "bytes" : "[e3 81 a7 e3 81 99]", "inflectionForm" : "基本形", "inflectionForm (en)" : "base", "inflectionType" : "特殊・デス", "inflectionType (en)" : "special-desu", "partOfSpeech" : "助動詞", "partOfSpeech (en)" : "auxiliary-verb", "positionLength" : 1, "pronunciation" : "デス", "pronunciation (en)" : "desu", "reading" : "デス", "reading (en)" : "desu" } ] }, "tokenfilters" : [ { "name" : "kuromoji_baseform", "tokens" : [ { "token" : "Elasticsearch", "start_offset" : 0, "end_offset" : 13, "type" : "word", "position" : 0, "baseForm" : null, "bytes" : "[45 6c 61 73 74 69 63 73 65 61 72 63 68]", "inflectionForm" : null, "inflectionForm (en)" : null, "inflectionType" : null, "inflectionType (en)" : null, "keyword" : false, "partOfSpeech" : "名詞-固有名詞-組織", "partOfSpeech (en)" : "noun-proper-organization", "positionLength" : 1, "pronunciation" : null, "pronunciation (en)" : null, "reading" : null, "reading (en)" : null }, { "token" : "は", "start_offset" : 13, "end_offset" : 14, "type" : "word", "position" : 1, "baseForm" : null, "bytes" : "[e3 81 af]", "inflectionForm" : null, "inflectionForm (en)" : null, "inflectionType" : null, "inflectionType (en)" : null, "keyword" : false, "partOfSpeech" : "助詞-係助詞", "partOfSpeech (en)" : "particle-dependency", "positionLength" : 1, "pronunciation" : "ワ", "pronunciation (en)" : "wa", "reading" : "ハ", "reading (en)" : "ha" }, { "token" : "全文", "start_offset" : 15, "end_offset" : 17, "type" : "word", "position" : 2, "baseForm" : null, "bytes" : "[e5 85 a8 e6 96 87]", "inflectionForm" : null, "inflectionForm (en)" : null, "inflectionType" : null, "inflectionType (en)" : null, "keyword" : false, "partOfSpeech" : "名詞-一般", "partOfSpeech (en)" : "noun-common", "positionLength" : 1, "pronunciation" : "ゼンブン", "pronunciation (en)" : "zembun", "reading" : "ゼンブン", "reading (en)" : "zembun" }, { "token" : "検索", "start_offset" : 17, "end_offset" : 19, "type" : "word", "position" : 3, "baseForm" : null, "bytes" : "[e6 a4 9c e7 b4 a2]", "inflectionForm" : null, "inflectionForm (en)" : null, "inflectionType" : null, "inflectionType (en)" : null, "keyword" : false, "partOfSpeech" : "名詞-サ変接続", "partOfSpeech (en)" : "noun-verbal", "positionLength" : 1, "pronunciation" : "ケンサク", "pronunciation (en)" : "kensaku", "reading" : "ケンサク", "reading (en)" : "kensaku" }, { "token" : "エンジン", "start_offset" : 19, "end_offset" : 23, "type" : "word", "position" : 4, "baseForm" : null, "bytes" : "[e3 82 a8 e3 83 b3 e3 82 b8 e3 83 b3]", "inflectionForm" : null, "inflectionForm (en)" : null, "inflectionType" : null, "inflectionType (en)" : null, "keyword" : false, "partOfSpeech" : "名詞-一般", "partOfSpeech (en)" : "noun-common", "positionLength" : 1, "pronunciation" : "エンジン", "pronunciation (en)" : "enjin", "reading" : "エンジン", "reading (en)" : "enjin" }, { "token" : "です", "start_offset" : 23, "end_offset" : 25, "type" : "word", "position" : 5, "baseForm" : null, "bytes" : "[e3 81 a7 e3 81 99]", "inflectionForm" : "基本形", "inflectionForm (en)" : "base", "inflectionType" : "特殊・デス", "inflectionType (en)" : "special-desu", "keyword" : false, "partOfSpeech" : "助動詞", "partOfSpeech (en)" : "auxiliary-verb", "positionLength" : 1, "pronunciation" : "デス", "pronunciation (en)" : "desu", "reading" : "デス", "reading (en)" : "desu" } ] }, { "name" : "kuromoji_part_of_speech", "tokens" : [ { "token" : "Elasticsearch", "start_offset" : 0, "end_offset" : 13, "type" : "word", "position" : 0, "baseForm" : null, "bytes" : "[45 6c 61 73 74 69 63 73 65 61 72 63 68]", "inflectionForm" : null, "inflectionForm (en)" : null, "inflectionType" : null, "inflectionType (en)" : null, "keyword" : false, "partOfSpeech" : "名詞-固有名詞-組織", "partOfSpeech (en)" : "noun-proper-organization", "positionLength" : 1, "pronunciation" : null, "pronunciation (en)" : null, "reading" : null, "reading (en)" : null }, { "token" : "全文", "start_offset" : 15, "end_offset" : 17, "type" : "word", "position" : 2, "baseForm" : null, "bytes" : "[e5 85 a8 e6 96 87]", "inflectionForm" : null, "inflectionForm (en)" : null, "inflectionType" : null, "inflectionType (en)" : null, "keyword" : false, "partOfSpeech" : "名詞-一般", "partOfSpeech (en)" : "noun-common", "positionLength" : 1, "pronunciation" : "ゼンブン", "pronunciation (en)" : "zembun", "reading" : "ゼンブン", "reading (en)" : "zembun" }, { "token" : "検索", "start_offset" : 17, "end_offset" : 19, "type" : "word", "position" : 3, "baseForm" : null, "bytes" : "[e6 a4 9c e7 b4 a2]", "inflectionForm" : null, "inflectionForm (en)" : null, "inflectionType" : null, "inflectionType (en)" : null, "keyword" : false, "partOfSpeech" : "名詞-サ変接続", "partOfSpeech (en)" : "noun-verbal", "positionLength" : 1, "pronunciation" : "ケンサク", "pronunciation (en)" : "kensaku", "reading" : "ケンサク", "reading (en)" : "kensaku" }, { "token" : "エンジン", "start_offset" : 19, "end_offset" : 23, "type" : "word", "position" : 4, "baseForm" : null, "bytes" : "[e3 82 a8 e3 83 b3 e3 82 b8 e3 83 b3]", "inflectionForm" : null, "inflectionForm (en)" : null, "inflectionType" : null, "inflectionType (en)" : null, "keyword" : false, "partOfSpeech" : "名詞-一般", "partOfSpeech (en)" : "noun-common", "positionLength" : 1, "pronunciation" : "エンジン", "pronunciation (en)" : "enjin", "reading" : "エンジン", "reading (en)" : "enjin" } ] }, { "name" : "cjk_width", "tokens" : [ { "token" : "Elasticsearch", "start_offset" : 0, "end_offset" : 13, "type" : "word", "position" : 0, "baseForm" : null, "bytes" : "[45 6c 61 73 74 69 63 73 65 61 72 63 68]", "inflectionForm" : null, "inflectionForm (en)" : null, "inflectionType" : null, "inflectionType (en)" : null, "keyword" : false, "partOfSpeech" : "名詞-固有名詞-組織", "partOfSpeech (en)" : "noun-proper-organization", "positionLength" : 1, "pronunciation" : null, "pronunciation (en)" : null, "reading" : null, "reading (en)" : null }, { "token" : "全文", "start_offset" : 15, "end_offset" : 17, "type" : "word", "position" : 2, "baseForm" : null, "bytes" : "[e5 85 a8 e6 96 87]", "inflectionForm" : null, "inflectionForm (en)" : null, "inflectionType" : null, "inflectionType (en)" : null, "keyword" : false, "partOfSpeech" : "名詞-一般", "partOfSpeech (en)" : "noun-common", "positionLength" : 1, "pronunciation" : "ゼンブン", "pronunciation (en)" : "zembun", "reading" : "ゼンブン", "reading (en)" : "zembun" }, { "token" : "検索", "start_offset" : 17, "end_offset" : 19, "type" : "word", "position" : 3, "baseForm" : null, "bytes" : "[e6 a4 9c e7 b4 a2]", "inflectionForm" : null, "inflectionForm (en)" : null, "inflectionType" : null, "inflectionType (en)" : null, "keyword" : false, "partOfSpeech" : "名詞-サ変接続", "partOfSpeech (en)" : "noun-verbal", "positionLength" : 1, "pronunciation" : "ケンサク", "pronunciation (en)" : "kensaku", "reading" : "ケンサク", "reading (en)" : "kensaku" }, { "token" : "エンジン", "start_offset" : 19, "end_offset" : 23, "type" : "word", "position" : 4, "baseForm" : null, "bytes" : "[e3 82 a8 e3 83 b3 e3 82 b8 e3 83 b3]", "inflectionForm" : null, "inflectionForm (en)" : null, "inflectionType" : null, "inflectionType (en)" : null, "keyword" : false, "partOfSpeech" : "名詞-一般", "partOfSpeech (en)" : "noun-common", "positionLength" : 1, "pronunciation" : "エンジン", "pronunciation (en)" : "enjin", "reading" : "エンジン", "reading (en)" : "enjin" } ] }, { "name" : "stop", "tokens" : [ { "token" : "Elasticsearch", "start_offset" : 0, "end_offset" : 13, "type" : "word", "position" : 0, "baseForm" : null, "bytes" : "[45 6c 61 73 74 69 63 73 65 61 72 63 68]", "inflectionForm" : null, "inflectionForm (en)" : null, "inflectionType" : null, "inflectionType (en)" : null, "keyword" : false, "partOfSpeech" : "名詞-固有名詞-組織", "partOfSpeech (en)" : "noun-proper-organization", "positionLength" : 1, "pronunciation" : null, "pronunciation (en)" : null, "reading" : null, "reading (en)" : null }, { "token" : "全文", "start_offset" : 15, "end_offset" : 17, "type" : "word", "position" : 2, "baseForm" : null, "bytes" : "[e5 85 a8 e6 96 87]", "inflectionForm" : null, "inflectionForm (en)" : null, "inflectionType" : null, "inflectionType (en)" : null, "keyword" : false, "partOfSpeech" : "名詞-一般", "partOfSpeech (en)" : "noun-common", "positionLength" : 1, "pronunciation" : "ゼンブン", "pronunciation (en)" : "zembun", "reading" : "ゼンブン", "reading (en)" : "zembun" }, { "token" : "検索", "start_offset" : 17, "end_offset" : 19, "type" : "word", "position" : 3, "baseForm" : null, "bytes" : "[e6 a4 9c e7 b4 a2]", "inflectionForm" : null, "inflectionForm (en)" : null, "inflectionType" : null, "inflectionType (en)" : null, "keyword" : false, "partOfSpeech" : "名詞-サ変接続", "partOfSpeech (en)" : "noun-verbal", "positionLength" : 1, "pronunciation" : "ケンサク", "pronunciation (en)" : "kensaku", "reading" : "ケンサク", "reading (en)" : "kensaku" }, { "token" : "エンジン", "start_offset" : 19, "end_offset" : 23, "type" : "word", "position" : 4, "baseForm" : null, "bytes" : "[e3 82 a8 e3 83 b3 e3 82 b8 e3 83 b3]", "inflectionForm" : null, "inflectionForm (en)" : null, "inflectionType" : null, "inflectionType (en)" : null, "keyword" : false, "partOfSpeech" : "名詞-一般", "partOfSpeech (en)" : "noun-common", "positionLength" : 1, "pronunciation" : "エンジン", "pronunciation (en)" : "enjin", "reading" : "エンジン", "reading (en)" : "enjin" } ] }, { "name" : "ja_stop", "tokens" : [ { "token" : "Elasticsearch", "start_offset" : 0, "end_offset" : 13, "type" : "word", "position" : 0, "baseForm" : null, "bytes" : "[45 6c 61 73 74 69 63 73 65 61 72 63 68]", "inflectionForm" : null, "inflectionForm (en)" : null, "inflectionType" : null, "inflectionType (en)" : null, "keyword" : false, "partOfSpeech" : "名詞-固有名詞-組織", "partOfSpeech (en)" : "noun-proper-organization", "positionLength" : 1, "pronunciation" : null, "pronunciation (en)" : null, "reading" : null, "reading (en)" : null }, { "token" : "全文", "start_offset" : 15, "end_offset" : 17, "type" : "word", "position" : 2, "baseForm" : null, "bytes" : "[e5 85 a8 e6 96 87]", "inflectionForm" : null, "inflectionForm (en)" : null, "inflectionType" : null, "inflectionType (en)" : null, "keyword" : false, "partOfSpeech" : "名詞-一般", "partOfSpeech (en)" : "noun-common", "positionLength" : 1, "pronunciation" : "ゼンブン", "pronunciation (en)" : "zembun", "reading" : "ゼンブン", "reading (en)" : "zembun" }, { "token" : "検索", "start_offset" : 17, "end_offset" : 19, "type" : "word", "position" : 3, "baseForm" : null, "bytes" : "[e6 a4 9c e7 b4 a2]", "inflectionForm" : null, "inflectionForm (en)" : null, "inflectionType" : null, "inflectionType (en)" : null, "keyword" : false, "partOfSpeech" : "名詞-サ変接続", "partOfSpeech (en)" : "noun-verbal", "positionLength" : 1, "pronunciation" : "ケンサク", "pronunciation (en)" : "kensaku", "reading" : "ケンサク", "reading (en)" : "kensaku" }, { "token" : "エンジン", "start_offset" : 19, "end_offset" : 23, "type" : "word", "position" : 4, "baseForm" : null, "bytes" : "[e3 82 a8 e3 83 b3 e3 82 b8 e3 83 b3]", "inflectionForm" : null, "inflectionForm (en)" : null, "inflectionType" : null, "inflectionType (en)" : null, "keyword" : false, "partOfSpeech" : "名詞-一般", "partOfSpeech (en)" : "noun-common", "positionLength" : 1, "pronunciation" : "エンジン", "pronunciation (en)" : "enjin", "reading" : "エンジン", "reading (en)" : "enjin" } ] }, { "name" : "kuromoji_stemmer", "tokens" : [ { "token" : "Elasticsearch", "start_offset" : 0, "end_offset" : 13, "type" : "word", "position" : 0, "baseForm" : null, "bytes" : "[45 6c 61 73 74 69 63 73 65 61 72 63 68]", "inflectionForm" : null, "inflectionForm (en)" : null, "inflectionType" : null, "inflectionType (en)" : null, "keyword" : false, "partOfSpeech" : "名詞-固有名詞-組織", "partOfSpeech (en)" : "noun-proper-organization", "positionLength" : 1, "pronunciation" : null, "pronunciation (en)" : null, "reading" : null, "reading (en)" : null }, { "token" : "全文", "start_offset" : 15, "end_offset" : 17, "type" : "word", "position" : 2, "baseForm" : null, "bytes" : "[e5 85 a8 e6 96 87]", "inflectionForm" : null, "inflectionForm (en)" : null, "inflectionType" : null, "inflectionType (en)" : null, "keyword" : false, "partOfSpeech" : "名詞-一般", "partOfSpeech (en)" : "noun-common", "positionLength" : 1, "pronunciation" : "ゼンブン", "pronunciation (en)" : "zembun", "reading" : "ゼンブン", "reading (en)" : "zembun" }, { "token" : "検索", "start_offset" : 17, "end_offset" : 19, "type" : "word", "position" : 3, "baseForm" : null, "bytes" : "[e6 a4 9c e7 b4 a2]", "inflectionForm" : null, "inflectionForm (en)" : null, "inflectionType" : null, "inflectionType (en)" : null, "keyword" : false, "partOfSpeech" : "名詞-サ変接続", "partOfSpeech (en)" : "noun-verbal", "positionLength" : 1, "pronunciation" : "ケンサク", "pronunciation (en)" : "kensaku", "reading" : "ケンサク", "reading (en)" : "kensaku" }, { "token" : "エンジン", "start_offset" : 19, "end_offset" : 23, "type" : "word", "position" : 4, "baseForm" : null, "bytes" : "[e3 82 a8 e3 83 b3 e3 82 b8 e3 83 b3]", "inflectionForm" : null, "inflectionForm (en)" : null, "inflectionType" : null, "inflectionType (en)" : null, "keyword" : false, "partOfSpeech" : "名詞-一般", "partOfSpeech (en)" : "noun-common", "positionLength" : 1, "pronunciation" : "エンジン", "pronunciation (en)" : "enjin", "reading" : "エンジン", "reading (en)" : "enjin" } ] }, { "name" : "lowercase", "tokens" : [ { "token" : "elasticsearch", "start_offset" : 0, "end_offset" : 13, "type" : "word", "position" : 0, "baseForm" : null, "bytes" : "[65 6c 61 73 74 69 63 73 65 61 72 63 68]", "inflectionForm" : null, "inflectionForm (en)" : null, "inflectionType" : null, "inflectionType (en)" : null, "keyword" : false, "partOfSpeech" : "名詞-固有名詞-組織", "partOfSpeech (en)" : "noun-proper-organization", "positionLength" : 1, "pronunciation" : null, "pronunciation (en)" : null, "reading" : null, "reading (en)" : null }, { "token" : "全文", "start_offset" : 15, "end_offset" : 17, "type" : "word", "position" : 2, "baseForm" : null, "bytes" : "[e5 85 a8 e6 96 87]", "inflectionForm" : null, "inflectionForm (en)" : null, "inflectionType" : null, "inflectionType (en)" : null, "keyword" : false, "partOfSpeech" : "名詞-一般", "partOfSpeech (en)" : "noun-common", "positionLength" : 1, "pronunciation" : "ゼンブン", "pronunciation (en)" : "zembun", "reading" : "ゼンブン", "reading (en)" : "zembun" }, { "token" : "検索", "start_offset" : 17, "end_offset" : 19, "type" : "word", "position" : 3, "baseForm" : null, "bytes" : "[e6 a4 9c e7 b4 a2]", "inflectionForm" : null, "inflectionForm (en)" : null, "inflectionType" : null, "inflectionType (en)" : null, "keyword" : false, "partOfSpeech" : "名詞-サ変接続", "partOfSpeech (en)" : "noun-verbal", "positionLength" : 1, "pronunciation" : "ケンサク", "pronunciation (en)" : "kensaku", "reading" : "ケンサク", "reading (en)" : "kensaku" }, { "token" : "エンジン", "start_offset" : 19, "end_offset" : 23, "type" : "word", "position" : 4, "baseForm" : null, "bytes" : "[e3 82 a8 e3 83 b3 e3 82 b8 e3 83 b3]", "inflectionForm" : null, "inflectionForm (en)" : null, "inflectionType" : null, "inflectionType (en)" : null, "keyword" : false, "partOfSpeech" : "名詞-一般", "partOfSpeech (en)" : "noun-common", "positionLength" : 1, "pronunciation" : "エンジン", "pronunciation (en)" : "enjin", "reading" : "エンジン", "reading (en)" : "enjin" } ] } ] } }
形態素解析時の各種属性なども見れるようになり、だいぶ詳細になりましたね。
ですが、ちょっと長い…。
しかも、JSONで縦に出るのでちょっと読み辛い…。
※みなさん、どうしてるんだろう…
というわけで、遊びを兼ねてこんなスクリプトを書いてみました。
analyze_detail.groovy
import groovy.json.JsonSlurper delimiter = "|" def interestTokenizerAttrs = [ 'type', 'baseForm', 'partOfSpeech', 'reading' ] def interestTokenfiltersAttrs = [ 'type', 'baseForm', 'partOfSpeech', 'reading' ] def slurper = new JsonSlurper() def explain = slurper.parse(System.in) def detail = explain['detail'] def charfilters = detail['charfilters'] def tokenizer = detail['tokenizer'] def tokenfilters = detail['tokenfilters'] if (charfilters) println('========== charfilter ==========') charfilters.each { charfilter -> def charfilterName = charfilter['name'] def filterTexts = charfilter['filtered_text'] println("charfilter = $charfilterName") filterTexts.each { println(it) } println() } def tokenizerName = tokenizer['name'] def tokenizerTokens = tokenizer['tokens'] println('========== tokenizer ==========') def tokenizerTokensAndAttrs = extractTokenAndAttr(tokenizerTokens, interestTokenizerAttrs) printFormat('tokenizer', tokenizerName, tokenizerTokensAndAttrs[0], tokenizerTokensAndAttrs[1]) println() if (tokenfilters) println("========== tokenfilters ==========") tokenfilters.each { tokenfilter -> def tokenfilterName = tokenfilter['name'] def tokenfilterTokens = tokenfilter['tokens'] def tokenfilterTokensAndAttrs = extractTokenAndAttr(tokenfilterTokens, interestTokenfiltersAttrs) printFormat('tokenfilter', tokenfilterName, tokenfilterTokensAndAttrs[0], tokenfilterTokensAndAttrs[1]) println() } def extractTokenAndAttr(tokenizerOrFilterTokens, interestAttrs) { def tokenAndAttrs = tokenizerOrFilterTokens.collect { tokens -> def attrs = [:] def token = tokens.grep { 'token' == it.key }.collect { it.value }[0] interestAttrs.grep { tokens.containsKey(it) }.each { attrs.put(it, tokens[it]) } ["$token": attrs] } def tokens = tokenAndAttrs.collect { it.keySet() }.flatten() def attrs = tokenAndAttrs.collect { it.values() }.flatten() [tokens, attrs] } def printFormat(label, tokenizerOrFilter, tokens, attrs) { println("$label = $tokenizerOrFilter") def maxTokenSize = tokens.collect { it.size() } def maxAttrSize = attrs.collect { it.collect { "${it.key}: ${it.value}".size() }.max() } def maxSize = [maxTokenSize, maxAttrSize].transpose().collect { Math.max(it[0], it[1]?: 0 ) } println([tokens, maxSize].transpose().collect { it[0].center(it[1]) }.join(delimiter)) def attrCentered = [attrs, maxSize].transpose().collect { def size = it[1] it[0].collect { "${it.key}: ${it.value}".center(size) } } def m = attrCentered.collect { it.size() }.max() def indexedAttrs = (0 ..< m).collect { [] } (0 ..< attrCentered.size()).collect { i -> (0 ..< attrCentered[i].size()).collect { j -> indexedAttrs[j] << attrCentered[i][j] } } indexedAttrs.each { println(it.join(delimiter)) } }
_analyze?explain=trueの結果に、パイプして使います。
$ curl -XGET 'http://localhost:9200/myindex/_analyze?pretty&explain=true&analyzer=kuromoji_analyzer' -d 'Elasticsearch は、全文検索エンジンです。' | groovy analyze_detail.groovy % Total % Received % Xferd Average Speed Time Time Time Current Dload Upload Total Spent Left Speed 100 24356 100 24304 100 52 3066k 6719 --:--:-- --:--:-- --:--:-- 3390k
こういう表示になります。
========== tokenizer ========== tokenizer = kuromoji_tokenizer_search Elasticsearch | は | 全文 | 検索 | エンジン | です type: word | type: word | type: word | type: word | type: word | type: word baseForm: null | baseForm: null | baseForm: null | baseForm: null | baseForm: null | baseForm: null partOfSpeech: 名詞-固有名詞-組織|partOfSpeech: 助詞-係助詞|partOfSpeech: 名詞-一般|partOfSpeech: 名詞-サ変接続|partOfSpeech: 名詞-一般|partOfSpeech: 助動詞 reading: null | reading: ハ | reading: ゼンブン | reading: ケンサク | reading: エンジン | reading: デス ========== tokenfilters ========== tokenfilter = kuromoji_baseform Elasticsearch | は | 全文 | 検索 | エンジン | です type: word | type: word | type: word | type: word | type: word | type: word baseForm: null | baseForm: null | baseForm: null | baseForm: null | baseForm: null | baseForm: null partOfSpeech: 名詞-固有名詞-組織|partOfSpeech: 助詞-係助詞|partOfSpeech: 名詞-一般|partOfSpeech: 名詞-サ変接続|partOfSpeech: 名詞-一般|partOfSpeech: 助動詞 reading: null | reading: ハ | reading: ゼンブン | reading: ケンサク | reading: エンジン | reading: デス tokenfilter = kuromoji_part_of_speech Elasticsearch | 全文 | 検索 | エンジン type: word | type: word | type: word | type: word baseForm: null | baseForm: null | baseForm: null | baseForm: null partOfSpeech: 名詞-固有名詞-組織|partOfSpeech: 名詞-一般|partOfSpeech: 名詞-サ変接続|partOfSpeech: 名詞-一般 reading: null | reading: ゼンブン | reading: ケンサク | reading: エンジン tokenfilter = cjk_width Elasticsearch | 全文 | 検索 | エンジン type: word | type: word | type: word | type: word baseForm: null | baseForm: null | baseForm: null | baseForm: null partOfSpeech: 名詞-固有名詞-組織|partOfSpeech: 名詞-一般|partOfSpeech: 名詞-サ変接続|partOfSpeech: 名詞-一般 reading: null | reading: ゼンブン | reading: ケンサク | reading: エンジン tokenfilter = stop Elasticsearch | 全文 | 検索 | エンジン type: word | type: word | type: word | type: word baseForm: null | baseForm: null | baseForm: null | baseForm: null partOfSpeech: 名詞-固有名詞-組織|partOfSpeech: 名詞-一般|partOfSpeech: 名詞-サ変接続|partOfSpeech: 名詞-一般 reading: null | reading: ゼンブン | reading: ケンサク | reading: エンジン tokenfilter = ja_stop Elasticsearch | 全文 | 検索 | エンジン type: word | type: word | type: word | type: word baseForm: null | baseForm: null | baseForm: null | baseForm: null partOfSpeech: 名詞-固有名詞-組織|partOfSpeech: 名詞-一般|partOfSpeech: 名詞-サ変接続|partOfSpeech: 名詞-一般 reading: null | reading: ゼンブン | reading: ケンサク | reading: エンジン tokenfilter = kuromoji_stemmer Elasticsearch | 全文 | 検索 | エンジン type: word | type: word | type: word | type: word baseForm: null | baseForm: null | baseForm: null | baseForm: null partOfSpeech: 名詞-固有名詞-組織|partOfSpeech: 名詞-一般|partOfSpeech: 名詞-サ変接続|partOfSpeech: 名詞-一般 reading: null | reading: ゼンブン | reading: ケンサク | reading: エンジン tokenfilter = lowercase elasticsearch | 全文 | 検索 | エンジン type: word | type: word | type: word | type: word baseForm: null | baseForm: null | baseForm: null | baseForm: null partOfSpeech: 名詞-固有名詞-組織|partOfSpeech: 名詞-一般|partOfSpeech: 名詞-サ変接続|partOfSpeech: 名詞-一般 reading: null | reading: ゼンブン | reading: ケンサク | reading: エンジン
一応Groovyでセンタリングしているのですが、全角文字の幅を考慮していないのでアンバランスになるのはご愛嬌…。
デリミタや、出力する属性は、最初の方で定義しています。トークンだけ欲しければ、最初のListを空にしましょう。
delimiter = "|" def interestTokenizerAttrs = [] /* [ 'type', 'baseForm', 'partOfSpeech', 'reading' ] */ def interestTokenfiltersAttrs = [] /* [ 'type', 'baseForm', 'partOfSpeech', 'reading' ] */
こんな感じになります。
========== tokenizer ========== tokenizer = kuromoji_tokenizer_search Elasticsearch|は|全文|検索|エンジン|です ========== tokenfilters ========== tokenfilter = kuromoji_baseform Elasticsearch|は|全文|検索|エンジン|です tokenfilter = kuromoji_part_of_speech Elasticsearch|全文|検索|エンジン tokenfilter = cjk_width Elasticsearch|全文|検索|エンジン tokenfilter = stop Elasticsearch|全文|検索|エンジン tokenfilter = ja_stop Elasticsearch|全文|検索|エンジン tokenfilter = kuromoji_stemmer Elasticsearch|全文|検索|エンジン tokenfilter = lowercase elasticsearch|全文|検索|エンジン
こういうところは、Solrの管理UIが便利だなぁと思うのですが、違うでしょうか…?
追記)
と書いていたら、@johtaniさんからツッコミをいただきました。
@kazuhira_r attributesで出力項目絞ると良いかと思いますー
2016-02-06 18:16:00 via twicca to @kazuhira_r
Analyze APIに、attributesというものがあるそうです。これで属性を絞れるとか。
で、ドキュメントを見ていて
ドキュメントに載ってない…よね…?(^^;URL
とか言ってたら
@kazuhira_r 次のページ
2016-02-06 19:07:08 via twicca to @kazuhira_r
なんともいえないツッコミをいただきました…。
というわけで、載ってますね。
Explain Analyze | Elasticsearch Reference [2.2] | Elastic
というわけで、試してみます。
今回は、attributesにpartOfSpeechとreadingを指定しました。tokenizerとtokenfilterの結果に現れる属性が、主要なものとpartOfSpeech、readingのみになっています。ケースバイケースで、使い分けですかねー。
$ curl -XGET 'http://localhost:9200/myindex/_analyze?pretty&explain=true&analyzer=kuromoji_analyzer' -d '{ "text": "Elasticsearchは、全文検索エンジンです。", "attributes": ["partOfSpeech", "reading"] }' { "detail" : { "custom_analyzer" : true, "charfilters" : [ ], "tokenizer" : { "name" : "kuromoji_tokenizer_search", "tokens" : [ { "token" : "Elasticsearch", "start_offset" : 0, "end_offset" : 13, "type" : "word", "position" : 0, "partOfSpeech" : "名詞-固有名詞-組織", "reading" : null }, { "token" : "は", "start_offset" : 13, "end_offset" : 14, "type" : "word", "position" : 1, "partOfSpeech" : "助詞-係助詞", "reading" : "ハ" }, { "token" : "全文", "start_offset" : 15, "end_offset" : 17, "type" : "word", "position" : 2, "partOfSpeech" : "名詞-一般", "reading" : "ゼンブン" }, { "token" : "検索", "start_offset" : 17, "end_offset" : 19, "type" : "word", "position" : 3, "partOfSpeech" : "名詞-サ変接続", "reading" : "ケンサク" }, { "token" : "エンジン", "start_offset" : 19, "end_offset" : 23, "type" : "word", "position" : 4, "partOfSpeech" : "名詞-一般", "reading" : "エンジン" }, { "token" : "です", "start_offset" : 23, "end_offset" : 25, "type" : "word", "position" : 5, "partOfSpeech" : "助動詞", "reading" : "デス" } ] }, "tokenfilters" : [ { "name" : "kuromoji_baseform", "tokens" : [ { "token" : "Elasticsearch", "start_offset" : 0, "end_offset" : 13, "type" : "word", "position" : 0, "partOfSpeech" : "名詞-固有名詞-組織", "reading" : null }, { "token" : "は", "start_offset" : 13, "end_offset" : 14, "type" : "word", "position" : 1, "partOfSpeech" : "助詞-係助詞", "reading" : "ハ" }, { "token" : "全文", "start_offset" : 15, "end_offset" : 17, "type" : "word", "position" : 2, "partOfSpeech" : "名詞-一般", "reading" : "ゼンブン" }, { "token" : "検索", "start_offset" : 17, "end_offset" : 19, "type" : "word", "position" : 3, "partOfSpeech" : "名詞-サ変接続", "reading" : "ケンサク" }, { "token" : "エンジン", "start_offset" : 19, "end_offset" : 23, "type" : "word", "position" : 4, "partOfSpeech" : "名詞-一般", "reading" : "エンジン" }, { "token" : "です", "start_offset" : 23, "end_offset" : 25, "type" : "word", "position" : 5, "partOfSpeech" : "助動詞", "reading" : "デス" } ] }, { "name" : "kuromoji_part_of_speech", "tokens" : [ { "token" : "Elasticsearch", "start_offset" : 0, "end_offset" : 13, "type" : "word", "position" : 0, "partOfSpeech" : "名詞-固有名詞-組織", "reading" : null }, { "token" : "全文", "start_offset" : 15, "end_offset" : 17, "type" : "word", "position" : 2, "partOfSpeech" : "名詞-一般", "reading" : "ゼンブン" }, { "token" : "検索", "start_offset" : 17, "end_offset" : 19, "type" : "word", "position" : 3, "partOfSpeech" : "名詞-サ変接続", "reading" : "ケンサク" }, { "token" : "エンジン", "start_offset" : 19, "end_offset" : 23, "type" : "word", "position" : 4, "partOfSpeech" : "名詞-一般", "reading" : "エンジン" } ] }, { "name" : "cjk_width", "tokens" : [ { "token" : "Elasticsearch", "start_offset" : 0, "end_offset" : 13, "type" : "word", "position" : 0, "partOfSpeech" : "名詞-固有名詞-組織", "reading" : null }, { "token" : "全文", "start_offset" : 15, "end_offset" : 17, "type" : "word", "position" : 2, "partOfSpeech" : "名詞-一般", "reading" : "ゼンブン" }, { "token" : "検索", "start_offset" : 17, "end_offset" : 19, "type" : "word", "position" : 3, "partOfSpeech" : "名詞-サ変接続", "reading" : "ケンサク" }, { "token" : "エンジン", "start_offset" : 19, "end_offset" : 23, "type" : "word", "position" : 4, "partOfSpeech" : "名詞-一般", "reading" : "エンジン" } ] }, { "name" : "stop", "tokens" : [ { "token" : "Elasticsearch", "start_offset" : 0, "end_offset" : 13, "type" : "word", "position" : 0, "partOfSpeech" : "名詞-固有名詞-組織", "reading" : null }, { "token" : "全文", "start_offset" : 15, "end_offset" : 17, "type" : "word", "position" : 2, "partOfSpeech" : "名詞-一般", "reading" : "ゼンブン" }, { "token" : "検索", "start_offset" : 17, "end_offset" : 19, "type" : "word", "position" : 3, "partOfSpeech" : "名詞-サ変接続", "reading" : "ケンサク" }, { "token" : "エンジン", "start_offset" : 19, "end_offset" : 23, "type" : "word", "position" : 4, "partOfSpeech" : "名詞-一般", "reading" : "エンジン" } ] }, { "name" : "ja_stop", "tokens" : [ { "token" : "Elasticsearch", "start_offset" : 0, "end_offset" : 13, "type" : "word", "position" : 0, "partOfSpeech" : "名詞-固有名詞-組織", "reading" : null }, { "token" : "全文", "start_offset" : 15, "end_offset" : 17, "type" : "word", "position" : 2, "partOfSpeech" : "名詞-一般", "reading" : "ゼンブン" }, { "token" : "検索", "start_offset" : 17, "end_offset" : 19, "type" : "word", "position" : 3, "partOfSpeech" : "名詞-サ変接続", "reading" : "ケンサク" }, { "token" : "エンジン", "start_offset" : 19, "end_offset" : 23, "type" : "word", "position" : 4, "partOfSpeech" : "名詞-一般", "reading" : "エンジン" } ] }, { "name" : "kuromoji_stemmer", "tokens" : [ { "token" : "Elasticsearch", "start_offset" : 0, "end_offset" : 13, "type" : "word", "position" : 0, "partOfSpeech" : "名詞-固有名詞-組織", "reading" : null }, { "token" : "全文", "start_offset" : 15, "end_offset" : 17, "type" : "word", "position" : 2, "partOfSpeech" : "名詞-一般", "reading" : "ゼンブン" }, { "token" : "検索", "start_offset" : 17, "end_offset" : 19, "type" : "word", "position" : 3, "partOfSpeech" : "名詞-サ変接続", "reading" : "ケンサク" }, { "token" : "エンジン", "start_offset" : 19, "end_offset" : 23, "type" : "word", "position" : 4, "partOfSpeech" : "名詞-一般", "reading" : "エンジン" } ] }, { "name" : "lowercase", "tokens" : [ { "token" : "elasticsearch", "start_offset" : 0, "end_offset" : 13, "type" : "word", "position" : 0, "partOfSpeech" : "名詞-固有名詞-組織", "reading" : null }, { "token" : "全文", "start_offset" : 15, "end_offset" : 17, "type" : "word", "position" : 2, "partOfSpeech" : "名詞-一般", "reading" : "ゼンブン" }, { "token" : "検索", "start_offset" : 17, "end_offset" : 19, "type" : "word", "position" : 3, "partOfSpeech" : "名詞-サ変接続", "reading" : "ケンサク" }, { "token" : "エンジン", "start_offset" : 19, "end_offset" : 23, "type" : "word", "position" : 4, "partOfSpeech" : "名詞-一般", "reading" : "エンジン" } ] } ] } }