Elasticsearch 2.2.0で追加されたAnalyze API（explain=true）を試す

Elasticsearchに、Analyze APIというものがあるらしいです。

Analyze | Elasticsearch Reference [2.2] | Elastic

テキストを、Analyze（単語分割）した時の結果が分かるAPIみたいです。

これ、Solrの管理UIで同じような機能がTokenizerやFilterが適用されていく様子がわかるような形でかなり便利に使えていたのですが、Elasticsearchだと最終形しかわからなかったようで、こちらのようなプラグインが作られていたようです。

GitHub - johtani/elasticsearch-extended-analyze: Extend Analyze API Plugin for Elasticsearch

で、それがElasticsearch 2.2.0で入ったと。
※プラグインを使う前にElasticsearch 2.2.0が出てしまった…

https://www.elastic.co/guide/en/elasticsearch/reference/current/release-notes-2.2.0.html

Add detail response support for _analyze API by johtani · Pull Request #11660 · elastic/elasticsearch · GitHub

こちらのPull Requestに出てきている方を見て、なるほど、と…。

では、早速使ってみましょう。

通常のAnalyze API

まずは、普通にAnalyze APIを使ってみます。ここで、「kuromoji_analyzer」はKuromoji Analysis Pluginが提供するTokenizer、各種Filterなどを適用したAnalyzerとします。

$ curl -XGET 'http://localhost:9200/myindex/_analyze?pretty&analyzer=kuromoji_analyzer' -d 'Elasticsearchは、全文検索エンジンです。'
{
  "tokens" : [ {
    "token" : "elasticsearch",
    "start_offset" : 0,
    "end_offset" : 13,
    "type" : "word",
    "position" : 0
  }, {
    "token" : "全文",
    "start_offset" : 15,
    "end_offset" : 17,
    "type" : "word",
    "position" : 2
  }, {
    "token" : "検索",
    "start_offset" : 17,
    "end_offset" : 19,
    "type" : "word",
    "position" : 3
  }, {
    "token" : "エンジン",
    "start_offset" : 19,
    "end_offset" : 23,
    "type" : "word",
    "position" : 4
  } ]
}

こんな結果になります、と。

Analyze API（explain=true）

次に、2.2.0で追加されたexplain=trueを付けて試してみます。

結果は、このように。

$ curl -XGET 'http://localhost:9200/myindex/_analyze?pretty&explain=true&analyzer=kuromoji_analyzer' -d 'Elasticsearch は、全文検索エンジンです。'
{
  "detail" : {
    "custom_analyzer" : true,
    "charfilters" : [ ],
    "tokenizer" : {
      "name" : "kuromoji_tokenizer_search",
      "tokens" : [ {
        "token" : "Elasticsearch",
        "start_offset" : 0,
        "end_offset" : 13,
        "type" : "word",
        "position" : 0,
        "baseForm" : null,
        "bytes" : "[45 6c 61 73 74 69 63 73 65 61 72 63 68]",
        "inflectionForm" : null,
        "inflectionForm (en)" : null,
        "inflectionType" : null,
        "inflectionType (en)" : null,
        "partOfSpeech" : "名詞-固有名詞-組織",
        "partOfSpeech (en)" : "noun-proper-organization",
        "positionLength" : 1,
        "pronunciation" : null,
        "pronunciation (en)" : null,
        "reading" : null,
        "reading (en)" : null
      }, {
        "token" : "は",
        "start_offset" : 13,
        "end_offset" : 14,
        "type" : "word",
        "position" : 1,
        "baseForm" : null,
        "bytes" : "[e3 81 af]",
        "inflectionForm" : null,
        "inflectionForm (en)" : null,
        "inflectionType" : null,
        "inflectionType (en)" : null,
        "partOfSpeech" : "助詞-係助詞",
        "partOfSpeech (en)" : "particle-dependency",
        "positionLength" : 1,
        "pronunciation" : "ワ",
        "pronunciation (en)" : "wa",
        "reading" : "ハ",
        "reading (en)" : "ha"
      }, {
        "token" : "全文",
        "start_offset" : 15,
        "end_offset" : 17,
        "type" : "word",
        "position" : 2,
        "baseForm" : null,
        "bytes" : "[e5 85 a8 e6 96 87]",
        "inflectionForm" : null,
        "inflectionForm (en)" : null,
        "inflectionType" : null,
        "inflectionType (en)" : null,
        "partOfSpeech" : "名詞-一般",
        "partOfSpeech (en)" : "noun-common",
        "positionLength" : 1,
        "pronunciation" : "ゼンブン",
        "pronunciation (en)" : "zembun",
        "reading" : "ゼンブン",
        "reading (en)" : "zembun"
      }, {
        "token" : "検索",
        "start_offset" : 17,
        "end_offset" : 19,
        "type" : "word",
        "position" : 3,
        "baseForm" : null,
        "bytes" : "[e6 a4 9c e7 b4 a2]",
        "inflectionForm" : null,
        "inflectionForm (en)" : null,
        "inflectionType" : null,
        "inflectionType (en)" : null,
        "partOfSpeech" : "名詞-サ変接続",
        "partOfSpeech (en)" : "noun-verbal",
        "positionLength" : 1,
        "pronunciation" : "ケンサク",
        "pronunciation (en)" : "kensaku",
        "reading" : "ケンサク",
        "reading (en)" : "kensaku"
      }, {
        "token" : "エンジン",
        "start_offset" : 19,
        "end_offset" : 23,
        "type" : "word",
        "position" : 4,
        "baseForm" : null,
        "bytes" : "[e3 82 a8 e3 83 b3 e3 82 b8 e3 83 b3]",
        "inflectionForm" : null,
        "inflectionForm (en)" : null,
        "inflectionType" : null,
        "inflectionType (en)" : null,
        "partOfSpeech" : "名詞-一般",
        "partOfSpeech (en)" : "noun-common",
        "positionLength" : 1,
        "pronunciation" : "エンジン",
        "pronunciation (en)" : "enjin",
        "reading" : "エンジン",
        "reading (en)" : "enjin"
      }, {
        "token" : "です",
        "start_offset" : 23,
        "end_offset" : 25,
        "type" : "word",
        "position" : 5,
        "baseForm" : null,
        "bytes" : "[e3 81 a7 e3 81 99]",
        "inflectionForm" : "基本形",
        "inflectionForm (en)" : "base",
        "inflectionType" : "特殊・デス",
        "inflectionType (en)" : "special-desu",
        "partOfSpeech" : "助動詞",
        "partOfSpeech (en)" : "auxiliary-verb",
        "positionLength" : 1,
        "pronunciation" : "デス",
        "pronunciation (en)" : "desu",
        "reading" : "デス",
        "reading (en)" : "desu"
      } ]
    },
    "tokenfilters" : [ {
      "name" : "kuromoji_baseform",
      "tokens" : [ {
        "token" : "Elasticsearch",
        "start_offset" : 0,
        "end_offset" : 13,
        "type" : "word",
        "position" : 0,
        "baseForm" : null,
        "bytes" : "[45 6c 61 73 74 69 63 73 65 61 72 63 68]",
        "inflectionForm" : null,
        "inflectionForm (en)" : null,
        "inflectionType" : null,
        "inflectionType (en)" : null,
        "keyword" : false,
        "partOfSpeech" : "名詞-固有名詞-組織",
        "partOfSpeech (en)" : "noun-proper-organization",
        "positionLength" : 1,
        "pronunciation" : null,
        "pronunciation (en)" : null,
        "reading" : null,
        "reading (en)" : null
      }, {
        "token" : "は",
        "start_offset" : 13,
        "end_offset" : 14,
        "type" : "word",
        "position" : 1,
        "baseForm" : null,
        "bytes" : "[e3 81 af]",
        "inflectionForm" : null,
        "inflectionForm (en)" : null,
        "inflectionType" : null,
        "inflectionType (en)" : null,
        "keyword" : false,
        "partOfSpeech" : "助詞-係助詞",
        "partOfSpeech (en)" : "particle-dependency",
        "positionLength" : 1,
        "pronunciation" : "ワ",
        "pronunciation (en)" : "wa",
        "reading" : "ハ",
        "reading (en)" : "ha"
      }, {
        "token" : "全文",
        "start_offset" : 15,
        "end_offset" : 17,
        "type" : "word",
        "position" : 2,
        "baseForm" : null,
        "bytes" : "[e5 85 a8 e6 96 87]",
        "inflectionForm" : null,
        "inflectionForm (en)" : null,
        "inflectionType" : null,
        "inflectionType (en)" : null,
        "keyword" : false,
        "partOfSpeech" : "名詞-一般",
        "partOfSpeech (en)" : "noun-common",
        "positionLength" : 1,
        "pronunciation" : "ゼンブン",
        "pronunciation (en)" : "zembun",
        "reading" : "ゼンブン",
        "reading (en)" : "zembun"
      }, {
        "token" : "検索",
        "start_offset" : 17,
        "end_offset" : 19,
        "type" : "word",
        "position" : 3,
        "baseForm" : null,
        "bytes" : "[e6 a4 9c e7 b4 a2]",
        "inflectionForm" : null,
        "inflectionForm (en)" : null,
        "inflectionType" : null,
        "inflectionType (en)" : null,
        "keyword" : false,
        "partOfSpeech" : "名詞-サ変接続",
        "partOfSpeech (en)" : "noun-verbal",
        "positionLength" : 1,
        "pronunciation" : "ケンサク",
        "pronunciation (en)" : "kensaku",
        "reading" : "ケンサク",
        "reading (en)" : "kensaku"
      }, {
        "token" : "エンジン",
        "start_offset" : 19,
        "end_offset" : 23,
        "type" : "word",
        "position" : 4,
        "baseForm" : null,
        "bytes" : "[e3 82 a8 e3 83 b3 e3 82 b8 e3 83 b3]",
        "inflectionForm" : null,
        "inflectionForm (en)" : null,
        "inflectionType" : null,
        "inflectionType (en)" : null,
        "keyword" : false,
        "partOfSpeech" : "名詞-一般",
        "partOfSpeech (en)" : "noun-common",
        "positionLength" : 1,
        "pronunciation" : "エンジン",
        "pronunciation (en)" : "enjin",
        "reading" : "エンジン",
        "reading (en)" : "enjin"
      }, {
        "token" : "です",
        "start_offset" : 23,
        "end_offset" : 25,
        "type" : "word",
        "position" : 5,
        "baseForm" : null,
        "bytes" : "[e3 81 a7 e3 81 99]",
        "inflectionForm" : "基本形",
        "inflectionForm (en)" : "base",
        "inflectionType" : "特殊・デス",
        "inflectionType (en)" : "special-desu",
        "keyword" : false,
        "partOfSpeech" : "助動詞",
        "partOfSpeech (en)" : "auxiliary-verb",
        "positionLength" : 1,
        "pronunciation" : "デス",
        "pronunciation (en)" : "desu",
        "reading" : "デス",
        "reading (en)" : "desu"
      } ]
    }, {
      "name" : "kuromoji_part_of_speech",
      "tokens" : [ {
        "token" : "Elasticsearch",
        "start_offset" : 0,
        "end_offset" : 13,
        "type" : "word",
        "position" : 0,
        "baseForm" : null,
        "bytes" : "[45 6c 61 73 74 69 63 73 65 61 72 63 68]",
        "inflectionForm" : null,
        "inflectionForm (en)" : null,
        "inflectionType" : null,
        "inflectionType (en)" : null,
        "keyword" : false,
        "partOfSpeech" : "名詞-固有名詞-組織",
        "partOfSpeech (en)" : "noun-proper-organization",
        "positionLength" : 1,
        "pronunciation" : null,
        "pronunciation (en)" : null,
        "reading" : null,
        "reading (en)" : null
      }, {
        "token" : "全文",
        "start_offset" : 15,
        "end_offset" : 17,
        "type" : "word",
        "position" : 2,
        "baseForm" : null,
        "bytes" : "[e5 85 a8 e6 96 87]",
        "inflectionForm" : null,
        "inflectionForm (en)" : null,
        "inflectionType" : null,
        "inflectionType (en)" : null,
        "keyword" : false,
        "partOfSpeech" : "名詞-一般",
        "partOfSpeech (en)" : "noun-common",
        "positionLength" : 1,
        "pronunciation" : "ゼンブン",
        "pronunciation (en)" : "zembun",
        "reading" : "ゼンブン",
        "reading (en)" : "zembun"
      }, {
        "token" : "検索",
        "start_offset" : 17,
        "end_offset" : 19,
        "type" : "word",
        "position" : 3,
        "baseForm" : null,
        "bytes" : "[e6 a4 9c e7 b4 a2]",
        "inflectionForm" : null,
        "inflectionForm (en)" : null,
        "inflectionType" : null,
        "inflectionType (en)" : null,
        "keyword" : false,
        "partOfSpeech" : "名詞-サ変接続",
        "partOfSpeech (en)" : "noun-verbal",
        "positionLength" : 1,
        "pronunciation" : "ケンサク",
        "pronunciation (en)" : "kensaku",
        "reading" : "ケンサク",
        "reading (en)" : "kensaku"
      }, {
        "token" : "エンジン",
        "start_offset" : 19,
        "end_offset" : 23,
        "type" : "word",
        "position" : 4,
        "baseForm" : null,
        "bytes" : "[e3 82 a8 e3 83 b3 e3 82 b8 e3 83 b3]",
        "inflectionForm" : null,
        "inflectionForm (en)" : null,
        "inflectionType" : null,
        "inflectionType (en)" : null,
        "keyword" : false,
        "partOfSpeech" : "名詞-一般",
        "partOfSpeech (en)" : "noun-common",
        "positionLength" : 1,
        "pronunciation" : "エンジン",
        "pronunciation (en)" : "enjin",
        "reading" : "エンジン",
        "reading (en)" : "enjin"
      } ]
    }, {
      "name" : "cjk_width",
      "tokens" : [ {
        "token" : "Elasticsearch",
        "start_offset" : 0,
        "end_offset" : 13,
        "type" : "word",
        "position" : 0,
        "baseForm" : null,
        "bytes" : "[45 6c 61 73 74 69 63 73 65 61 72 63 68]",
        "inflectionForm" : null,
        "inflectionForm (en)" : null,
        "inflectionType" : null,
        "inflectionType (en)" : null,
        "keyword" : false,
        "partOfSpeech" : "名詞-固有名詞-組織",
        "partOfSpeech (en)" : "noun-proper-organization",
        "positionLength" : 1,
        "pronunciation" : null,
        "pronunciation (en)" : null,
        "reading" : null,
        "reading (en)" : null
      }, {
        "token" : "全文",
        "start_offset" : 15,
        "end_offset" : 17,
        "type" : "word",
        "position" : 2,
        "baseForm" : null,
        "bytes" : "[e5 85 a8 e6 96 87]",
        "inflectionForm" : null,
        "inflectionForm (en)" : null,
        "inflectionType" : null,
        "inflectionType (en)" : null,
        "keyword" : false,
        "partOfSpeech" : "名詞-一般",
        "partOfSpeech (en)" : "noun-common",
        "positionLength" : 1,
        "pronunciation" : "ゼンブン",
        "pronunciation (en)" : "zembun",
        "reading" : "ゼンブン",
        "reading (en)" : "zembun"
      }, {
        "token" : "検索",
        "start_offset" : 17,
        "end_offset" : 19,
        "type" : "word",
        "position" : 3,
        "baseForm" : null,
        "bytes" : "[e6 a4 9c e7 b4 a2]",
        "inflectionForm" : null,
        "inflectionForm (en)" : null,
        "inflectionType" : null,
        "inflectionType (en)" : null,
        "keyword" : false,
        "partOfSpeech" : "名詞-サ変接続",
        "partOfSpeech (en)" : "noun-verbal",
        "positionLength" : 1,
        "pronunciation" : "ケンサク",
        "pronunciation (en)" : "kensaku",
        "reading" : "ケンサク",
        "reading (en)" : "kensaku"
      }, {
        "token" : "エンジン",
        "start_offset" : 19,
        "end_offset" : 23,
        "type" : "word",
        "position" : 4,
        "baseForm" : null,
        "bytes" : "[e3 82 a8 e3 83 b3 e3 82 b8 e3 83 b3]",
        "inflectionForm" : null,
        "inflectionForm (en)" : null,
        "inflectionType" : null,
        "inflectionType (en)" : null,
        "keyword" : false,
        "partOfSpeech" : "名詞-一般",
        "partOfSpeech (en)" : "noun-common",
        "positionLength" : 1,
        "pronunciation" : "エンジン",
        "pronunciation (en)" : "enjin",
        "reading" : "エンジン",
        "reading (en)" : "enjin"
      } ]
    }, {
      "name" : "stop",
      "tokens" : [ {
        "token" : "Elasticsearch",
        "start_offset" : 0,
        "end_offset" : 13,
        "type" : "word",
        "position" : 0,
        "baseForm" : null,
        "bytes" : "[45 6c 61 73 74 69 63 73 65 61 72 63 68]",
        "inflectionForm" : null,
        "inflectionForm (en)" : null,
        "inflectionType" : null,
        "inflectionType (en)" : null,
        "keyword" : false,
        "partOfSpeech" : "名詞-固有名詞-組織",
        "partOfSpeech (en)" : "noun-proper-organization",
        "positionLength" : 1,
        "pronunciation" : null,
        "pronunciation (en)" : null,
        "reading" : null,
        "reading (en)" : null
      }, {
        "token" : "全文",
        "start_offset" : 15,
        "end_offset" : 17,
        "type" : "word",
        "position" : 2,
        "baseForm" : null,
        "bytes" : "[e5 85 a8 e6 96 87]",
        "inflectionForm" : null,
        "inflectionForm (en)" : null,
        "inflectionType" : null,
        "inflectionType (en)" : null,
        "keyword" : false,
        "partOfSpeech" : "名詞-一般",
        "partOfSpeech (en)" : "noun-common",
        "positionLength" : 1,
        "pronunciation" : "ゼンブン",
        "pronunciation (en)" : "zembun",
        "reading" : "ゼンブン",
        "reading (en)" : "zembun"
      }, {
        "token" : "検索",
        "start_offset" : 17,
        "end_offset" : 19,
        "type" : "word",
        "position" : 3,
        "baseForm" : null,
        "bytes" : "[e6 a4 9c e7 b4 a2]",
        "inflectionForm" : null,
        "inflectionForm (en)" : null,
        "inflectionType" : null,
        "inflectionType (en)" : null,
        "keyword" : false,
        "partOfSpeech" : "名詞-サ変接続",
        "partOfSpeech (en)" : "noun-verbal",
        "positionLength" : 1,
        "pronunciation" : "ケンサク",
        "pronunciation (en)" : "kensaku",
        "reading" : "ケンサク",
        "reading (en)" : "kensaku"
      }, {
        "token" : "エンジン",
        "start_offset" : 19,
        "end_offset" : 23,
        "type" : "word",
        "position" : 4,
        "baseForm" : null,
        "bytes" : "[e3 82 a8 e3 83 b3 e3 82 b8 e3 83 b3]",
        "inflectionForm" : null,
        "inflectionForm (en)" : null,
        "inflectionType" : null,
        "inflectionType (en)" : null,
        "keyword" : false,
        "partOfSpeech" : "名詞-一般",
        "partOfSpeech (en)" : "noun-common",
        "positionLength" : 1,
        "pronunciation" : "エンジン",
        "pronunciation (en)" : "enjin",
        "reading" : "エンジン",
        "reading (en)" : "enjin"
      } ]
    }, {
      "name" : "ja_stop",
      "tokens" : [ {
        "token" : "Elasticsearch",
        "start_offset" : 0,
        "end_offset" : 13,
        "type" : "word",
        "position" : 0,
        "baseForm" : null,
        "bytes" : "[45 6c 61 73 74 69 63 73 65 61 72 63 68]",
        "inflectionForm" : null,
        "inflectionForm (en)" : null,
        "inflectionType" : null,
        "inflectionType (en)" : null,
        "keyword" : false,
        "partOfSpeech" : "名詞-固有名詞-組織",
        "partOfSpeech (en)" : "noun-proper-organization",
        "positionLength" : 1,
        "pronunciation" : null,
        "pronunciation (en)" : null,
        "reading" : null,
        "reading (en)" : null
      }, {
        "token" : "全文",
        "start_offset" : 15,
        "end_offset" : 17,
        "type" : "word",
        "position" : 2,
        "baseForm" : null,
        "bytes" : "[e5 85 a8 e6 96 87]",
        "inflectionForm" : null,
        "inflectionForm (en)" : null,
        "inflectionType" : null,
        "inflectionType (en)" : null,
        "keyword" : false,
        "partOfSpeech" : "名詞-一般",
        "partOfSpeech (en)" : "noun-common",
        "positionLength" : 1,
        "pronunciation" : "ゼンブン",
        "pronunciation (en)" : "zembun",
        "reading" : "ゼンブン",
        "reading (en)" : "zembun"
      }, {
        "token" : "検索",
        "start_offset" : 17,
        "end_offset" : 19,
        "type" : "word",
        "position" : 3,
        "baseForm" : null,
        "bytes" : "[e6 a4 9c e7 b4 a2]",
        "inflectionForm" : null,
        "inflectionForm (en)" : null,
        "inflectionType" : null,
        "inflectionType (en)" : null,
        "keyword" : false,
        "partOfSpeech" : "名詞-サ変接続",
        "partOfSpeech (en)" : "noun-verbal",
        "positionLength" : 1,
        "pronunciation" : "ケンサク",
        "pronunciation (en)" : "kensaku",
        "reading" : "ケンサク",
        "reading (en)" : "kensaku"
      }, {
        "token" : "エンジン",
        "start_offset" : 19,
        "end_offset" : 23,
        "type" : "word",
        "position" : 4,
        "baseForm" : null,
        "bytes" : "[e3 82 a8 e3 83 b3 e3 82 b8 e3 83 b3]",
        "inflectionForm" : null,
        "inflectionForm (en)" : null,
        "inflectionType" : null,
        "inflectionType (en)" : null,
        "keyword" : false,
        "partOfSpeech" : "名詞-一般",
        "partOfSpeech (en)" : "noun-common",
        "positionLength" : 1,
        "pronunciation" : "エンジン",
        "pronunciation (en)" : "enjin",
        "reading" : "エンジン",
        "reading (en)" : "enjin"
      } ]
    }, {
      "name" : "kuromoji_stemmer",
      "tokens" : [ {
        "token" : "Elasticsearch",
        "start_offset" : 0,
        "end_offset" : 13,
        "type" : "word",
        "position" : 0,
        "baseForm" : null,
        "bytes" : "[45 6c 61 73 74 69 63 73 65 61 72 63 68]",
        "inflectionForm" : null,
        "inflectionForm (en)" : null,
        "inflectionType" : null,
        "inflectionType (en)" : null,
        "keyword" : false,
        "partOfSpeech" : "名詞-固有名詞-組織",
        "partOfSpeech (en)" : "noun-proper-organization",
        "positionLength" : 1,
        "pronunciation" : null,
        "pronunciation (en)" : null,
        "reading" : null,
        "reading (en)" : null
      }, {
        "token" : "全文",
        "start_offset" : 15,
        "end_offset" : 17,
        "type" : "word",
        "position" : 2,
        "baseForm" : null,
        "bytes" : "[e5 85 a8 e6 96 87]",
        "inflectionForm" : null,
        "inflectionForm (en)" : null,
        "inflectionType" : null,
        "inflectionType (en)" : null,
        "keyword" : false,
        "partOfSpeech" : "名詞-一般",
        "partOfSpeech (en)" : "noun-common",
        "positionLength" : 1,
        "pronunciation" : "ゼンブン",
        "pronunciation (en)" : "zembun",
        "reading" : "ゼンブン",
        "reading (en)" : "zembun"
      }, {
        "token" : "検索",
        "start_offset" : 17,
        "end_offset" : 19,
        "type" : "word",
        "position" : 3,
        "baseForm" : null,
        "bytes" : "[e6 a4 9c e7 b4 a2]",
        "inflectionForm" : null,
        "inflectionForm (en)" : null,
        "inflectionType" : null,
        "inflectionType (en)" : null,
        "keyword" : false,
        "partOfSpeech" : "名詞-サ変接続",
        "partOfSpeech (en)" : "noun-verbal",
        "positionLength" : 1,
        "pronunciation" : "ケンサク",
        "pronunciation (en)" : "kensaku",
        "reading" : "ケンサク",
        "reading (en)" : "kensaku"
      }, {
        "token" : "エンジン",
        "start_offset" : 19,
        "end_offset" : 23,
        "type" : "word",
        "position" : 4,
        "baseForm" : null,
        "bytes" : "[e3 82 a8 e3 83 b3 e3 82 b8 e3 83 b3]",
        "inflectionForm" : null,
        "inflectionForm (en)" : null,
        "inflectionType" : null,
        "inflectionType (en)" : null,
        "keyword" : false,
        "partOfSpeech" : "名詞-一般",
        "partOfSpeech (en)" : "noun-common",
        "positionLength" : 1,
        "pronunciation" : "エンジン",
        "pronunciation (en)" : "enjin",
        "reading" : "エンジン",
        "reading (en)" : "enjin"
      } ]
    }, {
      "name" : "lowercase",
      "tokens" : [ {
        "token" : "elasticsearch",
        "start_offset" : 0,
        "end_offset" : 13,
        "type" : "word",
        "position" : 0,
        "baseForm" : null,
        "bytes" : "[65 6c 61 73 74 69 63 73 65 61 72 63 68]",
        "inflectionForm" : null,
        "inflectionForm (en)" : null,
        "inflectionType" : null,
        "inflectionType (en)" : null,
        "keyword" : false,
        "partOfSpeech" : "名詞-固有名詞-組織",
        "partOfSpeech (en)" : "noun-proper-organization",
        "positionLength" : 1,
        "pronunciation" : null,
        "pronunciation (en)" : null,
        "reading" : null,
        "reading (en)" : null
      }, {
        "token" : "全文",
        "start_offset" : 15,
        "end_offset" : 17,
        "type" : "word",
        "position" : 2,
        "baseForm" : null,
        "bytes" : "[e5 85 a8 e6 96 87]",
        "inflectionForm" : null,
        "inflectionForm (en)" : null,
        "inflectionType" : null,
        "inflectionType (en)" : null,
        "keyword" : false,
        "partOfSpeech" : "名詞-一般",
        "partOfSpeech (en)" : "noun-common",
        "positionLength" : 1,
        "pronunciation" : "ゼンブン",
        "pronunciation (en)" : "zembun",
        "reading" : "ゼンブン",
        "reading (en)" : "zembun"
      }, {
        "token" : "検索",
        "start_offset" : 17,
        "end_offset" : 19,
        "type" : "word",
        "position" : 3,
        "baseForm" : null,
        "bytes" : "[e6 a4 9c e7 b4 a2]",
        "inflectionForm" : null,
        "inflectionForm (en)" : null,
        "inflectionType" : null,
        "inflectionType (en)" : null,
        "keyword" : false,
        "partOfSpeech" : "名詞-サ変接続",
        "partOfSpeech (en)" : "noun-verbal",
        "positionLength" : 1,
        "pronunciation" : "ケンサク",
        "pronunciation (en)" : "kensaku",
        "reading" : "ケンサク",
        "reading (en)" : "kensaku"
      }, {
        "token" : "エンジン",
        "start_offset" : 19,
        "end_offset" : 23,
        "type" : "word",
        "position" : 4,
        "baseForm" : null,
        "bytes" : "[e3 82 a8 e3 83 b3 e3 82 b8 e3 83 b3]",
        "inflectionForm" : null,
        "inflectionForm (en)" : null,
        "inflectionType" : null,
        "inflectionType (en)" : null,
        "keyword" : false,
        "partOfSpeech" : "名詞-一般",
        "partOfSpeech (en)" : "noun-common",
        "positionLength" : 1,
        "pronunciation" : "エンジン",
        "pronunciation (en)" : "enjin",
        "reading" : "エンジン",
        "reading (en)" : "enjin"
      } ]
    } ]
  }
}

形態素解析時の各種属性なども見れるようになり、だいぶ詳細になりましたね。

ですが、ちょっと長い…。

しかも、JSONで縦に出るのでちょっと読み辛い…。
※みなさん、どうしてるんだろう…

というわけで、遊びを兼ねてこんなスクリプトを書いてみました。
analyze_detail.groovy

import groovy.json.JsonSlurper

delimiter = "|"

def interestTokenizerAttrs =
  [
    'type',
    'baseForm',
    'partOfSpeech',
    'reading'
  ]

def interestTokenfiltersAttrs =
  [
    'type',
    'baseForm',
    'partOfSpeech',
    'reading'
  ]

def slurper = new JsonSlurper()
def explain = slurper.parse(System.in)
def detail = explain['detail']
def charfilters = detail['charfilters']
def tokenizer = detail['tokenizer']
def tokenfilters = detail['tokenfilters']

if (charfilters) println('========== charfilter ==========')

charfilters.each { charfilter ->
  def charfilterName = charfilter['name']
  def filterTexts = charfilter['filtered_text']
 
  println("charfilter = $charfilterName")
  filterTexts.each { println(it) }
  println()
}

def tokenizerName = tokenizer['name']
def tokenizerTokens = tokenizer['tokens']

println('========== tokenizer ==========')

def tokenizerTokensAndAttrs = extractTokenAndAttr(tokenizerTokens, interestTokenizerAttrs)
printFormat('tokenizer', tokenizerName, tokenizerTokensAndAttrs[0], tokenizerTokensAndAttrs[1])

println()

if (tokenfilters) println("========== tokenfilters ==========")

tokenfilters.each { tokenfilter ->
  def tokenfilterName = tokenfilter['name']
  def tokenfilterTokens = tokenfilter['tokens']
  def tokenfilterTokensAndAttrs = extractTokenAndAttr(tokenfilterTokens, interestTokenfiltersAttrs)
  printFormat('tokenfilter', tokenfilterName, tokenfilterTokensAndAttrs[0], tokenfilterTokensAndAttrs[1])
  println()
}

def extractTokenAndAttr(tokenizerOrFilterTokens, interestAttrs) {
  def tokenAndAttrs =
    tokenizerOrFilterTokens.collect { tokens ->
      def attrs = [:]
      def token = tokens.grep { 'token' == it.key }.collect { it.value }[0]
      interestAttrs.grep { tokens.containsKey(it) }.each { attrs.put(it, tokens[it]) }

      ["$token": attrs]
    }

    def tokens = tokenAndAttrs.collect { it.keySet() }.flatten()
    def attrs = tokenAndAttrs.collect { it.values() }.flatten()

    [tokens, attrs]
}

def printFormat(label, tokenizerOrFilter, tokens, attrs) {
  println("$label = $tokenizerOrFilter")
  def maxTokenSize = tokens.collect { it.size() }
  def maxAttrSize = attrs.collect { it.collect { "${it.key}: ${it.value}".size() }.max() }
  def maxSize = [maxTokenSize, maxAttrSize].transpose().collect { Math.max(it[0], it[1]?: 0 ) }

  println([tokens, maxSize].transpose().collect { it[0].center(it[1]) }.join(delimiter))
  def attrCentered = [attrs, maxSize].transpose().collect {
    def size = it[1]
    it[0].collect { "${it.key}: ${it.value}".center(size) }
  }

  def m = attrCentered.collect { it.size() }.max()

  def indexedAttrs = (0 ..< m).collect { [] }
  
  (0 ..< attrCentered.size()).collect { i -> 
    (0 ..< attrCentered[i].size()).collect { j ->
      indexedAttrs[j] << attrCentered[i][j]
    }
  }

  indexedAttrs.each { println(it.join(delimiter)) }
}

_analyze?explain=trueの結果に、パイプして使います。

$ curl -XGET 'http://localhost:9200/myindex/_analyze?pretty&explain=true&analyzer=kuromoji_analyzer' -d 'Elasticsearch は、全文検索エンジンです。' | groovy analyze_detail.groovy
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 24356  100 24304  100    52  3066k   6719 --:--:-- --:--:-- --:--:-- 3390k

こういう表示になります。

========== tokenizer ==========
tokenizer = kuromoji_tokenizer_search
     Elasticsearch      |         は          |        全文         |         検索          |       エンジン        |       です        
       type: word       |     type: word     |    type: word     |     type: word      |    type: word     |   type: word    
     baseForm: null     |   baseForm: null   |  baseForm: null   |   baseForm: null    |  baseForm: null   | baseForm: null  
partOfSpeech: 名詞-固有名詞-組織|partOfSpeech: 助詞-係助詞|partOfSpeech: 名詞-一般|partOfSpeech: 名詞-サ変接続|partOfSpeech: 名詞-一般|partOfSpeech: 助動詞
     reading: null      |     reading: ハ     |   reading: ゼンブン   |    reading: ケンサク    |   reading: エンジン   |   reading: デス   

========== tokenfilters ==========
tokenfilter = kuromoji_baseform
     Elasticsearch      |         は          |        全文         |         検索          |       エンジン        |       です        
       type: word       |     type: word     |    type: word     |     type: word      |    type: word     |   type: word    
     baseForm: null     |   baseForm: null   |  baseForm: null   |   baseForm: null    |  baseForm: null   | baseForm: null  
partOfSpeech: 名詞-固有名詞-組織|partOfSpeech: 助詞-係助詞|partOfSpeech: 名詞-一般|partOfSpeech: 名詞-サ変接続|partOfSpeech: 名詞-一般|partOfSpeech: 助動詞
     reading: null      |     reading: ハ     |   reading: ゼンブン   |    reading: ケンサク    |   reading: エンジン   |   reading: デス   

tokenfilter = kuromoji_part_of_speech
     Elasticsearch      |        全文         |         検索          |       エンジン        
       type: word       |    type: word     |     type: word      |    type: word     
     baseForm: null     |  baseForm: null   |   baseForm: null    |  baseForm: null   
partOfSpeech: 名詞-固有名詞-組織|partOfSpeech: 名詞-一般|partOfSpeech: 名詞-サ変接続|partOfSpeech: 名詞-一般
     reading: null      |   reading: ゼンブン   |    reading: ケンサク    |   reading: エンジン   

tokenfilter = cjk_width
     Elasticsearch      |        全文         |         検索          |       エンジン        
       type: word       |    type: word     |     type: word      |    type: word     
     baseForm: null     |  baseForm: null   |   baseForm: null    |  baseForm: null   
partOfSpeech: 名詞-固有名詞-組織|partOfSpeech: 名詞-一般|partOfSpeech: 名詞-サ変接続|partOfSpeech: 名詞-一般
     reading: null      |   reading: ゼンブン   |    reading: ケンサク    |   reading: エンジン   

tokenfilter = stop
     Elasticsearch      |        全文         |         検索          |       エンジン        
       type: word       |    type: word     |     type: word      |    type: word     
     baseForm: null     |  baseForm: null   |   baseForm: null    |  baseForm: null   
partOfSpeech: 名詞-固有名詞-組織|partOfSpeech: 名詞-一般|partOfSpeech: 名詞-サ変接続|partOfSpeech: 名詞-一般
     reading: null      |   reading: ゼンブン   |    reading: ケンサク    |   reading: エンジン   

tokenfilter = ja_stop
     Elasticsearch      |        全文         |         検索          |       エンジン        
       type: word       |    type: word     |     type: word      |    type: word     
     baseForm: null     |  baseForm: null   |   baseForm: null    |  baseForm: null   
partOfSpeech: 名詞-固有名詞-組織|partOfSpeech: 名詞-一般|partOfSpeech: 名詞-サ変接続|partOfSpeech: 名詞-一般
     reading: null      |   reading: ゼンブン   |    reading: ケンサク    |   reading: エンジン   

tokenfilter = kuromoji_stemmer
     Elasticsearch      |        全文         |         検索          |       エンジン        
       type: word       |    type: word     |     type: word      |    type: word     
     baseForm: null     |  baseForm: null   |   baseForm: null    |  baseForm: null   
partOfSpeech: 名詞-固有名詞-組織|partOfSpeech: 名詞-一般|partOfSpeech: 名詞-サ変接続|partOfSpeech: 名詞-一般
     reading: null      |   reading: ゼンブン   |    reading: ケンサク    |   reading: エンジン   

tokenfilter = lowercase
     elasticsearch      |        全文         |         検索          |       エンジン        
       type: word       |    type: word     |     type: word      |    type: word     
     baseForm: null     |  baseForm: null   |   baseForm: null    |  baseForm: null   
partOfSpeech: 名詞-固有名詞-組織|partOfSpeech: 名詞-一般|partOfSpeech: 名詞-サ変接続|partOfSpeech: 名詞-一般
     reading: null      |   reading: ゼンブン   |    reading: ケンサク    |   reading: エンジン

一応Groovyでセンタリングしているのですが、全角文字の幅を考慮していないのでアンバランスになるのはご愛嬌…。

デリミタや、出力する属性は、最初の方で定義しています。トークンだけ欲しければ、最初のListを空にしましょう。

delimiter = "|"

def interestTokenizerAttrs =
  []
  /*
  [
    'type',
    'baseForm',
    'partOfSpeech',
    'reading'
  ]
  */

def interestTokenfiltersAttrs =
  []
  /*
  [
    'type',
    'baseForm',
    'partOfSpeech',
    'reading'
  ]
  */

こんな感じになります。

========== tokenizer ==========
tokenizer = kuromoji_tokenizer_search
Elasticsearch|は|全文|検索|エンジン|です

========== tokenfilters ==========
tokenfilter = kuromoji_baseform
Elasticsearch|は|全文|検索|エンジン|です

tokenfilter = kuromoji_part_of_speech
Elasticsearch|全文|検索|エンジン

tokenfilter = cjk_width
Elasticsearch|全文|検索|エンジン

tokenfilter = stop
Elasticsearch|全文|検索|エンジン

tokenfilter = ja_stop
Elasticsearch|全文|検索|エンジン

tokenfilter = kuromoji_stemmer
Elasticsearch|全文|検索|エンジン

tokenfilter = lowercase
elasticsearch|全文|検索|エンジン

こういうところは、Solrの管理UIが便利だなぁと思うのですが、違うでしょうか…？

追記）
と書いていたら、@johtaniさんからツッコミをいただきました。

@kazuhira_r attributesで出力項目絞ると良いかと思いますー

2016-02-06 18:16:00 via twicca to @kazuhira_r

Analyze APIに、attributesというものがあるそうです。これで属性を絞れるとか。

で、ドキュメントを見ていて

ドキュメントに載ってない…よね…？(^^;URL

2016-02-06 18:25:27 via Twitter Web Client to @kazuhira_r

とか言ってたら

@kazuhira_r 次のページ

2016-02-06 19:07:08 via twicca to @kazuhira_r

なんともいえないツッコミをいただきました…。

というわけで、載ってますね。

Explain Analyze | Elasticsearch Reference [2.2] | Elastic

というわけで、試してみます。

今回は、attributesにpartOfSpeechとreadingを指定しました。tokenizerとtokenfilterの結果に現れる属性が、主要なものとpartOfSpeech、readingのみになっています。ケースバイケースで、使い分けですかねー。

$ curl -XGET 'http://localhost:9200/myindex/_analyze?pretty&explain=true&analyzer=kuromoji_analyzer' -d '{ "text": "Elasticsearchは、全文検索エンジンです。", "attributes": ["partOfSpeech", "reading"] }'
{
  "detail" : {
    "custom_analyzer" : true,
    "charfilters" : [ ],
    "tokenizer" : {
      "name" : "kuromoji_tokenizer_search",
      "tokens" : [ {
        "token" : "Elasticsearch",
        "start_offset" : 0,
        "end_offset" : 13,
        "type" : "word",
        "position" : 0,
        "partOfSpeech" : "名詞-固有名詞-組織",
        "reading" : null
      }, {
        "token" : "は",
        "start_offset" : 13,
        "end_offset" : 14,
        "type" : "word",
        "position" : 1,
        "partOfSpeech" : "助詞-係助詞",
        "reading" : "ハ"
      }, {
        "token" : "全文",
        "start_offset" : 15,
        "end_offset" : 17,
        "type" : "word",
        "position" : 2,
        "partOfSpeech" : "名詞-一般",
        "reading" : "ゼンブン"
      }, {
        "token" : "検索",
        "start_offset" : 17,
        "end_offset" : 19,
        "type" : "word",
        "position" : 3,
        "partOfSpeech" : "名詞-サ変接続",
        "reading" : "ケンサク"
      }, {
        "token" : "エンジン",
        "start_offset" : 19,
        "end_offset" : 23,
        "type" : "word",
        "position" : 4,
        "partOfSpeech" : "名詞-一般",
        "reading" : "エンジン"
      }, {
        "token" : "です",
        "start_offset" : 23,
        "end_offset" : 25,
        "type" : "word",
        "position" : 5,
        "partOfSpeech" : "助動詞",
        "reading" : "デス"
      } ]
    },
    "tokenfilters" : [ {
      "name" : "kuromoji_baseform",
      "tokens" : [ {
        "token" : "Elasticsearch",
        "start_offset" : 0,
        "end_offset" : 13,
        "type" : "word",
        "position" : 0,
        "partOfSpeech" : "名詞-固有名詞-組織",
        "reading" : null
      }, {
        "token" : "は",
        "start_offset" : 13,
        "end_offset" : 14,
        "type" : "word",
        "position" : 1,
        "partOfSpeech" : "助詞-係助詞",
        "reading" : "ハ"
      }, {
        "token" : "全文",
        "start_offset" : 15,
        "end_offset" : 17,
        "type" : "word",
        "position" : 2,
        "partOfSpeech" : "名詞-一般",
        "reading" : "ゼンブン"
      }, {
        "token" : "検索",
        "start_offset" : 17,
        "end_offset" : 19,
        "type" : "word",
        "position" : 3,
        "partOfSpeech" : "名詞-サ変接続",
        "reading" : "ケンサク"
      }, {
        "token" : "エンジン",
        "start_offset" : 19,
        "end_offset" : 23,
        "type" : "word",
        "position" : 4,
        "partOfSpeech" : "名詞-一般",
        "reading" : "エンジン"
      }, {
        "token" : "です",
        "start_offset" : 23,
        "end_offset" : 25,
        "type" : "word",
        "position" : 5,
        "partOfSpeech" : "助動詞",
        "reading" : "デス"
      } ]
    }, {
      "name" : "kuromoji_part_of_speech",
      "tokens" : [ {
        "token" : "Elasticsearch",
        "start_offset" : 0,
        "end_offset" : 13,
        "type" : "word",
        "position" : 0,
        "partOfSpeech" : "名詞-固有名詞-組織",
        "reading" : null
      }, {
        "token" : "全文",
        "start_offset" : 15,
        "end_offset" : 17,
        "type" : "word",
        "position" : 2,
        "partOfSpeech" : "名詞-一般",
        "reading" : "ゼンブン"
      }, {
        "token" : "検索",
        "start_offset" : 17,
        "end_offset" : 19,
        "type" : "word",
        "position" : 3,
        "partOfSpeech" : "名詞-サ変接続",
        "reading" : "ケンサク"
      }, {
        "token" : "エンジン",
        "start_offset" : 19,
        "end_offset" : 23,
        "type" : "word",
        "position" : 4,
        "partOfSpeech" : "名詞-一般",
        "reading" : "エンジン"
      } ]
    }, {
      "name" : "cjk_width",
      "tokens" : [ {
        "token" : "Elasticsearch",
        "start_offset" : 0,
        "end_offset" : 13,
        "type" : "word",
        "position" : 0,
        "partOfSpeech" : "名詞-固有名詞-組織",
        "reading" : null
      }, {
        "token" : "全文",
        "start_offset" : 15,
        "end_offset" : 17,
        "type" : "word",
        "position" : 2,
        "partOfSpeech" : "名詞-一般",
        "reading" : "ゼンブン"
      }, {
        "token" : "検索",
        "start_offset" : 17,
        "end_offset" : 19,
        "type" : "word",
        "position" : 3,
        "partOfSpeech" : "名詞-サ変接続",
        "reading" : "ケンサク"
      }, {
        "token" : "エンジン",
        "start_offset" : 19,
        "end_offset" : 23,
        "type" : "word",
        "position" : 4,
        "partOfSpeech" : "名詞-一般",
        "reading" : "エンジン"
      } ]
    }, {
      "name" : "stop",
      "tokens" : [ {
        "token" : "Elasticsearch",
        "start_offset" : 0,
        "end_offset" : 13,
        "type" : "word",
        "position" : 0,
        "partOfSpeech" : "名詞-固有名詞-組織",
        "reading" : null
      }, {
        "token" : "全文",
        "start_offset" : 15,
        "end_offset" : 17,
        "type" : "word",
        "position" : 2,
        "partOfSpeech" : "名詞-一般",
        "reading" : "ゼンブン"
      }, {
        "token" : "検索",
        "start_offset" : 17,
        "end_offset" : 19,
        "type" : "word",
        "position" : 3,
        "partOfSpeech" : "名詞-サ変接続",
        "reading" : "ケンサク"
      }, {
        "token" : "エンジン",
        "start_offset" : 19,
        "end_offset" : 23,
        "type" : "word",
        "position" : 4,
        "partOfSpeech" : "名詞-一般",
        "reading" : "エンジン"
      } ]
    }, {
      "name" : "ja_stop",
      "tokens" : [ {
        "token" : "Elasticsearch",
        "start_offset" : 0,
        "end_offset" : 13,
        "type" : "word",
        "position" : 0,
        "partOfSpeech" : "名詞-固有名詞-組織",
        "reading" : null
      }, {
        "token" : "全文",
        "start_offset" : 15,
        "end_offset" : 17,
        "type" : "word",
        "position" : 2,
        "partOfSpeech" : "名詞-一般",
        "reading" : "ゼンブン"
      }, {
        "token" : "検索",
        "start_offset" : 17,
        "end_offset" : 19,
        "type" : "word",
        "position" : 3,
        "partOfSpeech" : "名詞-サ変接続",
        "reading" : "ケンサク"
      }, {
        "token" : "エンジン",
        "start_offset" : 19,
        "end_offset" : 23,
        "type" : "word",
        "position" : 4,
        "partOfSpeech" : "名詞-一般",
        "reading" : "エンジン"
      } ]
    }, {
      "name" : "kuromoji_stemmer",
      "tokens" : [ {
        "token" : "Elasticsearch",
        "start_offset" : 0,
        "end_offset" : 13,
        "type" : "word",
        "position" : 0,
        "partOfSpeech" : "名詞-固有名詞-組織",
        "reading" : null
      }, {
        "token" : "全文",
        "start_offset" : 15,
        "end_offset" : 17,
        "type" : "word",
        "position" : 2,
        "partOfSpeech" : "名詞-一般",
        "reading" : "ゼンブン"
      }, {
        "token" : "検索",
        "start_offset" : 17,
        "end_offset" : 19,
        "type" : "word",
        "position" : 3,
        "partOfSpeech" : "名詞-サ変接続",
        "reading" : "ケンサク"
      }, {
        "token" : "エンジン",
        "start_offset" : 19,
        "end_offset" : 23,
        "type" : "word",
        "position" : 4,
        "partOfSpeech" : "名詞-一般",
        "reading" : "エンジン"
      } ]
    }, {
      "name" : "lowercase",
      "tokens" : [ {
        "token" : "elasticsearch",
        "start_offset" : 0,
        "end_offset" : 13,
        "type" : "word",
        "position" : 0,
        "partOfSpeech" : "名詞-固有名詞-組織",
        "reading" : null
      }, {
        "token" : "全文",
        "start_offset" : 15,
        "end_offset" : 17,
        "type" : "word",
        "position" : 2,
        "partOfSpeech" : "名詞-一般",
        "reading" : "ゼンブン"
      }, {
        "token" : "検索",
        "start_offset" : 17,
        "end_offset" : 19,
        "type" : "word",
        "position" : 3,
        "partOfSpeech" : "名詞-サ変接続",
        "reading" : "ケンサク"
      }, {
        "token" : "エンジン",
        "start_offset" : 19,
        "end_offset" : 23,
        "type" : "word",
        "position" : 4,
        "partOfSpeech" : "名詞-一般",
        "reading" : "エンジン"
      } ]
    } ]
  }
}

CLOVER🍀

That was when it all began.

Elasticsearch 2.2.0で追加されたAnalyze API（explain=true）を試す

通常のAnalyze API

Analyze API（explain=true）