JapaneseAnalyzer／Tokenizer（Kuromoji）とCJKAnalyzer／Tokenizerを簡単に試すスクリプト

以前に書いたエントリ、

LuceneのAnalyzerで遊んでみる
http://d.hatena.ne.jp/Kazuhira/20130601/1370103317

をちょっと簡単にしてGroovyに移植したものです。自分が、使いそうな気がしたので…。

使うのは、JapaneseAnalyzer（Kuromoji）とCJKAnalyzerだけです。

引数に、単語分割したい文字列を取るスクリプト。
lucene-analyzer.groovy

@Grab('org.apache.lucene:lucene-core:4.3.1')
@Grab('org.apache.lucene:lucene-analyzers-kuromoji:4.3.1')
import org.apache.lucene.analysis.cjk.CJKAnalyzer
import org.apache.lucene.analysis.ja.JapaneseAnalyzer
import org.apache.lucene.analysis.ja.tokenattributes.*
import org.apache.lucene.analysis.tokenattributes.*
import org.apache.lucene.util.Version

def luceneVersion = Version.LUCENE_43

assert args.length == 1, 'トークン化する文字列を、引数でひとつ与えてください'

def input = args[0]

def analyzers =
    [new JapaneseAnalyzer(luceneVersion),
    new CJKAnalyzer(luceneVersion)]

def printAnalyzerName = { analyzer, text ->
    println('================================================================================')
    println("Analyzer Class => ${analyzer.getClass().name}")
    println("Input Text => $text")

    [analyzer, text]
}

def toTokenStream = { analyzer, text ->
    def reader = new StringReader(text)
    [analyzer.tokenStream("", reader), text]
}

def printTokenAttributes = { tokenStream, text ->
    def charTermAttr = tokenStream.addAttribute(CharTermAttribute.class)
    def typeAttr = tokenStream.addAttribute(TypeAttribute.class)
    def partOfSpeechAttr = tokenStream.addAttribute(PartOfSpeechAttribute.class)

    println('================================================================================')

    tokenStream.reset()

    while (tokenStream.incrementToken()) {
        println("""\
token: ${charTermAttr.toString()},
  type: $typeAttr.type,
  partOfSpeech: $partOfSpeechAttr.partOfSpeech""")
    }

    println('================================================================================')

    tokenStream.end()

    tokenStream.close()
}

analyzers.collect { [it, input] }.each(printAnalyzerName
                                       >> toTokenStream
                                       >> printTokenAttributes)

PartOfSpeechは、JapaneseAnalyzerの時しか出力されません。

あと、Tokenizer版も作ってみました。
lucene-tokenizer.groovy

@Grab('org.apache.lucene:lucene-core:4.3.1')
@Grab('org.apache.lucene:lucene-analyzers-kuromoji:4.3.1')
import org.apache.lucene.analysis.cjk.CJKTokenizer
import org.apache.lucene.analysis.ja.JapaneseTokenizer
import org.apache.lucene.analysis.ja.tokenattributes.*
import org.apache.lucene.analysis.tokenattributes.*
import org.apache.lucene.util.Version

def luceneVersion = Version.LUCENE_43

assert args.length == 1, 'トークン化する文字列を、引数でひとつ与えてください'

def input = args[0]

def printTokenizerName = { tokenizer, text ->
    println('================================================================================')
    println("Tokenizer Class => ${tokenizer.getClass().name}")
    println("Input Text => $text")

    tokenizer
}

def printTokenAttributes = { tokenStream ->
    def charTermAttr = tokenStream.addAttribute(CharTermAttribute.class)
    def typeAttr = tokenStream.addAttribute(TypeAttribute.class)
    def partOfSpeechAttr = tokenStream.addAttribute(PartOfSpeechAttribute.class)

    println('================================================================================')

    tokenStream.reset()

    while (tokenStream.incrementToken()) {
        println("""\
token: ${charTermAttr.toString()},
  type: $typeAttr.type,
  partOfSpeech: $partOfSpeechAttr.partOfSpeech""")
    }

    println('================================================================================')

    tokenStream.end()

    tokenStream.close()
}

[[new JapaneseTokenizer(new StringReader(input),
                       null,
                       true,
                       JapaneseTokenizer.Mode.SEARCH),
        input],
    [new CJKTokenizer(new StringReader(input)), input]].each(printTokenizerName
                                                             >> printTokenAttributes)

こんな使い方。

まずは、Analyzer版。

$ groovy lucene-analyzer.groovy すもももももももものうち
================================================================================
Analyzer Class => org.apache.lucene.analysis.ja.JapaneseAnalyzer
Input Text => すもももももももものうち
================================================================================
token: すもも,
  type: word,
  partOfSpeech: 名詞-一般
token: もも,
  type: word,
  partOfSpeech: 名詞-一般
token: もも,
  type: word,
  partOfSpeech: 名詞-一般
================================================================================
================================================================================
Analyzer Class => org.apache.lucene.analysis.cjk.CJKAnalyzer
Input Text => すもももももももものうち
================================================================================
token: すも,
  type: <DOUBLE>,
  partOfSpeech: null
token: もも,
  type: <DOUBLE>,
  partOfSpeech: null
token: もも,
  type: <DOUBLE>,
  partOfSpeech: null
token: もも,
  type: <DOUBLE>,
  partOfSpeech: null
token: もも,
  type: <DOUBLE>,
  partOfSpeech: null
token: もも,
  type: <DOUBLE>,
  partOfSpeech: null
token: もも,
  type: <DOUBLE>,
  partOfSpeech: null
token: もも,
  type: <DOUBLE>,
  partOfSpeech: null
token: もの,
  type: <DOUBLE>,
  partOfSpeech: null
token: のう,
  type: <DOUBLE>,
  partOfSpeech: null
token: うち,
  type: <DOUBLE>,
  partOfSpeech: null
================================================================================

$ groovy lucene-analyzer.groovy 今日もいい天気です
================================================================================
Analyzer Class => org.apache.lucene.analysis.ja.JapaneseAnalyzer
Input Text => 今日もいい天気です
================================================================================
token: 今日,
  type: word,
  partOfSpeech: 名詞-副詞可能
token: いい,
  type: word,
  partOfSpeech: 形容詞-自立
token: 天気,
  type: word,
  partOfSpeech: 名詞-一般
================================================================================
================================================================================
Analyzer Class => org.apache.lucene.analysis.cjk.CJKAnalyzer
Input Text => 今日もいい天気です
================================================================================
token: 今日,
  type: <DOUBLE>,
  partOfSpeech: null
token: 日も,
  type: <DOUBLE>,
  partOfSpeech: null
token: もい,
  type: <DOUBLE>,
  partOfSpeech: null
token: いい,
  type: <DOUBLE>,
  partOfSpeech: null
token: い天,
  type: <DOUBLE>,
  partOfSpeech: null
token: 天気,
  type: <DOUBLE>,
  partOfSpeech: null
token: 気で,
  type: <DOUBLE>,
  partOfSpeech: null
token: です,
  type: <DOUBLE>,
  partOfSpeech: null
================================================================================

続いて、Tokenizer版。

$ groovy lucene-tokenizer.groovy すもももももももものうち$ groovy lucene-tokenizer.groovy すもももももももものうち
================================================================================
Tokenizer Class => org.apache.lucene.analysis.ja.JapaneseTokenizer
Input Text => すもももももももものうち
================================================================================
token: すもも,
  type: word,
  partOfSpeech: 名詞-一般
token: も,
  type: word,
  partOfSpeech: 助詞-係助詞
token: もも,
  type: word,
  partOfSpeech: 名詞-一般
token: も,
  type: word,
  partOfSpeech: 助詞-係助詞
token: もも,
  type: word,
  partOfSpeech: 名詞-一般
token: の,
  type: word,
  partOfSpeech: 助詞-連体化
token: うち,
  type: word,
  partOfSpeech: 名詞-非自立-副詞可能
================================================================================
================================================================================
Tokenizer Class => org.apache.lucene.analysis.cjk.CJKTokenizer
Input Text => すもももももももものうち
================================================================================
token: すも,
  type: double,
  partOfSpeech: null
token: もも,
  type: double,
  partOfSpeech: null
token: もも,
  type: double,
  partOfSpeech: null
token: もも,
  type: double,
  partOfSpeech: null
token: もも,
  type: double,
  partOfSpeech: null
token: もも,
  type: double,
  partOfSpeech: null
token: もも,
  type: double,
  partOfSpeech: null
token: もも,
  type: double,
  partOfSpeech: null
token: もの,
  type: double,
  partOfSpeech: null
token: のう,
  type: double,
  partOfSpeech: null
token: うち,
  type: double,
  partOfSpeech: null
================================================================================

$ groovy lucene-tokenizer.groovy 今日もいい天気です
================================================================================
Tokenizer Class => org.apache.lucene.analysis.ja.JapaneseTokenizer
Input Text => 今日もいい天気です
================================================================================
token: 今日,
  type: word,
  partOfSpeech: 名詞-副詞可能
token: も,
  type: word,
  partOfSpeech: 助詞-係助詞
token: いい,
  type: word,
  partOfSpeech: 形容詞-自立
token: 天気,
  type: word,
  partOfSpeech: 名詞-一般
token: です,
  type: word,
  partOfSpeech: 助動詞
================================================================================
================================================================================
Tokenizer Class => org.apache.lucene.analysis.cjk.CJKTokenizer
Input Text => 今日もいい天気です
================================================================================
token: 今日,
  type: double,
  partOfSpeech: null
token: 日も,
  type: double,
  partOfSpeech: null
token: もい,
  type: double,
  partOfSpeech: null
token: いい,
  type: double,
  partOfSpeech: null
token: い天,
  type: double,
  partOfSpeech: null
token: 天気,
  type: double,
  partOfSpeech: null
token: 気で,
  type: double,
  partOfSpeech: null
token: です,
  type: double,
  partOfSpeech: null
================================================================================

CLOVER🍀

That was when it all began.

JapaneseAnalyzer／Tokenizer（Kuromoji）とCJKAnalyzer／Tokenizerを簡単に試すスクリプト