Join API
http://lucene.apache.org/core/4_4_0/join/index.html
まあ、RDBMS的なJOINを期待すると痛い目をみそうな感じですが、知っててもいいのかなぁ?
今回参考にしたのは、
Join API自体のパッケージの説明
http://lucene.apache.org/core/4_4_0/join/org/apache/lucene/search/join/package-summary.html
Grouping & Joining
http://www.slideshare.net/lucenerevolution/grouping-and-joining-in-lucenesolr
Query time joining in Lucene
http://blog.trifork.com/2012/01/22/query-time-joining-in-lucene/
です。Slideshareの資料は、Grouping APIを使う時にも参考にしましたね。
LuceneでのJOINの実現方法には、
- インデックスの作成時に、あらかじめJOINを見込んだDocumentを作成する
- Queryを投げる時にJOINする
の2通りの方法があります。
インデックス作成時にJOINするためには、Documentをちょっと変わった単位というか、ブロック区切りで作る必要があります。子のDocumentを登録して、ある単位で親のDocumentを登録してブロック化するみたい。Slideshareの資料の14、15ページあたりを見るとよいでしょう。
Queryを投げる時にJOINする場合は、インデックス作成時よりはまあRDBMSでやるJOINに近いイメージですね。まあ、使ってみればやっぱり感覚はだいぶ違いますが…。
では、Query実行時のJOINを使ってみましょう。
依存関係の定義。
build.sbt
name := "lucene-join" version := "0.0.1-SNAPSHOT" scalaVersion := "2.10.2" organization := "littlewings" libraryDependencies ++= Seq( "org.apache.lucene" % "lucene-analyzers-kuromoji" % "4.4.0", "org.apache.lucene" % "lucene-join" % "4.4.0" )
今回は、こんな3種類の構造を持つDocumentを考えます。
| Book(書籍) | | isbn13 | title | category-ref | price | | Book Detail(詳細) | | isbn13-ref | title | year | month | |Category(カテゴリ)| | category | explain |
BookとBook Detailをisbn13カラムとisbn13-refカラムで、CategoryとBookをcategoryカラムとcategory-refカラムでJOINするような感じの例を作成していきます。
import文。
import scala.collection.JavaConverters._ import org.apache.lucene.analysis.Analyzer import org.apache.lucene.analysis.ja.JapaneseAnalyzer import org.apache.lucene.document.{Document, Field, StringField, TextField} import org.apache.lucene.index.{DirectoryReader, IndexWriter, IndexWriterConfig, Term} import org.apache.lucene.search.{IndexSearcher, Query, Sort, SortField, TermQuery, TopFieldCollector} import org.apache.lucene.search.{BooleanClause, BooleanQuery, MatchAllDocsQuery, TermQuery} import org.apache.lucene.store.{Directory, RAMDirectory} import org.apache.lucene.util.Version import org.apache.lucene.search.join.{JoinUtil, ScoreMode}
Document作成、その他もろもろのコードです。JOINをする部分は、あとで書きます。
object LuceneJoin { def main(args: Array[String]): Unit = { val luceneVersion = Version.LUCENE_44 val analyzer = new JapaneseAnalyzer(luceneVersion) for { bookDirectory <- new RAMDirectory detailDirectory <- new RAMDirectory categoryDirectory <- new RAMDirectory } { registryBooks(bookDirectory, luceneVersion, analyzer) registryBookDetails(detailDirectory, luceneVersion, analyzer) registryCategories(categoryDirectory, luceneVersion, analyzer) /** 後で **/ } } private def join(label: String, fromDirectory: Directory, toDirectory: Directory, fromQuery: Query, fromField: String, toField: String, luceneVersion: Version, analyzer: Analyzer): Unit = /** 後で **/ private def registryBooks(directory: Directory, luceneVersion: Version, analyzer: Analyzer): Unit = for (indexWriter <- new IndexWriter(directory, new IndexWriterConfig(luceneVersion, analyzer))) { indexWriter.addDocument(book("978-4894714991", "Effective Java 第2版", "java", 3780)) indexWriter.addDocument(book("978-4774139906", "パーフェクトJava", "java", 3780)) indexWriter.addDocument(book("978-4844330844", "Scalaスケーラブルプログラミング第2版", "scala", 4830)) indexWriter.addDocument(book("978-4798125411", "Scala逆引きレシピ (PROGRAMMER’S RECiPE)", "scala", 3360)) indexWriter.addDocument(book("978-4274069130", "プログラミングClojure 第2版", "clojure", 3570)) indexWriter.addDocument(book("978-4774159911", "おいしいClojure入門", "clojure", 2919)) } private def book(isbn13: String, title: String, category: String, price: Int): Document = { val document = new Document document.add(new StringField("isbn13", isbn13, Field.Store.YES)) document.add(new TextField("title", title, Field.Store.YES)) document.add(new StringField("category-ref", category, Field.Store.YES)) document.add(new StringField("price", price.toString, Field.Store.YES)) document } private def registryBookDetails(directory: Directory, luceneVersion: Version, analyzer: Analyzer): Unit = for (indexWriter <- new IndexWriter(directory, new IndexWriterConfig(luceneVersion, analyzer))) { indexWriter.addDocument(bookDetail("978-4894714991", "Effective Java 第2版", "2008", "11")) indexWriter.addDocument(bookDetail("978-4774139906", "パーフェクトJava", "2009", "09")) indexWriter.addDocument(bookDetail("978-4844330844", "Scalaスケーラブルプログラミング第2版", "2011", "09")) indexWriter.addDocument(bookDetail("978-4798125411", "Scala逆引きレシピ (PROGRAMMER’S RECiPE)", "2012", "07")) indexWriter.addDocument(bookDetail("978-4274069130", "プログラミングClojure 第2版", "2013", "04")) indexWriter.addDocument(bookDetail("978-4774159911", "おいしいClojure入門", "2013", "09")) } private def bookDetail(isbn13: String, title: String, year: String, month: String): Document = { val document = new Document document.add(new StringField("isbn13-ref", isbn13, Field.Store.YES)) document.add(new TextField("title", title, Field.Store.YES)) document.add(new StringField("year", year, Field.Store.YES)) document.add(new StringField("month", month, Field.Store.YES)) document } private def registryCategories(directory: Directory, luceneVersion: Version, analyzer: Analyzer): Unit = for (indexWriter <- new IndexWriter(directory, new IndexWriterConfig(luceneVersion, analyzer))) { indexWriter.addDocument(category("java", "Object Oriented Programming Language")) indexWriter.addDocument(category("scala", "OOP and Functionaly Programming Language")) indexWriter.addDocument(category("clojure", "Functionaly Programming Language")) } private def category(category: String, explain: String): Document = { val document = new Document document.add(new StringField("category", category, Field.Store.YES)) document.add(new TextField("explain", explain, Field.Store.YES)) document } implicit class AutoCloseableWrapper[A <: AutoCloseable](val underlying: A) extends AnyVal { def foreach(fun: A => Unit): Unit = try { fun(underlying) } finally { underlying.close() } } }
最初に、Directoryを作成してDocumentを登録してしまいます。
for { bookDirectory <- new RAMDirectory detailDirectory <- new RAMDirectory categoryDirectory <- new RAMDirectory } { registryBooks(bookDirectory, luceneVersion, analyzer) registryBookDetails(detailDirectory, luceneVersion, analyzer) registryCategories(categoryDirectory, luceneVersion, analyzer)
なお、今回はDocumentを格納するDirectoryをそれぞれ分けていますが、別に分けていなくても動作します。
それぞれDocumentを登録。
// Book private def registryBooks(directory: Directory, luceneVersion: Version, analyzer: Analyzer): Unit = for (indexWriter <- new IndexWriter(directory, new IndexWriterConfig(luceneVersion, analyzer))) { indexWriter.addDocument(book("978-4894714991", "Effective Java 第2版", "java", 3780)) indexWriter.addDocument(book("978-4774139906", "パーフェクトJava", "java", 3780)) indexWriter.addDocument(book("978-4844330844", "Scalaスケーラブルプログラミング第2版", "scala", 4830)) indexWriter.addDocument(book("978-4798125411", "Scala逆引きレシピ (PROGRAMMER’S RECiPE)", "scala", 3360)) indexWriter.addDocument(book("978-4274069130", "プログラミングClojure 第2版", "clojure", 3570)) indexWriter.addDocument(book("978-4774159911", "おいしいClojure入門", "clojure", 2919)) } // Book Detail private def registryBookDetails(directory: Directory, luceneVersion: Version, analyzer: Analyzer): Unit = for (indexWriter <- new IndexWriter(directory, new IndexWriterConfig(luceneVersion, analyzer))) { indexWriter.addDocument(bookDetail("978-4894714991", "Effective Java 第2版", "2008", "11")) indexWriter.addDocument(bookDetail("978-4774139906", "パーフェクトJava", "2009", "09")) indexWriter.addDocument(bookDetail("978-4844330844", "Scalaスケーラブルプログラミング第2版", "2011", "09")) indexWriter.addDocument(bookDetail("978-4798125411", "Scala逆引きレシピ (PROGRAMMER’S RECiPE)", "2012", "07")) indexWriter.addDocument(bookDetail("978-4274069130", "プログラミングClojure 第2版", "2013", "04")) indexWriter.addDocument(bookDetail("978-4774159911", "おいしいClojure入門", "2013", "09")) } // Category private def registryCategories(directory: Directory, luceneVersion: Version, analyzer: Analyzer): Unit = for (indexWriter <- new IndexWriter(directory, new IndexWriterConfig(luceneVersion, analyzer))) { indexWriter.addDocument(category("java", "Object Oriented Programming Language")) indexWriter.addDocument(category("scala", "OOP and Functionaly Programming Language")) indexWriter.addDocument(category("clojure", "Functionaly Programming Language")) }
では、JOINを行うコードを。
private def join(label: String, fromDirectory: Directory, toDirectory: Directory, fromQuery: Query, fromField: String, toField: String, luceneVersion: Version, analyzer: Analyzer): Unit = for { fromReader <- DirectoryReader.open(fromDirectory) toReader <- DirectoryReader.open(toDirectory) } { val fromSearcher = new IndexSearcher(fromReader) val toSearcher = new IndexSearcher(toReader) val multipleValuesPerDoc = false val joinQuery = JoinUtil.createJoinQuery(fromField, multipleValuesPerDoc, toField, fromQuery, fromSearcher, ScoreMode.Max) val docCollector = TopFieldCollector.create(Sort.RELEVANCE, 100, true, false, false, false) toSearcher.search(joinQuery, docCollector) println(label) for (h <- docCollector.topDocs.scoreDocs) { val hitDoc = toSearcher.doc(h.doc) println { s"Score,N[${h.score}:${h.doc}] : Doc => " + hitDoc .getFields .asScala .map(_.stringValue) .mkString(" ", " | ", "") } } }
IndexReaderとIndexSearcherを作成します。
for { fromReader <- DirectoryReader.open(fromDirectory) toReader <- DirectoryReader.open(toDirectory) } { val fromSearcher = new IndexSearcher(fromReader) val toSearcher = new IndexSearcher(toReader)
DocumentごとにDirectoryは分けなくてもよいと書きましたが、ここで書いているfrom/toのReader/Searcherは、同一でもよいみたいです。
次に、JoinUtil#createJoinQueryメソッドを使って、Queryを作成します。
val joinQuery = JoinUtil.createJoinQuery(fromField,
multipleValuesPerDoc,
toField,
fromQuery,
fromSearcher,
ScoreMode.Max)
ここで、各パラメータの意味は
- fromField … JOINのFrom(親側)となるフィールドの名前
- multipleValuesPerDocument … fromFieldが、multivalued fieldかどうか
- toField … JOIN先(To)となるフィールドの名前
- fromQuery … Fromとなるインデックスを検索するためのQuery
- fromSearcher … fromQueryを実行するIndexSearcher
- scoreMode … fromQueryが戻したスコアを、どのようにマップするかを指示する(ここ、よくわかりません…)
となります。
あとは、toSearcherに対して、JoinUtil#createJoinQueryで作成したQueryを実行してもらいます。
val docCollector = TopFieldCollector.create(Sort.RELEVANCE, 100, true, false, false, false) toSearcher.search(joinQuery, docCollector)
結果は、TopDocsから普通に取りましょう。
で、呼び出し元はこうなります。
BookとBook DetailをJOINする方。
val bookAndDetailFromQuery = new BooleanQuery bookAndDetailFromQuery.add(new TermQuery(new Term("isbn13", "978-4844330844")), BooleanClause.Occur.SHOULD) bookAndDetailFromQuery.add(new TermQuery(new Term("isbn13", "978-4274069130")), BooleanClause.Occur.SHOULD) join(label = "===== Join Book And Detail =====", fromDirectory = bookDirectory, toDirectory = detailDirectory, fromQuery = bookAndDetailFromQuery, fromField = "isbn13", toField = "isbn13-ref", luceneVersion = luceneVersion, analyzer = analyzer)
まあ、fromQueryはなんでもいいのですが…。
実行結果。
===== Join Book And Detail ===== Score,N[NaN:2] : Doc => 978-4844330844 | Scalaスケーラブルプログラミング第2版 | 2011 | 09 Score,N[NaN:4] : Doc => 978-4274069130 | プログラミングClojure 第2版 | 2013 | 04
CategoryとBookをJOIN。
val categoryAndBookFromQuery = new BooleanQuery categoryAndBookFromQuery.add(new TermQuery(new Term("category", "scala")), BooleanClause.Occur.SHOULD) categoryAndBookFromQuery.add(new TermQuery(new Term("category", "clojure")), BooleanClause.Occur.SHOULD) join(label = "===== Join Category And Book =====", fromDirectory = categoryDirectory, toDirectory = bookDirectory, fromQuery = categoryAndBookFromQuery, fromField = "category", toField = "category-ref", luceneVersion = luceneVersion, analyzer = analyzer)
実行結果。
===== Join Category And Book ===== Score,N[NaN:2] : Doc => 978-4844330844 | Scalaスケーラブルプログラミング第2版 | scala | 4830 Score,N[NaN:3] : Doc => 978-4798125411 | Scala逆引きレシピ (PROGRAMMER’S RECiPE) | scala | 3360 Score,N[NaN:4] : Doc => 978-4274069130 | プログラミングClojure 第2版 | clojure | 3570 Score,N[NaN:5] : Doc => 978-4774159911 | おいしいClojure入門 | clojure | 2919
どちらも、From(それぞれ、Book、Category)を検索するQueryを投げて、対応するToのDocumentを引っ張ってきています。
…お気付きかもしれませんが、From側のDocumentの内容は、toSearcher#searchの結果には現れません。toSearcher#docでDocumentを取得しているので、そりゃそうですよね。
Fromの検索結果を利用して、ToのDocumentを取得したい、という使い方になりそうですね。
最後は、今回のソース。
src/main/scala/LuceneJoin.scala
import scala.collection.JavaConverters._ import org.apache.lucene.analysis.Analyzer import org.apache.lucene.analysis.ja.JapaneseAnalyzer import org.apache.lucene.document.{Document, Field, StringField, TextField} import org.apache.lucene.index.{DirectoryReader, IndexWriter, IndexWriterConfig, Term} import org.apache.lucene.search.{IndexSearcher, Query, Sort, SortField, TermQuery, TopFieldCollector} import org.apache.lucene.search.{BooleanClause, BooleanQuery, MatchAllDocsQuery, TermQuery} import org.apache.lucene.store.{Directory, RAMDirectory} import org.apache.lucene.util.Version import org.apache.lucene.search.join.{JoinUtil, ScoreMode} object LuceneJoin { def main(args: Array[String]): Unit = { val luceneVersion = Version.LUCENE_44 val analyzer = new JapaneseAnalyzer(luceneVersion) for { bookDirectory <- new RAMDirectory detailDirectory <- new RAMDirectory categoryDirectory <- new RAMDirectory } { registryBooks(bookDirectory, luceneVersion, analyzer) registryBookDetails(detailDirectory, luceneVersion, analyzer) registryCategories(categoryDirectory, luceneVersion, analyzer) val bookAndDetailFromQuery = new BooleanQuery bookAndDetailFromQuery.add(new TermQuery(new Term("isbn13", "978-4844330844")), BooleanClause.Occur.SHOULD) bookAndDetailFromQuery.add(new TermQuery(new Term("isbn13", "978-4274069130")), BooleanClause.Occur.SHOULD) join(label = "===== Join Book And Detail =====", fromDirectory = bookDirectory, toDirectory = detailDirectory, fromQuery = bookAndDetailFromQuery, fromField = "isbn13", toField = "isbn13-ref", luceneVersion = luceneVersion, analyzer = analyzer) val categoryAndBookFromQuery = new BooleanQuery categoryAndBookFromQuery.add(new TermQuery(new Term("category", "scala")), BooleanClause.Occur.SHOULD) categoryAndBookFromQuery.add(new TermQuery(new Term("category", "clojure")), BooleanClause.Occur.SHOULD) join(label = "===== Join Category And Book =====", fromDirectory = categoryDirectory, toDirectory = bookDirectory, fromQuery = categoryAndBookFromQuery, fromField = "category", toField = "category-ref", luceneVersion = luceneVersion, analyzer = analyzer) } } private def join(label: String, fromDirectory: Directory, toDirectory: Directory, fromQuery: Query, fromField: String, toField: String, luceneVersion: Version, analyzer: Analyzer): Unit = for { fromReader <- DirectoryReader.open(fromDirectory) toReader <- DirectoryReader.open(toDirectory) } { val fromSearcher = new IndexSearcher(fromReader) val toSearcher = new IndexSearcher(toReader) val multipleValuesPerDoc = false val joinQuery = JoinUtil.createJoinQuery(fromField, multipleValuesPerDoc, toField, fromQuery, fromSearcher, ScoreMode.Max) val docCollector = TopFieldCollector.create(Sort.RELEVANCE, 100, true, false, false, false) toSearcher.search(joinQuery, docCollector) println(label) for (h <- docCollector.topDocs.scoreDocs) { val hitDoc = toSearcher.doc(h.doc) println { s"Score,N[${h.score}:${h.doc}] : Doc => " + hitDoc .getFields .asScala .map(_.stringValue) .mkString(" ", " | ", "") } } } private def registryBooks(directory: Directory, luceneVersion: Version, analyzer: Analyzer): Unit = for (indexWriter <- new IndexWriter(directory, new IndexWriterConfig(luceneVersion, analyzer))) { indexWriter.addDocument(book("978-4894714991", "Effective Java 第2版", "java", 3780)) indexWriter.addDocument(book("978-4774139906", "パーフェクトJava", "java", 3780)) indexWriter.addDocument(book("978-4844330844", "Scalaスケーラブルプログラミング第2版", "scala", 4830)) indexWriter.addDocument(book("978-4798125411", "Scala逆引きレシピ (PROGRAMMER’S RECiPE)", "scala", 3360)) indexWriter.addDocument(book("978-4274069130", "プログラミングClojure 第2版", "clojure", 3570)) indexWriter.addDocument(book("978-4774159911", "おいしいClojure入門", "clojure", 2919)) } private def book(isbn13: String, title: String, category: String, price: Int): Document = { val document = new Document document.add(new StringField("isbn13", isbn13, Field.Store.YES)) document.add(new TextField("title", title, Field.Store.YES)) document.add(new StringField("category-ref", category, Field.Store.YES)) document.add(new StringField("price", price.toString, Field.Store.YES)) document } private def registryBookDetails(directory: Directory, luceneVersion: Version, analyzer: Analyzer): Unit = for (indexWriter <- new IndexWriter(directory, new IndexWriterConfig(luceneVersion, analyzer))) { indexWriter.addDocument(bookDetail("978-4894714991", "Effective Java 第2版", "2008", "11")) indexWriter.addDocument(bookDetail("978-4774139906", "パーフェクトJava", "2009", "09")) indexWriter.addDocument(bookDetail("978-4844330844", "Scalaスケーラブルプログラミング第2版", "2011", "09")) indexWriter.addDocument(bookDetail("978-4798125411", "Scala逆引きレシピ (PROGRAMMER’S RECiPE)", "2012", "07")) indexWriter.addDocument(bookDetail("978-4274069130", "プログラミングClojure 第2版", "2013", "04")) indexWriter.addDocument(bookDetail("978-4774159911", "おいしいClojure入門", "2013", "09")) } private def bookDetail(isbn13: String, title: String, year: String, month: String): Document = { val document = new Document document.add(new StringField("isbn13-ref", isbn13, Field.Store.YES)) document.add(new TextField("title", title, Field.Store.YES)) document.add(new StringField("year", year, Field.Store.YES)) document.add(new StringField("month", month, Field.Store.YES)) document } private def registryCategories(directory: Directory, luceneVersion: Version, analyzer: Analyzer): Unit = for (indexWriter <- new IndexWriter(directory, new IndexWriterConfig(luceneVersion, analyzer))) { indexWriter.addDocument(category("java", "Object Oriented Programming Language")) indexWriter.addDocument(category("scala", "OOP and Functionaly Programming Language")) indexWriter.addDocument(category("clojure", "Functionaly Programming Language")) } private def category(category: String, explain: String): Document = { val document = new Document document.add(new StringField("category", category, Field.Store.YES)) document.add(new TextField("explain", explain, Field.Store.YES)) document } implicit class AutoCloseableWrapper[A <: AutoCloseable](val underlying: A) extends AnyVal { def foreach(fun: A => Unit): Unit = try { fun(underlying) } finally { underlying.close() } } }