i have built an index in Lucene. I want without specifying a query, just to get a score (cosine similarity or another distance?) between two documents in the index.
If you don't need to store documents to Lucene and just want to calculate similarity between two docs, here's the faster code (Scala, from my blog http://chepurnoy.org/blog/2014/03/faster-cosine-similarity-between-two-dicuments-with-scala-and-lucene/ )
def extractTerms(content: String): Map[String, Int] = {
val analyzer = new StopAnalyzer(Version.LUCENE_46)
val ts = new EnglishMinimalStemFilter(analyzer.tokenStream("c", content))
val charTermAttribute = ts.addAttribute(classOf[CharTermAttribute])
val m = scala.collection.mutable.Map[String, Int]()
ts.reset()
while (ts.incrementToken()) {
val term = charTermAttribute.toString
val newCount = m.get(term).map(_ + 1).getOrElse(1)
m += term -> newCount
}
m.toMap
}
def similarity(t1: Map[String, Int], t2: Map[String, Int]): Double = {
//word, t1 freq, t2 freq
val m = scala.collection.mutable.HashMap[String, (Int, Int)]()
val sum1 = t1.foldLeft(0d) {case (sum, (word, freq)) =>
m += word ->(freq, 0)
sum + freq
}
val sum2 = t2.foldLeft(0d) {case (sum, (word, freq)) =>
m.get(word) match {
case Some((freq1, _)) => m += word ->(freq1, freq)
case None => m += word ->(0, freq)
}
sum + freq
}
val (p1, p2, p3) = m.foldLeft((0d, 0d, 0d)) {case ((s1, s2, s3), e) =>
val fs = e._2
val f1 = fs._1 / sum1
val f2 = fs._2 / sum2
(s1 + f1 * f2, s2 + f1 * f1, s3 + f2 * f2)
}
val cos = p1 / (Math.sqrt(p2) * Math.sqrt(p3))
cos
}
So, to calculate similarity between text1 and text2 just call similarity(extractTerms(text1), extractTerms(text2))