How to do query auto-completion/suggestions in Lucene?

后端 未结 5 1883
刺人心
刺人心 2020-11-27 09:35

I\'m looking for a way to do query auto-completion/suggestions in Lucene. I\'ve Googled around a bit and played around a bit, but all of the examples I\'ve seen seem to be s

5条回答
  •  暗喜
    暗喜 (楼主)
    2020-11-27 10:24

    In addition to the above (much appreciated) post re: c# conversion, should you be using .NET 3.5 you'll need to include the code for the EdgeNGramTokenFilter - or at least I did - using Lucene 2.9.2 - this filter is missing from the .NET version as far as I could tell. I had to go and find the .NET 4 version online in 2.9.3 and port back - hope this makes the procedure less painful for someone...

    Edit : Please also note that the array returned by the SuggestTermsFor() function is sorted by count ascending, you'll probably want to reverse it to get the most popular terms first in your list

    using System.IO;
    using System.Collections;
    using Lucene.Net.Analysis;
    using Lucene.Net.Analysis.Tokenattributes;
    using Lucene.Net.Util;
    
    namespace Lucene.Net.Analysis.NGram
    {
    
    /**
     * Tokenizes the given token into n-grams of given size(s).
     * 

    * This {@link TokenFilter} create n-grams from the beginning edge or ending edge of a input token. *

    */ public class EdgeNGramTokenFilter : TokenFilter { public static Side DEFAULT_SIDE = Side.FRONT; public static int DEFAULT_MAX_GRAM_SIZE = 1; public static int DEFAULT_MIN_GRAM_SIZE = 1; // Replace this with an enum when the Java 1.5 upgrade is made, the impl will be simplified /** Specifies which side of the input the n-gram should be generated from */ public class Side { private string label; /** Get the n-gram from the front of the input */ public static Side FRONT = new Side("front"); /** Get the n-gram from the end of the input */ public static Side BACK = new Side("back"); // Private ctor private Side(string label) { this.label = label; } public string getLabel() { return label; } // Get the appropriate Side from a string public static Side getSide(string sideName) { if (FRONT.getLabel().Equals(sideName)) { return FRONT; } else if (BACK.getLabel().Equals(sideName)) { return BACK; } return null; } } private int minGram; private int maxGram; private Side side; private char[] curTermBuffer; private int curTermLength; private int curGramSize; private int tokStart; private TermAttribute termAtt; private OffsetAttribute offsetAtt; protected EdgeNGramTokenFilter(TokenStream input) : base(input) { this.termAtt = (TermAttribute)AddAttribute(typeof(TermAttribute)); this.offsetAtt = (OffsetAttribute)AddAttribute(typeof(OffsetAttribute)); } /** * Creates EdgeNGramTokenFilter that can generate n-grams in the sizes of the given range * * @param input {@link TokenStream} holding the input to be tokenized * @param side the {@link Side} from which to chop off an n-gram * @param minGram the smallest n-gram to generate * @param maxGram the largest n-gram to generate */ public EdgeNGramTokenFilter(TokenStream input, Side side, int minGram, int maxGram) : base(input) { if (side == null) { throw new System.ArgumentException("sideLabel must be either front or back"); } if (minGram < 1) { throw new System.ArgumentException("minGram must be greater than zero"); } if (minGram > maxGram) { throw new System.ArgumentException("minGram must not be greater than maxGram"); } this.minGram = minGram; this.maxGram = maxGram; this.side = side; this.termAtt = (TermAttribute)AddAttribute(typeof(TermAttribute)); this.offsetAtt = (OffsetAttribute)AddAttribute(typeof(OffsetAttribute)); } /** * Creates EdgeNGramTokenFilter that can generate n-grams in the sizes of the given range * * @param input {@link TokenStream} holding the input to be tokenized * @param sideLabel the name of the {@link Side} from which to chop off an n-gram * @param minGram the smallest n-gram to generate * @param maxGram the largest n-gram to generate */ public EdgeNGramTokenFilter(TokenStream input, string sideLabel, int minGram, int maxGram) : this(input, Side.getSide(sideLabel), minGram, maxGram) { } public override bool IncrementToken() { while (true) { if (curTermBuffer == null) { if (!input.IncrementToken()) { return false; } else { curTermBuffer = (char[])termAtt.TermBuffer().Clone(); curTermLength = termAtt.TermLength(); curGramSize = minGram; tokStart = offsetAtt.StartOffset(); } } if (curGramSize <= maxGram) { if (!(curGramSize > curTermLength // if the remaining input is too short, we can't generate any n-grams || curGramSize > maxGram)) { // if we have hit the end of our n-gram size range, quit // grab gramSize chars from front or back int start = side == Side.FRONT ? 0 : curTermLength - curGramSize; int end = start + curGramSize; ClearAttributes(); offsetAtt.SetOffset(tokStart + start, tokStart + end); termAtt.SetTermBuffer(curTermBuffer, start, curGramSize); curGramSize++; return true; } } curTermBuffer = null; } } public override Token Next(Token reusableToken) { return base.Next(reusableToken); } public override Token Next() { return base.Next(); } public override void Reset() { base.Reset(); curTermBuffer = null; } } }

提交回复
热议问题