I want to use WikipediaTokenizer in lucene project - http://lucene.apache.org/java/3_0_2/api/contrib-wikipedia/org/apache/lucene/wikipedia/analysis/WikipediaTokenizer.html B
public class WikipediaTokenizerTest { static Logger logger = Logger.getLogger(WikipediaTokenizerTest.class); protected static final String LINK_PHRASES = "click [[link here again]] click [http://lucene.apache.org here again] [[Category:a b c d]]";
public WikipediaTokenizer testSimple() throws Exception {
String text = "This is a [[Category:foo]]";
return new WikipediaTokenizer(new StringReader(text));
}
public static void main(String[] args){
WikipediaTokenizerTest wtt = new WikipediaTokenizerTest();
try {
WikipediaTokenizer x = wtt.testSimple();
logger.info(x.hasAttributes());
Token token = new Token();
int count = 0;
int numItalics = 0;
int numBoldItalics = 0;
int numCategory = 0;
int numCitation = 0;
while (x.incrementToken() == true) {
logger.info("seen something");
}
} catch(Exception e){
logger.error("Exception while tokenizing Wiki Text: " + e.getMessage());
}
}