本篇文章使用的是Lucene4.2版本
首先创建索引分析器,建立IndexWriter对象
File docDir = new File(filePath);
Directory dir = null;
try {
dir = FSDirectory.open(new File(indexPath));
} catch (Exception e) {
return;
}
Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_42);
IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_42, analyzer);
iwc.setOpenMode(OpenMode.CREATE_OR_APPEND);//设置索引模式
IndexWriter writer = new IndexWriter(dir, iwc);
indexDocs(writer, docDir);
对象docDir为你要建立索引的目录或文件,参数indexPath为你要保存索引文件的目录
private void indexDocs(IndexWriter writer, File file) throws Exception {
if (file.canRead()) {
if (file.isDirectory()) {
String[] files = file.list();
for(int j=0;j<files.length;j++){
System.out.println(j+"文件名为:"+files[j]);
}
if (files != null) {
for (int i = 0; i < files.length; i++) {
indexDocs(writer, new File(file, files[i]));
}
}
} else {
FileInputStream fis;
try {
fis = new FileInputStream(file);
} catch (FileNotFoundException fnfe) {
return;
}
try {
Document doc = new Document();
doc.add(new StringField("path", file.getPath(), Field.Store.YES));
doc.add(new TextField("filename", file.getName(), Field.Store.YES));
doc.add(new TextField("size", file.length()+"", Field.Store.YES));
doc.add(new TextField("type", "file", Field.Store.YES));
doc.add(new LongField("modify", file.lastModified(), Field.Store.YES));
String fileName = file.getName();
String extention = "";
int index = fileName.lastIndexOf('.');
if(index > -1 && index <fileName.length()){
extention = fileName.substring(index+1);
}
if(extention.equalsIgnoreCase("doc")){
WordExtractor wordExtractor = new WordExtractor(fis);
String result = wordExtractor.getText();
//BufferedReader br = new BufferedReader(new StringReader(result));
doc.add(new TextField("contents", result,Field.Store.YES));
}else if(extention.equalsIgnoreCase("docx")){
XWPFWordExtractor xwpfWordExtractor = new XWPFWordExtractor(new XWPFDocument(fis));
String result = xwpfWordExtractor.getText();
//BufferedReader br = new BufferedReader(new StringReader(result));
doc.add(new TextField("contents",result,Field.Store.YES));
}else if(extention.equalsIgnoreCase("wps")){
HWPFDocument hwpfDocument = new HWPFDocument(fis);
Range range = hwpfDocument.getRange();
String result = range.text();
doc.add(new TextField("contents",result,Field.Store.YES));
}else if(extention.equalsIgnoreCase("xlsx")){
XSSFWorkbook wb = new XSSFWorkbook(fis);
StringBuffer sb = new StringBuffer();
for(int sheetNum = 0;sheetNum < wb.getNumberOfSheets() ;sheetNum++){
if(wb.getSheetAt(sheetNum)!=null){
XSSFSheet sheet = wb.getSheetAt(sheetNum);
for(int sheetRow =0;sheetRow<sheet.getLastRowNum();sheetRow++){
if(sheet.getRow(sheetRow)!=null){
XSSFRow row = sheet.getRow(sheetRow);
for(int sheetCol =0;sheetCol<row.getLastCellNum();sheetCol++){
if(row.getCell(sheetCol)!=null){
XSSFCell aCell = row.getCell(sheetCol);
if (aCell.getCellType() == HSSFCell.CELL_TYPE_NUMERIC) {
sb.append(aCell.getNumericCellValue() + "\t");
} else if (aCell.getCellType() == HSSFCell.CELL_TYPE_BOOLEAN) {
sb.append(aCell.getBooleanCellValue() + "\t");
} else if (aCell.getCellType() == HSSFCell.CELL_TYPE_STRING) {
sb.append(aCell.getStringCellValue() + "\t");
} else if (aCell.getCellType() == HSSFCell.CELL_TYPE_FORMULA){
sb.append(aCell.getCellFormula() + "\t");
}
}
if(sheetCol==row.getLastCellNum()-1){
sb.append("\n");
}
}
}
}
}
}
//BufferedReader br = new BufferedReader(new StringReader(sb.toString()));
doc.add(new TextField("contents", sb.toString(),Field.Store.YES));
}else if(extention.equalsIgnoreCase("xls")){
POIFSFileSystem poifsFileSystem = new POIFSFileSystem(fis);
StringBuffer sb = new StringBuffer();
HSSFWorkbook wb = new HSSFWorkbook(poifsFileSystem);
for(int sheetNum = 0;sheetNum < wb.getNumberOfSheets() ;sheetNum++){
if(wb.getSheetAt(sheetNum)!=null){
HSSFSheet sheet = wb.getSheetAt(sheetNum);
for(int sheetRow =0;sheetRow<sheet.getLastRowNum();sheetRow++){
if(sheet.getRow(sheetRow)!=null){
HSSFRow row = sheet.getRow(sheetRow);
for(int sheetCol =0;sheetCol<row.getLastCellNum();sheetCol++){
if(row.getCell(sheetCol)!=null){
HSSFCell aCell = row.getCell(sheetCol);
if (aCell.getCellType() == HSSFCell.CELL_TYPE_NUMERIC) {
sb.append(aCell.getNumericCellValue() + "\t");
} else if (aCell.getCellType() == HSSFCell.CELL_TYPE_BOOLEAN) {
sb.append(aCell.getBooleanCellValue() + "\t");
} else if (aCell.getCellType() == HSSFCell.CELL_TYPE_STRING) {
sb.append(aCell.getStringCellValue() + "\t");
} else if (aCell.getCellType() == HSSFCell.CELL_TYPE_FORMULA){
sb.append(aCell.getCellFormula() + "\t");
}
}
if(sheetCol==row.getLastCellNum()-1){
sb.append("\n");
}
}
}
}
}
}
//BufferedReader br = new BufferedReader(new StringReader(sb.toString()));
doc.add(new TextField("contents", sb.toString(),Field.Store.YES));
}else if(extention.equalsIgnoreCase("ppt")){
StringBuffer sb = new StringBuffer();
SlideShow ss = new SlideShow(new HSLFSlideShow(fis));
Slide[] s = ss.getSlides();
for(int i=0;i<s.length;i++){
sb.append(s[i].getTitle());
TextRun[] t = s[i].getTextRuns();
for(int j=0;j<t.length;j++){
sb.append(t[j].getText()+"\t");
}
sb.append("\n");
}
//BufferedReader br = new BufferedReader(new StringReader(sb.toString()));
doc.add(new TextField("contents", sb.toString(),Field.Store.YES));
}else if(extention.equalsIgnoreCase("pdf")){
PDFParser parser = new PDFParser(fis);
parser.parse();
PDDocument pdDocument = parser.getPDDocument();
PDFTextStripper stripper = new PDFTextStripper();
String result = stripper.getText(pdDocument);
//BufferedReader br = new BufferedReader(new StringReader(result));
doc.add(new TextField("contents", result,Field.Store.YES));
}else if(extention.equalsIgnoreCase("txt")||extention.equalsIgnoreCase("html")||extention.equalsIgnoreCase("xml")||extention.equalsIgnoreCase("java")){
StringBuffer stringBuffer = new StringBuffer();
BufferedReader br = new BufferedReader(new InputStreamReader(fis, "GBK"));
String data = null;
while((data=br.readLine())!=null){
stringBuffer.append(data+"\n");
}
doc.add(new TextField("contents", stringBuffer.toString(),Field.Store.YES));
}else{
return;
}
if (writer.getConfig().getOpenMode() == OpenMode.CREATE) {
System.out.println("adding " + file);
writer.addDocument(doc);
} else {
System.out.println("updating " + file);
writer.updateDocument(new Term("path", file.getPath()), doc);
}
} finally {
fis.close();
}
}
}
}
方法indexDocs(IndexWriter writer, File file)为递归对目录下的文件进行索引的建立。从代码中可以看出,我们首先要创建Document对象,并通过Field对象将文件的属性(文件名、文件内容、文件路径等)添加到文档对象Document中。最后调用IndexWriter对象中的方法,将文档添加到索引中去。
搜索文档,我们就需要使用IndexReader对象将索引文件读取出来,并使用Query对象来进行索引的检索。
String[] fields = new String[]{"filename","contents"};
IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(indexPath)));
IndexSearcher searcher = new IndexSearcher(reader);
Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_42);
MultiFieldQueryParser multiFieldQueryParser = new MultiFieldQueryParser(Version.LUCENE_42, fields, analyzer);
其中fileds表示你要查找的域,即你要在哪个字段中进行关键字查找。这里使用MultiFieldQueryParser对象来进行多个字段的解析。
Query query = multiFieldQueryParser.parse(condition);
//System.out.println("解析后的查询条件:"+query.toString());
//System.out.println();
TopDocs results = searcher.search(query,10);
ScoreDoc[] hits = results.scoreDocs;
创建Query查询对象,并调用IndexSearcher对象来进行查找。TopDocs表示相关度最高的文档集
for(ScoreDoc hit:hits){
Document doc = searcher.doc(hit.doc);
/*
我们可以通过doc.get('字段名')方式获取我们索引的数据。
*/
}
对于检索出的结果,Lucene的API完美的实现了关键字高亮显示等功能。
来源:oschina
链接:https://my.oschina.net/u/942785/blog/176313