Lucene索引删除和更新



Lucene索引删除和更新,

package com.ethan.index;
import java.io.File;
import java.io.IOException;

import org.apache.commons.io.FileUtils;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexReader.FieldOption;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.LockObtainFailedException;
import org.apache.lucene.util.Version;

public class IndexUtil {
private String[] ids = {“1″,”2″,”3″,”4″,”5″,”6″};
private String[] emails = {“11@qq.com”,”22@qq.com”,”33@126.com”,”43@yahoo.cn”,”54@gmail.com”,”65@qq.com”};
private String[] contents = {
“welcome to nba hot”,
“my name is ethan”,
“someone like you “,
“rolling in the deep, you like”,
“i like fast……..”,
“l like sports”
};

private int[] attachs = {2,3,1,5,4,6};
private String[] names = {“ethan”,”sara”,”michael”,”wade”,”lin”,”paul”};

private Directory directory = null;

public IndexUtil() {
try {
directory = FSDirectory.open(new File(“C:\\Users\\ETHAN\\workspace\\hellolucene\\index02″));
} catch (IOException e) {
e.printStackTrace();
}
}
public void index() {
IndexWriter writer = null;

try {
writer = new IndexWriter(directory, new IndexWriterConfig(Version.LUCENE_35,new StandardAnalyzer(Version.LUCENE_35)) );
Document doc = null;
for(int i=0;i<ids.length;i++) {
doc = new Document();
doc.add(new Field(“id”,ids[i],Field.Store.YES,Field.Index.NOT_ANALYZED_NO_NORMS));
doc.add(new Field(“email”,emails[i],Field.Store.YES,Field.Index.NOT_ANALYZED));

doc.add(new Field(“content”,contents[i],Field.Store.NO,Field.Index.ANALYZED));
doc.add(new Field(“name”,names[i],Field.Store.YES,Field.Index.ANALYZED_NO_NORMS));

writer.addDocument(doc);
}
} catch (CorruptIndexException e) {
e.printStackTrace();
} catch (LockObtainFailedException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} finally {
if(writer!=null) {
try {
writer.close();
} catch (CorruptIndexException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
}

}

public void query() {

try {
IndexReader reader = IndexReader.open(directory);
//被存储的
System.out.println(“numDocs: “+reader.numDocs());

//文档总量
System.out.println(“maxDocs: “+reader.maxDoc());
//删除的文档
System.out.println(“deleteDocs: “+reader.numDeletedDocs());;
} catch (CorruptIndexException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}

public void delete() {
IndexWriter writer = null;

try {
writer = new IndexWriter(directory,new IndexWriterConfig(Version.LUCENE_35,new StandardAnalyzer(Version.LUCENE_35)));

//参数是一个选项,可以是一个Query,也可以是一个term,term是一个精确查找的值
//这里删除id=1的文档,还会留在”回收站“。xxx.del
writer.deleteDocuments(new Term(“id”,”1″));
} catch (CorruptIndexException e) {
e.printStackTrace();
} catch (LockObtainFailedException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} finally {
if(writer!=null) {
try {
writer.close();
} catch (CorruptIndexException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
}

}

public void undelete() {
//使用IndexReader进行恢复
IndexReader reader = null;
try {
//set readOnly=false
reader = IndexReader.open(directory,false);
reader.undeleteAll();
} catch (CorruptIndexException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} finally {
if(reader!=null) {
try {
reader.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}

}

public void forceDelete() {
IndexWriter writer = null;

try {
writer = new IndexWriter(directory,new IndexWriterConfig(Version.LUCENE_35,new StandardAnalyzer(Version.LUCENE_35)));


//强制优化,del文件就没了,回收站清空
writer.forceMergeDeletes();
} catch (CorruptIndexException e) {
e.printStackTrace();
} catch (LockObtainFailedException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} finally {
if(writer!=null) {
try {
writer.close();
} catch (CorruptIndexException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}

/*
* 自己手动merge
* 多次创建索引,文件会增多,
* 比如 5次的话,5个id=1的
*
* merge后合并为n段
*/

public void merge() {
IndexWriter writer = null;
try {
writer = new IndexWriter(directory,new IndexWriterConfig(Version.LUCENE_35,new StandardAnalyzer(Version.LUCENE_35)));

//将索引合并为2段,这两段中的del文件会被清空
//3.5后不建议使用,开销大,lucene会根据情况自动处理
writer.forceMerge(2);
} catch (CorruptIndexException e) {
e.printStackTrace();
} catch (LockObtainFailedException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} finally {
if(writer!=null) {
try {
writer.close();
} catch (CorruptIndexException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}

/*
* 更新操作
*/
public void update() {
IndexWriter writer = null;
try {
writer = new IndexWriter(directory,new IndexWriterConfig(Version.LUCENE_35,new StandardAnalyzer(Version.LUCENE_35)));

//lucene没有提供更新方法,这里操作分为两步
//匹配后删除 和 添加新的

Document doc = new Document();
doc.add(new Field(“id”,”11″,Field.Store.YES,Field.Index.NOT_ANALYZED_NO_NORMS));
doc.add(new Field(“email”,emails[0],Field.Store.YES,Field.Index.NOT_ANALYZED));

doc.add(new Field(“content”,contents[0],Field.Store.NO,Field.Index.ANALYZED));
doc.add(new Field(“name”,names[0],Field.Store.YES,Field.Index.ANALYZED_NO_NORMS));

writer.updateDocument(new Term(“id”,”1″),doc);
} catch (CorruptIndexException e) {
e.printStackTrace();
} catch (LockObtainFailedException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} finally {
if(writer!=null) {
try {
writer.close();
} catch (CorruptIndexException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
}
[java] view plaincopy
package com.ethan.test;

import org.junit.Test;

import com.ethan.index.IndexUtil;

public class IndexTest {

@Test
public void testIndex() {
IndexUtil iu = new IndexUtil();
iu.index();
}
/*
* numDocs: 24
maxDocs: 24
deleteDocs: 0
*/
@Test
public void testQuery() {
IndexUtil iu = new IndexUtil();
iu.query();
}
/*
* numDocs: 20
maxDocs: 24
deleteDocs: 4 (id=1 4条)
*/
@Test
public void testDelete() {
IndexUtil iu = new IndexUtil();
iu.delete();
}

/*
* numDocs: 7
maxDocs: 7
deleteDocs: 0
*/
@Test
public void testUnDelete() {
IndexUtil iu = new IndexUtil();
iu.undelete();
}

/*
* numDocs: 6
maxDocs: 6(7)
deleteDocs: 0(1)
*/
@Test
public void testForceDelete() {
IndexUtil iu = new IndexUtil();
iu.forceDelete();
}

/*
* merge后:
* numDocs: 20
maxDocs: 21
deleteDocs: 1(因为强制合并为2段,所以_0_1.del没删)
_0为第一段,不动,把后边的合并为一段
*/
@Test
public void testMerge() {
IndexUtil iu = new IndexUtil();
iu.merge();
}

/*
* numDocs: 6
maxDocs: 7
deleteDocs: 1

删除后 add
*/
@Test
public void testUpdate() {
IndexUtil iu = new IndexUtil();
iu.update();
}
}

索引文件中文件 表示含义:
0.fnm: 保存的field的信息,有哪几个字段
0.fdt,0.fdx: Store.YES的对应字段的值
0.frq:单词出现的频率
0.nrm: 存储评分信息,权重
0.prx: 偏移量
0.tii,0.tis: 存储索引信息
文档和域的概念:

 

文档相当于表中的一条记录,域相当于表中每一个字段

optimize() 已被启用,开销比较大
forceMergeDeletes() 强制把回收站的内容给删掉
当segment比较多时,lucene会自动优化处理