Lucene二(域选项、文档基本信息、索引的增删改查)

来源:互联网 发布:人工智能 开源 编辑:程序博客网 时间:2024/06/09 16:27

  先来看看域选项,域选项分为域存储选项和域索引选项,该选项是在为Document添加Field的时候,对该Field的描述。域存储选项即Field.Store.*   *等于YES时表示会把这个域中的内容完全存储到索引文件中,方便进行还原;*等于NO表示这个域中的内容不存储到索引文件中,但是不代表不能进行索引,存储和索引是两个概念。域索引选项即Field.Index.*     *等于NO,表示不进行索引;*等于ANALYZED表示需要进行索引和分词,适用于标题,内容等;*等于NOT_ANALYZED表示要进行索引,但不进行分词,如身份证、姓名、id等,适用于精确搜索;*等于ANALYZED_NOT_NORMS表示进行索引和分词但不存储norms信息,norms信息包含了创建索引的时间和权值等信息;*等于NOT_ANALYZED_NOT_NORMS表示要索引,但既不进行分词也不存储norms信息

  最佳实践:

IndexStore常见使用场景NOT_ANALYZED_NOT_NORMSYES标识符(主键、文件名),电话,身份证,姓名,日期等ANALYZEDYES文档标题和摘要ANALYZEDNO文档正文NOYES文档类型,数据库主键(不进行索引)NOT_ANALYZEDNO隐藏关键字







以下是索引的操作代码:

package cn.liuys.lucene.index;

import java.io.File;
import java.io.IOException;
import java.util.Date;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.LockObtainFailedException;
import org.apache.lucene.util.Version;


public class IndexUtil {
private String[] ids = {"1","2","3","4","5","6"};
private String[] emails = {"aa@itat.org","bb@itat.org","cc@cc.org","dd@sina.org","ee@zttc.edu","ff@itat.org"};
private String[] contents = {
"welcome to visited the space,I like book",
"hello boy, I like pingpeng ball",
"my name is cc I like game",
"I like football",
"I like football and I like basketball too",
"I like movie and swim"
};
private Date[] dates = null;
private int[] attachs = {2,3,1,4,5,5};
private String[] names = {"zhangsan","lisi","john","jetty","mike","jake"};
private Directory directory = null;


public IndexUtil() {
try {
directory = FSDirectory.open(new File("F:\\stady\\JAVA\\other\\Lucene\\test\\index02"));
} catch (IOException e) {
e.printStackTrace();
}
}

/**
* 建立索引
*/
public void index(){
IndexWriter writer = null;
try {
writer = new IndexWriter(directory, new IndexWriterConfig(Version.LUCENE_35, new StandardAnalyzer(Version.LUCENE_35)));
//创建前先删除索引
writer.deleteAll();
Document doc = null;
for(int i = 0; i < ids.length; i++){
doc = new Document();
doc.add(new Field("id", ids[i], Field.Store.YES,Field.Index.NOT_ANALYZED_NO_NORMS));
doc.add(new Field("email",emails[i],Field.Store.YES,Field.Index.NOT_ANALYZED));
doc.add(new Field("content",contents[i],Field.Store.NO,Field.Index.ANALYZED));
doc.add(new Field("name",names[i],Field.Store.YES,Field.Index.NOT_ANALYZED_NO_NORMS));
writer.addDocument(doc);
}
} catch (CorruptIndexException e) {
e.printStackTrace();
} catch (LockObtainFailedException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} finally{
try {
if(writer != null) writer.close();
} catch (CorruptIndexException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
}

/**
* 根据IndexReader获取文档基本信息
*/
public void query(){
IndexReader reader = null;
try {
reader = IndexReader.open(directory);
System.out.println("numDoc:"+reader.numDocs());
System.out.println("maxDoc:"+reader.maxDoc());
System.out.println("deleteDoc:"+reader.numDeletedDocs());
} catch (CorruptIndexException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} finally{
try {
if(reader != null) reader.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}

/**
* 删除索引,类似windows的删除,删除的文件存放在了回收站中并没有彻底删除
*/
public void delete(){
IndexWriter writer = null;
try {
writer = new IndexWriter(directory, new IndexWriterConfig(Version.LUCENE_35, new StandardAnalyzer(Version.LUCENE_35)));
//该方法的参数可以是一个query,也可以是一个term,term是一个精确查找的值
writer.deleteDocuments(new Term("id", "1"));
//删除之后,当我们运行query方法会发现numDoc为5,maxDoc为6,deleteDoc为1
//并且索引文件中多了一个.del结尾的文件,证明该删除确实不是完全删除,可以恢复也可以完全删除
} catch (IOException e) {
e.printStackTrace();
} finally{
try {
if(writer != null) writer.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}


/**
* 恢复删除(回收站)的内容
*/
public void unDelete(){
IndexReader reader = null;
try {
//reader是用来读取的,所以默认readOnly为true,改为false就不会抛出异常了
reader = IndexReader.open(directory,false);
reader.undeleteAll();
} catch (IOException e) {
e.printStackTrace();
} finally{
try {
if(reader != null) reader.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}

/**
* 强制删除,会清空回收站内容
*/
public void forceDelete(){
IndexWriter writer = null;
try {
writer = new IndexWriter(directory, new IndexWriterConfig(Version.LUCENE_35, new StandardAnalyzer(Version.LUCENE_35)));
writer.forceMergeDeletes();
} catch (IOException e) {
e.printStackTrace();
} finally{
try {
if(writer != null) writer.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}

/**
* 强制合并重复索引段,不建议使用,消耗性能。Lucene会自动进行索引的维护
*/
public void forceMerge(){
IndexWriter writer = null;
try {
writer = new IndexWriter(directory, new IndexWriterConfig(Version.LUCENE_35, new StandardAnalyzer(Version.LUCENE_35)));
//将索引强制合并为2段
writer.forceMerge(2);
} catch (IOException e) {
e.printStackTrace();
} finally{
try {
if(writer != null) writer.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}

/**
* 更新索引
*/
public void update(){
IndexWriter writer = null;
try {
writer = new IndexWriter(directory, new IndexWriterConfig(Version.LUCENE_35, new StandardAnalyzer(Version.LUCENE_35)));
//lucene并没有提供更新操作,更新其实就是先删除后添加
Document doc = new Document();
doc.add(new Field("id", "1001", Field.Store.YES,Field.Index.NOT_ANALYZED_NO_NORMS));
doc.add(new Field("email",emails[0],Field.Store.YES,Field.Index.NOT_ANALYZED));
doc.add(new Field("content",contents[0],Field.Store.NO,Field.Index.ANALYZED));
doc.add(new Field("name",names[0],Field.Store.YES,Field.Index.NOT_ANALYZED_NO_NORMS));
//将id为1的项更新
writer.updateDocument(new Term("id", "1"), doc);
} catch (IOException e) {
e.printStackTrace();
} finally{
try {
if(writer != null) writer.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}

0 0
原创粉丝点击