lucenc代码阅读指南、测试范例
来源:互联网 发布:淘宝天猫lee鼎汉店真假 编辑:程序博客网 时间:2024/06/12 01:09
a href="http://forfuture1978.iteye.com/blog/691017">Lucene 原理与代码分析完整版 -- 力荐
Lucene介绍及源码剖析: http://javenstudio.org/blog/annotated-lucene -- 核心IndexWriter
下载:Annotated+Lucene+.pdf: http://ishare.iask.sina.com.cn/f/24103589.html
阅读步骤:
1、了解检索的基本原理和概念
2、了解lucene的基本概念
3、熟悉lucene的索引文件格式 -- 关键
4、熟悉lucene的索引流程:具体代码的类层次较多,且引入不必要的设计模式致使代码阅读相对困难。基本思路:controler + model 封装索引链,实现多线程并发处理(数据不共享)。
5、熟悉lucene的搜索流程
6、了解lucene搜索语法解析器 和 熟悉分词
推荐资料深入剖析lucene的源码,非常有价值。光看文档,不够形象,大体看过文档后,建议结合源码理解文档内容。代码能让读者有大体的基本概念,但文档对源码细节的解释容易让读者"只见枝叶不见森林”,理解困难。根据文档作者提供的大体思路,结合实际源码,读起来更容易。
测试
测试对于了解lucene的工作原理、代码执行流程极有帮助,是阅读代码的重要辅助手段。
IndexerExample.java
/* * Compiler: javac -classpath .:../lucene-core-2.9.1.jar:http://www.cnblogs.com/ChineseSegmenter/chineseSegmenter.jar IndexerExample.java * Exec : java -classpath .:../lucene-core-2.9.1.jar:http://www.cnblogs.com/ChineseSegmenter/chineseSegmenter.jar IndexerExample * */import java.io.BufferedReader;import java.io.File;import java.io.FileReader;import java.io.FileInputStream;import java.io.IOException;import java.io.InputStreamReader;import java.io.StringReader;import org.apache.lucene.analysis.Analyzer;import org.apache.lucene.analysis.TokenStream;import org.apache.lucene.analysis.standard.StandardAnalyzer;import org.apache.lucene.analysis.PerFieldAnalyzerWrapper;import org.apache.lucene.analysis.WhitespaceAnalyzer;import org.apache.lucene.analysis.cn.ChineseAnalyzer;import org.apache.lucene.document.Document;import org.apache.lucene.document.Field;import org.apache.lucene.document.DateTools;import org.apache.lucene.index.IndexWriter;import org.apache.lucene.store.Directory;import org.apache.lucene.store.FSDirectory;public class IndexerExample { private static void EnExample() throws Exception { // Store the index on disk Directory directory = FSDirectory.getDirectory("/tmp/testindex"); // Use standard analyzer Analyzer analyzer = new StandardAnalyzer(); // Create IndexWriter object IndexWriter iwriter = new IndexWriter(directory, analyzer, true); iwriter.setMaxFieldLength(25000); // make a new, empty document Document doc = new Document(); File f = new File("/tmp/test.txt"); // Add the path of the file as a field named "path". Use a field that is // indexed (i.e. searchable), but don't tokenize the field into words. doc.add(new Field("path", f.getPath(), Field.Store.YES, Field.Index.UN_TOKENIZED)); String text = "This is the text to be indexed."; doc.add(new Field("fieldname", text, Field.Store.YES, Field.Index.TOKENIZED)); doc.add(new Field("name", text, Field.Store.YES, Field.Index.TOKENIZED)); // Add the last modified date of the file a field named "modified". Use // a field that is indexed (i.e. searchable), but don't tokenize the field // into words. doc.add(new Field("modified", DateTools.timeToString(f.lastModified(), DateTools.Resolution.MINUTE), Field.Store.YES, Field.Index.UN_TOKENIZED)); // Add the contents of the file to a field named "contents". Specify a Reader, // so that the text of the file is tokenized and indexed, but not stored. // Note that FileReader expects the file to be in the system's default encoding. // If that's not the case searching for special characters will fail. doc.add(new Field("contents", new FileReader(f))); iwriter.addDocument(doc); iwriter.optimize(); iwriter.close(); } private static void CnExample() throws Exception { // Store the index on disk Directory directory = FSDirectory.getDirectory("/tmp/testindex"); // Use chinese analyzer Analyzer analyzer = new ChineseAnalyzer(); PerFieldAnalyzerWrapper wrapper = new PerFieldAnalyzerWrapper(new WhitespaceAnalyzer()); wrapper.addAnalyzer("name", analyzer); // Create IndexWriter object IndexWriter iwriter = new IndexWriter(directory, wrapper, true); iwriter.setMaxFieldLength(25000); // make a new, empty document Document doc = new Document(); File f = new File("/tmp/test.txt"); // Add the path of the file as a field named "path". Use a field that is // indexed (i.e. searchable), but don't tokenize the field into words. doc.add(new Field("path", f.getPath(), Field.Store.YES, Field.Index.UN_TOKENIZED)); String text = "This is the text to be indexed."; doc.add(new Field("fieldname", text, Field.Store.YES, Field.Index.TOKENIZED)); String name = "2013春装新款女气质修身风衣大翻领双层大摆长款外套 系腰带"; doc.add(new Field("name", name, Field.Store.YES, Field.Index.TOKENIZED)); // Add the last modified date of the file a field named "modified". Use // a field that is indexed (i.e. searchable), but don't tokenize the field // into words. doc.add(new Field("modified", DateTools.timeToString(f.lastModified(), DateTools.Resolution.MINUTE), Field.Store.YES, Field.Index.UN_TOKENIZED)); // Add the contents of the file to a field named "contents". Specify a Reader, // so that the text of the file is tokenized and indexed, but not stored. // Note that FileReader expects the file to be in the system's default encoding. // If that's not the case searching for special characters will fail. doc.add(new Field("contents", new FileReader(f))); iwriter.addDocument(doc); iwriter.optimize(); iwriter.close(); } public static void main(String[] args) throws Exception { System.out.println("Start test: "); if( args.length > 0){ CnExample(); } else{ EnExample(); } System.out.println("Index dir: /tmp/testindex"); }}
SearcherExample.java
/* * Compiler: javac -classpath .:../lucene-core-2.9.1.jar:http://www.cnblogs.com/ChineseSegmenter/chineseSegmenter.jar SearcherExample.java * Exec : java -classpath .:../lucene-core-2.9.1.jar:http://www.cnblogs.com/ChineseSegmenter/chineseSegmenter.jar SearcherExample * */import java.io.BufferedReader;import java.io.File;import java.io.FileReader;import java.io.FileInputStream;import java.io.IOException;import java.io.InputStreamReader;import java.io.StringReader;import java.util.Date;import org.apache.lucene.analysis.Analyzer;import org.apache.lucene.analysis.TokenStream;import org.apache.lucene.analysis.standard.StandardAnalyzer;import org.apache.lucene.analysis.cn.ChineseAnalyzer;import org.apache.lucene.document.Document;import org.apache.lucene.document.Field;import org.apache.lucene.document.DateTools;import org.apache.lucene.index.IndexWriter;import org.apache.lucene.store.Directory;import org.apache.lucene.store.FSDirectory;import org.apache.lucene.search.IndexSearcher;import org.apache.lucene.search.Searcher;import org.apache.lucene.search.Query;import org.apache.lucene.search.Hits;import org.apache.lucene.queryParser.QueryParser;public class SearcherExample { public static void main(String[] args) throws Exception { if (args.length < 2) { throw new Exception("Usage: java " + Searcher.class.getName() + "<index dir> <query> [cn]"); } File indexDir = new File(args[0]); String q = args[1]; boolean bCn = args.length > 2? true : false; if (!indexDir.exists() || !indexDir.isDirectory()) { throw new Exception(indexDir + " does not exist or is not a directory."); } search(indexDir, q, bCn); } public static void search(File indexDir, String q, boolean bCn) throws Exception { Directory fsDir = FSDirectory.getDirectory(indexDir, false); IndexSearcher is = new IndexSearcher(fsDir); Analyzer analyzer = new StandardAnalyzer(); if( bCn ){ analyzer = new ChineseAnalyzer(); } QueryParser parser = new QueryParser( "name", analyzer); Query query = parser.parse(q); System.out.println("Query: " + query.toString()); long start = new Date().getTime(); Hits hits = is.search(query); long end = new Date().getTime(); System.err.println("Found " + hits.length() + " document(s) (in " + (end - start) + " milliseconds) that matched query '" + q + "'"); for (int i = 0; i < hits.length(); i++) { Document doc = hits.doc(i); System.out.println( "HIT " + i + " :" + doc.get("name")); } } }
中文分词可采用lucene自带的库,效果不好,或者自行封装,核心就是封装分词Tokenizer。
package org.apache.lucene.analysis.cn;import java.io.BufferedReader;import java.io.File;import java.io.FileInputStream;import java.io.IOException;import java.io.InputStreamReader;import java.io.Reader;import java.nio.charset.Charset;import java.util.ArrayList;import java.util.HashSet;import java.util.Iterator;import java.util.List;import org.apache.commons.lang.StringUtils;import org.apache.lucene.analysis.Token;import org.apache.lucene.analysis.Tokenizer;public class SnippetTermTokenizer extends Tokenizer { private StringBuffer buffer = new StringBuffer(); private BufferedReader inputBuffer; private JNISelecter selecter; // 中文分词核心类 private List<Token> tokenList = null; private List<String> phraseTokenList = null; private Iterator<Token> tokenIter = null; public SnippetTermTokenizer(Reader reader, JNISelecter s) { inputBuffer = new BufferedReader(reader, 2048); selecter = s; } public Token next() throws IOException { if (tokenIter != null) { if (tokenIter.hasNext()) { return tokenIter.next(); } else { // finish read input return null; } } // need to read content readContent(); if (segment()) { // segment succeed, create iterator return tokenIter.next(); } return null; } public void close() throws IOException { inputBuffer.close(); } // 分词相关略}
<script type="text/javascript"><!--google_ad_client = "ca-pub-1944176156128447";/* cnblogs 首页横幅 */google_ad_slot = "5419468456";google_ad_width = 728;google_ad_height = 90;//--></script><script type="text/javascript" src="http://pagead2.googlesyndication.com/pagead/show_ads.js"></script>
- lucenc代码阅读指南、测试范例
- 阅读、测试代码的方法
- 代码范例
- 【阅读】 软技能——代码之外的生存指南
- 安全测试模式范例
- MyBatis测试范例
- lucenc-solr-4.9.1
- 代码范例 - ArrayUtils
- JAVA代码注释范例
- JAVA代码注释范例
- NSDate常用代码范例
- NSDate常用代码范例
- CloudBox范例代码:CloudLED
- [Android]照相机范例代码
- NSDate常用代码范例
- NSDate常用代码范例
- java同步代码范例
- NSDate常用代码范例
- 一个存储过程中的小问题
- MySQL & Entity Framework Code First 数据表大小写的问题
- Android的touchEvent的消费过程
- [原]用GitBlit 和 VS GitSourceControlProvider 搭建基于 Http(s) 的 Git 工作平台
- JQuery常用标签
- lucenc代码阅读指南、测试范例
- ACM之遗失的袜子
- 结构体单步调试和撞错体验
- 2013ACM多校联合(1)_CSUST
- linux动态链接库
- ACM之车位选择
- cassandra节点异常数据处理——HintedHandOff
- 框架和工具类
- -bash: scp: command not found问题解决