lucenc代码阅读指南、测试范例

来源:互联网 发布:淘宝天猫lee鼎汉店真假 编辑:程序博客网 时间:2024/06/12 01:09
a href="http://forfuture1978.iteye.com/blog/691017">Lucene 原理与代码分析完整版  -- 力荐

Lucene介绍及源码剖析: http://javenstudio.org/blog/annotated-lucene  -- 核心IndexWriter

下载:Annotated+Lucene+.pdf: http://ishare.iask.sina.com.cn/f/24103589.html

阅读步骤:

1、了解检索的基本原理和概念

2、了解lucene的基本概念

3、熟悉lucene的索引文件格式 -- 关键

4、熟悉lucene的索引流程:具体代码的类层次较多,且引入不必要的设计模式致使代码阅读相对困难。基本思路:controler + model 封装索引链,实现多线程并发处理(数据不共享)。

5、熟悉lucene的搜索流程

6、了解lucene搜索语法解析器 和 熟悉分词

 

推荐资料深入剖析lucene的源码,非常有价值。光看文档,不够形象,大体看过文档后,建议结合源码理解文档内容。代码能让读者有大体的基本概念,但文档对源码细节的解释容易让读者"只见枝叶不见森林”,理解困难。根据文档作者提供的大体思路,结合实际源码,读起来更容易。

测试

测试对于了解lucene的工作原理、代码执行流程极有帮助,是阅读代码的重要辅助手段。

IndexerExample.java

/* * Compiler: javac -classpath .:../lucene-core-2.9.1.jar:http://www.cnblogs.com/ChineseSegmenter/chineseSegmenter.jar  IndexerExample.java   * Exec    : java  -classpath .:../lucene-core-2.9.1.jar:http://www.cnblogs.com/ChineseSegmenter/chineseSegmenter.jar  IndexerExample   * */import java.io.BufferedReader;import java.io.File;import java.io.FileReader;import java.io.FileInputStream;import java.io.IOException;import java.io.InputStreamReader;import java.io.StringReader;import org.apache.lucene.analysis.Analyzer;import org.apache.lucene.analysis.TokenStream;import org.apache.lucene.analysis.standard.StandardAnalyzer;import org.apache.lucene.analysis.PerFieldAnalyzerWrapper;import org.apache.lucene.analysis.WhitespaceAnalyzer;import org.apache.lucene.analysis.cn.ChineseAnalyzer;import org.apache.lucene.document.Document;import org.apache.lucene.document.Field;import org.apache.lucene.document.DateTools;import org.apache.lucene.index.IndexWriter;import org.apache.lucene.store.Directory;import org.apache.lucene.store.FSDirectory;public class IndexerExample {        private static void EnExample() throws Exception {        // Store the index on disk        Directory directory = FSDirectory.getDirectory("/tmp/testindex");        // Use standard analyzer        Analyzer analyzer = new StandardAnalyzer();        // Create IndexWriter object        IndexWriter iwriter = new IndexWriter(directory, analyzer, true);        iwriter.setMaxFieldLength(25000);        // make a new, empty document        Document doc = new Document();        File f = new File("/tmp/test.txt");                // Add the path of the file as a field named "path".  Use a field that is        // indexed (i.e. searchable), but don't tokenize the field into words.        doc.add(new Field("path", f.getPath(), Field.Store.YES, Field.Index.UN_TOKENIZED));                String text = "This is the text to be indexed.";        doc.add(new Field("fieldname", text, Field.Store.YES,      Field.Index.TOKENIZED));        doc.add(new Field("name", text, Field.Store.YES,      Field.Index.TOKENIZED));                // Add the last modified date of the file a field named "modified".  Use        // a field that is indexed (i.e. searchable), but don't tokenize the field        // into words.        doc.add(new Field("modified",                    DateTools.timeToString(f.lastModified(), DateTools.Resolution.MINUTE),                    Field.Store.YES, Field.Index.UN_TOKENIZED));        // Add the contents of the file to a field named "contents".  Specify a Reader,        // so that the text of the file is tokenized and indexed, but not stored.        // Note that FileReader expects the file to be in the system's default encoding.        // If that's not the case searching for special characters will fail.        doc.add(new Field("contents", new FileReader(f)));                iwriter.addDocument(doc);        iwriter.optimize();        iwriter.close();    }     private static void CnExample() throws Exception {        // Store the index on disk        Directory directory = FSDirectory.getDirectory("/tmp/testindex");        // Use chinese analyzer        Analyzer analyzer = new ChineseAnalyzer();        PerFieldAnalyzerWrapper wrapper = new PerFieldAnalyzerWrapper(new WhitespaceAnalyzer());        wrapper.addAnalyzer("name", analyzer);                // Create IndexWriter object        IndexWriter iwriter = new IndexWriter(directory, wrapper, true);        iwriter.setMaxFieldLength(25000);        // make a new, empty document        Document doc = new Document();        File f = new File("/tmp/test.txt");                // Add the path of the file as a field named "path".  Use a field that is        // indexed (i.e. searchable), but don't tokenize the field into words.        doc.add(new Field("path", f.getPath(), Field.Store.YES, Field.Index.UN_TOKENIZED));                String text = "This is the text to be indexed.";        doc.add(new Field("fieldname", text, Field.Store.YES, Field.Index.TOKENIZED));                String name = "2013春装新款女气质修身风衣大翻领双层大摆长款外套 系腰带";        doc.add(new Field("name", name, Field.Store.YES, Field.Index.TOKENIZED));                // Add the last modified date of the file a field named "modified".  Use        // a field that is indexed (i.e. searchable), but don't tokenize the field        // into words.        doc.add(new Field("modified",                    DateTools.timeToString(f.lastModified(), DateTools.Resolution.MINUTE),                    Field.Store.YES, Field.Index.UN_TOKENIZED));        // Add the contents of the file to a field named "contents".  Specify a Reader,        // so that the text of the file is tokenized and indexed, but not stored.        // Note that FileReader expects the file to be in the system's default encoding.        // If that's not the case searching for special characters will fail.        doc.add(new Field("contents", new FileReader(f)));                iwriter.addDocument(doc);        iwriter.optimize();        iwriter.close();    }    public static void main(String[] args) throws Exception {        System.out.println("Start test: ");        if( args.length > 0){            CnExample();        }        else{            EnExample();        }        System.out.println("Index dir: /tmp/testindex");    }}

SearcherExample.java

/* * Compiler: javac -classpath .:../lucene-core-2.9.1.jar:http://www.cnblogs.com/ChineseSegmenter/chineseSegmenter.jar  SearcherExample.java   * Exec    : java  -classpath .:../lucene-core-2.9.1.jar:http://www.cnblogs.com/ChineseSegmenter/chineseSegmenter.jar  SearcherExample *  */import java.io.BufferedReader;import java.io.File;import java.io.FileReader;import java.io.FileInputStream;import java.io.IOException;import java.io.InputStreamReader;import java.io.StringReader;import java.util.Date;import org.apache.lucene.analysis.Analyzer;import org.apache.lucene.analysis.TokenStream;import org.apache.lucene.analysis.standard.StandardAnalyzer;import org.apache.lucene.analysis.cn.ChineseAnalyzer;import org.apache.lucene.document.Document;import org.apache.lucene.document.Field;import org.apache.lucene.document.DateTools;import org.apache.lucene.index.IndexWriter;import org.apache.lucene.store.Directory;import org.apache.lucene.store.FSDirectory;import org.apache.lucene.search.IndexSearcher;import org.apache.lucene.search.Searcher;import org.apache.lucene.search.Query;import org.apache.lucene.search.Hits;import org.apache.lucene.queryParser.QueryParser;public class SearcherExample {     public static void main(String[] args) throws Exception {         if (args.length < 2) {             throw new Exception("Usage: java " + Searcher.class.getName()                     + "<index dir> <query> [cn]");         }         File indexDir = new File(args[0]);        String q = args[1];         boolean bCn = args.length > 2? true : false;        if (!indexDir.exists() || !indexDir.isDirectory()) {             throw new Exception(indexDir +                     " does not exist or is not a directory.");         }         search(indexDir, q, bCn);     }     public static void search(File indexDir, String q, boolean bCn)         throws Exception {         Directory fsDir = FSDirectory.getDirectory(indexDir, false);         IndexSearcher is = new IndexSearcher(fsDir);        Analyzer analyzer = new StandardAnalyzer();        if( bCn ){            analyzer = new ChineseAnalyzer();        }        QueryParser parser = new QueryParser( "name",  analyzer);        Query query = parser.parse(q);                 System.out.println("Query: " + query.toString());        long start = new Date().getTime();         Hits hits = is.search(query);        long end = new Date().getTime();         System.err.println("Found " + hits.length() +                 " document(s) (in " + (end - start) +                 " milliseconds) that matched query '" +                 q + "'");         for (int i = 0; i < hits.length(); i++) {             Document doc = hits.doc(i);             System.out.println( "HIT " + i + " :" + doc.get("name"));         }     } } 

中文分词可采用lucene自带的库,效果不好,或者自行封装,核心就是封装分词Tokenizer。

package org.apache.lucene.analysis.cn;import java.io.BufferedReader;import java.io.File;import java.io.FileInputStream;import java.io.IOException;import java.io.InputStreamReader;import java.io.Reader;import java.nio.charset.Charset;import java.util.ArrayList;import java.util.HashSet;import java.util.Iterator;import java.util.List;import org.apache.commons.lang.StringUtils;import org.apache.lucene.analysis.Token;import org.apache.lucene.analysis.Tokenizer;public class SnippetTermTokenizer extends Tokenizer {        private StringBuffer buffer = new StringBuffer();        private BufferedReader inputBuffer;        private JNISelecter selecter;     // 中文分词核心类        private List<Token> tokenList = null;        private List<String> phraseTokenList = null;        private Iterator<Token> tokenIter = null;        public SnippetTermTokenizer(Reader reader, JNISelecter s) {                inputBuffer = new BufferedReader(reader, 2048);                selecter = s;        }        public Token next() throws IOException {                if (tokenIter != null) {                        if (tokenIter.hasNext()) {                                return tokenIter.next();                        } else {                                // finish read input                                return null;                        }                }                // need to read content                readContent();                if (segment()) {                        // segment succeed, create iterator                        return tokenIter.next();                }                return null;        }        public void close() throws IOException {                inputBuffer.close();        }               // 分词相关略

 

 

<script type="text/javascript"><!--google_ad_client = "ca-pub-1944176156128447";/* cnblogs 首页横幅 */google_ad_slot = "5419468456";google_ad_width = 728;google_ad_height = 90;//--></script><script type="text/javascript" src="http://pagead2.googlesyndication.com/pagead/show_ads.js"></script>
原创粉丝点击