1. 要求
环境:
Lucene 4.1版本/IKAnalyzer 2012 FF版本/mmseg4j 1.9版本
功能: 1).高亮查询演示注意:
此篇文章开始,索引目录将不再使用示范目录,而是使用真实的数据。即LUCENE_INDEX_DIR = "C:\\lucene\\data"改到了LUCENE_INDEX_DIR = "C:\\solr\\news\\data\\index"。
2. 实现代码
package com.clzhang.sample.lucene;import java.io.*;import org.apache.lucene.analysis.Analyzer;import org.apache.lucene.analysis.TokenStream;import org.apache.lucene.document.Document;import org.apache.lucene.index.DirectoryReader;import org.apache.lucene.index.Term;import org.apache.lucene.store.Directory;import org.apache.lucene.store.FSDirectory;import org.apache.lucene.queryparser.classic.QueryParser;import org.apache.lucene.search.IndexSearcher;import org.apache.lucene.search.Query;import org.apache.lucene.search.ScoreDoc;import org.apache.lucene.search.TermQuery;import org.apache.lucene.search.TopDocs;import org.apache.lucene.search.highlight.Fragmenter;import org.apache.lucene.search.highlight.Highlighter;import org.apache.lucene.search.highlight.QueryScorer;import org.apache.lucene.search.highlight.SimpleSpanFragmenter;import org.apache.lucene.search.highlight.SimpleHTMLFormatter;import org.apache.lucene.search.highlight.TokenSources;import org.apache.lucene.util.Version;import com.chenlb.mmseg4j.Dictionary;import com.chenlb.mmseg4j.analysis.SimpleAnalyzer;import com.chenlb.mmseg4j.analysis.ComplexAnalyzer;import org.junit.Test;/** * 环境:Lucene 4.1版本/IKAnalyzer 2012 FF版本/mmseg4j 1.9版本 * 功能: * 1.高亮查询演示 * @author Administrator * */public class HighlightDemo { // mmseg4j字典路径 private static final String MMSEG4J_DICT_PATH = "C:\\solr\\news\\conf"; private static Dictionary dictionary = Dictionary.getInstance(MMSEG4J_DICT_PATH); // Lucene索引存放路径 private static final String LUCENE_INDEX_DIR = "C:\\solr\\news\\data\\index"; @Test public void testHighlighting() throws Exception { // 独立测试Highlighting的代码 String text = "台保钓人士拟起诉日当局 感谢大陆海监船驰援"; TermQuery query = new TermQuery(new Term("title", "当局")); TokenStream tokenStream = new ComplexAnalyzer(dictionary).tokenStream( "title", new StringReader(text)); QueryScorer scorer = new QueryScorer(query, "title"); Fragmenter fragmenter = new SimpleSpanFragmenter(scorer); Highlighter highlighter = new Highlighter(scorer); highlighter.setTextFragmenter(fragmenter); String hlText = highlighter.getBestFragment(tokenStream, text); System.out.println(hlText); System.out.println("--------------------------"); } @Test public void doHighlightQuery() throws Exception { // 实例化IKAnalyzer分词器// Analyzer analyzer = new IKAnalyzer(); // 实例化mmseg4j分词器 Analyzer analyzer = new SimpleAnalyzer(dictionary); // 实例化搜索器 Directory directory = FSDirectory.open(new File(LUCENE_INDEX_DIR)); DirectoryReader reader = DirectoryReader.open(directory); IndexSearcher searcher = new IndexSearcher(reader); final String FIELD_NAME = "webTitle"; String keyword = "记者"; // 使用QueryParser查询分析器构造Query对象 QueryParser qp = new QueryParser(Version.LUCENE_41, FIELD_NAME, analyzer); Query query = qp.parse(keyword); // 搜索相似度最高的5条记录 TopDocs hits = searcher.search(query, 5); System.out.println("命中:" + hits.totalHits); // 高亮代码1 QueryScorer scorer = new QueryScorer(query, FIELD_NAME); // 下面是指定高亮代码样式的代码 SimpleHTMLFormatter simpleHtmlFormatter = new SimpleHTMLFormatter("", ""); Highlighter highlighter = new Highlighter(simpleHtmlFormatter, scorer); highlighter.setTextFragmenter( new SimpleSpanFragmenter(scorer)); // 输出结果 for (ScoreDoc scoreDoc : hits.scoreDocs) { Document doc = searcher.doc(scoreDoc.doc); String title = doc.get(FIELD_NAME); // 高亮代码2 TokenStream stream = TokenSources.getAnyTokenStream( searcher.getIndexReader(), scoreDoc.doc, FIELD_NAME, doc, analyzer); String fragment = highlighter.getBestFragment(stream, title); System.out.println(fragment); } reader.close(); directory.close(); System.out.println("--------------------------"); }}
输出:
台保钓人士拟起诉日<B>当局</B> 感谢大陆海监船驰援
--------------------------命中:125浙江杭州一男子涉嫌殴打<EM>记者</EM>被警方抓获领导快看;<EM>记者</EM>曝光![视频]节前聚焦烟花爆竹安全 居民楼内存花炮 <EM>记者</EM>举报无人监管 20130203老夫看过<EM>记者</EM>关于肖某勒索的调查视频,可以说,“胁从犯罪”的证据极为明显——问题就在于,曾经处理方哦,算是结了案,再次处理,法理上有疑问<EM>记者</EM>调查:重庆忠县一桩疑窦重生的受贿案(转载)--------------------------