From 91eefb5f00192c31f0fc26f0215556b808791785 Mon Sep 17 00:00:00 2001 From: hankcs Date: Thu, 22 Oct 2015 12:30:21 +0800 Subject: [PATCH] =?UTF-8?q?=E6=BC=94=E7=A4=BA=E9=AB=98=E4=BA=AE=E6=90=9C?= =?UTF-8?q?=E7=B4=A2=E7=BB=93=E6=9E=9C=EF=BC=8C=E8=A7=A3=E5=86=B3=EF=BC=9A?= =?UTF-8?q?https://github.com/hankcs/HanLP/issues/74?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .gitignore | 1 + pom.xml | 10 +- .../com/hankcs/lucene/SegmentWrapper.java | 10 ++ .../com/hankcs/lucene/HighLighterTest.java | 169 ++++++++++++++++++ 4 files changed, 189 insertions(+), 1 deletion(-) create mode 100644 src/test/java/com/hankcs/lucene/HighLighterTest.java diff --git a/.gitignore b/.gitignore index 32858aa..08ef410 100644 --- a/.gitignore +++ b/.gitignore @@ -10,3 +10,4 @@ # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml hs_err_pid* +/.idea diff --git a/pom.xml b/pom.xml index bbc00df..9acd0b1 100644 --- a/pom.xml +++ b/pom.xml @@ -6,7 +6,7 @@ com.hankcs.nlp hanlp-solr-plugin - 1.0.1 + 1.0.2 hanlp-solr-plugin https://github.com/hankcs/HanLP @@ -37,6 +37,14 @@ test + + org.apache.lucene + lucene-highlighter + ${lucene.version} + test + + + org.apache.lucene lucene-core diff --git a/src/main/java/com/hankcs/lucene/SegmentWrapper.java b/src/main/java/com/hankcs/lucene/SegmentWrapper.java index 0f819ad..5dfb5a4 100644 --- a/src/main/java/com/hankcs/lucene/SegmentWrapper.java +++ b/src/main/java/com/hankcs/lucene/SegmentWrapper.java @@ -32,6 +32,10 @@ public class SegmentWrapper * termArray下标 */ int index; + /** + * term的偏移量,由于wrapper是按行读取的,必须对term.offset做一个校正 + */ + int offset; public SegmentWrapper(BufferedReader br, Segment segment) { @@ -49,6 +53,7 @@ public void reset(BufferedReader br) this.br = br; termArray = null; index = 0; + offset = 0; } public Term next() throws IOException @@ -58,12 +63,17 @@ public Term next() throws IOException while (isBlank(line)) { if (line == null) return null; + offset += line.length() + 1; line = br.readLine(); } List termList = segment.seg(line); if (termList.size() == 0) return null; termArray = termList.toArray(new Term[0]); + for (Term term : termArray) + { + term.offset += offset; + } index = 0; return termArray[index++]; diff --git a/src/test/java/com/hankcs/lucene/HighLighterTest.java b/src/test/java/com/hankcs/lucene/HighLighterTest.java new file mode 100644 index 0000000..11e8c6c --- /dev/null +++ b/src/test/java/com/hankcs/lucene/HighLighterTest.java @@ -0,0 +1,169 @@ +/* + * + * He Han + * me@hankcs.com + * 2015/10/22 11:37 + * + * + * Copyright (c) 2008-2015, 码农场. All Right Reserved, http://www.hankcs.com/ + * This source is subject to Hankcs. Please contact Hankcs to get more information. + * + */ +package com.hankcs.lucene; + +import junit.framework.TestCase; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.TextField; +import org.apache.lucene.index.*; +import org.apache.lucene.queryparser.classic.ParseException; +import org.apache.lucene.queryparser.classic.QueryParser; +import org.apache.lucene.search.*; +import org.apache.lucene.search.highlight.*; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.LockObtainFailedException; +import org.apache.lucene.store.RAMDirectory; + +import java.io.IOException; + +/** + * 演示高亮搜索结果 + * @author hankcs + * + */ +public class HighLighterTest extends TestCase +{ + + public void testHightlight() throws Exception + { + // Lucene Document的主要域名 + String fieldName = "text"; + + // 实例化Analyzer分词器 + Analyzer analyzer = new HanLPAnalyzer(); + + Directory directory = null; + IndexWriter iwriter; + IndexReader ireader = null; + IndexSearcher isearcher; + try + { + //索引过程********************************** + //建立内存索引对象 + directory = new RAMDirectory(); + + //配置IndexWriterConfig + IndexWriterConfig iwConfig = new IndexWriterConfig(analyzer); + iwConfig.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND); + iwriter = new IndexWriter(directory, iwConfig); + { + // 加入一个文档 + Document doc = new Document(); + doc.add(new TextField(fieldName, "我白天是一名语言学习者,晚上是一名初级码农。空的时候喜欢看算法和应用数学书,也喜欢悬疑推理小说,ACG方面喜欢型月、轨迹。喜欢有思想深度的事物,讨厌急躁、拜金与安逸的人。目前在魔都某女校学习,这是我的个人博客。闻道有先后,术业有专攻,请多多关照。你喜欢写代码吗?", Field.Store.YES)); + doc.add(new TextField("title", "关于hankcs", Field.Store.YES)); + iwriter.addDocument(doc); + } + { + // 再加入一个 + Document doc = new Document(); + doc.add(new TextField(fieldName, "\n\n \n程序员喜欢黑夜", Field.Store.YES)); + doc.add(new TextField("title", "关于程序员", Field.Store.YES)); + iwriter.addDocument(doc); + } + iwriter.close(); + + //搜索过程********************************** + //实例化搜索器 + ireader = DirectoryReader.open(directory); + isearcher = new IndexSearcher(ireader); + + String keyword = "喜欢"; + //使用QueryParser查询分析器构造Query对象 + QueryParser qp = new QueryParser(fieldName, analyzer); + Query query = qp.parse(keyword); + System.out.println("Query = " + query); + + //搜索相似度最高的5条记录 + TopDocs topDocs = isearcher.search(query, 5); + System.out.println("命中:" + topDocs.totalHits); + //输出结果 + ScoreDoc[] scoreDocs = topDocs.scoreDocs; + + for (int i = 0; i < Math.min(5, scoreDocs.length); ++i) + { + Document targetDoc = isearcher.doc(scoreDocs[i].doc); + System.out.print(targetDoc.getField("title").stringValue()); + System.out.println(" , " + scoreDocs[i].score); + + String text = targetDoc.get(fieldName); + System.out.println(displayHtmlHighlight(query, analyzer, fieldName, text, 200)); + } + } + catch (CorruptIndexException e) + { + e.printStackTrace(); + } + catch (LockObtainFailedException e) + { + e.printStackTrace(); + } + catch (IOException e) + { + e.printStackTrace(); + } + catch (ParseException e) + { + e.printStackTrace(); + } + catch (InvalidTokenOffsetsException e) + { + e.printStackTrace(); + } + finally + { + if (ireader != null) + { + try + { + ireader.close(); + } + catch (IOException e) + { + e.printStackTrace(); + } + } + if (directory != null) + { + try + { + directory.close(); + } + catch (IOException e) + { + e.printStackTrace(); + } + } + } + } + + /** + * 获取高亮显示结果的html代码 + * @param query 查询 + * @param analyzer 分词器 + * @param fieldName 域名 + * @param fieldContent 域内容 + * @param fragmentSize 结果的长度(不含html标签长度) + * @return 结果(一段html代码) + * @throws IOException + * @throws InvalidTokenOffsetsException + */ + static String displayHtmlHighlight(Query query, Analyzer analyzer, String fieldName, String fieldContent, int fragmentSize) throws IOException, InvalidTokenOffsetsException + { + //创建一个高亮器 + Highlighter highlighter = new Highlighter(new SimpleHTMLFormatter("", ""), new QueryScorer(query)); + Fragmenter fragmenter = new SimpleFragmenter(fragmentSize); + highlighter.setTextFragmenter(fragmenter); + return highlighter.getBestFragment(analyzer, fieldName, fieldContent); + } +}