Skip to content

Commit

Permalink
演示高亮搜索结果,解决:hankcs/HanLP#74
Browse files Browse the repository at this point in the history
  • Loading branch information
hankcs committed Oct 22, 2015
1 parent 8c469d0 commit 91eefb5
Show file tree
Hide file tree
Showing 4 changed files with 189 additions and 1 deletion.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,4 @@

# virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml
hs_err_pid*
/.idea
10 changes: 9 additions & 1 deletion pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

<groupId>com.hankcs.nlp</groupId>
<artifactId>hanlp-solr-plugin</artifactId>
<version>1.0.1</version>
<version>1.0.2</version>

<name>hanlp-solr-plugin</name>
<url>https://github.com/hankcs/HanLP</url>
Expand Down Expand Up @@ -37,6 +37,14 @@
<scope>test</scope>
</dependency>

<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-highlighter</artifactId>
<version>${lucene.version}</version>
<scope>test</scope>
</dependency>


<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-core</artifactId>
Expand Down
10 changes: 10 additions & 0 deletions src/main/java/com/hankcs/lucene/SegmentWrapper.java
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,10 @@ public class SegmentWrapper
* termArray下标
*/
int index;
/**
* term的偏移量,由于wrapper是按行读取的,必须对term.offset做一个校正
*/
int offset;

public SegmentWrapper(BufferedReader br, Segment segment)
{
Expand All @@ -49,6 +53,7 @@ public void reset(BufferedReader br)
this.br = br;
termArray = null;
index = 0;
offset = 0;
}

public Term next() throws IOException
Expand All @@ -58,12 +63,17 @@ public Term next() throws IOException
while (isBlank(line))
{
if (line == null) return null;
offset += line.length() + 1;
line = br.readLine();
}

List<Term> termList = segment.seg(line);
if (termList.size() == 0) return null;
termArray = termList.toArray(new Term[0]);
for (Term term : termArray)
{
term.offset += offset;
}
index = 0;

return termArray[index++];
Expand Down
169 changes: 169 additions & 0 deletions src/test/java/com/hankcs/lucene/HighLighterTest.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,169 @@
/*
* <summary></summary>
* <author>He Han</author>
* <email>[email protected]</email>
* <create-date>2015/10/22 11:37</create-date>
*
* <copyright file="HighLighterDemo.java" company="码农场">
* Copyright (c) 2008-2015, 码农场. All Right Reserved, http://www.hankcs.com/
* This source is subject to Hankcs. Please contact Hankcs to get more information.
* </copyright>
*/
package com.hankcs.lucene;

import junit.framework.TestCase;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.*;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.*;
import org.apache.lucene.search.highlight.*;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.LockObtainFailedException;
import org.apache.lucene.store.RAMDirectory;

import java.io.IOException;

/**
* 演示高亮搜索结果
* @author hankcs
*
*/
public class HighLighterTest extends TestCase
{

public void testHightlight() throws Exception
{
// Lucene Document的主要域名
String fieldName = "text";

// 实例化Analyzer分词器
Analyzer analyzer = new HanLPAnalyzer();

Directory directory = null;
IndexWriter iwriter;
IndexReader ireader = null;
IndexSearcher isearcher;
try
{
//索引过程**********************************
//建立内存索引对象
directory = new RAMDirectory();

//配置IndexWriterConfig
IndexWriterConfig iwConfig = new IndexWriterConfig(analyzer);
iwConfig.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND);
iwriter = new IndexWriter(directory, iwConfig);
{
// 加入一个文档
Document doc = new Document();
doc.add(new TextField(fieldName, "我白天是一名语言学习者,晚上是一名初级码农。空的时候喜欢看算法和应用数学书,也喜欢悬疑推理小说,ACG方面喜欢型月、轨迹。喜欢有思想深度的事物,讨厌急躁、拜金与安逸的人。目前在魔都某女校学习,这是我的个人博客。闻道有先后,术业有专攻,请多多关照。你喜欢写代码吗?", Field.Store.YES));
doc.add(new TextField("title", "关于hankcs", Field.Store.YES));
iwriter.addDocument(doc);
}
{
// 再加入一个
Document doc = new Document();
doc.add(new TextField(fieldName, "\n\n \n程序员喜欢黑夜", Field.Store.YES));
doc.add(new TextField("title", "关于程序员", Field.Store.YES));
iwriter.addDocument(doc);
}
iwriter.close();

//搜索过程**********************************
//实例化搜索器
ireader = DirectoryReader.open(directory);
isearcher = new IndexSearcher(ireader);

String keyword = "喜欢";
//使用QueryParser查询分析器构造Query对象
QueryParser qp = new QueryParser(fieldName, analyzer);
Query query = qp.parse(keyword);
System.out.println("Query = " + query);

//搜索相似度最高的5条记录
TopDocs topDocs = isearcher.search(query, 5);
System.out.println("命中:" + topDocs.totalHits);
//输出结果
ScoreDoc[] scoreDocs = topDocs.scoreDocs;

for (int i = 0; i < Math.min(5, scoreDocs.length); ++i)
{
Document targetDoc = isearcher.doc(scoreDocs[i].doc);
System.out.print(targetDoc.getField("title").stringValue());
System.out.println(" , " + scoreDocs[i].score);

String text = targetDoc.get(fieldName);
System.out.println(displayHtmlHighlight(query, analyzer, fieldName, text, 200));
}
}
catch (CorruptIndexException e)
{
e.printStackTrace();
}
catch (LockObtainFailedException e)
{
e.printStackTrace();
}
catch (IOException e)
{
e.printStackTrace();
}
catch (ParseException e)
{
e.printStackTrace();
}
catch (InvalidTokenOffsetsException e)
{
e.printStackTrace();
}
finally
{
if (ireader != null)
{
try
{
ireader.close();
}
catch (IOException e)
{
e.printStackTrace();
}
}
if (directory != null)
{
try
{
directory.close();
}
catch (IOException e)
{
e.printStackTrace();
}
}
}
}

/**
* 获取高亮显示结果的html代码
* @param query 查询
* @param analyzer 分词器
* @param fieldName 域名
* @param fieldContent 域内容
* @param fragmentSize 结果的长度(不含html标签长度)
* @return 结果(一段html代码)
* @throws IOException
* @throws InvalidTokenOffsetsException
*/
static String displayHtmlHighlight(Query query, Analyzer analyzer, String fieldName, String fieldContent, int fragmentSize) throws IOException, InvalidTokenOffsetsException
{
//创建一个高亮器
Highlighter highlighter = new Highlighter(new SimpleHTMLFormatter("<font color='red'>", "</font>"), new QueryScorer(query));
Fragmenter fragmenter = new SimpleFragmenter(fragmentSize);
highlighter.setTextFragmenter(fragmenter);
return highlighter.getBestFragment(analyzer, fieldName, fieldContent);
}
}

0 comments on commit 91eefb5

Please sign in to comment.