`
heilwolf
  • 浏览: 32806 次
  • 性别: Icon_minigender_1
  • 来自: 杭州
社区版块
存档分类
最新评论

中文分词 mmseg4j 在 lucene 中的使用示例

阅读更多
mmseg4j 发布也有一段时间了,前些日子忙着发布新的版本,修正 bug 之类的。使用示例一直拖到现在,其实 svn 上的 test 有使用 lucene 例子。如果你了解 lucene ,就不用例子也可以很方便与它集成。

mmseg4j 有几个 analyzer:SimpleAnalyzer、ComplexAnalyzer、MaxWordAnalyzer、MMSegAnalyzer。前面 三个都是继承 MMSegAnalyzer,MMSegAnalyzer 默认使用 max-word 方式分词。这些 analyzer 都有无参数的构造函数,还有一个带词库目录为参数的构造函数。怎么用也没多少可说的,看下 svn 里 test:

package com.chenlb.mmseg4j.lucene;  
  
import java.io.IOException;  
  
import junit.framework.TestCase;  
  
import org.apache.lucene.analysis.Analyzer;  
import org.apache.lucene.document.Document;  
import org.apache.lucene.document.Field;  
import org.apache.lucene.index.CorruptIndexException;  
import org.apache.lucene.index.IndexWriter;  
import org.apache.lucene.queryParser.ParseException;  
import org.apache.lucene.queryParser.QueryParser;  
import org.apache.lucene.search.IndexSearcher;  
import org.apache.lucene.search.Query;  
import org.apache.lucene.search.ScoreDoc;  
import org.apache.lucene.search.TopDocs;  
import org.apache.lucene.store.Directory;  
import org.apache.lucene.store.RAMDirectory;  
  
import com.chenlb.mmseg4j.analysis.ComplexAnalyzer;  
import com.chenlb.mmseg4j.analysis.MaxWordAnalyzer;  
import com.chenlb.mmseg4j.analysis.SimpleAnalyzer;  
  
public class LuceneUseSimpleAnalyzerTest extends TestCase {  
  
    Directory dir;  
    Analyzer analyzer;  
  
    @Override  
    protected void setUp throws Exception {  
        String txt = "京华时报1月23日报道 昨 天,受一股来自中西伯利亚的强冷空气影响,本市出现大风降温天气,白天最高气温只有零下7摄氏度,同时伴有6到7级的偏北风。";  
        //txt = "2008年底发了资金吗";  
        analyzer = new SimpleAnalyzer;  
        analyzer = new ComplexAnalyzer;  
        //analyzer = new MaxWordAnalyzer;  
        dir = new RAMDirectory;  
        IndexWriter iw = new IndexWriter(dir, analyzer);  
        Document doc = new Document;  
        doc.add(new Field("txt", txt, Field.Store.YES, Field.Index.ANALYZED));  
        iw.addDocument(doc);  
        iw.commit;  
        iw.optimize;  
        iw.close;  
    }  
  
    public void testSearch {  
        try {  
            IndexSearcher searcher = new IndexSearcher(dir);  
            QueryParser qp = new QueryParser("txt", analyzer);  
            Query q = qp.parse("西 伯利亚"); //2008年底  
            System.out.println(q);  
            TopDocs tds = searcher.search(q, 10);  
            System.out.println("======size:"+tds.totalHits+"========");  
            for(ScoreDoc sd : tds.scoreDocs) {  
                System.out.println(sd.score);  
                System.out.println(searcher.doc(sd.doc).get("txt"));  
            }  
        } catch (CorruptIndexException e) {  
  
            e.printStackTrace;  
        } catch (IOException e) {  
  
            e.printStackTrace;  
        } catch (ParseException e) {  
  
            e.printStackTrace;  
        }  
    }  
  
}  

package com.chenlb.mmseg4j.lucene;

import java.io.IOException;

import junit.framework.TestCase;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;

import com.chenlb.mmseg4j.analysis.ComplexAnalyzer;
import com.chenlb.mmseg4j.analysis.MaxWordAnalyzer;
import com.chenlb.mmseg4j.analysis.SimpleAnalyzer;

public class LuceneUseSimpleAnalyzerTest extends TestCase {

Directory dir;
Analyzer analyzer;

@Override
protected void setUp throws Exception {
String txt = "京华时报1月23日报道 昨天,受一股来自中西伯利亚的强冷空气影响,本市出现大风降温天气,白天最高气温只有零下7摄氏度,同时伴有6到7级的偏北风。";
//txt = "2008年底发了资金吗";
analyzer = new SimpleAnalyzer;
analyzer = new ComplexAnalyzer;
//analyzer = new MaxWordAnalyzer;
dir = new RAMDirectory;
IndexWriter iw = new IndexWriter(dir, analyzer);
Document doc = new Document;
doc.add(new Field("txt", txt, Field.Store.YES, Field.Index.ANALYZED));
iw.addDocument(doc);
iw.commit;
iw.optimize;
iw.close;
}

public void testSearch {
try {
IndexSearcher searcher = new IndexSearcher(dir);
QueryParser qp = new QueryParser("txt", analyzer);
Query q = qp.parse("西伯利亚"); //2008年底
System.out.println(q);
TopDocs tds = searcher.search(q, 10);
System.out.println("======size:"+tds.totalHits+"========");
for(ScoreDoc sd : tds.scoreDocs) {
System.out.println(sd.score);
System.out.println(searcher.doc(sd.doc).get("txt"));
}
} catch (CorruptIndexException e) {

e.printStackTrace;
} catch (IOException e) {

e.printStackTrace;
} catch (ParseException e) {

e.printStackTrace;
}
}

}
运行结果:

2009-4-26 22:41:02 com.chenlb.mmseg4j.Dictionary getDefalutPath
信息: look up in mmseg.dic.path=null
2009-4-26 22:41:02 com.chenlb.mmseg4j.Dictionary getDefalutPath
信息: look up in user.dir=M:\workspace\mmseg4j/data
2009-4-26 22:41:02 com.chenlb.mmseg4j.Dictionary loadDic
信息: chars loaded time=391ms, line=12638, on file=M:\workspace\mmseg4j\data\chars.dic
2009-4-26 22:41:02 com.chenlb.mmseg4j.Dictionary loadDic
信息: words loaded time=16ms, line=1, on file=M:\workspace\mmseg4j\data\words-my.dic
2009-4-26 22:41:06 com.chenlb.mmseg4j.Dictionary loadDic
信息: words loaded time=3406ms, line=157202, on file=M:\workspace\mmseg4j\data\words.dic
2009-4-26 22:41:06 com.chenlb.mmseg4j.Dictionary loadDic
信息: sort time=0ms
2009-4-26 22:41:06 com.chenlb.mmseg4j.Dictionary loadDic
信息: load dic use time=3844ms
2009-4-26 22:41:06 com.chenlb.mmseg4j.Dictionary loadUnit
信息: unit loaded time=16ms, line=22, on file=M:\workspace\mmseg4j\data\units.dic
txt:西伯利亚
======size:1========
0.047945753
京华时报1月23日报道 昨天,受一股来自中西伯利亚的强冷空气影响,本市出现大风降温天气,白天最高气温只有零下7摄氏度,同时伴有6到7级的偏北风。

上面的运行环境是 lucene 2.4 的。


分享到:
评论

相关推荐

Global site tag (gtag.js) - Google Analytics