查询好好多资料,英文同义词好好的,中文就不行,多谢网友支持,拼接了好多代码,然后修改了一些,不足之处,多谢指正。
直接上代码吧,在代码中了解怎么分词的最好
1,创建分词引擎
1 public interface SamewordContext {2 String[] getSamewords(String name);3 }
2,同义词
1 import java.util.HashMap; 2 import java.util.Map; 3 4 public class SimpleSamewordContext implements SamewordContext { 5 Mapmaps = new HashMap (); 6 public SimpleSamewordContext() { 7 maps.put("中国",new String[]{"天朝","大陆"}); 8 maps.put("我家",new String[]{"family","伐木累"}); 9 }10 @Override11 public String[] getSamewords(String name) {12 // TODO Auto-generated method stub13 return maps.get(name);14 }15 }
3,TokenFilter
import java.io.IOException;import java.util.Stack;import org.apache.lucene.analysis.TokenFilter;import org.apache.lucene.analysis.TokenStream;import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;import org.apache.lucene.util.AttributeSource;public class MySameTokenFilter extends TokenFilter { private CharTermAttribute cta = null; private PositionIncrementAttribute pia = null; private AttributeSource.State current; private Stacksames = null; private SamewordContext samewordContext; protected MySameTokenFilter(TokenStream input,SamewordContext samewordContext) { super(input); cta = this.addAttribute(CharTermAttribute.class); pia = this.addAttribute(PositionIncrementAttribute.class); sames = new Stack (); this.samewordContext = samewordContext; } @Override public boolean incrementToken() throws IOException { if(sames.size()>0) { //将元素出栈,并且获取这个同义词 String str = sames.pop(); //还原状态 restoreState(current); cta.setEmpty(); cta.append(str); //设置位置0 pia.setPositionIncrement(0); return true; } if(!this.input.incrementToken()) return false; if(addSames(cta.toString())) { //如果有同义词将当前状态先保存 current = captureState(); } return true; } private boolean addSames(String name) { String[] sws = samewordContext.getSamewords(name); if(sws!=null) { for(String str:sws) { sames.push(str); } return true; } return false; } }
4,Analyzer
import java.io.Reader;import java.io.StringReader;import org.apache.lucene.analysis.TokenStream;import org.apache.lucene.analysis.core.LowerCaseFilter;import org.apache.lucene.analysis.core.StopAnalyzer;import org.apache.lucene.analysis.core.StopFilter;import org.wltea.analyzer.lucene.IKTokenizer;import com.chenlb.mmseg4j.analysis.MMSegAnalyzer;public class MySameworkAnalyzer extends MMSegAnalyzer { private SamewordContext samewordContext; public MySameworkAnalyzer(SamewordContext samewordContext) { // TODO Auto-generated constructor stub this.samewordContext = samewordContext; } @Override protected TokenStreamComponents createComponents(String text) { Reader in = new StringReader(text); IKTokenizer tokenizer = new IKTokenizer(in , true); TokenStream tokenStream = new MySameTokenFilter(tokenizer, samewordContext); tokenStream = new LowerCaseFilter(tokenStream); tokenStream = new StopFilter(tokenStream, StopAnalyzer.ENGLISH_STOP_WORDS_SET); return new TokenStreamComponents(tokenizer, tokenStream); }}
5,测试
@Test public void test01() { String text = "我家在中国"; Analyzer analyzer = new MySameworkAnalyzer(new SimpleSamewordContext()); AnalyzerUtils.displayAllToken(text,analyzer); }
运行结果: