1.自定义TokenFilter
import org.apache.lucene.analysis.TokenFilter;import org.apache.lucene.analysis.TokenStream;import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;import java.io.IOException;import java.util.HashMap;import java.util.Map;public class CourtesyTokenFilter extends TokenFilter { private MapcourtesyMap = new HashMap<>(); private CharTermAttribute charTermAttribute; public CourtesyTokenFilter(TokenStream input) { super(input); this.charTermAttribute = this.addAttribute(CharTermAttribute.class); courtesyMap.put("dr", "doctor"); courtesyMap.put("mr", "mister"); courtesyMap.put("mrs", "miss"); } @Override public final boolean incrementToken() throws IOException { if (!this.input.incrementToken()) { return false; } String term = this.charTermAttribute.toString(); if (courtesyMap.containsKey(term)) { this.charTermAttribute.setEmpty().append(this.courtesyMap.get(term)); } return true; }}
2.应用TokenFilter:
String text = "Hi, Dr Wang, Mr Liu asks if you stay with Mrs Liu yesterday!"; StandardAnalyzer standardAnalyzer = new StandardAnalyzer(); CourtesyTokenFilter courtesyTokenFilter = new CourtesyTokenFilter(standardAnalyzer.tokenStream("text", text)); CharTermAttribute charTermAttribute = courtesyTokenFilter.addAttribute(CharTermAttribute.class); courtesyTokenFilter.reset(); while (courtesyTokenFilter.incrementToken()) { System.out.print(charTermAttribute + " "); }
3.场景解析
"Hi, Dr Wang, Mr Liu asks if you stay with Mrs Liu yesterday!" 这段文本中,有Dr, Mrs这两个词,我们看不懂,要用全称来显示。