基于詞典的逆向最大匹配中文分詞算法,,能實現(xiàn)中英文數(shù)字混合分詞。比如能分出這樣的詞:bb霜,、3室,、樂phone、touch4,、mp3,、T恤。實際分詞效果比正向分詞效果好 public class RMM
{ private static final Log log = LogFactory.getLog(RMM.class); private static HashMap<String, Integer> dictionary = null; private static final int WORD_MAX_LENGTH = 9; static { loadDictionary(); } //將句子切分出詞,逆向最大匹配 public static ArrayList<Token> getToken(ArrayList<Sentence> list) throws IOException { Collections.reverse(list); ArrayList<Token> tokenlist=new ArrayList<Token>(); for(Sentence sen:list) { StringBuffer word = new StringBuffer(); int offset=sen.getStartOffset()+sen.getText().length; int bufferIndex = sen.getText().length-1; char c; boolean b=false; while(bufferIndex>-1) { offset--; c=sen.getText()[bufferIndex--]; if(word.length()==0) word.append(c); else { String temp = (c+word.toString()).intern(); if(dictionary.containsKey(temp) && dictionary.get(temp)==1) word.insert(0, c); else if(dictionary.containsKey(temp) && bufferIndex>-1) word.insert(0, c); else { bufferIndex++; offset++; while(word.length()>1 && dictionary.get(word.toString())!=null && dictionary.get(word.toString())==2) { word.deleteCharAt(0); bufferIndex++; offset++; } b=true; } } if(b || bufferIndex==-1) { Token token = new Token(word.toString(),offset,offset+word.length(),"word"); word.setLength(0); tokenlist.add(token); b=false; } } } Collections.reverse(tokenlist); return tokenlist; } //加載詞典 public static void loadDictionary() { if (dictionary == null) { dictionary = new HashMap<String, Integer>(); InputStream is = null; BufferedReader br = null; try { is = new FileInputStream(new File(RMM.class.getClassLoader().getResource("dictionary.txt").toURI())); br = new BufferedReader(new InputStreamReader(is, "UTF-8")); String word = null; while ((word = br.readLine()) != null) { word=word.toLowerCase(); if ((word.indexOf("#") == -1) && (word.length() <= WORD_MAX_LENGTH)) { dictionary.put(word.intern(), 1); int i = 1; while(i < word.length()-1) { String temp = word.substring(i,word.length()).intern(); if (!dictionary.containsKey(temp)) dictionary.put(temp,2); i++; } } } } catch (Exception e) { log.info(e); } finally { try { if(br!=null) br.close(); if(is!=null) is.close(); } catch (IOException e) { log.info(e); } } } } public static String[] segWords(Reader reader) { ArrayList<String> list=new ArrayList<String>(); try { ArrayList<Token> tlist= Util.getNewToken(getToken(Util.getSentence(reader))); for(Token t:tlist) { list.add(t.getWord()); } } catch(IOException e) { log.info(e); } return (String[])list.toArray(new String[0]); } public static void main(String[] args) { String[] cc=RMM.segWords(new StringReader("急,、急,、急、花里林居,二房二廳,業(yè)主誠心,出租".toLowerCase())); for(String c:cc) { System.out.println(c); } } } public class Util { //切分出由中文,、字母,、數(shù)字組成的句子 public static ArrayList<Sentence> getSentence(Reader reader) throws IOException { ArrayList<Sentence> list=new ArrayList<Sentence>(); StringBuffer cb=new StringBuffer(); int d=reader.read(); int offset=0; boolean b=false; while(d>-1) { int type=Character.getType(d); if(type==2 || type==9 || type==5) { d=toAscii(d); cb.append((char)d); } else { b=true; } d=reader.read(); if(d==-1 || b) { if(d==-1) offset++; b=false; char[] ioBuffer = new char[cb.length()]; cb.getChars(0, cb.length(), ioBuffer, 0); Sentence sen=new Sentence(ioBuffer,offset-cb.length()); list.add(sen); cb.setLength(0); } offset++; } return list; } //將相連的單個英文或數(shù)字組合成詞 public static ArrayList<Token> getNewToken(ArrayList<Token> list) throws IOException { ArrayList<Token> tokenlist=new ArrayList<Token>(); Token word=null; for(int i=0;i<list.size();i++) { Token t=list.get(i); if(t.getWord().length()==1 && Character.getType((int)t.getWord().charAt(0))!=5) { if(word==null) word=t; else if(word.getEnd()==t.getStart()) { word.setEnd(t.getEnd()); word.setWord(word.getWord()+t.getWord()); } else { tokenlist.add(word); word=t; } } else if(word!=null) { tokenlist.add(word); word=null; tokenlist.add(t); } else tokenlist.add(t); } if(word!=null) tokenlist.add(word); return tokenlist; } //雙角轉(zhuǎn)單角 public static int toAscii(int codePoint) { if((codePoint>=65296 && codePoint<=65305) //0-9 || (codePoint>=65313 && codePoint<=65338) //A-Z || (codePoint>=65345 && codePoint<=65370) //a-z ) { codePoint -= 65248; } return codePoint; } } |
|