Thursday, January 9, 2014

Learning SmartChineseAnalyzer

http://wutaoo.iteye.com/blog/415126
SmartChineseAnalyzer 是一个智能中文分词模块, 与 ChineseAnalyzer (切分每个汉字)和 CJKAnalyzer (组合每两个汉字)不同, 它能够利用概率对汉语句子进行最优切分, 并内嵌英文tokenizer,能有效处理中英文混合的文本内容。目前SmartChineseAnalyzer的词典库只支持简体中文。 
它的原理基于自然语言处理领域的隐马尔科夫模型(HMM), 利用大量语料库的训练来统计汉语词汇的词频和跳转概率,从而根据这些统计结果对整个汉语句子计算最似然(likelihood)的切分。 

public List<SegToken> segmentSentence(String sentence, int startOffset) {

List<SegToken> segTokenList = hhmmSegmenter.process(sentence);
// tokens from sentence, excluding WordType.SENTENCE_BEGIN and WordType.SENTENCE_END
List<SegToken> result = Collections.emptyList();

if (segTokenList.size() > 2) // if its not an empty sentence
result = segTokenList.subList(1, segTokenList.size() - 1);

for (SegToken st : result)
convertSegToken(st, sentence, startOffset);

return result;
}

org.apache.lucene.analysis.cn.smart.WordSegmenter.segmentSentence(String, int)
  public List<SegToken> process(String sentence) {
    SegGraph segGraph = createSegGraph(sentence);
    BiSegGraph biSegGraph = new BiSegGraph(segGraph);
    List<SegToken> shortPath = biSegGraph.getShortPath();
    return shortPath;
  }

org.apache.lucene.analysis.cn.smart.hhmm.HHMMSegmenter.createSegGraph(String)
SegGraph segGraph = new SegGraph();
    while (i < length) {
      hasFullWidth = false;
      switch (charTypeArray[i]) {
        case CharType.HANZI:
          j = i + 1;
          wordBuf.delete(0, wordBuf.length());
          // It doesn't matter if a single Chinese character (Hanzi) can form a phrase or not, 
          // it will store that single Chinese character (Hanzi) in the SegGraph.  Otherwise, it will 
          // cause word division.
          wordBuf.append(sentence.charAt(i));
          charArray = new char[] { sentence.charAt(i) };
          frequency = wordDict.getFrequency(charArray);
          token = new SegToken(charArray, i, j, WordType.CHINESE_WORD,
              frequency);
          segGraph.addToken(token);

          foundIndex = wordDict.getPrefixMatch(charArray);
          while (j <= length && foundIndex != -1) {
            if (wordDict.isEqual(charArray, foundIndex) && charArray.length > 1) {
              // It is the phrase we are looking for; In other words, we have found a phrase SegToken
              // from i to j.  It is not a monosyllabic word (single word).
              frequency = wordDict.getFrequency(charArray);
              token = new SegToken(charArray, i, j, WordType.CHINESE_WORD,
                  frequency);
              segGraph.addToken(token);
            }

            while (j < length && charTypeArray[j] == CharType.SPACE_LIKE)
              j++;

            if (j < length && charTypeArray[j] == CharType.HANZI) {
              wordBuf.append(sentence.charAt(j));
              charArray = new char[wordBuf.length()];
              wordBuf.getChars(0, charArray.length, charArray, 0);
              // idArray has been found (foundWordIndex!=-1) as a prefix before.  
              // Therefore, idArray after it has been lengthened can only appear after foundWordIndex.  
              // So start searching after foundWordIndex.
              foundIndex = wordDict.getPrefixMatch(charArray, foundIndex);
              j++;
            } else {
              break;
            }
          }
          i++;
          break;
      }
    }


No comments:

Post a Comment