http://wutaoo.iteye.com/blog/415126
SmartChineseAnalyzer 是一个智能中文分词模块, 与 ChineseAnalyzer (切分每个汉字)和 CJKAnalyzer (组合每两个汉字)不同, 它能够利用概率对汉语句子进行最优切分, 并内嵌英文tokenizer,能有效处理中英文混合的文本内容。目前SmartChineseAnalyzer的词典库只支持简体中文。
它的原理基于自然语言处理领域的隐马尔科夫模型(HMM), 利用大量语料库的训练来统计汉语词汇的词频和跳转概率,从而根据这些统计结果对整个汉语句子计算最似然(likelihood)的切分。
public List<SegToken> segmentSentence(String sentence, int startOffset) {
List<SegToken> segTokenList = hhmmSegmenter.process(sentence);
// tokens from sentence, excluding WordType.SENTENCE_BEGIN and WordType.SENTENCE_END
List<SegToken> result = Collections.emptyList();
if (segTokenList.size() > 2) // if its not an empty sentence
result = segTokenList.subList(1, segTokenList.size() - 1);
for (SegToken st : result)
convertSegToken(st, sentence, startOffset);
return result;
}
org.apache.lucene.analysis.cn.smart.WordSegmenter.segmentSentence(String, int)
public List<SegToken> process(String sentence) {
SegGraph segGraph = createSegGraph(sentence);
BiSegGraph biSegGraph = new BiSegGraph(segGraph);
List<SegToken> shortPath = biSegGraph.getShortPath();
return shortPath;
}
org.apache.lucene.analysis.cn.smart.hhmm.HHMMSegmenter.createSegGraph(String)
SegGraph segGraph = new SegGraph();
while (i < length) {
hasFullWidth = false;
switch (charTypeArray[i]) {
case CharType.HANZI:
j = i + 1;
wordBuf.delete(0, wordBuf.length());
// It doesn't matter if a single Chinese character (Hanzi) can form a phrase or not,
// it will store that single Chinese character (Hanzi) in the SegGraph. Otherwise, it will
// cause word division.
wordBuf.append(sentence.charAt(i));
charArray = new char[] { sentence.charAt(i) };
frequency = wordDict.getFrequency(charArray);
token = new SegToken(charArray, i, j, WordType.CHINESE_WORD,
frequency);
segGraph.addToken(token);
foundIndex = wordDict.getPrefixMatch(charArray);
while (j <= length && foundIndex != -1) {
if (wordDict.isEqual(charArray, foundIndex) && charArray.length > 1) {
// It is the phrase we are looking for; In other words, we have found a phrase SegToken
// from i to j. It is not a monosyllabic word (single word).
frequency = wordDict.getFrequency(charArray);
token = new SegToken(charArray, i, j, WordType.CHINESE_WORD,
frequency);
segGraph.addToken(token);
}
while (j < length && charTypeArray[j] == CharType.SPACE_LIKE)
j++;
if (j < length && charTypeArray[j] == CharType.HANZI) {
wordBuf.append(sentence.charAt(j));
charArray = new char[wordBuf.length()];
wordBuf.getChars(0, charArray.length, charArray, 0);
// idArray has been found (foundWordIndex!=-1) as a prefix before.
// Therefore, idArray after it has been lengthened can only appear after foundWordIndex.
// So start searching after foundWordIndex.
foundIndex = wordDict.getPrefixMatch(charArray, foundIndex);
j++;
} else {
break;
}
}
i++;
break;
}
}
SmartChineseAnalyzer 是一个智能中文分词模块, 与 ChineseAnalyzer (切分每个汉字)和 CJKAnalyzer (组合每两个汉字)不同, 它能够利用概率对汉语句子进行最优切分, 并内嵌英文tokenizer,能有效处理中英文混合的文本内容。目前SmartChineseAnalyzer的词典库只支持简体中文。
它的原理基于自然语言处理领域的隐马尔科夫模型(HMM), 利用大量语料库的训练来统计汉语词汇的词频和跳转概率,从而根据这些统计结果对整个汉语句子计算最似然(likelihood)的切分。
public List<SegToken> segmentSentence(String sentence, int startOffset) {
List<SegToken> segTokenList = hhmmSegmenter.process(sentence);
// tokens from sentence, excluding WordType.SENTENCE_BEGIN and WordType.SENTENCE_END
List<SegToken> result = Collections.emptyList();
if (segTokenList.size() > 2) // if its not an empty sentence
result = segTokenList.subList(1, segTokenList.size() - 1);
for (SegToken st : result)
convertSegToken(st, sentence, startOffset);
return result;
}
org.apache.lucene.analysis.cn.smart.WordSegmenter.segmentSentence(String, int)
public List<SegToken> process(String sentence) {
SegGraph segGraph = createSegGraph(sentence);
BiSegGraph biSegGraph = new BiSegGraph(segGraph);
List<SegToken> shortPath = biSegGraph.getShortPath();
return shortPath;
}
org.apache.lucene.analysis.cn.smart.hhmm.HHMMSegmenter.createSegGraph(String)
SegGraph segGraph = new SegGraph();
while (i < length) {
hasFullWidth = false;
switch (charTypeArray[i]) {
case CharType.HANZI:
j = i + 1;
wordBuf.delete(0, wordBuf.length());
// It doesn't matter if a single Chinese character (Hanzi) can form a phrase or not,
// it will store that single Chinese character (Hanzi) in the SegGraph. Otherwise, it will
// cause word division.
wordBuf.append(sentence.charAt(i));
charArray = new char[] { sentence.charAt(i) };
frequency = wordDict.getFrequency(charArray);
token = new SegToken(charArray, i, j, WordType.CHINESE_WORD,
frequency);
segGraph.addToken(token);
foundIndex = wordDict.getPrefixMatch(charArray);
while (j <= length && foundIndex != -1) {
if (wordDict.isEqual(charArray, foundIndex) && charArray.length > 1) {
// It is the phrase we are looking for; In other words, we have found a phrase SegToken
// from i to j. It is not a monosyllabic word (single word).
frequency = wordDict.getFrequency(charArray);
token = new SegToken(charArray, i, j, WordType.CHINESE_WORD,
frequency);
segGraph.addToken(token);
}
while (j < length && charTypeArray[j] == CharType.SPACE_LIKE)
j++;
if (j < length && charTypeArray[j] == CharType.HANZI) {
wordBuf.append(sentence.charAt(j));
charArray = new char[wordBuf.length()];
wordBuf.getChars(0, charArray.length, charArray, 0);
// idArray has been found (foundWordIndex!=-1) as a prefix before.
// Therefore, idArray after it has been lengthened can only appear after foundWordIndex.
// So start searching after foundWordIndex.
foundIndex = wordDict.getPrefixMatch(charArray, foundIndex);
j++;
} else {
break;
}
}
i++;
break;
}
}
No comments:
Post a Comment