Thursday, January 9, 2014

Learning Lucene/Solr Analyzer

org.apache.lucene.analysis.cn.smart.SentenceTokenizer.incrementToken()
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);

termAtt.setEmpty().append(buffer);
offsetAtt.setOffset(correctOffset(tokenStart), correctOffset(tokenEnd));
typeAtt.setType("sentence");

org.apache.solr.handler.AnalysisRequestHandlerBase.analyzeValue(String, AnalysisContext)
TokenizerFactory tfac = tokenizerChain.getTokenizerFactory();
TokenStream tokenStream = tfac.create(tokenizerChain.initReader(null, new StringReader(value)));
List<AttributeSource> tokens = analyzeTokenStream(tokenStream);

namedList.add(tokenStream.getClass().getName(), convertTokensToNamedLists(tokens, context));


for (TokenFilterFactory tokenFilterFactory : filtfacs) {
for (final AttributeSource tok : tokens) {
tok.getAttribute(TokenTrackingAttribute.class).freezeStage();
}
tokenStream = tokenFilterFactory.create(listBasedTokenStream);
tokens = analyzeTokenStream(tokenStream);
namedList.add(tokenStream.getClass().getName(), convertTokensToNamedLists(tokens, context));
listBasedTokenStream = new ListBasedTokenStream(tokens);
}

org.apache.solr.handler.AnalysisRequestHandlerBase.analyzeTokenStream(TokenStream)
  private List<AttributeSource> analyzeTokenStream(TokenStream tokenStream) {
    final List<AttributeSource> tokens = new ArrayList<AttributeSource>();
    final PositionIncrementAttribute posIncrAtt = tokenStream.addAttribute(PositionIncrementAttribute.class);
    final TokenTrackingAttribute trackerAtt = tokenStream.addAttribute(TokenTrackingAttribute.class);
    // for backwards compatibility, add all "common" attributes
    tokenStream.addAttribute(OffsetAttribute.class);
    tokenStream.addAttribute(TypeAttribute.class);
    try {
      tokenStream.reset();
      int position = 0;
      while (tokenStream.incrementToken()) {
        position += posIncrAtt.getPositionIncrement();
        trackerAtt.setActPosition(position);
        tokens.add(tokenStream.cloneAttributes());
      }
    } catch (IOException ioe) {
      throw new RuntimeException("Error occured while iterating over tokenstream", ioe);
    }

    return tokens;
  }

No comments:

Post a Comment