com.raritantechnologies.concept.classifier
Class IndexedDocument

java.lang.Object
  extended bycom.raritantechnologies.concept.classifier.IndexedDocument

public class IndexedDocument
extends java.lang.Object

Contains an indexed view of a document: A map of tokens to token positions.


Developed by Raritan Technologies Inc..

Author:
Ted Sullivan

Nested Class Summary
 class IndexedDocument.Excerpt
           
 class IndexedDocument.Sentence
           
 class IndexedDocument.SentenceStats
           
 
Field Summary
static java.lang.String TOKEN_DELIMITER
           
 
Constructor Summary
IndexedDocument()
           
IndexedDocument(java.lang.String document)
           
IndexedDocument(java.lang.String document, boolean detectSentences)
           
 
Method Summary
 void addToken(java.lang.String token, int wordPosition, long charPosition)
           
 boolean contains(java.lang.String token)
           
 boolean contains(java.lang.String token, boolean caseSensitive)
           
 java.util.ArrayList getCharacterPositionList(java.lang.String pToken, boolean caseSensitive)
           
 int getCount(java.lang.String token)
           
 int getCount(java.lang.String token, boolean caseSensitive)
           
 java.lang.String getDelimiter()
           
 java.lang.String getDocumentText()
           
 java.lang.String[] getExcerpts(IDocumentMatcher docMatcher, int excerptWidth, boolean caseSensitive)
           
 java.lang.String[] getExcerpts(IDocumentMatcher docMatcher, int excerptWidth, boolean caseSensitive, java.lang.String preMarkup, java.lang.String postMarkup)
           
 java.lang.String[] getExcerpts(java.lang.String[] termList, int excerptWidth, boolean caseSensitive)
           
 java.lang.String[] getExcerpts(java.lang.String[] termList, int excerptWidth, boolean caseSensitive, java.lang.String preMarkup, java.lang.String postMarkup)
           
 double getFieldBoost(java.lang.String fieldName)
           
 java.util.Set getPositionSet(java.lang.String token)
           
 java.util.Set getPositionSet(java.lang.String token, boolean caseSensitive)
           
 java.util.Map getSentenceStats(java.lang.String[] wordSet, boolean caseSensitive)
          returns a map of sentence number --> SentenceStats object.
 java.util.List getSortedSentences(java.lang.String[] wordSet, boolean caseSensitive)
          returns SentenceStats in order of decreasing word hit count...
 java.util.Iterator getTokens()
           
 java.util.Iterator getTokens(boolean caseSensitive)
           
 int getWordCount()
           
 boolean isStopWord(java.lang.String word)
           
static java.lang.String markupExcerpt(java.lang.String excerpt, java.lang.String delimiter, java.lang.String term, boolean caseSensitive, java.lang.String preMarkup, java.lang.String postMarkup)
           
 int minDistance(java.lang.String termOne, java.lang.String termTwo)
          Returns the minimum distance between the two terms in the document.
 int minDistance(java.lang.String termOne, java.lang.String termTwo, boolean caseSensitive)
           
 int minDistance(java.lang.String termOne, java.lang.String termTwo, boolean caseSensitive, boolean ordered)
           
 void setDelimiter(java.lang.String delimiter)
           
 void setDocumentText(java.lang.String documentText)
           
 void setFieldBoost(java.lang.String fieldName, java.lang.Double fieldBoost)
           
 void setFieldBoost(java.lang.String fieldName, java.lang.String boostValue)
           
 void setFieldBoostMap(java.util.Map fieldBoostMap)
           
 void setRemoveHTML(boolean removeHTML)
           
 void setStopWords(java.util.Set stopWords)
           
 java.util.Set tokenize(java.util.Map tokenMap, boolean doDetectSentences)
           
 java.util.Set tokenize(java.util.Map tokenMap, boolean doDetectSentences, boolean caseSensitive)
           
 void tokenize(java.lang.String document)
           
 java.util.Set tokenize(java.lang.String document, java.util.Map tokenMap)
           
 java.util.Set tokenize(java.lang.String pDocument, java.util.Map tokenMap, boolean doDetectSentences)
           
 java.util.Set tokenize(java.lang.String pDocument, java.util.Map tokenMap, boolean doDetectSentences, boolean caseSensitive)
           
 
Methods inherited from class java.lang.Object
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait
 

Field Detail

TOKEN_DELIMITER

public static final java.lang.String TOKEN_DELIMITER
See Also:
Constant Field Values
Constructor Detail

IndexedDocument

public IndexedDocument()

IndexedDocument

public IndexedDocument(java.lang.String document)

IndexedDocument

public IndexedDocument(java.lang.String document,
                       boolean detectSentences)
Method Detail

setDelimiter

public void setDelimiter(java.lang.String delimiter)

getDelimiter

public java.lang.String getDelimiter()

setDocumentText

public void setDocumentText(java.lang.String documentText)

getDocumentText

public java.lang.String getDocumentText()

setStopWords

public void setStopWords(java.util.Set stopWords)

isStopWord

public boolean isStopWord(java.lang.String word)

setFieldBoostMap

public void setFieldBoostMap(java.util.Map fieldBoostMap)

getFieldBoost

public double getFieldBoost(java.lang.String fieldName)

setFieldBoost

public void setFieldBoost(java.lang.String fieldName,
                          java.lang.String boostValue)

setFieldBoost

public void setFieldBoost(java.lang.String fieldName,
                          java.lang.Double fieldBoost)

setRemoveHTML

public void setRemoveHTML(boolean removeHTML)

tokenize

public void tokenize(java.lang.String document)

tokenize

public java.util.Set tokenize(java.lang.String document,
                              java.util.Map tokenMap)

tokenize

public java.util.Set tokenize(java.util.Map tokenMap,
                              boolean doDetectSentences)

tokenize

public java.util.Set tokenize(java.util.Map tokenMap,
                              boolean doDetectSentences,
                              boolean caseSensitive)

tokenize

public java.util.Set tokenize(java.lang.String pDocument,
                              java.util.Map tokenMap,
                              boolean doDetectSentences)

tokenize

public java.util.Set tokenize(java.lang.String pDocument,
                              java.util.Map tokenMap,
                              boolean doDetectSentences,
                              boolean caseSensitive)

getTokens

public java.util.Iterator getTokens()

getTokens

public java.util.Iterator getTokens(boolean caseSensitive)

addToken

public void addToken(java.lang.String token,
                     int wordPosition,
                     long charPosition)

getCount

public int getCount(java.lang.String token)

getWordCount

public int getWordCount()

getCount

public int getCount(java.lang.String token,
                    boolean caseSensitive)

getPositionSet

public java.util.Set getPositionSet(java.lang.String token)

getPositionSet

public java.util.Set getPositionSet(java.lang.String token,
                                    boolean caseSensitive)

contains

public boolean contains(java.lang.String token)

contains

public boolean contains(java.lang.String token,
                        boolean caseSensitive)

minDistance

public int minDistance(java.lang.String termOne,
                       java.lang.String termTwo)
Returns the minimum distance between the two terms in the document.


minDistance

public int minDistance(java.lang.String termOne,
                       java.lang.String termTwo,
                       boolean caseSensitive)

minDistance

public int minDistance(java.lang.String termOne,
                       java.lang.String termTwo,
                       boolean caseSensitive,
                       boolean ordered)

getSortedSentences

public java.util.List getSortedSentences(java.lang.String[] wordSet,
                                         boolean caseSensitive)
returns SentenceStats in order of decreasing word hit count...


getSentenceStats

public java.util.Map getSentenceStats(java.lang.String[] wordSet,
                                      boolean caseSensitive)
returns a map of sentence number --> SentenceStats object.


getCharacterPositionList

public java.util.ArrayList getCharacterPositionList(java.lang.String pToken,
                                                    boolean caseSensitive)

getExcerpts

public java.lang.String[] getExcerpts(IDocumentMatcher docMatcher,
                                      int excerptWidth,
                                      boolean caseSensitive)

getExcerpts

public java.lang.String[] getExcerpts(IDocumentMatcher docMatcher,
                                      int excerptWidth,
                                      boolean caseSensitive,
                                      java.lang.String preMarkup,
                                      java.lang.String postMarkup)

getExcerpts

public java.lang.String[] getExcerpts(java.lang.String[] termList,
                                      int excerptWidth,
                                      boolean caseSensitive)

getExcerpts

public java.lang.String[] getExcerpts(java.lang.String[] termList,
                                      int excerptWidth,
                                      boolean caseSensitive,
                                      java.lang.String preMarkup,
                                      java.lang.String postMarkup)

markupExcerpt

public static java.lang.String markupExcerpt(java.lang.String excerpt,
                                             java.lang.String delimiter,
                                             java.lang.String term,
                                             boolean caseSensitive,
                                             java.lang.String preMarkup,
                                             java.lang.String postMarkup)