com.raritantechnologies.concept.classifier
Class IndexedDocument
java.lang.Object
com.raritantechnologies.concept.classifier.IndexedDocument
- public class IndexedDocument
- extends java.lang.Object
Contains an indexed view of a document: A map of tokens to token positions.
Developed by
Raritan Technologies Inc..
- Author:
- Ted Sullivan
|
Method Summary |
void |
addToken(java.lang.String token,
int wordPosition,
long charPosition)
|
boolean |
contains(java.lang.String token)
|
boolean |
contains(java.lang.String token,
boolean caseSensitive)
|
java.util.ArrayList |
getCharacterPositionList(java.lang.String pToken,
boolean caseSensitive)
|
int |
getCount(java.lang.String token)
|
int |
getCount(java.lang.String token,
boolean caseSensitive)
|
java.lang.String |
getDelimiter()
|
java.lang.String |
getDocumentText()
|
java.lang.String[] |
getExcerpts(IDocumentMatcher docMatcher,
int excerptWidth,
boolean caseSensitive)
|
java.lang.String[] |
getExcerpts(IDocumentMatcher docMatcher,
int excerptWidth,
boolean caseSensitive,
java.lang.String preMarkup,
java.lang.String postMarkup)
|
java.lang.String[] |
getExcerpts(java.lang.String[] termList,
int excerptWidth,
boolean caseSensitive)
|
java.lang.String[] |
getExcerpts(java.lang.String[] termList,
int excerptWidth,
boolean caseSensitive,
java.lang.String preMarkup,
java.lang.String postMarkup)
|
double |
getFieldBoost(java.lang.String fieldName)
|
java.util.Set |
getPositionSet(java.lang.String token)
|
java.util.Set |
getPositionSet(java.lang.String token,
boolean caseSensitive)
|
java.util.Map |
getSentenceStats(java.lang.String[] wordSet,
boolean caseSensitive)
returns a map of sentence number --> SentenceStats object. |
java.util.List |
getSortedSentences(java.lang.String[] wordSet,
boolean caseSensitive)
returns SentenceStats in order of decreasing word hit count... |
java.util.Iterator |
getTokens()
|
java.util.Iterator |
getTokens(boolean caseSensitive)
|
int |
getWordCount()
|
boolean |
isStopWord(java.lang.String word)
|
static java.lang.String |
markupExcerpt(java.lang.String excerpt,
java.lang.String delimiter,
java.lang.String term,
boolean caseSensitive,
java.lang.String preMarkup,
java.lang.String postMarkup)
|
int |
minDistance(java.lang.String termOne,
java.lang.String termTwo)
Returns the minimum distance between the two terms in the document. |
int |
minDistance(java.lang.String termOne,
java.lang.String termTwo,
boolean caseSensitive)
|
int |
minDistance(java.lang.String termOne,
java.lang.String termTwo,
boolean caseSensitive,
boolean ordered)
|
void |
setDelimiter(java.lang.String delimiter)
|
void |
setDocumentText(java.lang.String documentText)
|
void |
setFieldBoost(java.lang.String fieldName,
java.lang.Double fieldBoost)
|
void |
setFieldBoost(java.lang.String fieldName,
java.lang.String boostValue)
|
void |
setFieldBoostMap(java.util.Map fieldBoostMap)
|
void |
setRemoveHTML(boolean removeHTML)
|
void |
setStopWords(java.util.Set stopWords)
|
java.util.Set |
tokenize(java.util.Map tokenMap,
boolean doDetectSentences)
|
java.util.Set |
tokenize(java.util.Map tokenMap,
boolean doDetectSentences,
boolean caseSensitive)
|
void |
tokenize(java.lang.String document)
|
java.util.Set |
tokenize(java.lang.String document,
java.util.Map tokenMap)
|
java.util.Set |
tokenize(java.lang.String pDocument,
java.util.Map tokenMap,
boolean doDetectSentences)
|
java.util.Set |
tokenize(java.lang.String pDocument,
java.util.Map tokenMap,
boolean doDetectSentences,
boolean caseSensitive)
|
| Methods inherited from class java.lang.Object |
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait |
TOKEN_DELIMITER
public static final java.lang.String TOKEN_DELIMITER
- See Also:
- Constant Field Values
IndexedDocument
public IndexedDocument()
IndexedDocument
public IndexedDocument(java.lang.String document)
IndexedDocument
public IndexedDocument(java.lang.String document,
boolean detectSentences)
setDelimiter
public void setDelimiter(java.lang.String delimiter)
getDelimiter
public java.lang.String getDelimiter()
setDocumentText
public void setDocumentText(java.lang.String documentText)
getDocumentText
public java.lang.String getDocumentText()
setStopWords
public void setStopWords(java.util.Set stopWords)
isStopWord
public boolean isStopWord(java.lang.String word)
setFieldBoostMap
public void setFieldBoostMap(java.util.Map fieldBoostMap)
getFieldBoost
public double getFieldBoost(java.lang.String fieldName)
setFieldBoost
public void setFieldBoost(java.lang.String fieldName,
java.lang.String boostValue)
setFieldBoost
public void setFieldBoost(java.lang.String fieldName,
java.lang.Double fieldBoost)
setRemoveHTML
public void setRemoveHTML(boolean removeHTML)
tokenize
public void tokenize(java.lang.String document)
tokenize
public java.util.Set tokenize(java.lang.String document,
java.util.Map tokenMap)
tokenize
public java.util.Set tokenize(java.util.Map tokenMap,
boolean doDetectSentences)
tokenize
public java.util.Set tokenize(java.util.Map tokenMap,
boolean doDetectSentences,
boolean caseSensitive)
tokenize
public java.util.Set tokenize(java.lang.String pDocument,
java.util.Map tokenMap,
boolean doDetectSentences)
tokenize
public java.util.Set tokenize(java.lang.String pDocument,
java.util.Map tokenMap,
boolean doDetectSentences,
boolean caseSensitive)
getTokens
public java.util.Iterator getTokens()
getTokens
public java.util.Iterator getTokens(boolean caseSensitive)
addToken
public void addToken(java.lang.String token,
int wordPosition,
long charPosition)
getCount
public int getCount(java.lang.String token)
getWordCount
public int getWordCount()
getCount
public int getCount(java.lang.String token,
boolean caseSensitive)
getPositionSet
public java.util.Set getPositionSet(java.lang.String token)
getPositionSet
public java.util.Set getPositionSet(java.lang.String token,
boolean caseSensitive)
contains
public boolean contains(java.lang.String token)
contains
public boolean contains(java.lang.String token,
boolean caseSensitive)
minDistance
public int minDistance(java.lang.String termOne,
java.lang.String termTwo)
- Returns the minimum distance between the two terms in the document.
minDistance
public int minDistance(java.lang.String termOne,
java.lang.String termTwo,
boolean caseSensitive)
minDistance
public int minDistance(java.lang.String termOne,
java.lang.String termTwo,
boolean caseSensitive,
boolean ordered)
getSortedSentences
public java.util.List getSortedSentences(java.lang.String[] wordSet,
boolean caseSensitive)
- returns SentenceStats in order of decreasing word hit count...
getSentenceStats
public java.util.Map getSentenceStats(java.lang.String[] wordSet,
boolean caseSensitive)
- returns a map of sentence number --> SentenceStats object.
getCharacterPositionList
public java.util.ArrayList getCharacterPositionList(java.lang.String pToken,
boolean caseSensitive)
getExcerpts
public java.lang.String[] getExcerpts(IDocumentMatcher docMatcher,
int excerptWidth,
boolean caseSensitive)
getExcerpts
public java.lang.String[] getExcerpts(IDocumentMatcher docMatcher,
int excerptWidth,
boolean caseSensitive,
java.lang.String preMarkup,
java.lang.String postMarkup)
getExcerpts
public java.lang.String[] getExcerpts(java.lang.String[] termList,
int excerptWidth,
boolean caseSensitive)
getExcerpts
public java.lang.String[] getExcerpts(java.lang.String[] termList,
int excerptWidth,
boolean caseSensitive,
java.lang.String preMarkup,
java.lang.String postMarkup)
markupExcerpt
public static java.lang.String markupExcerpt(java.lang.String excerpt,
java.lang.String delimiter,
java.lang.String term,
boolean caseSensitive,
java.lang.String preMarkup,
java.lang.String postMarkup)