com.raritantechnologies.federated.html
Class HTMLSearchSource

java.lang.Object
  extended bycom.raritantechnologies.searchApp.SearchSource
      extended bycom.raritantechnologies.federated.html.HTMLSearchSource

public class HTMLSearchSource
extends SearchSource

Describes an HTTP SearchSource that can search web site search engines. Manages login search, and result paging processes. Delegates search and retrival activities to a HTMLScraper object. Manages the dynamic mapping of input fields into a Login Process and "Search Process" using login and search templates.

XML Configuration Template:
  <SourceType name="[unique search source name]" 
                 type="HTMLSearchSource" 
                 displayName="[ displayable name]"
                 sourceFactoryClass="com.raritantechnologies.federated.html.HTMLSearchSourceFactory" 
                 queryProcessor="com.raritantechnologies.federated.html.HTMLQueryProcessor"
                 resultIsXML="true|false(default)"
                 cacheCookieKey="[ key to cache cookies from site ]"
                 cookieResultField="[ result field to put cookie String in ]"
                 IDField="[ field to use as unique ID ]"
                 URLField="[ field with fullText URL ]"
                 titleField="[ field with document Title ]" >

  <!-- Describes mapping of query input parameters to the HTML SearchProcess and of the      -->
  <!-- abstract or normalized field names to the field name at the HTML source.              -->
  <Fields>

    <!-- The value of the ID field in the query will be inserted into the SearchProcess      -->
    <!-- element at the xPath location specified in the xPath parameter. The sourceName      -->
    <!-- field defines the name of the field at the HTML source.                             -->
    <Field ID="[ abstract field name ]"
              xPath="[ xPath within the SearchProcess: e.g. '/SearchProcess/Step/params/param[@formName='q']/@value' ]"
              sourceName="[ field name at source (e.g. 'q')]"/>

    <!-- Describes complex formatting needed for a field value.       -->
    <!-- Describes complex formatting needed for a field value.       -->
    <FormattedField uses com.raritantechnologies.utils.format classes -->

    <FormatField value="{(KY)_KY_}"
             xPath="/SearchProcess/Step/params/param[@formName='term']/@value" >

    <FilteredField uses com.raritantechnologies.utils.filter classes -->

    <FilterField class="com.raritantechnologies.utils.filter.IStringFilter"
             xPath="/SearchProcess/Step/params/param[@formName='term']/@value" >

    <Field ID="[ boolean ID ]" xPath="path to search form value ]"
      <!-- FieldLookup allows user selected value to be looked up for use in FormatField -->
      <!-- used by Block: {(TI)(BOOL(TI TTL TI_OP ())) _TTL_[Title]} 
           see com.raritantechnologies.utils.format.BlockFormatter javadocs for details -->
      <FieldMap ID="TIBOOL" name="TI_OP">   
         <Choice abstractVal="AND" sourceVal="AND" />
         <Choice abstractVal="OR"  sourceVal="OR" />
      </FieldMap>
    </Field>

  </Fields>
 
  <SecurityModel>
     <search>[ Public | Restricted ]</search>
  </SecurityModel>

  <!-- Restricted sites require a LoginProcess which defines how login parameters are to be handled -->
  <LoginMap>
      <UserName xPath="[ xPath to userName in LoginProcess ]" />
      <Password xPath="[ xPath to password in LoginProcess ]" />
  </LoginMap>

  <LoginProcess>
    <Step type="[getURL|getURLSocket|postURL|postURLSocket]" URL="[ login form URL ]" >
      <params>
        <param formName="[ name of parameter in html form ]" value="[ form value ]" alwaysOutput="[true|false(default) - use for blank values ]" />
      </params>
    </Step>
  </LoginProcess>

  <!-- For NTLM Authentication - use this format: -->
  <LoginProcess UserName="[user name]" 
                   Password="[password]" 
                   PasswordEnc="[DES encrypted DB password]" />

  <!-- The SearchProcess describes the search form that will be sent to the search site: -->
  <!-- It can consist of one or more "Steps" depending on the site -->
    <SearchProcess 
          outputStep="[ step number (from 1) that generates output - if no value: the last step will be used ]" >
    <Step type="[getURL|getURLSocket|postURL|postURLSocket]" URL="[the URL that the form should be sent to]" >
      <params>
        <param formName="[ name of parameter in html form ]" value="[ form value ]" alwaysOutput="[true|false(default) - use for blank values ]" />
        <param formName="[ name of form parameter ]" value="" alwaysOutput="[true|false(default) - use for blank values ]" />

        <!-- etc. . . -->

      </params>
    </Step>
  </SearchProcess>

  <!-- The PageProcess describes how paging commands (get or post) will be sent to the search site: -->
  <PageProcess mapFrom="[ xPath within result to get paging data ]" method="[tagMap|  ]" >
    <TotalDocs mapFrom="[ xPath within result to get total docs e.g. '/Records/Page/TotalDocs' ]" />
    <Step type="[getURL|getURLSocket|postURL|postURLSocket]" URL="[the URL that the form should be sent to]" >
      <params>
        <param formName="[ name of parameter in html form ]" value="[ form value ]" alwaysOutput="[true|false(default) - use for blank values ]" />

        <!-- computed parameter -->
        <param formName="[ name of parameter in html form ]" value="" computeFrom="[ compute formula with PAGE_NUM as placeholder for page Number with 1 representing the first page ]" />

        <param formName="[ name of parameter ]" computeFrom="[ some string with pattern {COMPUTE_FROM:[ formula ]} ]" />

      </params>
  </PageProcess>

  <ScraperConfigFile>[ path to the HTMLScraper configuration File ]
  <OutputTransformer>[ path to the XSL file that translates the raw XML to result XML ]

  <!-- Optional FieldFormatters section -->
  <FieldFormatters>

    <Formatter formatterClass="[ class of com.raritantechnologies.searchApp.IFieldFormatter ]" >

    </Formatter>

    <!-- etc. . . -->

  </FieldFormatters>

  <ResultSetAttributes>
    <Attribute name="[ attribute name ]" xPath="[ xPath of value within HTMLScraper XML output ]" />
    <Attribute name="[ another ]" xPath="[ its xPath ]" />
    <!-- etc... -->
  </ResultSetAttributes>

  </SourceType>
 

Developed by Raritan Technologies Inc..

Author:
Ted Sullivan

Field Summary
 
Fields inherited from class com.raritantechnologies.searchApp.SearchSource
ID_FIELD, IS_FEDERATED, NUMBER_OF_FIELDS, SECURE, SOURCE_NAME, SOURCE_TYPE, TITLE_FIELD, URL_FIELD
 
Constructor Summary
HTMLSearchSource()
           
 
Method Summary
 void addFilterField(HTMLFilterField filterField)
           
 void addFormatField(HTMLFormatField formatField)
           
 java.lang.String getCacheCookieKey()
           
 java.lang.String getCookieResultField()
           
 HTMLScraper getHTMLScraper(ILoginInfo sourceLogin, OrderedMap cookies)
          Initializes an HTMLScraper using the loginInfo and a map to store cookies.
 org.w3c.dom.Document getHTMLScraperProcess()
           
 boolean getIsXOD()
           
 HTMLScraper getPageHTMLScraper(OrderedMap cookies)
           
 org.w3c.dom.Document getPageProcess(java.util.Map pageProcessData, int pageNum, OrderedMap queryParams)
          returns the SearchProcess used to get subsequent pages of data.
 org.w3c.dom.Document getPageProcessTemplate()
           
 java.lang.String getProxyHost()
           
 java.lang.String getProxyPassword()
           
 java.lang.String getProxyPort()
           
 java.lang.String getProxyUserName()
           
 IQueryProcessor getQueryProcessor()
          returns the type of QueryProcessor that can access this SearchSource.
 java.lang.String getScrapedRecordTagName()
           
 java.lang.String getScrapedRootTagName()
           
 org.w3c.dom.Document getSearchProcess(java.util.Map queryParams, ISearchFieldMap searchFieldMap, java.lang.Integer pageSize, java.lang.Integer startRec)
          returns Scraper process needed to direct the HTMLScraper.
 org.w3c.dom.Document getSearchProcess(java.util.Map sessionData, java.util.Map queryParams, ISearchFieldMap searchFieldMap, java.lang.Integer pageSize, java.lang.Integer startRec)
          returns Scraper process needed to direct the HTMLScraper.
 void setCacheCookieKey(java.lang.String cacheCookieKey)
           
 void setCookieResultField(java.lang.String cookieResultField)
           
 void setHTMLScraperProcess(org.w3c.dom.Document htmlScraperConfig)
          Sets the scraper configuration.
 void setInitCookies(java.lang.String initCookies)
           
 void setIsXOD(boolean isXOD)
          Determines if result is XML (XOD paradigm: XML on Demand).
 void setLoginProcessMapping(java.lang.String unamePath, java.lang.String pwordPath)
          set xPath map of "UserName" and "Password"
 void setLoginProcessTemplate(org.w3c.dom.Document loginProcessTemplate)
          Sets the LoginProcess Template.
 void setPageProcessTemplate(org.w3c.dom.Document pageProcessTemplate)
           
 void setPageSizeMapping(java.lang.String pageSizeXPath)
          Sets the XPath within the SearchProcess for the PageSize variable Set by HTMLSearchSourceFactory
 void setProxyHost(java.lang.String proxyHost)
           
 void setProxyPassword(java.lang.String proxyPassword)
           
 void setProxyPort(java.lang.String proxyPort)
           
 void setProxyUserName(java.lang.String proxyUserName)
           
 void setResultSetAttributeMap(java.util.Map resultSetAttributeMap)
           
 void setResultSetAttributes(org.w3c.dom.Document resultDoc, IResultSet resultSet)
           
 void setSearchProcessTemplate(org.w3c.dom.Document searchProcessTemplate)
          Sets the SearchProcess template.
 void setSourcePassword(java.lang.String sourcePassword)
           
 void setSourceUserName(java.lang.String sourceUserName)
           
 void setStartRecMapping(java.lang.String startRecXPath)
          Sets the XPath within the SearchProcess for the PageSize variable Set by HTMLSearchSourceFactory
 boolean shouldNotInitCookies()
           
 
Methods inherited from class com.raritantechnologies.searchApp.SearchSource
addFieldFormatter, addSearchField, addSourceAttributes, addStaticField, addStaticFields, addUserSearchField, canAccess, canConsolidateSources, consolidateSources, delSearchField, formatResult, formatResult, formatResultField, formatResultField, formatResults, formatResults, getAccessableFields, getAttributes, getBasePath, getConfigPath, getDisplayName, getFieldFormatters, getIDField, getName, getProperty, getSearchField, getSearchFieldID, getSearchFields, getSearchFields, getSecondarySortSpec, getSecurityManager, getSecurityManagerName, getSecurityModel, getSortSpec, getSourceCredentials, getSourcePropertyNames, getTitleField, getType, getURLField, getUserSearchFields, isMultiThreaded, setConfigPath, setDisplayName, setIDField, setMultiThreaded, setName, setSecurityManagerName, setSecurityModel, setSortMap, setSourcePropertyInfoSource, setTitleField, setType, setURLField, shouldFormatResults, supportsBackgroundSort
 
Methods inherited from class java.lang.Object
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait
 

Constructor Detail

HTMLSearchSource

public HTMLSearchSource()
Method Detail

getQueryProcessor

public IQueryProcessor getQueryProcessor()
Description copied from class: SearchSource
returns the type of QueryProcessor that can access this SearchSource.

Specified by:
getQueryProcessor in class SearchSource

getHTMLScraper

public HTMLScraper getHTMLScraper(ILoginInfo sourceLogin,
                                  OrderedMap cookies)
Initializes an HTMLScraper using the loginInfo and a map to store cookies.


getPageHTMLScraper

public HTMLScraper getPageHTMLScraper(OrderedMap cookies)

getSearchProcess

public org.w3c.dom.Document getSearchProcess(java.util.Map queryParams,
                                             ISearchFieldMap searchFieldMap,
                                             java.lang.Integer pageSize,
                                             java.lang.Integer startRec)
returns Scraper process needed to direct the HTMLScraper. This method needs to modify the Document on the Fly.


getSearchProcess

public org.w3c.dom.Document getSearchProcess(java.util.Map sessionData,
                                             java.util.Map queryParams,
                                             ISearchFieldMap searchFieldMap,
                                             java.lang.Integer pageSize,
                                             java.lang.Integer startRec)
returns Scraper process needed to direct the HTMLScraper. This method needs to modify the Document on the Fly.


setHTMLScraperProcess

public void setHTMLScraperProcess(org.w3c.dom.Document htmlScraperConfig)
Sets the scraper configuration. This is done by the HTMLSearchSourceFactory


getHTMLScraperProcess

public org.w3c.dom.Document getHTMLScraperProcess()

getScrapedRecordTagName

public java.lang.String getScrapedRecordTagName()

getScrapedRootTagName

public java.lang.String getScrapedRootTagName()

setInitCookies

public void setInitCookies(java.lang.String initCookies)

shouldNotInitCookies

public boolean shouldNotInitCookies()

setSearchProcessTemplate

public void setSearchProcessTemplate(org.w3c.dom.Document searchProcessTemplate)
Sets the SearchProcess template. This is done by the HTMLSearchSourceFactory. This template will be modified by getSearchProcess.


setPageProcessTemplate

public void setPageProcessTemplate(org.w3c.dom.Document pageProcessTemplate)

getPageProcessTemplate

public org.w3c.dom.Document getPageProcessTemplate()

getPageProcess

public org.w3c.dom.Document getPageProcess(java.util.Map pageProcessData,
                                           int pageNum,
                                           OrderedMap queryParams)
returns the SearchProcess used to get subsequent pages of data.


setLoginProcessTemplate

public void setLoginProcessTemplate(org.w3c.dom.Document loginProcessTemplate)
Sets the LoginProcess Template. This is done by the HTMLSearchSourceFactory. This template will be dynamically modified using information from the SourceLoginInfo object passed to HTMLQueryProcessor.


setLoginProcessMapping

public void setLoginProcessMapping(java.lang.String unamePath,
                                   java.lang.String pwordPath)
set xPath map of "UserName" and "Password"


setPageSizeMapping

public void setPageSizeMapping(java.lang.String pageSizeXPath)
Sets the XPath within the SearchProcess for the PageSize variable Set by HTMLSearchSourceFactory


setStartRecMapping

public void setStartRecMapping(java.lang.String startRecXPath)
Sets the XPath within the SearchProcess for the PageSize variable Set by HTMLSearchSourceFactory


addFormatField

public void addFormatField(HTMLFormatField formatField)

addFilterField

public void addFilterField(HTMLFilterField filterField)

setResultSetAttributeMap

public void setResultSetAttributeMap(java.util.Map resultSetAttributeMap)

setResultSetAttributes

public void setResultSetAttributes(org.w3c.dom.Document resultDoc,
                                   IResultSet resultSet)

setIsXOD

public void setIsXOD(boolean isXOD)
Determines if result is XML (XOD paradigm: XML on Demand).


getIsXOD

public boolean getIsXOD()

setProxyHost

public void setProxyHost(java.lang.String proxyHost)

getProxyHost

public java.lang.String getProxyHost()

setProxyPort

public void setProxyPort(java.lang.String proxyPort)

getProxyPort

public java.lang.String getProxyPort()

setProxyUserName

public void setProxyUserName(java.lang.String proxyUserName)

getProxyUserName

public java.lang.String getProxyUserName()

setProxyPassword

public void setProxyPassword(java.lang.String proxyPassword)

getProxyPassword

public java.lang.String getProxyPassword()

setCacheCookieKey

public void setCacheCookieKey(java.lang.String cacheCookieKey)

getCacheCookieKey

public java.lang.String getCacheCookieKey()

setCookieResultField

public void setCookieResultField(java.lang.String cookieResultField)

getCookieResultField

public java.lang.String getCookieResultField()

setSourceUserName

public void setSourceUserName(java.lang.String sourceUserName)

setSourcePassword

public void setSourcePassword(java.lang.String sourcePassword)