MainJavadocExample
HTMLSearchSource Demo

The HTMLSearchSource is used to search sites on the world wide web. This example shows a search against the CBSNews web site.

Keywords:  


JSP components:

  <search:SearchForm
     formName="HTMLSearchForm" 
     categoryName="CBSNews"
     action="HTMLSearchSource.jsp"
   />
<hr>
  <results:DisplayForm formName="HTMLSourceDisplay" />


Demo configuration components:

An HTMLSearchSource consists of several XML components. The main <SourceType> XML section of the framework application main Configuration XML, an XML file that directs the HTML -> XML "scraping" process that converts the HTML returned from the web search site into XML that can be converted into a Raritan ResultSet, and an XSL style transform that converts the result XML to a standard format. Details on the format and procedures needed to write an HTML Scraper XML are documented in the HTML Scraping Guide

XML Configuration for this example:
Search Form:

  <!-- ====================================================================== -->
  <!--  SearchForm for HTMLSearchSource demo.                                 -->
  <!-- ====================================================================== -->
  <SearchForm name="HTMLSearchForm" category="CBSNews" >
    <Field ID="KY" type="text" width="50" name="Keywords" />
  </SearchForm>

Display Form:

  <!-- ======================================================================= -->
  <!--  DisplayForm for HTMLSearchSource:  uses XSLTResultRenderer             -->
  <!-- ======================================================================= -->
  <DisplayForm name="HTMLSourceDisplay" 
               rendererClass="com.raritantechnologies.quickstart.taglibrary.TableDisplayFormRenderer"
               bgcolor1="#ffffff" bgcolor2="#EDF3FE" >

     <ResultRenderer rendererClass="com.raritantechnologies.quickstart.taglibrary.XSLResultRenderer" 
                     align="left" valign="top" defaultString="&nbsp;" width="550">
       <Source name="CBSNews"      xslFile="BASE_PATH/WEB-INF/conf/CBSNews/CBSNewsDisplayTransform.xsl" />
     </ResultRenderer>

  </DisplayForm>

Source configuration:

  <!-- ======================================================================== -->
  <!--    HTMLSearchSource: CBSNews.com                                         -->
  <!-- ======================================================================== -->
  <SourceType name="CBSNews" type="HTMLSearchSource" displayName="CBSNews.com" 
                 sourceFactoryClass="com.raritantechnologies.federated.html.HTMLSearchSourceFactory" 
                 queryProcessor="com.raritantechnologies.federated.html.HTMLQueryProcessor">
    <Fields>
      <Field ID="key" xPath="/SearchProcess/Step/params/param[@formName='searchString']/@value"/>
    </Fields>

    <SecurityModel>
      <search>Public</search>
    </SecurityModel>

    <SearchProcess>
      <Step type="getURL" URL="http://www.cbsnews.com/htdocs/search/search.php">
        <params>
          <param formName="num"     value="10" />
          <param formName="offset"  value="0" />
          <param formName="source"  value="cbsnews" />
          <param formName="section" value="" />
          <param formName="type"    value="all" />
          <param formName="sp-s"    value="1" />
          <param formName="searchString" value="" />
          <param formName="x"       value="22" />
          <param formName="y"       value="8" />
        </params>
      </Step>
    </SearchProcess>

    <ScraperConfigFile>CBSNewsScraperConfig.xml</ScraperConfigFile>
    <OutputTransformer>CBSNewsTransform.xsl</OutputTransformer>
  </SourceType>


XML Scraper Configuration:

  <HTMLScraperConfig>

   <!-- ============================================================================= -->
   <!--  Each StateTransitions tag describes a sequence of HTML events that the       -->
   <!--  scraper should follow. When the parser encounters an HTML Event that matches -->
   <!--  one of an events described within an HTMLEvent element, either a state       -->
   <!--  change is triggered, data extracted from the HTML event, or both. Extracted  -->
   <!--  data is put into named temporary variables that are then transferred to      -->
   <!--  the format described by the ProcessedResult element below.                   -->
   <!-- ============================================================================= -->
   <StateTransitions processedResultType="Record" begState="_None_">

     <HTMLEvent eventType="StartTag" tagType="a" fromState="_None_" toState="FoundRecord" >
       <InputAttribute attrName="href" startWith="http://www.cbsnews.com/stories" />
       <OutputAttribute attrName="href" outputTo="LinkData" />
     </HTMLEvent>

     <HTMLEvent eventType="HandleText" fromState="FoundRecord" toState="FoundRecord" 
                outputTo="TitleData" append="true" />

     <HTMLEvent eventType="EndTag" tagType="a" fromState="FoundRecord" toState="RecordDone"  />

     <HTMLEvent eventType="SimpleTag" tagType="span" fromState="RecordDone" toState="FoundBody" >
       <InputAttribute attrName="class" attrVal="body" />
     </HTMLEvent>

     <HTMLEvent eventType="HandleText" fromState="FoundBody" toState="FoundBody" 
                outputTo="SummaryData" append=" " />

     <HTMLEvent eventType="SimpleTag" tagType="span" 
                fromState="FoundBody" toState="_None_" processResult="true" >
       <InputAttribute attrName="endtag" attrVal="true" />
     </HTMLEvent>

   </StateTransitions>

   <!-- ============================================================================ -->
   <!-- The OutputFormat element describes how scraped data from a StateTransitions  -->
   <!-- element is to be formatted as XML output.                                    -->
   <!-- ============================================================================ -->
   <OutputFormat>
      <RootTag tagName="Records" />
      <ProcessedResult processedResultType="Record" tagName="Record" >
        <ProcessedElement tagName="Title"   charDataFrom="TitleData" />
        <ProcessedElement tagName="Link"    charDataFrom="LinkData" />
        <ProcessedElement tagName="Brief"   charDataFrom="SummaryData" />
      </ProcessedResult>
   </OutputFormat>

  </HTMLScraperConfig>