| Main | Javadoc | Example |
|
|
The HTMLSearchSource is used to search sites on the world wide web. This example shows a search against the CBSNews web site.
<search:SearchForm
formName="HTMLSearchForm"
categoryName="CBSNews"
action="HTMLSearchSource.jsp"
/>
<hr>
<results:DisplayForm formName="HTMLSourceDisplay" />
An HTMLSearchSource consists of several XML components. The main <SourceType> XML section of the framework application main Configuration XML, an XML file that directs the HTML -> XML "scraping" process that converts the HTML returned from the web search site into XML that can be converted into a Raritan ResultSet, and an XSL style transform that converts the result XML to a standard format. Details on the format and procedures needed to write an HTML Scraper XML are documented in the HTML Scraping Guide
XML Configuration for this example:
<!-- ====================================================================== -->
<!-- SearchForm for HTMLSearchSource demo. -->
<!-- ====================================================================== -->
<SearchForm name="HTMLSearchForm" category="CBSNews" >
<Field ID="KY" type="text" width="50" name="Keywords" />
</SearchForm>
Display Form:
<!-- ======================================================================= -->
<!-- DisplayForm for HTMLSearchSource: uses XSLTResultRenderer -->
<!-- ======================================================================= -->
<DisplayForm name="HTMLSourceDisplay"
rendererClass="com.raritantechnologies.quickstart.taglibrary.TableDisplayFormRenderer"
bgcolor1="#ffffff" bgcolor2="#EDF3FE" >
<ResultRenderer rendererClass="com.raritantechnologies.quickstart.taglibrary.XSLResultRenderer"
align="left" valign="top" defaultString=" " width="550">
<Source name="CBSNews" xslFile="BASE_PATH/WEB-INF/conf/CBSNews/CBSNewsDisplayTransform.xsl" />
</ResultRenderer>
</DisplayForm>
Source configuration:
<!-- ======================================================================== -->
<!-- HTMLSearchSource: CBSNews.com -->
<!-- ======================================================================== -->
<SourceType name="CBSNews" type="HTMLSearchSource" displayName="CBSNews.com"
sourceFactoryClass="com.raritantechnologies.federated.html.HTMLSearchSourceFactory"
queryProcessor="com.raritantechnologies.federated.html.HTMLQueryProcessor">
<Fields>
<Field ID="key" xPath="/SearchProcess/Step/params/param[@formName='searchString']/@value"/>
</Fields>
<SecurityModel>
<search>Public</search>
</SecurityModel>
<SearchProcess>
<Step type="getURL" URL="http://www.cbsnews.com/htdocs/search/search.php">
<params>
<param formName="num" value="10" />
<param formName="offset" value="0" />
<param formName="source" value="cbsnews" />
<param formName="section" value="" />
<param formName="type" value="all" />
<param formName="sp-s" value="1" />
<param formName="searchString" value="" />
<param formName="x" value="22" />
<param formName="y" value="8" />
</params>
</Step>
</SearchProcess>
<ScraperConfigFile>CBSNewsScraperConfig.xml</ScraperConfigFile>
<OutputTransformer>CBSNewsTransform.xsl</OutputTransformer>
</SourceType>
<HTMLScraperConfig>
<!-- ============================================================================= -->
<!-- Each StateTransitions tag describes a sequence of HTML events that the -->
<!-- scraper should follow. When the parser encounters an HTML Event that matches -->
<!-- one of an events described within an HTMLEvent element, either a state -->
<!-- change is triggered, data extracted from the HTML event, or both. Extracted -->
<!-- data is put into named temporary variables that are then transferred to -->
<!-- the format described by the ProcessedResult element below. -->
<!-- ============================================================================= -->
<StateTransitions processedResultType="Record" begState="_None_">
<HTMLEvent eventType="StartTag" tagType="a" fromState="_None_" toState="FoundRecord" >
<InputAttribute attrName="href" startWith="http://www.cbsnews.com/stories" />
<OutputAttribute attrName="href" outputTo="LinkData" />
</HTMLEvent>
<HTMLEvent eventType="HandleText" fromState="FoundRecord" toState="FoundRecord"
outputTo="TitleData" append="true" />
<HTMLEvent eventType="EndTag" tagType="a" fromState="FoundRecord" toState="RecordDone" />
<HTMLEvent eventType="SimpleTag" tagType="span" fromState="RecordDone" toState="FoundBody" >
<InputAttribute attrName="class" attrVal="body" />
</HTMLEvent>
<HTMLEvent eventType="HandleText" fromState="FoundBody" toState="FoundBody"
outputTo="SummaryData" append=" " />
<HTMLEvent eventType="SimpleTag" tagType="span"
fromState="FoundBody" toState="_None_" processResult="true" >
<InputAttribute attrName="endtag" attrVal="true" />
</HTMLEvent>
</StateTransitions>
<!-- ============================================================================ -->
<!-- The OutputFormat element describes how scraped data from a StateTransitions -->
<!-- element is to be formatted as XML output. -->
<!-- ============================================================================ -->
<OutputFormat>
<RootTag tagName="Records" />
<ProcessedResult processedResultType="Record" tagName="Record" >
<ProcessedElement tagName="Title" charDataFrom="TitleData" />
<ProcessedElement tagName="Link" charDataFrom="LinkData" />
<ProcessedElement tagName="Brief" charDataFrom="SummaryData" />
</ProcessedResult>
</OutputFormat>
</HTMLScraperConfig>