|
HTMLScraperPageImportRenderer Examples
|
This demo shows how the Raritan HTMLScraper can be used to streamline multi-step processes. This example
executes a chemical structure search against the NIH PubChem site, automatically following the steps
that a user would have to do manually to find similar chemical structures for a drug trade name (such as 'Vioxx').
JSP Custom Tags used in this demo:
<search:SearchForm
formName="PubChemSearch"
action="HTMLScraperPageImportRenderer.jsp"
/>
<hr>
<!-- A CustomIFrame tag is used to pass the request parameters to the StructSearch.jsp page -->
<!-- This demo needs its own frame context because the final output of PubChem is a wait / -->
<!-- refresh cycle that is best left up to the browser to negotiate. -->
<results:CustomIFrame action="StructSearch.jsp" />
The StructSearch.jsp has the PageImportRenderer tag:
<results:PageImport importRenderer="PubChemStructureScrape" />
XML Configuration for this demo:
<SearchForm name="PubChemSearch" >
<Field ID="drug" type="text" width="50" name="Drug Name" />
</SearchForm>
<!-- ======================================================================= -->
<!-- PubChem Scraper PageImportRenderer -->
<!-- ======================================================================= -->
<!-- ======================================================================= -->
<!-- DynamicPageImportRenderer is used to ensure that a valid query term -->
<!-- is present before launching the HTMLScraperPageImportRenderer. -->
<!-- ====================================================================== -->
<SystemObject type="PageImportRenderer" name="PubChemStructureScrape"
configurableClass="com.raritantechnologies.searchApp.taglibrary.DynamicPageImportRenderer" >
<ProxyRenderer>
<RequestParams>
<Param name="drug" >
<Comparator class="com.raritantechnologies.utils.comparators.AnythingComparator" />
</Param>
</RequestParams>
<PageContextRenderer
class="com.raritantechnologies.HTML.HTMLScraperPageImportRenderer"
scraperConfig="BASE_PATH/WEB-INF/conf/PubChem/PubChemNullScraper.xml" >
<Fields>
<!-- =========================================================== -->
<!-- LookupStringFilter is used to Convert drug to SMILES String -->
<!-- by executing a search against the PubChem site. -->
<!-- =========================================================== -->
<Field ID="drug" xPath="/SearchProcess/Step/params/param[@formName='simple_searchdata']/@value" >
<StringFilter class="com.raritantechnologies.utils.filter.LookupStringFilter"
searchSource="PubChemSmilesFinder"
queryField="drug"
outputField="Smiles" />
</Field>
</Fields>
<SearchProcess>
<Step type="getURLSocket" URL="http://pubchem.ncbi.nlm.nih.gov/search/" >
<params>
</params>
</Step>
<Step type="postURLSocketMultipart" URL="http://pubchem.ncbi.nlm.nih.gov/search/PreQSrv.cgi">
<params>
<param formName="mode" value="simplequery" />
<param formName="check" value="remote" />
<param formName="execution" value="remote" />
<param formName="queue" value="ssquery" />
<param formName="simple_searchdata" value="" />
<param formName="simple_cid" value="" />
<param formName="simple_sid" value="" />
<param formName="file" value=""
type="file" filename="" contentType="application/octet-stream" />
<param formName="simple_searchtype" value="sim90" />
<param formName="maxhits" value="200" />
</params>
</Step>
</SearchProcess>
<!-- =============================================================== -->
<!-- StringFilters are used to clean up the HTML output and to -->
<!-- convert relative HTML paths to absolute paths. -->
<!-- =============================================================== -->
<OutputStringFilter class="com.raritantechnologies.utils.filter.SequentialStringFilter" >
<StringFilter class="com.raritantechnologies.utils.filter.BasicStringFilter"
command="START_AT" indexOf="<html" />
<StringFilter class="com.raritantechnologies.utils.filter.RegExprStringFilter"
inPattern="NbrQSrv.cgi(.*)" outPattern="http://pubchem.ncbi.nlm.nih.gov/search/NbrQSrv.cgi$1" />
</OutputStringFilter>
</PageContextRenderer>
</ProxyRenderer>
</SystemObject>
<!-- ======================================================================= -->
<!-- PubChem SMILES Finder -->
<!-- ======================================================================= -->
<SourceType name="PubChemSmilesFinder" type="HTMLSearchSource" displayName="PubChem Smiles Lookup"
sourceFactoryClass="com.raritantechnologies.federated.html.HTMLSearchSourceFactory"
queryProcessor="com.raritantechnologies.federated.html.HTMLQueryProcessor" >
<Fields>
<Field ID="drug" xPath="/SearchProcess/Step/params/param[@formName='term']/@value" />
</Fields>
<SecurityModel>
<search>Public</search>
</SecurityModel>
<SearchProcess>
<Step type="postURLSocket" URL="http://www.ncbi.nlm.nih.gov/coreutils/dispatch.cgi">
<params>
<param formName="db" value="26" />
<param formName="term" value="" />
<param formName="SITE" value="NcbiHome" />
</params>
</Step>
</SearchProcess>
<ScraperConfigFile>PubChemStructureScraper.xml</ScraperConfigFile>
<OutputTransformer>PubChemOutputTransform.xsl</OutputTransformer>
<!-- Post filtering of data:
1) URLDecode SMILES string
2) Extract &cid= attribute into separate result field
3) Remove &cid= from SMILES value
-->
<FieldFormatters>
<Formatter formatterClass="com.raritantechnologies.searchApp.formatters.StringFilterFormatter"
fieldID="Smiles" >
<StringFilter class="com.raritantechnologies.utils.filter.URLDecoderStringFilter" />
</Formatter>
<Formatter formatterClass="com.raritantechnologies.searchApp.formatters.PatternExtractor"
inputFields="Smiles"
outputField="CID"
inPattern=".*\&cid=(.*)" outPattern="$1" />
<Formatter formatterClass="com.raritantechnologies.searchApp.formatters.RegExprFieldFormatter"
fieldID="Smiles"
inPattern="(.*)\&cid=(.*)" outPattern="$1" />
</FieldFormatters>
</SourceType>
HTML Scraper configuration:
<!-- PubChemStructureScraper -->
<HTMLScraperConfig>
<StateTransitions processedResultType="Record" begState="_None_">
<HTMLEvent eventType="SimpleTag" tagType="input" fromState="_None_" toState="FoundResult">
<InputAttribute attrName="name" attrVal="uid"/>
<OutputAttribute attrName="value" outputTo="CIDValue"/>
</HTMLEvent>
<HTMLEvent eventType="StartTag" tagType="a" fromState="FoundResult" toState="FoundLink" >
<OutputAttribute attrName="href" outputTo="LinkData" />
</HTMLEvent>
<HTMLEvent eventType="EndTag" tagType="a" fromState="FoundLink" toState="DONE" processResults="true" />
</StateTransitions>
<OutputFormat>
<RootTag tagName="Records" />
<ProcessedResult processedResultType="Record" tagName="Record" >
<ProcessedElement tagName="Link" charDataFrom="LinkData" />
<ChildElement tagName="Records" elementDataFromHref="LinkData"
XMLDef="BASE_PATH/WEB-INF/conf/PubChem/PubChemStructureScraper_2.xml" />
</ProcessedResult>
</OutputFormat>
</HTMLScraperConfig>