MainJavadocExample
HTMLScraperPageImportRenderer Examples

This demo shows how the Raritan HTMLScraper can be used to streamline multi-step processes. This example executes a chemical structure search against the NIH PubChem site, automatically following the steps that a user would have to do manually to find similar chemical structures for a drug trade name (such as 'Vioxx').

Drug Name:  


JSP Custom Tags used in this demo:
  <search:SearchForm
     formName="PubChemSearch" 
     action="HTMLScraperPageImportRenderer.jsp"
   />
<hr>

   <!-- A CustomIFrame tag is used to pass the request parameters to the StructSearch.jsp page -->
   <!-- This demo needs its own frame context because the final output of PubChem is a wait /  -->
   <!-- refresh cycle that is best left up to the browser to negotiate.                        -->

   <results:CustomIFrame action="StructSearch.jsp" />
The StructSearch.jsp has the PageImportRenderer tag:
   <results:PageImport importRenderer="PubChemStructureScrape" />

XML Configuration for this demo:

  <SearchForm name="PubChemSearch" >
    <Field ID="drug" type="text" width="50" name="Drug Name" />
  </SearchForm>

  <!-- ======================================================================= -->
  <!--       PubChem Scraper PageImportRenderer                                -->
  <!-- ======================================================================= -->

  <!-- ======================================================================= -->
  <!--   DynamicPageImportRenderer is used to ensure that a valid query term   -->
  <!--   is present before launching the HTMLScraperPageImportRenderer.        -->
  <!--  ====================================================================== -->
  <SystemObject type="PageImportRenderer" name="PubChemStructureScrape"
          configurableClass="com.raritantechnologies.searchApp.taglibrary.DynamicPageImportRenderer" >

    <ProxyRenderer>
      <RequestParams>
        <Param name="drug" >
          <Comparator class="com.raritantechnologies.utils.comparators.AnythingComparator" />
        </Param>
      </RequestParams>

      <PageContextRenderer
          class="com.raritantechnologies.HTML.HTMLScraperPageImportRenderer"
          scraperConfig="BASE_PATH/WEB-INF/conf/PubChem/PubChemNullScraper.xml" >

        <Fields>
          <!-- =========================================================== -->
          <!-- LookupStringFilter is used to Convert drug to SMILES String -->
          <!-- by executing a search against the PubChem site.             -->
          <!-- =========================================================== -->
          <Field ID="drug"    xPath="/SearchProcess/Step/params/param[@formName='simple_searchdata']/@value" >
            <StringFilter class="com.raritantechnologies.utils.filter.LookupStringFilter"
                          searchSource="PubChemSmilesFinder"
                          queryField="drug"
                          outputField="Smiles" />
           </Field>
        </Fields>

        <SearchProcess>

          <Step type="getURLSocket" URL="http://pubchem.ncbi.nlm.nih.gov/search/" >
            <params>
            </params>
          </Step>

          <Step type="postURLSocketMultipart" URL="http://pubchem.ncbi.nlm.nih.gov/search/PreQSrv.cgi">
            <params>
               <param formName="mode"               value="simplequery" />
               <param formName="check"              value="remote" />
               <param formName="execution"          value="remote" />
               <param formName="queue"              value="ssquery" />
               <param formName="simple_searchdata"  value="" />
               <param formName="simple_cid"         value=""  />
               <param formName="simple_sid"         value=""  />
               <param formName="file"               value="" 
                      type="file" filename="" contentType="application/octet-stream" />

               <param formName="simple_searchtype"  value="sim90" />
               <param formName="maxhits"            value="200" />

            </params>
          </Step>

        </SearchProcess>

        <!-- =============================================================== -->
        <!--  StringFilters are used to clean up the HTML output and to      -->
        <!--  convert relative HTML paths to absolute paths.                 -->
        <!-- =============================================================== -->
        <OutputStringFilter class="com.raritantechnologies.utils.filter.SequentialStringFilter" >

          <StringFilter class="com.raritantechnologies.utils.filter.BasicStringFilter"
                        command="START_AT" indexOf="<html" />

          <StringFilter class="com.raritantechnologies.utils.filter.RegExprStringFilter"
                        inPattern="NbrQSrv.cgi(.*)" outPattern="http://pubchem.ncbi.nlm.nih.gov/search/NbrQSrv.cgi$1" />

         </OutputStringFilter>

      </PageContextRenderer>
    </ProxyRenderer>
  </SystemObject>


  <!-- ======================================================================= -->
  <!--                           PubChem SMILES Finder                         -->
  <!-- ======================================================================= -->
  <SourceType name="PubChemSmilesFinder" type="HTMLSearchSource" displayName="PubChem Smiles Lookup"
              sourceFactoryClass="com.raritantechnologies.federated.html.HTMLSearchSourceFactory" 
              queryProcessor="com.raritantechnologies.federated.html.HTMLQueryProcessor" >

    <Fields>
      <Field ID="drug"    xPath="/SearchProcess/Step/params/param[@formName='term']/@value" />
    </Fields>

    <SecurityModel>
       <search>Public</search>
    </SecurityModel>

    <SearchProcess>

      <Step type="postURLSocket" URL="http://www.ncbi.nlm.nih.gov/coreutils/dispatch.cgi">
        <params>
          <param formName="db" value="26" />

          <param formName="term" value="" />
          <param formName="SITE" value="NcbiHome" />
        </params>
      </Step>

    </SearchProcess>

    <ScraperConfigFile>PubChemStructureScraper.xml</ScraperConfigFile>
    <OutputTransformer>PubChemOutputTransform.xsl</OutputTransformer>

    <!-- Post filtering of data:
         1) URLDecode SMILES string
         2) Extract &cid= attribute into separate result field
         3) Remove &cid= from SMILES value
      -->
    <FieldFormatters>

      <Formatter formatterClass="com.raritantechnologies.searchApp.formatters.StringFilterFormatter"
                   fieldID="Smiles" >
         <StringFilter class="com.raritantechnologies.utils.filter.URLDecoderStringFilter" />
      </Formatter>

      <Formatter  formatterClass="com.raritantechnologies.searchApp.formatters.PatternExtractor" 
                  inputFields="Smiles" 
                  outputField="CID"
                  inPattern=".*\&cid=(.*)" outPattern="$1" />

      <Formatter  formatterClass="com.raritantechnologies.searchApp.formatters.RegExprFieldFormatter" 
                  fieldID="Smiles" 
                  inPattern="(.*)\&cid=(.*)" outPattern="$1" />

    </FieldFormatters>

  </SourceType>

HTML Scraper configuration:
<!--   PubChemStructureScraper -->
<HTMLScraperConfig>

   <StateTransitions processedResultType="Record" begState="_None_">

      <HTMLEvent eventType="SimpleTag" tagType="input" fromState="_None_" toState="FoundResult">
        <InputAttribute attrName="name" attrVal="uid"/>
        <OutputAttribute attrName="value" outputTo="CIDValue"/>
      </HTMLEvent>

      <HTMLEvent eventType="StartTag" tagType="a" fromState="FoundResult" toState="FoundLink" >
        <OutputAttribute attrName="href" outputTo="LinkData" />
      </HTMLEvent>

      <HTMLEvent eventType="EndTag" tagType="a" fromState="FoundLink" toState="DONE" processResults="true" />

   </StateTransitions>

   <OutputFormat>
      <RootTag tagName="Records" />
      <ProcessedResult processedResultType="Record" tagName="Record" >
           <ProcessedElement tagName="Link" charDataFrom="LinkData" />
           <ChildElement tagName="Records" elementDataFromHref="LinkData"
                          XMLDef="BASE_PATH/WEB-INF/conf/PubChem/PubChemStructureScraper_2.xml" />
      </ProcessedResult>
   </OutputFormat>

</HTMLScraperConfig>