com.raritantechnologies.HTML
Class HTMLScraper

java.lang.Object
  extended byjavax.swing.text.html.HTMLEditorKit.ParserCallback
      extended bycom.raritantechnologies.HTML.HTMLScraper

public class HTMLScraper
extends javax.swing.text.html.HTMLEditorKit.ParserCallback

Converts an HTML file to an XML DOM object. Uses a configuration XML which defines how state transitions map to an XML tree. The resulting XML object can be further transformed e.g. with XSL. or post processed using a java class that implements IHTMLScraperProcessor.

Annotated Scraper Configuration Example:
    <HTMLScraperConfig>

       <!-- One of more StateTransitions tags: Each tracks a particular  -->
       <!-- state transition "thread" through the HTML page -->
         <StateTransitions 
            processedResultType="[ root name of XML output tag - e.g. 'Record' links to ]"  
            begState="[ starting state name - e.g.'_None_']">

           <!-- One or more HTMLEvent tags. Each HTMLEvent tag describes an HTML event and how -->
           <!-- the HTMLScraper should react to it. Some events cause state transitions, others -->
           <!-- direct data to be stored in temporary variables, some do both. -->

           <!-- StartTag HTMLEvent type: if a 'table' tag is found and we are in state '_None_' -->
           <!-- then go to state 'FoundATable' -->
           <HTMLEvent eventType="startTag" tagType="table" fromState="_None_" toState="FoundATable" />

           <!-- StartTag HTMLEvent that detects value of tag attributes  -->
           <!-- to make a state change decision: if a 'td' tag is found and we are in state 'FoundATable'
           <!-- then go to state 'FoundARow' -->
           <HTMLEvent eventType="StartTag" tagType="td" fromState="FoundATable" toState="FoundARow" >
               <!-- checks td attribute 'class' for value = 'data' -->
               <InputAttribute attrName="class" attrVal="data" />
           </HTMLEvent>

           <!-- StartTag HTMLEvent that detects a starting value of a tag attribute to make a -->
           <!-- state change decision and then extracts data from another attribute -->
           <HTMLEvent eventType="StartTag" tagType="a" fromState="FoundARow" toState="FoundALink" >
               <!-- checks href attribute if it starts with '/data/showData?' -->
               <InputAttribute attrName="href" startWith="/data/showData?" />

               <!-- If InputAttribute check succeeds, stores the 'href' attribute value in a -->
               <!-- temporary variable named 'LinkData'  -->
               <OutputAttribute attrName="href" outputTo="LinkData" />  
           </HTMLEvent>

           <!-- Text event that is captured into a temporary variable 'TitleData'. append="true" attribute  -->
           <!-- causes separate sequential text events to be appended - useful if there are other           -->
           <!-- markup tags such as 'b' or 'font' for example within the text block we are scraping that we -->
           <!-- want to ignore. -->
           <HTMLEvent eventType="HandleText" fromState="FoundALink" toState="FoundALink" 
                         outputTo="TitleData" append="true" />        

           <!-- EndTag event that detects the closing anchor tag. If the current state is 'FoundALink' -->
           <!-- then the state is changed to 'LinkDone' -->
           <HTMLEvent eventType="EndTag" tagType="a" fromState="FoundALink" toState="LinkDone" />

         
           <!-- EndTag that detects the end of the current row. If the current state if 'LinkDone' then -->
           <!-- the state is changed to 'FoundATable' This is done so that the state-machine can 
           <!-- respond correctly to the next row of data. -->
           <HTMLEvent eventType="EndTag" tagType="td" fromState="LinkDone" toState="FoundATable" />

           <!-- EndTag that detects the end of the table. This means that all of the rows have been scraped -->
           <!-- and that the accumulated data is ready for output to XML -->
           <!-- Note that we give it a toState='_NO_MORE_DATA_' that hasn't been used yet. This is to -->
           <!-- that any tables that we see after this one won't be scraped. The processResult="true" -->
           <!-- tells the HTMLScraper to take the temporary data arrays and export them to XML. (See  -->
           <!-- the OutputFormat tag below) -->
           <HTMLEvent eventType="EndTag" tagType="table" fromState="FoundATable" toState="_NO_MORE_DATA_" 
                         processResult="true" />

         </StateTransitions>

         <!-- Another set of state transitions - will be processed in parallel and will generate a separate -->
         <!-- section of the final XML output document -->
         <StateTransitions processedResultType="AnotherTag" begState="different_start_state_name">
             <!-- HTMLEvents ... -->
         </StateTransitions>
 
         <!-- The OutputFormat tags describe how the accumulated scraped data from the StateTransitions elements -->
         <!-- should be formatted as XML. -->

         <OutputFormat>
            <RootTag tagName="[ root tag name output XML]" />

            <!-- One or more ProcessedResult tags. Each ProcessResult tag maps to one of the StateTransitions tags -->
            <ProcessedResult 
                     processedResultType="[the 'processedResultType' value of one of the StateTransitions tags]"
                     tagName="[ the tag name of the XML record for this ProcessedResult ]" >

               <!-- ProcessedElement creates a tag with data from a temporary variable scraped above. -->
               <!-- This one takes data stored in the temporary variable 'LinkData' defined above and -->
               <!-- puts it in a tag named 'URL' --> 
               <ProcessedElement tagName="URL"     charDataFrom="LinkData" />

               <!-- Take the data from the variable named 'TitleData' and put it in a tag named 'Title' -->
               <ProcessedElement tagName="Title"         charDataFrom="TitleData" />

               <!-- Child Scraper Element: does recursive scraping -->
               <ChildElement tagName="[ root tag name of inner data set ]"
                                elementDataFromHref="[ temporary variable to get href (URL) from ]"
                                XMLDef="[ XML Scraper configuration for inner page scrape ]"
                                replaceString="[ simple replace value: format from | to ]"
                                inPattern="[ regular expression input pattern ]"
                                outPattern="[ regular expression output pattern ]"
                                method="[ get(default)|post ]" >

                 <!-- Optional StringFilter to modify href value prior to getting child page -->
                 <StringFilter class="[ class of com.raritantechnologies.utils.filter.IStringFilter ]" >

                 </StringFilter>

                 <HREFPattern>[ optional href pattern with {ProcessedResultType/DataName} placeholders ]</HREFPattern>

                 <!-- Alternatively can have separate patterns for host and params -->
                 <HOSTPattern>[ pattern for host portion ]</HOSTPattern>
                 <PARAMSPattern>[ pattern for request params ]</PARAMSPattern>

               </ChildElement>

            </ProcessedResult>

            <!-- This ProcessedResult tag processed data from the second StateTransitions tag. -->
            <ProcessedResult processedResultType="AnotherTag" tagName="AuxilliaryData" >

            </ProcessedResult>

         </OutputFormat>

    </HTMLScraperConfig>
  
Test Properties: C:\dev\HTMLScraper\EdgarScrap.xml C:\dev\HTMLScraper\sample.html C:\dev\HTMLScraper\GoogleFormat.xml C:\dev\HTMLScraper\GoogleSearch_1.html C:\dev\HTMLScraper\ClinicalTrails.xml "http://www.clinicaltrials.gov/ct/screen/BrowseAny?path=%2Fbrowse%2Fby-condition%2Fhier&recruiting=true" C:\dev\HTMLScraper\ChildLevel3.xml c:\dev\HTMLScraper\ChildLevel4.html C:\dev\HTMLFilter\CTTop.xml "http://www.clinicaltrials.gov/ct/screen/BrowseAny?path=/browse/by-condition/hier&recruiting=false" c:\dev\HTMLFilter\MonsterStep2.xml c:\dev\HTMLFilter\Monster.html
Developed by Raritan Technologies Inc..

Author:
Chris Peterson, Ted Sullivan, Kepler Gelotte

Field Summary
 
Fields inherited from class javax.swing.text.html.HTMLEditorKit.ParserCallback
IMPLIED
 
Constructor Summary
HTMLScraper(org.w3c.dom.Document scraperConfig, org.w3c.dom.Document loginProcess, ILoginInfo siteLogin)
          Constructor used to pass separate scraper config and securityProcess.
HTMLScraper(org.w3c.dom.Document scraperConfig, org.w3c.dom.Document loginProcess, ILoginInfo siteLogin, ISecurityManager secMan)
           
HTMLScraper(org.w3c.dom.Document scraperConfig, org.w3c.dom.Document loginProcess, ILoginInfo siteLogin, ISecurityManager secMan, java.lang.String proxyHost, java.lang.String proxyPort, java.lang.String proxyUser, java.lang.String proxyPassword)
           
HTMLScraper(org.w3c.dom.Document scraperConfig, org.w3c.dom.Document loginProcess, ILoginInfo siteLogin, java.lang.String proxyHost, java.lang.String proxyPort, java.lang.String proxyUser, java.lang.String proxyPassword)
           
HTMLScraper(java.io.Reader xmlSource)
          Basic constructor - takes an xmlSource for the XML Scraper Configuration.
HTMLScraper(java.io.Reader xmlSource, ILoginInfo user, ISecurityManager secMan)
           
 
Method Summary
 OrderedMap getCookies()
           
 java.lang.String getLastHTMLPage()
           
 org.w3c.dom.Document getOutputDoc()
           
 void getPage(ILoginInfo user, org.w3c.dom.Document pageProcess)
           
 void getPage(ILoginInfo user, org.w3c.dom.Document pageProcess, boolean parseResults, boolean searchResultIsXML)
          gets a page using a PageProcess configuration.
 int GetStateTransitions()
           
 XMLURLDriller getXMLURLDriller()
           
 void handleComment(char[] data, int pos)
           
 void handleEndTag(javax.swing.text.html.HTML.Tag tag, int pos)
           
 void handleError(java.lang.String errorMsg, int pos)
           
 void handleSimpleTag(javax.swing.text.html.HTML.Tag tag, javax.swing.text.MutableAttributeSet mutableAttrSet, int pos)
           
 void handleStartTag(javax.swing.text.html.HTML.Tag tag, javax.swing.text.MutableAttributeSet mutableAttrSet, int pos)
           
 void handleText(char[] data, int pos)
           
static void main(java.lang.String[] args)
           
 void outputXML(java.io.Writer writer)
           
 void outputXML(java.io.Writer writer, javax.xml.transform.Transformer transformer)
           
 void parse(java.io.Reader htmlReader)
           
 void parseSecure(java.lang.String urlString, java.lang.String theParams)
           
 void parseSecure(java.lang.String urlString, java.lang.String theParams, java.lang.String requestMethod)
           
 void PutToXML()
           
 void search(ILoginInfo user, org.w3c.dom.Document searchProcess)
          executes a search using the searchProcess then scrapes the resulting html page.
 void search(ILoginInfo user, org.w3c.dom.Document searchProcess, boolean parseResults, boolean searchResultIsXML)
           
 void setBasePath(java.lang.String basePath)
          Set the base file path for XML Scrapers.
 void setDEBUG(boolean debug)
           
 void setProxyHost(java.lang.String proxyHost, java.lang.String proxyPort, java.lang.String proxyUserName, java.lang.String proxyPassword)
           
 
Methods inherited from class javax.swing.text.html.HTMLEditorKit.ParserCallback
flush, handleEndOfLineString
 
Methods inherited from class java.lang.Object
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait
 

Constructor Detail

HTMLScraper

public HTMLScraper(java.io.Reader xmlSource)
Basic constructor - takes an xmlSource for the XML Scraper Configuration.


HTMLScraper

public HTMLScraper(java.io.Reader xmlSource,
                   ILoginInfo user,
                   ISecurityManager secMan)

HTMLScraper

public HTMLScraper(org.w3c.dom.Document scraperConfig,
                   org.w3c.dom.Document loginProcess,
                   ILoginInfo siteLogin)
Constructor used to pass separate scraper config and securityProcess. In this version, there is no ISecurityManager so the login is either set up for the site specified in the securityProcess or there is no login needed.


HTMLScraper

public HTMLScraper(org.w3c.dom.Document scraperConfig,
                   org.w3c.dom.Document loginProcess,
                   ILoginInfo siteLogin,
                   ISecurityManager secMan)

HTMLScraper

public HTMLScraper(org.w3c.dom.Document scraperConfig,
                   org.w3c.dom.Document loginProcess,
                   ILoginInfo siteLogin,
                   java.lang.String proxyHost,
                   java.lang.String proxyPort,
                   java.lang.String proxyUser,
                   java.lang.String proxyPassword)

HTMLScraper

public HTMLScraper(org.w3c.dom.Document scraperConfig,
                   org.w3c.dom.Document loginProcess,
                   ILoginInfo siteLogin,
                   ISecurityManager secMan,
                   java.lang.String proxyHost,
                   java.lang.String proxyPort,
                   java.lang.String proxyUser,
                   java.lang.String proxyPassword)
Method Detail

search

public void search(ILoginInfo user,
                   org.w3c.dom.Document searchProcess)
executes a search using the searchProcess then scrapes the resulting html page.


search

public void search(ILoginInfo user,
                   org.w3c.dom.Document searchProcess,
                   boolean parseResults,
                   boolean searchResultIsXML)

getPage

public void getPage(ILoginInfo user,
                    org.w3c.dom.Document pageProcess)

getPage

public void getPage(ILoginInfo user,
                    org.w3c.dom.Document pageProcess,
                    boolean parseResults,
                    boolean searchResultIsXML)
gets a page using a PageProcess configuration.


parse

public void parse(java.io.Reader htmlReader)

PutToXML

public void PutToXML()

GetStateTransitions

public int GetStateTransitions()

handleError

public void handleError(java.lang.String errorMsg,
                        int pos)

handleSimpleTag

public void handleSimpleTag(javax.swing.text.html.HTML.Tag tag,
                            javax.swing.text.MutableAttributeSet mutableAttrSet,
                            int pos)

handleComment

public void handleComment(char[] data,
                          int pos)

handleStartTag

public void handleStartTag(javax.swing.text.html.HTML.Tag tag,
                           javax.swing.text.MutableAttributeSet mutableAttrSet,
                           int pos)

handleEndTag

public void handleEndTag(javax.swing.text.html.HTML.Tag tag,
                         int pos)

handleText

public void handleText(char[] data,
                       int pos)

getOutputDoc

public org.w3c.dom.Document getOutputDoc()

outputXML

public void outputXML(java.io.Writer writer)

outputXML

public void outputXML(java.io.Writer writer,
                      javax.xml.transform.Transformer transformer)

setBasePath

public void setBasePath(java.lang.String basePath)
Set the base file path for XML Scrapers. Used in the XMLDefs of Child Element processing.


parseSecure

public void parseSecure(java.lang.String urlString,
                        java.lang.String theParams)

parseSecure

public void parseSecure(java.lang.String urlString,
                        java.lang.String theParams,
                        java.lang.String requestMethod)

getXMLURLDriller

public XMLURLDriller getXMLURLDriller()

getCookies

public OrderedMap getCookies()

getLastHTMLPage

public java.lang.String getLastHTMLPage()

setProxyHost

public void setProxyHost(java.lang.String proxyHost,
                         java.lang.String proxyPort,
                         java.lang.String proxyUserName,
                         java.lang.String proxyPassword)

setDEBUG

public void setDEBUG(boolean debug)

main

public static void main(java.lang.String[] args)