fr.inria.edelweiss.extractor.webpage
Class WebPageExtractor

java.lang.Object
  extended by fr.inria.edelweiss.extractor.webpage.WebPageExtractor

public class WebPageExtractor
extends java.lang.Object

Class providing methods to extract and access metadata (HTTP header and HTML meta and link tags) and text content of web pages.

Version:
1.1
Author:
Fabien Gandon

Constructor Summary
WebPageExtractor()
           
WebPageExtractor(java.lang.String p_URL)
           
WebPageExtractor(java.net.URL p_URL)
           
 
Method Summary
 void extract()
           
 java.lang.String fullText()
           
 java.util.Date getConnectionDate()
           
 java.util.LinkedList<ContentBlock> getContent()
           
 java.lang.String getContentType()
           
 java.lang.String getDefaultContentEncoding()
           
 java.lang.String getHost()
           
 java.util.Map getHttpHeaderFields()
           
 java.lang.String getIPAddress()
           
 java.util.Date getLastModified()
           
 java.util.LinkedList<Link> getLinks()
           
 java.util.LinkedList<Meta> getMeta()
           
 org.w3c.dom.Document getRDFXML()
           
 java.lang.String getSerializedRDFXML()
          from http://www.raben.com/articles/XsltEditor/part_1.html
 java.lang.String getTitle()
           
 java.lang.String getTopDomain()
           
 java.net.URL getURL()
           
static void main(java.lang.String[] args)
          testing purposes ; params are not used.
 void setDefaultContentEncoding(java.lang.String p_default_content_encoding)
           
 void setURL(java.lang.String p_URL)
           
 void setURL(java.net.URL p_URL)
           
 java.lang.String toString()
           
 
Methods inherited from class java.lang.Object
equals, getClass, hashCode, notify, notifyAll, wait, wait, wait
 

Constructor Detail

WebPageExtractor

public WebPageExtractor()

WebPageExtractor

public WebPageExtractor(java.lang.String p_URL)
                 throws java.net.MalformedURLException
Throws:
java.net.MalformedURLException

WebPageExtractor

public WebPageExtractor(java.net.URL p_URL)
Parameters:
p_URL - URL of the web page to be parsed.
Method Detail

getConnectionDate

public final java.util.Date getConnectionDate()
Returns:
connection date of the extraction
See Also:
getLastModified()

getContentType

public final java.lang.String getContentType()
Returns:
content type from HTTP header e.g. "text/html; charset=UTF-8"

getIPAddress

public final java.lang.String getIPAddress()
Returns:
IP address of the server serving the page

getLastModified

public final java.util.Date getLastModified()
Returns:
date of the last modification of this page
See Also:
getConnectionDate()

getTitle

public final java.lang.String getTitle()
Returns:
value of the TITLE html element.

setURL

public void setURL(java.lang.String p_URL)
            throws java.net.MalformedURLException
Parameters:
p_URL - string representation of the URL of the page to be parsed
Throws:
java.net.MalformedURLException

setURL

public void setURL(java.net.URL p_URL)
Parameters:
p_URL - URL of the page to be parsed

getDefaultContentEncoding

public final java.lang.String getDefaultContentEncoding()
Returns:
the encoding uses by default if not found at connection time

setDefaultContentEncoding

public final void setDefaultContentEncoding(java.lang.String p_default_content_encoding)
Parameters:
p_default_content_encoding - the encoding uses by default if not found at connection time

getContent

public final java.util.LinkedList<ContentBlock> getContent()
Returns:
the LinkedList of the content blocks of the document
See Also:
Anchor, Embedded, Header, Image, Link, Paragraph, ContentBlock

getLinks

public final java.util.LinkedList<Link> getLinks()
Returns:
the LinkedList of the <link> elements of the document
See Also:
Link

getMeta

public final java.util.LinkedList<Meta> getMeta()
Returns:
the LinkedList of the <meta> elements of the document
See Also:
Meta

extract

public void extract()
             throws java.io.IOException,
                    java.net.MalformedURLException
Throws:
java.io.IOException
java.net.MalformedURLException

getHost

public final java.lang.String getHost()
Returns:
the host name of the server serving the the page

getHttpHeaderFields

public final java.util.Map getHttpHeaderFields()
Returns:
map of all the HTTP headers fields see Header Field Definitions

getTopDomain

public final java.lang.String getTopDomain()
Returns:
top domain of host name e.g. "www.inria.fr" gives "fr"

getURL

public final java.net.URL getURL()
Returns:
URL of the page being parsed

toString

public java.lang.String toString()
Overrides:
toString in class java.lang.Object

fullText

public java.lang.String fullText()
Returns:
the full raw text of the document

getRDFXML

public org.w3c.dom.Document getRDFXML()
                               throws javax.xml.parsers.ParserConfigurationException
Returns:
the DOM of the XML/RDF representation of the result of the extraction.
Throws:
javax.xml.parsers.ParserConfigurationException

getSerializedRDFXML

public java.lang.String getSerializedRDFXML()
                                     throws javax.xml.transform.TransformerException,
                                            javax.xml.parsers.ParserConfigurationException
from http://www.raben.com/articles/XsltEditor/part_1.html

Returns:
the RDF/XML representation converted to a string
Throws:
javax.xml.parsers.ParserConfigurationException
javax.xml.transform.TransformerException

main

public static void main(java.lang.String[] args)
testing purposes ; params are not used.

Parameters:
args -