Mypal/parser/html/java/htmlparser/gwt-src/nu/validator/htmlparser/gwt/HtmlParser.java

266 lines
9.3 KiB
Java

/*
* Copyright (c) 2007 Henri Sivonen
* Copyright (c) 2007-2008 Mozilla Foundation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*/
package nu.validator.htmlparser.gwt;
import java.util.LinkedList;
import nu.validator.htmlparser.common.XmlViolationPolicy;
import nu.validator.htmlparser.impl.ErrorReportingTokenizer;
import nu.validator.htmlparser.impl.Tokenizer;
import nu.validator.htmlparser.impl.UTF16Buffer;
import org.xml.sax.ErrorHandler;
import org.xml.sax.SAXException;
import org.xml.sax.SAXParseException;
import com.google.gwt.core.client.JavaScriptObject;
import com.google.gwt.user.client.Timer;
/**
* This class implements an HTML5 parser that exposes data through the DOM
* interface.
*
* <p>By default, when using the constructor without arguments, the
* this parser treats XML 1.0-incompatible infosets as fatal errors.
* This corresponds to
* <code>FATAL</code> as the general XML violation policy. To make the parser
* support non-conforming HTML fully per the HTML 5 spec while on the other
* hand potentially violating the DOM API contract, set the general XML
* violation policy to <code>ALLOW</code>. This does not work with a standard
* DOM implementation. Handling all input without fatal errors and without
* violating the DOM API contract is possible by setting
* the general XML violation policy to <code>ALTER_INFOSET</code>. <em>This
* makes the parser non-conforming</em> but is probably the most useful
* setting for most applications.
*
* <p>The doctype is not represented in the tree.
*
* <p>The document mode is represented as user data <code>DocumentMode</code>
* object with the key <code>nu.validator.document-mode</code> on the document
* node.
*
* <p>The form pointer is also stored as user data with the key
* <code>nu.validator.form-pointer</code>.
*
* @version $Id: HtmlDocumentBuilder.java 255 2008-05-29 08:57:38Z hsivonen $
* @author hsivonen
*/
public class HtmlParser {
private static final int CHUNK_SIZE = 512;
private final Tokenizer tokenizer;
private final BrowserTreeBuilder domTreeBuilder;
private final StringBuilder documentWriteBuffer = new StringBuilder();
private ErrorHandler errorHandler;
private UTF16Buffer stream;
private int streamLength;
private boolean lastWasCR;
private boolean ending;
private ParseEndListener parseEndListener;
private final LinkedList<UTF16Buffer> bufferStack = new LinkedList<UTF16Buffer>();
/**
* Instantiates the parser
*
* @param implementation
* the DOM implementation
* @param xmlPolicy the policy
*/
public HtmlParser(JavaScriptObject document) {
this.domTreeBuilder = new BrowserTreeBuilder(document);
this.tokenizer = new ErrorReportingTokenizer(domTreeBuilder);
this.domTreeBuilder.setNamePolicy(XmlViolationPolicy.ALTER_INFOSET);
this.tokenizer.setCommentPolicy(XmlViolationPolicy.ALTER_INFOSET);
this.tokenizer.setContentNonXmlCharPolicy(XmlViolationPolicy.ALTER_INFOSET);
this.tokenizer.setContentSpacePolicy(XmlViolationPolicy.ALTER_INFOSET);
this.tokenizer.setNamePolicy(XmlViolationPolicy.ALTER_INFOSET);
this.tokenizer.setXmlnsPolicy(XmlViolationPolicy.ALTER_INFOSET);
}
/**
* Parses a document from a SAX <code>InputSource</code>.
* @param is the source
* @return the doc
* @see javax.xml.parsers.DocumentBuilder#parse(org.xml.sax.InputSource)
*/
public void parse(String source, ParseEndListener callback) throws SAXException {
parseEndListener = callback;
domTreeBuilder.setFragmentContext(null);
tokenize(source, null);
}
/**
* @param is
* @throws SAXException
* @throws IOException
* @throws MalformedURLException
*/
private void tokenize(String source, String context) throws SAXException {
lastWasCR = false;
ending = false;
documentWriteBuffer.setLength(0);
streamLength = source.length();
stream = new UTF16Buffer(source.toCharArray(), 0,
(streamLength < CHUNK_SIZE ? streamLength : CHUNK_SIZE));
bufferStack.clear();
push(stream);
domTreeBuilder.setFragmentContext(context == null ? null : context.intern());
tokenizer.start();
pump();
}
private void pump() throws SAXException {
if (ending) {
tokenizer.end();
domTreeBuilder.getDocument(); // drops the internal reference
parseEndListener.parseComplete();
// Don't schedule timeout
return;
}
int docWriteLen = documentWriteBuffer.length();
if (docWriteLen > 0) {
char[] newBuf = new char[docWriteLen];
documentWriteBuffer.getChars(0, docWriteLen, newBuf, 0);
push(new UTF16Buffer(newBuf, 0, docWriteLen));
documentWriteBuffer.setLength(0);
}
for (;;) {
UTF16Buffer buffer = peek();
if (!buffer.hasMore()) {
if (buffer == stream) {
if (buffer.getEnd() == streamLength) {
// Stop parsing
tokenizer.eof();
ending = true;
break;
} else {
int newEnd = buffer.getStart() + CHUNK_SIZE;
buffer.setEnd(newEnd < streamLength ? newEnd
: streamLength);
continue;
}
} else {
pop();
continue;
}
}
// now we have a non-empty buffer
buffer.adjust(lastWasCR);
lastWasCR = false;
if (buffer.hasMore()) {
lastWasCR = tokenizer.tokenizeBuffer(buffer);
domTreeBuilder.maybeRunScript();
break;
} else {
continue;
}
}
// schedule
Timer timer = new Timer() {
@Override public void run() {
try {
pump();
} catch (SAXException e) {
ending = true;
if (errorHandler != null) {
try {
errorHandler.fatalError(new SAXParseException(
e.getMessage(), null, null, -1, -1, e));
} catch (SAXException e1) {
}
}
}
}
};
timer.schedule(1);
}
private void push(UTF16Buffer buffer) {
bufferStack.addLast(buffer);
}
private UTF16Buffer peek() {
return bufferStack.getLast();
}
private void pop() {
bufferStack.removeLast();
}
public void documentWrite(String text) throws SAXException {
UTF16Buffer buffer = new UTF16Buffer(text.toCharArray(), 0, text.length());
while (buffer.hasMore()) {
buffer.adjust(lastWasCR);
lastWasCR = false;
if (buffer.hasMore()) {
lastWasCR = tokenizer.tokenizeBuffer(buffer);
domTreeBuilder.maybeRunScript();
}
}
}
/**
* @see javax.xml.parsers.DocumentBuilder#setErrorHandler(org.xml.sax.ErrorHandler)
*/
public void setErrorHandler(ErrorHandler errorHandler) {
this.errorHandler = errorHandler;
domTreeBuilder.setErrorHandler(errorHandler);
tokenizer.setErrorHandler(errorHandler);
}
/**
* Sets whether comment nodes appear in the tree.
* @param ignoreComments <code>true</code> to ignore comments
* @see nu.validator.htmlparser.impl.TreeBuilder#setIgnoringComments(boolean)
*/
public void setIgnoringComments(boolean ignoreComments) {
domTreeBuilder.setIgnoringComments(ignoreComments);
}
/**
* Sets whether the parser considers scripting to be enabled for noscript treatment.
* @param scriptingEnabled <code>true</code> to enable
* @see nu.validator.htmlparser.impl.TreeBuilder#setScriptingEnabled(boolean)
*/
public void setScriptingEnabled(boolean scriptingEnabled) {
domTreeBuilder.setScriptingEnabled(scriptingEnabled);
}
}