View Javadoc
1   /*
2    * Copyright (c) 2002-2025 Gargoyle Software Inc.
3    *
4    * Licensed under the Apache License, Version 2.0 (the "License");
5    * you may not use this file except in compliance with the License.
6    * You may obtain a copy of the License at
7    * https://www.apache.org/licenses/LICENSE-2.0
8    *
9    * Unless required by applicable law or agreed to in writing, software
10   * distributed under the License is distributed on an "AS IS" BASIS,
11   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12   * See the License for the specific language governing permissions and
13   * limitations under the License.
14   */
15  package org.htmlunit.html.parser.neko;
16  
17  import static org.htmlunit.BrowserVersionFeatures.HTML_COMMAND_TAG;
18  import static org.htmlunit.BrowserVersionFeatures.JS_SCRIPT_IN_TEMPLATE_EXECUTED_ON_ATTACH;
19  
20  import java.io.IOException;
21  import java.io.StringReader;
22  import java.net.URL;
23  import java.nio.charset.Charset;
24  import java.util.ArrayDeque;
25  import java.util.Deque;
26  
27  import org.htmlunit.BrowserVersion;
28  import org.htmlunit.ObjectInstantiationException;
29  import org.htmlunit.WebClient;
30  import org.htmlunit.WebResponse;
31  import org.htmlunit.cyberneko.HTMLConfiguration;
32  import org.htmlunit.cyberneko.HTMLElements;
33  import org.htmlunit.cyberneko.HTMLScanner;
34  import org.htmlunit.cyberneko.HTMLTagBalancingListener;
35  import org.htmlunit.cyberneko.xerces.parsers.AbstractSAXParser;
36  import org.htmlunit.cyberneko.xerces.xni.Augmentations;
37  import org.htmlunit.cyberneko.xerces.xni.QName;
38  import org.htmlunit.cyberneko.xerces.xni.XMLAttributes;
39  import org.htmlunit.cyberneko.xerces.xni.XMLString;
40  import org.htmlunit.cyberneko.xerces.xni.XNIException;
41  import org.htmlunit.cyberneko.xerces.xni.parser.XMLInputSource;
42  import org.htmlunit.cyberneko.xerces.xni.parser.XMLParserConfiguration;
43  import org.htmlunit.html.DomCDataSection;
44  import org.htmlunit.html.DomComment;
45  import org.htmlunit.html.DomDocumentType;
46  import org.htmlunit.html.DomElement;
47  import org.htmlunit.html.DomNode;
48  import org.htmlunit.html.DomText;
49  import org.htmlunit.html.ElementFactory;
50  import org.htmlunit.html.Html;
51  import org.htmlunit.html.HtmlBody;
52  import org.htmlunit.html.HtmlElement;
53  import org.htmlunit.html.HtmlForm;
54  import org.htmlunit.html.HtmlHiddenInput;
55  import org.htmlunit.html.HtmlImage;
56  import org.htmlunit.html.HtmlPage;
57  import org.htmlunit.html.HtmlSvg;
58  import org.htmlunit.html.HtmlTable;
59  import org.htmlunit.html.HtmlTableRow;
60  import org.htmlunit.html.HtmlTemplate;
61  import org.htmlunit.html.ScriptElement;
62  import org.htmlunit.html.SubmittableElement;
63  import org.htmlunit.html.XHtmlPage;
64  import org.htmlunit.html.parser.HTMLParser;
65  import org.htmlunit.html.parser.HTMLParserDOMBuilder;
66  import org.htmlunit.html.parser.HTMLParserListener;
67  import org.htmlunit.javascript.host.html.HTMLBodyElement;
68  import org.htmlunit.util.StringUtils;
69  import org.w3c.dom.Node;
70  import org.xml.sax.Attributes;
71  import org.xml.sax.ContentHandler;
72  import org.xml.sax.Locator;
73  import org.xml.sax.SAXException;
74  import org.xml.sax.ext.LexicalHandler;
75  
76  /**
77   * <span style="color:red">INTERNAL API - SUBJECT TO CHANGE AT ANY TIME - USE AT YOUR OWN RISK.</span><br>
78   *
79   * The parser and DOM builder. This class subclasses Xerces's AbstractSAXParser and implements
80   * the ContentHandler interface. Thus all parser APIs are kept private. The ContentHandler methods
81   * consume SAX events to build the page DOM
82   *
83   * @author Christian Sell
84   * @author David K. Taylor
85   * @author Chris Erskine
86   * @author Ahmed Ashour
87   * @author Marc Guillemot
88   * @author Ethan Glasser-Camp
89   * @author Sudhan Moghe
90   * @author Ronald Brill
91   * @author Frank Danek
92   * @author Carsten Steul
93   * @author Ronny Shapiro
94   * @author Atsushi Nakagawa
95   */
96  final class HtmlUnitNekoDOMBuilder extends AbstractSAXParser
97          implements ContentHandler, LexicalHandler, HTMLTagBalancingListener, HTMLParserDOMBuilder {
98  
99      // cache Neko Elements for performance and memory efficiency
100     private static final HTMLElements HTMLELEMENTS;
101     private static final HTMLElements HTMLELEMENTS_WITH_CMD;
102 
103     static {
104         // continue short code enumeration
105         final short commandShortCode = HTMLElements.UNKNOWN + 1;
106 
107         final HTMLElements.Element command = new HTMLElements.Element(commandShortCode, "COMMAND",
108                 HTMLElements.Element.EMPTY, new short[] {HTMLElements.BODY, HTMLElements.HEAD}, null);
109 
110         HTMLELEMENTS = new HTMLElements();
111 
112         final HTMLElements value = new HTMLElements();
113         value.setElement(command);
114         HTMLELEMENTS_WITH_CMD = value;
115     }
116 
117     private enum HeadParsed { YES, SYNTHESIZED, NO }
118 
119     private final HTMLParser htmlParser_;
120     private final HtmlPage page_;
121 
122     private Locator locator_;
123     private final Deque<DomNode> stack_ = new ArrayDeque<>();
124 
125     /** Did the snippet tried to overwrite the start node? */
126     private boolean snippetStartNodeOverwritten_;
127     private final int initialSize_;
128     private DomNode currentNode_;
129     private final boolean createdByJavascript_;
130     private final XMLString characters_ = new XMLString();
131     private HtmlUnitNekoDOMBuilder.HeadParsed headParsed_ = HeadParsed.NO;
132     private HtmlElement body_;
133     private boolean lastTagWasSynthesized_;
134     private HtmlForm consumingForm_;
135     private boolean formEndingIsAdjusting_;
136     private boolean insideSvg_;
137     private boolean insideTemplate_;
138 
139     private static final String FEATURE_AUGMENTATIONS = "http://cyberneko.org/html/features/augmentations";
140     private static final String FEATURE_PARSE_NOSCRIPT
141         = "http://cyberneko.org/html/features/parse-noscript-content";
142 
143     /**
144      * Parses and then inserts the specified HTML content into the HTML content currently being parsed.
145      * @param html the HTML content to push
146      */
147     @Override
148     public void pushInputString(final String html) {
149         page_.registerParsingStart();
150         page_.registerInlineSnippetParsingStart();
151         try {
152             final WebResponse webResponse = page_.getWebResponse();
153             final Charset charset = webResponse.getContentCharset();
154             final String url = webResponse.getWebRequest().getUrl().toString();
155             final XMLInputSource in = new XMLInputSource(null, url, null, new StringReader(html), charset.name());
156             ((HTMLConfiguration) parserConfiguration_).evaluateInputSource(in);
157         }
158         finally {
159             page_.registerParsingEnd();
160             page_.registerInlineSnippetParsingEnd();
161         }
162     }
163 
164     /**
165      * Creates a new builder for parsing the specified response contents.
166      * @param node the location at which to insert the new content
167      * @param url the page's URL
168      * @param createdByJavascript if true the (script) tag was created by javascript
169      */
170     HtmlUnitNekoDOMBuilder(final HTMLParser htmlParser,
171             final DomNode node, final URL url, final String htmlContent, final boolean createdByJavascript) {
172         super(createConfiguration(node.getPage().getWebClient().getBrowserVersion()));
173 
174         htmlParser_ = htmlParser;
175         page_ = (HtmlPage) node.getPage();
176 
177         currentNode_ = node;
178         for (final Node ancestor : currentNode_.getAncestors()) {
179             stack_.push((DomNode) ancestor);
180         }
181         createdByJavascript_ = createdByJavascript;
182 
183         final WebClient webClient = page_.getWebClient();
184         final HTMLParserListener listener = webClient.getHTMLParserListener();
185         final boolean reportErrors = listener != null;
186         if (reportErrors) {
187             parserConfiguration_.setErrorHandler(new HtmlUnitNekoHTMLErrorHandler(listener, url, htmlContent));
188         }
189 
190         try {
191             setFeature(FEATURE_AUGMENTATIONS, true);
192             setFeature("http://cyberneko.org/html/features/report-errors", reportErrors);
193             setFeature(FEATURE_PARSE_NOSCRIPT, !webClient.isJavaScriptEnabled());
194             setFeature(HTMLScanner.ALLOW_SELFCLOSING_IFRAME, false);
195 
196             setContentHandler(this);
197             setLexicalHandler(this); //comments and CDATA
198         }
199         catch (final SAXException e) {
200             throw new ObjectInstantiationException("unable to create HTML parser", e);
201         }
202         initialSize_ = stack_.size();
203     }
204 
205     /**
206      * Create the configuration depending on the simulated browser
207      * @return the configuration
208      */
209     private static XMLParserConfiguration createConfiguration(final BrowserVersion browserVersion) {
210         if (browserVersion.hasFeature(HTML_COMMAND_TAG)) {
211             return new HTMLConfiguration(HTMLELEMENTS_WITH_CMD);
212         }
213         return new HTMLConfiguration(HTMLELEMENTS);
214     }
215 
216     /**
217      * {@inheritDoc}
218      */
219     @Override
220     public void setDocumentLocator(final Locator locator) {
221         locator_ = locator;
222     }
223 
224     /**
225      * {@inheritDoc}
226      */
227     @Override
228     public void startDocument() throws SAXException {
229         // nothing to do
230     }
231 
232     /** {@inheritDoc} */
233     @Override
234     public void startElement(final QName element, final XMLAttributes attributes, final Augmentations augs)
235         throws XNIException {
236         // augs might change so we store only the interesting part
237         lastTagWasSynthesized_ = augs.isSynthesized();
238         super.startElement(element, attributes, augs);
239     }
240 
241     /**
242      * {@inheritDoc}
243      */
244     @Override
245     public void startElement(String namespaceURI, final String localName, final String qName, final Attributes atts)
246         throws SAXException {
247 
248         if (snippetStartNodeOverwritten_) {
249             snippetStartNodeOverwritten_ = false;
250             return;
251         }
252         handleCharacters();
253 
254         final String tagLower = StringUtils.toRootLowerCase(localName);
255         if (page_.isParsingHtmlSnippet() && ("html".equals(tagLower) || "body".equals(tagLower))) {
256             // we have to push the current node on the stack to make sure
257             // the endElement call is able to remove a node from the stack
258             stack_.push(currentNode_);
259             return;
260         }
261 
262         if ("head".equals(tagLower)) {
263             if (headParsed_ == HeadParsed.YES || page_.isParsingHtmlSnippet()) {
264                 // we have to push the current node on the stack to make sure
265                 // the endElement call is able to remove a node from the stack
266                 stack_.push(currentNode_);
267                 return;
268             }
269 
270             headParsed_ = lastTagWasSynthesized_ ? HeadParsed.SYNTHESIZED : HeadParsed.YES;
271         }
272 
273         // If we're adding a body element, keep track of any temporary synthetic ones
274         // that we may have had to create earlier (for document.write(), for example).
275         HtmlBody oldBody = null;
276         final boolean isBodyTag = "body".equals(tagLower);
277         if (isBodyTag) {
278             final HtmlBody body = page_.getBody();
279             if (body != null) {
280                 oldBody = body;
281             }
282         }
283 
284         if (namespaceURI != null) {
285             namespaceURI = namespaceURI.trim();
286         }
287         // Add the new node.
288         if (!(page_ instanceof XHtmlPage) && Html.XHTML_NAMESPACE.equals(namespaceURI)) {
289             namespaceURI = null;
290         }
291 
292         final ElementFactory factory =
293                 htmlParser_.getElementFactory(page_, namespaceURI, qName, insideSvg_, false);
294         if (factory == HtmlUnitNekoHtmlParser.SVG_FACTORY) {
295             namespaceURI = Html.SVG_NAMESPACE;
296         }
297 
298         final DomElement newElement = factory.createElementNS(page_, namespaceURI, qName, atts);
299         newElement.setStartLocation(locator_.getLineNumber(), locator_.getColumnNumber());
300 
301         // parse can't replace everything as it does not buffer elements while parsing
302         addNodeToRightParent(currentNode_, newElement);
303 
304         if (newElement instanceof HtmlSvg) {
305             insideSvg_ = true;
306         }
307         else if (newElement instanceof HtmlTemplate) {
308             insideTemplate_ = true;
309         }
310 
311         // Forms own elements simply by enclosing source-wise rather than DOM parent-child relationship
312         // Forms without a </form> will keep consuming forever
313         else if (newElement instanceof HtmlForm) {
314             consumingForm_ = (HtmlForm) newElement;
315             formEndingIsAdjusting_ = false;
316         }
317         else if (consumingForm_ != null) {
318             // If the current form enclosed a suitable element
319             if (newElement instanceof SubmittableElement) {
320                 // Let these be owned by the form
321                 if (((HtmlElement) newElement).getEnclosingForm() != consumingForm_) {
322                     ((HtmlElement) newElement).setOwningForm(consumingForm_);
323                 }
324             }
325         }
326 
327         // If we had an old synthetic body and we just added a real body element, quietly
328         // remove the old body and move its children to the real body element we just added.
329         if (oldBody != null) {
330             oldBody.quietlyRemoveAndMoveChildrenTo(newElement);
331         }
332 
333         if (!insideSvg_ && isBodyTag) {
334             body_ = (HtmlElement) newElement;
335         }
336         else if (createdByJavascript_
337                 && newElement instanceof ScriptElement
338                 && (!insideTemplate_
339                         || !page_.getWebClient().getBrowserVersion()
340                                 .hasFeature(JS_SCRIPT_IN_TEMPLATE_EXECUTED_ON_ATTACH))) {
341             final ScriptElement script = (ScriptElement) newElement;
342             script.markAsCreatedByDomParser();
343         }
344 
345         currentNode_ = newElement;
346         stack_.push(currentNode_);
347     }
348 
349     /**
350      * Adds the new node to the right parent that is not necessary the currentNode in case of
351      * malformed HTML code. The method tries to emulate the behavior of Firefox.
352      */
353     private void addNodeToRightParent(final DomNode currentNode, final DomElement newElement) {
354         final String currentNodeName = currentNode.getNodeName();
355         final String newNodeName = newElement.getNodeName();
356 
357         // First ensure table elements are housed correctly
358         if (isTableChild(newNodeName)) {
359             final DomNode parent =
360                     "table".equals(currentNodeName) ? currentNode : findElementOnStack("table");
361             appendChild(parent, newElement);
362             return;
363         }
364         if ("tr".equals(newNodeName)) {
365             final DomNode parent =
366                     isTableChild(currentNodeName) ? currentNode : findElementOnStack("tbody", "thead", "tfoot");
367             appendChild(parent, newElement);
368             return;
369         }
370         if (isTableCell(newNodeName)) {
371             final DomNode parent =
372                     "tr".equals(currentNodeName) ? currentNode : findElementOnStack("tr");
373             appendChild(parent, newElement);
374             return;
375         }
376 
377         // Next ensure non-table elements don't appear in tables
378         if ("table".equals(currentNodeName) || isTableChild(currentNodeName) || "tr".equals(currentNodeName)) {
379             if ("template".equals(newNodeName)) {
380                 currentNode.appendChild(newElement);
381             }
382 
383             // Scripts, forms, and styles are exempt
384             else if (!"colgroup".equals(currentNodeName)
385                     && ("script".equals(newNodeName)
386                         || "form".equals(newNodeName)
387                         || "style".equals(newNodeName))) {
388                 currentNode.appendChild(newElement);
389             }
390 
391             // These are good
392             else if ("col".equals(newNodeName) && "colgroup".equals(currentNodeName)) {
393                 currentNode.appendChild(newElement);
394             }
395             else if ("caption".equals(currentNodeName)) {
396                 currentNode.appendChild(newElement);
397             }
398             else if (newElement instanceof HtmlHiddenInput) {
399                 currentNode.appendChild(newElement);
400             }
401             else {
402                 // Move before the table
403                 final DomNode parent = findElementOnStack("table");
404                 parent.insertBefore(newElement);
405             }
406             return;
407         }
408 
409         if (formEndingIsAdjusting_ && "form".equals(currentNodeName)) {
410             // We cater to HTMLTagBalancer's shortcomings by moving this node out of the <form>
411             appendChild(currentNode.getParentNode(), newElement);
412             return;
413         }
414 
415         // Everything else
416         appendChild(currentNode, newElement);
417     }
418 
419     private DomNode findElementOnStack(final String searchedElementName) {
420         for (final DomNode node : stack_) {
421             if (searchedElementName.equals(node.getNodeName())) {
422                 return node;
423             }
424         }
425 
426         // this is surely wrong but at least it won't throw a NPE
427         return stack_.peek();
428     }
429 
430     private DomNode findElementOnStack(final String... searchedElementNames) {
431         for (final DomNode node : stack_) {
432             for (final String searchedElementName : searchedElementNames) {
433                 if (searchedElementName.equals(node.getNodeName())) {
434                     return node;
435                 }
436             }
437         }
438 
439         // this is surely wrong but at least it won't throw a NPE
440         return stack_.peek();
441     }
442 
443     private static boolean isTableChild(final String nodeName) {
444         if (nodeName == null || nodeName.length() < 5) {
445             return false;
446         }
447 
448         return "thead".equals(nodeName)
449                 || "tbody".equals(nodeName)
450                 || "tfoot".equals(nodeName)
451                 || "caption".equals(nodeName)
452                 || "colgroup".equals(nodeName);
453     }
454 
455     private static boolean isTableCell(final String nodeName) {
456         if (nodeName == null || nodeName.length() != 2) {
457             return false;
458         }
459         return "td".equals(nodeName) || "th".equals(nodeName);
460     }
461 
462     /** {@inheritDoc} */
463     @Override
464     public void endElement(final QName element, final Augmentations augs)
465         throws XNIException {
466         // augs might change so we store only the interesting part
467         lastTagWasSynthesized_ = augs.isSynthesized();
468         super.endElement(element, augs);
469     }
470 
471     /**
472      * {@inheritDoc}
473      */
474     @Override
475     public void endElement(final String namespaceURI, final String localName, final String qName)
476         throws SAXException {
477 
478         final String tagLower = StringUtils.toRootLowerCase(localName);
479 
480         handleCharacters();
481 
482         if (page_.isParsingHtmlSnippet()) {
483             if ("html".equals(tagLower) || "body".equals(tagLower)) {
484                 return;
485             }
486             if (stack_.size() == initialSize_) {
487                 // a <p> inside a <p> is valid for innerHTML processing
488                 // see HTMLParser2Test for more cases
489                 snippetStartNodeOverwritten_ = !StringUtils.equalsChar('p', tagLower);
490                 return;
491             }
492         }
493 
494         if ("svg".equals(tagLower)) {
495             insideSvg_ = false;
496         }
497         else if ("template".equals(tagLower)) {
498             insideTemplate_ = false;
499         }
500 
501         // this only avoids a problem when the stack is empty here
502         // but for this case we made the problem before - the balancing
503         // is broken already
504         if (stack_.isEmpty()) {
505             return;
506         }
507 
508         final DomNode previousNode = stack_.pop(); //remove currentElement from stack
509         previousNode.setEndLocation(locator_.getLineNumber(), locator_.getColumnNumber());
510 
511         if ("form".equals(tagLower) && !lastTagWasSynthesized_) {
512             // We get here if the </form> was on the same DOM tree depth as the <form> that started it,
513             // otherwise HTMLTagBalancer gives us the end through ignoredEndElement()
514             consumingForm_ = null;
515         }
516 
517         if (!stack_.isEmpty()) {
518             currentNode_ = stack_.peek();
519         }
520 
521         final boolean postponed = page_.isParsingInlineHtmlSnippet();
522         previousNode.onAllChildrenAddedToPage(postponed);
523     }
524 
525     /** {@inheritDoc} */
526     @Override
527     public void characters(final char[] ch, final int start, final int length) throws SAXException {
528         characters_.append(ch, start, length);
529     }
530 
531     /** {@inheritDoc} */
532     @Override
533     public void ignorableWhitespace(final char[] ch, final int start, final int length) throws SAXException {
534         characters_.append(ch, start, length);
535     }
536 
537     /**
538      * Picks up the character data accumulated so far and add it to the current element as a text node.
539      */
540     private void handleCharacters() {
541         // make the code easier to read because we remove a nesting level
542         if (characters_.length() == 0) {
543             return;
544         }
545 
546         // Use the normal behavior: append a text node for the accumulated text.
547         final String textValue = characters_.toString();
548         characters_.clear();
549 
550         if (StringUtils.isBlank(textValue)) {
551             appendChild(currentNode_, new DomText(page_, textValue));
552             return;
553         }
554 
555         // malformed HTML: </td>some text</tr> => text comes before the table
556         if (currentNode_ instanceof HtmlTableRow) {
557             final HtmlTableRow row = (HtmlTableRow) currentNode_;
558             final HtmlTable enclosingTable = row.getEnclosingTable();
559             if (enclosingTable != null) { // may be null when called from Range.createContextualFragment
560                 if (enclosingTable.getPreviousSibling() instanceof DomText) {
561                     final DomText domText = (DomText) enclosingTable.getPreviousSibling();
562                     domText.setTextContent(domText.getWholeText() + textValue);
563                 }
564                 else {
565                     enclosingTable.insertBefore(new DomText(page_, textValue));
566                 }
567             }
568         }
569         else if (currentNode_ instanceof HtmlTable) {
570             final HtmlTable enclosingTable = (HtmlTable) currentNode_;
571             if (enclosingTable.getPreviousSibling() instanceof DomText) {
572                 final DomText domText = (DomText) enclosingTable.getPreviousSibling();
573                 domText.setTextContent(domText.getWholeText() + textValue);
574             }
575             else {
576                 enclosingTable.insertBefore(new DomText(page_, textValue));
577             }
578         }
579         else if (currentNode_ instanceof HtmlImage) {
580             currentNode_.getParentNode().appendChild(new DomText(page_, textValue));
581         }
582         else {
583             appendChild(currentNode_, new DomText(page_, textValue));
584         }
585     }
586 
587     /** {@inheritDoc} */
588     @Override
589     public void endDocument() throws SAXException {
590         handleCharacters();
591         if (locator_ != null) {
592             page_.setEndLocation(locator_.getLineNumber(), locator_.getColumnNumber());
593         }
594     }
595 
596     /** {@inheritDoc} */
597     @Override
598     public void startPrefixMapping(final String prefix, final String uri) throws SAXException {
599         // nothing to do
600     }
601 
602     /** {@inheritDoc} */
603     @Override
604     public void endPrefixMapping(final String prefix) throws SAXException {
605         // nothing to do
606     }
607 
608     /** {@inheritDoc} */
609     @Override
610     public void processingInstruction(final String target, final String data) throws SAXException {
611         // nothing to do
612     }
613 
614     /** {@inheritDoc} */
615     @Override
616     public void skippedEntity(final String name) throws SAXException {
617         // nothing to do
618     }
619 
620     // LexicalHandler methods
621 
622     /** {@inheritDoc} */
623     @Override
624     public void comment(final char[] ch, final int start, final int length) {
625         handleCharacters();
626         final String data = new String(ch, start, length);
627         final DomComment comment = new DomComment(page_, data);
628         appendChild(currentNode_, comment);
629     }
630 
631     /** {@inheritDoc} */
632     @Override
633     public void endCDATA() {
634         final String data = characters_.toString();
635         characters_.clear();
636 
637         final DomCDataSection cdataSection = new DomCDataSection(page_, data);
638         appendChild(currentNode_, cdataSection);
639     }
640 
641     /** {@inheritDoc} */
642     @Override
643     public void endDTD() {
644         // nothing to do
645     }
646 
647     /** {@inheritDoc} */
648     @Override
649     public void endEntity(final String name) {
650         // nothing to do
651     }
652 
653     /** {@inheritDoc} */
654     @Override
655     public void startCDATA() {
656         handleCharacters();
657     }
658 
659     /** {@inheritDoc} */
660     @Override
661     public void startDTD(final String name, final String publicId, final String systemId) {
662         final DomDocumentType type = new DomDocumentType(page_, name, publicId, systemId);
663         page_.setDocumentType(type);
664 
665         final Node child;
666         child = type;
667         page_.appendChild(child);
668     }
669 
670     /** {@inheritDoc} */
671     @Override
672     public void startEntity(final String name) {
673         // nothing to do
674     }
675 
676     /**
677      * {@inheritDoc}
678      */
679     @Override
680     public void ignoredEndElement(final QName element, final Augmentations augs) {
681         // HTMLTagBalancer brings us here if </form> was found in the source on a different
682         // DOM tree depth (either above or below) to the <form> that started it
683         if ("form".equals(element.getLocalpart()) && consumingForm_ != null) {
684             consumingForm_ = null;
685 
686             if (findElementOnStack("table", "form") instanceof HtmlTable) {
687                 // The </form> just goes missing for these (really? just tables?)
688             }
689             else {
690                 /*
691                  * This </form> was ignored by HTMLTagBalancer as it generates its own
692                  * </form> at the end of the depth with the starting <form>.
693                  * e.g. This:
694                  * | <form>
695                  * |   <div>
696                  * |     </form> <!--ignored by HTMLTagBalancer-->
697                  * |   </div>
698                  * |   <input>
699                  *
700                  * is turned into:
701                  * | <form>
702                  * |   <div>
703                  * |   </div>
704                  * |   <input>
705                  * | </form> <!--synthesized by HTMLTagBalancer-->
706                  *
707                  * but this isn't suitable for us because </form> shouldn't be ignored but
708                  * rather moved directly behind the tree it's in to instead become:
709                  * | <form>
710                  * |   <div>
711                  * |   </div>
712                  * | </form> <!--moved out of div-->
713                  * | <input> <!--proceeding children are not part of form-->
714                  */
715                 // We cater for this by moving out nodes such as the <input> in the above
716                 // diagram out of the form
717                 formEndingIsAdjusting_ = true;
718             }
719         }
720     }
721 
722     /**
723      * {@inheritDoc}
724      */
725     @Override
726     public void ignoredStartElement(final QName elem, final XMLAttributes attrs, final Augmentations augs) {
727         // when multiple html/body elements are encountered, the attributes of the discarded
728         // elements are used when not previously defined
729         if (attrs != null && body_ != null) {
730             final String lp = elem.getLocalpart();
731             if (lp != null && lp.length() == 4) {
732                 if ("body".equalsIgnoreCase(lp)) {
733                     copyAttributes(body_, attrs);
734                 }
735                 else if ("html".equalsIgnoreCase(lp)) {
736                     final DomNode parent = body_.getParentNode();
737                     if (parent instanceof DomElement) {
738                         copyAttributes((DomElement) parent, attrs);
739                     }
740                 }
741             }
742         }
743     }
744 
745     private static void copyAttributes(final DomElement to, final XMLAttributes attrs) {
746         final int length = attrs.getLength();
747 
748         for (int i = 0; i < length; i++) {
749             final String attrName = StringUtils.toRootLowerCase(attrs.getLocalName(i));
750             if (to.getAttributes().getNamedItem(attrName) == null) {
751                 to.setAttribute(attrName, attrs.getValue(i));
752                 if (attrName.startsWith("on") && to.getPage().getWebClient().isJavaScriptEngineEnabled()
753                         && to.getScriptableObject() instanceof HTMLBodyElement) {
754                     final HTMLBodyElement jsBody = to.getScriptableObject();
755                     jsBody.createEventHandlerFromAttribute(attrName, attrs.getValue(i));
756                 }
757             }
758         }
759     }
760 
761     /**
762      * {@inheritDoc}
763      */
764     @Override
765     public void parse(final XMLInputSource inputSource) throws XNIException, IOException {
766         final HTMLParserDOMBuilder oldBuilder = page_.getDOMBuilder();
767         page_.setDOMBuilder(this);
768         try {
769             super.parse(inputSource);
770         }
771         finally {
772             page_.setDOMBuilder(oldBuilder);
773         }
774     }
775 
776     private static void appendChild(final DomNode parent, final DomNode child) {
777         if (parent instanceof HtmlTemplate) {
778             ((HtmlTemplate) parent).getContent().appendChild(child);
779             return;
780         }
781 
782         parent.appendChild(child);
783     }
784 }