1   
2   
3   
4   
5   
6   
7   
8   
9   
10  
11  
12  
13  
14  
15  package org.htmlunit.html.parser.neko;
16  
17  import static org.htmlunit.BrowserVersionFeatures.HTML_COMMAND_TAG;
18  import static org.htmlunit.BrowserVersionFeatures.JS_SCRIPT_IN_TEMPLATE_EXECUTED_ON_ATTACH;
19  
20  import java.io.IOException;
21  import java.io.StringReader;
22  import java.net.URL;
23  import java.nio.charset.Charset;
24  import java.util.ArrayDeque;
25  import java.util.Deque;
26  
27  import org.htmlunit.BrowserVersion;
28  import org.htmlunit.ObjectInstantiationException;
29  import org.htmlunit.WebClient;
30  import org.htmlunit.WebResponse;
31  import org.htmlunit.cyberneko.HTMLConfiguration;
32  import org.htmlunit.cyberneko.HTMLElements;
33  import org.htmlunit.cyberneko.HTMLScanner;
34  import org.htmlunit.cyberneko.HTMLTagBalancingListener;
35  import org.htmlunit.cyberneko.xerces.parsers.AbstractSAXParser;
36  import org.htmlunit.cyberneko.xerces.xni.Augmentations;
37  import org.htmlunit.cyberneko.xerces.xni.QName;
38  import org.htmlunit.cyberneko.xerces.xni.XMLAttributes;
39  import org.htmlunit.cyberneko.xerces.xni.XMLString;
40  import org.htmlunit.cyberneko.xerces.xni.XNIException;
41  import org.htmlunit.cyberneko.xerces.xni.parser.XMLInputSource;
42  import org.htmlunit.cyberneko.xerces.xni.parser.XMLParserConfiguration;
43  import org.htmlunit.html.DomCDataSection;
44  import org.htmlunit.html.DomComment;
45  import org.htmlunit.html.DomDocumentType;
46  import org.htmlunit.html.DomElement;
47  import org.htmlunit.html.DomNode;
48  import org.htmlunit.html.DomText;
49  import org.htmlunit.html.ElementFactory;
50  import org.htmlunit.html.Html;
51  import org.htmlunit.html.HtmlBody;
52  import org.htmlunit.html.HtmlElement;
53  import org.htmlunit.html.HtmlForm;
54  import org.htmlunit.html.HtmlHiddenInput;
55  import org.htmlunit.html.HtmlImage;
56  import org.htmlunit.html.HtmlPage;
57  import org.htmlunit.html.HtmlSvg;
58  import org.htmlunit.html.HtmlTable;
59  import org.htmlunit.html.HtmlTableRow;
60  import org.htmlunit.html.HtmlTemplate;
61  import org.htmlunit.html.ScriptElement;
62  import org.htmlunit.html.SubmittableElement;
63  import org.htmlunit.html.XHtmlPage;
64  import org.htmlunit.html.parser.HTMLParser;
65  import org.htmlunit.html.parser.HTMLParserDOMBuilder;
66  import org.htmlunit.html.parser.HTMLParserListener;
67  import org.htmlunit.javascript.host.html.HTMLBodyElement;
68  import org.htmlunit.util.StringUtils;
69  import org.w3c.dom.Node;
70  import org.xml.sax.Attributes;
71  import org.xml.sax.ContentHandler;
72  import org.xml.sax.Locator;
73  import org.xml.sax.SAXException;
74  import org.xml.sax.ext.LexicalHandler;
75  
76  
77  
78  
79  
80  
81  
82  
83  
84  
85  
86  
87  
88  
89  
90  
91  
92  
93  
94  
95  
96  final class HtmlUnitNekoDOMBuilder extends AbstractSAXParser
97          implements ContentHandler, LexicalHandler, HTMLTagBalancingListener, HTMLParserDOMBuilder {
98  
99      
100     private static final HTMLElements HTMLELEMENTS;
101     private static final HTMLElements HTMLELEMENTS_WITH_CMD;
102 
103     static {
104         
105         final short commandShortCode = HTMLElements.UNKNOWN + 1;
106 
107         final HTMLElements.Element command = new HTMLElements.Element(commandShortCode, "COMMAND",
108                 HTMLElements.Element.EMPTY, new short[] {HTMLElements.BODY, HTMLElements.HEAD}, null);
109 
110         HTMLELEMENTS = new HTMLElements();
111 
112         final HTMLElements value = new HTMLElements();
113         value.setElement(command);
114         HTMLELEMENTS_WITH_CMD = value;
115     }
116 
117     private enum HeadParsed { YES, SYNTHESIZED, NO }
118 
119     private final HTMLParser htmlParser_;
120     private final HtmlPage page_;
121 
122     private Locator locator_;
123     private final Deque<DomNode> stack_ = new ArrayDeque<>();
124 
125     
126     private boolean snippetStartNodeOverwritten_;
127     private final int initialSize_;
128     private DomNode currentNode_;
129     private final boolean createdByJavascript_;
130     private final XMLString characters_ = new XMLString();
131     private HtmlUnitNekoDOMBuilder.HeadParsed headParsed_ = HeadParsed.NO;
132     private HtmlElement body_;
133     private boolean lastTagWasSynthesized_;
134     private HtmlForm consumingForm_;
135     private boolean formEndingIsAdjusting_;
136     private boolean insideSvg_;
137     private boolean insideTemplate_;
138 
139     private static final String FEATURE_AUGMENTATIONS = "http://cyberneko.org/html/features/augmentations";
140     private static final String FEATURE_PARSE_NOSCRIPT
141         = "http://cyberneko.org/html/features/parse-noscript-content";
142 
143     
144 
145 
146 
147     @Override
148     public void pushInputString(final String html) {
149         page_.registerParsingStart();
150         page_.registerInlineSnippetParsingStart();
151         try {
152             final WebResponse webResponse = page_.getWebResponse();
153             final Charset charset = webResponse.getContentCharset();
154             final String url = webResponse.getWebRequest().getUrl().toString();
155             final XMLInputSource in = new XMLInputSource(null, url, null, new StringReader(html), charset.name());
156             ((HTMLConfiguration) parserConfiguration_).evaluateInputSource(in);
157         }
158         finally {
159             page_.registerParsingEnd();
160             page_.registerInlineSnippetParsingEnd();
161         }
162     }
163 
164     
165 
166 
167 
168 
169 
170     HtmlUnitNekoDOMBuilder(final HTMLParser htmlParser,
171             final DomNode node, final URL url, final String htmlContent, final boolean createdByJavascript) {
172         super(createConfiguration(node.getPage().getWebClient().getBrowserVersion()));
173 
174         htmlParser_ = htmlParser;
175         page_ = (HtmlPage) node.getPage();
176 
177         currentNode_ = node;
178         for (final Node ancestor : currentNode_.getAncestors()) {
179             stack_.push((DomNode) ancestor);
180         }
181         createdByJavascript_ = createdByJavascript;
182 
183         final WebClient webClient = page_.getWebClient();
184         final HTMLParserListener listener = webClient.getHTMLParserListener();
185         final boolean reportErrors = listener != null;
186         if (reportErrors) {
187             parserConfiguration_.setErrorHandler(new HtmlUnitNekoHTMLErrorHandler(listener, url, htmlContent));
188         }
189 
190         try {
191             setFeature(FEATURE_AUGMENTATIONS, true);
192             setFeature("http://cyberneko.org/html/features/report-errors", reportErrors);
193             setFeature(FEATURE_PARSE_NOSCRIPT, !webClient.isJavaScriptEnabled());
194             setFeature(HTMLScanner.ALLOW_SELFCLOSING_IFRAME, false);
195 
196             setContentHandler(this);
197             setLexicalHandler(this); 
198         }
199         catch (final SAXException e) {
200             throw new ObjectInstantiationException("unable to create HTML parser", e);
201         }
202         initialSize_ = stack_.size();
203     }
204 
205     
206 
207 
208 
209     private static XMLParserConfiguration createConfiguration(final BrowserVersion browserVersion) {
210         
211         
212         
213 
214         if (browserVersion.hasFeature(HTML_COMMAND_TAG)) {
215             return new HTMLConfiguration(new HTMLElements.HTMLElementsWithCache(HTMLELEMENTS_WITH_CMD));
216         }
217         return new HTMLConfiguration(new HTMLElements.HTMLElementsWithCache(HTMLELEMENTS));
218     }
219 
220     
221 
222 
223     @Override
224     public void setDocumentLocator(final Locator locator) {
225         locator_ = locator;
226     }
227 
228     
229 
230 
231     @Override
232     public void startDocument() throws SAXException {
233         
234     }
235 
236     
237     @Override
238     public void startElement(final QName element, final XMLAttributes attributes, final Augmentations augs)
239         throws XNIException {
240         
241         lastTagWasSynthesized_ = augs.isSynthesized();
242         super.startElement(element, attributes, augs);
243     }
244 
245     
246 
247 
248     @Override
249     public void startElement(String namespaceURI, final String localName, final String qName, final Attributes atts)
250         throws SAXException {
251 
252         if (snippetStartNodeOverwritten_) {
253             snippetStartNodeOverwritten_ = false;
254             return;
255         }
256         handleCharacters();
257 
258         final String tagLower = StringUtils.toRootLowerCase(localName);
259         if (page_.isParsingHtmlSnippet() && ("html".equals(tagLower) || "body".equals(tagLower))) {
260             
261             
262             stack_.push(currentNode_);
263             return;
264         }
265 
266         if ("head".equals(tagLower)) {
267             if (headParsed_ == HeadParsed.YES || page_.isParsingHtmlSnippet()) {
268                 
269                 
270                 stack_.push(currentNode_);
271                 return;
272             }
273 
274             headParsed_ = lastTagWasSynthesized_ ? HeadParsed.SYNTHESIZED : HeadParsed.YES;
275         }
276 
277         
278         
279         HtmlBody oldBody = null;
280         final boolean isBodyTag = "body".equals(tagLower);
281         if (isBodyTag) {
282             final HtmlBody body = page_.getBody();
283             if (body != null) {
284                 oldBody = body;
285             }
286         }
287 
288         if (namespaceURI != null) {
289             namespaceURI = namespaceURI.trim();
290         }
291         
292         if (!(page_ instanceof XHtmlPage) && Html.XHTML_NAMESPACE.equals(namespaceURI)) {
293             namespaceURI = null;
294         }
295 
296         final ElementFactory factory =
297                 htmlParser_.getElementFactory(page_, namespaceURI, qName, insideSvg_, false);
298         if (factory == HtmlUnitNekoHtmlParser.SVG_FACTORY) {
299             namespaceURI = Html.SVG_NAMESPACE;
300         }
301 
302         final DomElement newElement = factory.createElementNS(page_, namespaceURI, qName, atts);
303         newElement.setStartLocation(locator_.getLineNumber(), locator_.getColumnNumber());
304 
305         
306         addNodeToRightParent(currentNode_, newElement);
307 
308         if (newElement instanceof HtmlSvg) {
309             insideSvg_ = true;
310         }
311         else if (newElement instanceof HtmlTemplate) {
312             insideTemplate_ = true;
313         }
314 
315         
316         
317         else if (newElement instanceof HtmlForm) {
318             consumingForm_ = (HtmlForm) newElement;
319             formEndingIsAdjusting_ = false;
320         }
321         else if (consumingForm_ != null) {
322             
323             if (newElement instanceof SubmittableElement) {
324                 
325                 if (((HtmlElement) newElement).getEnclosingForm() != consumingForm_) {
326                     ((HtmlElement) newElement).setOwningForm(consumingForm_);
327                 }
328             }
329         }
330 
331         
332         
333         if (oldBody != null) {
334             oldBody.quietlyRemoveAndMoveChildrenTo(newElement);
335         }
336 
337         if (!insideSvg_ && isBodyTag) {
338             body_ = (HtmlElement) newElement;
339         }
340         else if (createdByJavascript_
341                 && newElement instanceof ScriptElement
342                 && (!insideTemplate_
343                         || !page_.getWebClient().getBrowserVersion()
344                                 .hasFeature(JS_SCRIPT_IN_TEMPLATE_EXECUTED_ON_ATTACH))) {
345             final ScriptElement script = (ScriptElement) newElement;
346             script.markAsCreatedByDomParser();
347         }
348 
349         currentNode_ = newElement;
350         stack_.push(currentNode_);
351     }
352 
353     
354 
355 
356 
357     private void addNodeToRightParent(final DomNode currentNode, final DomElement newElement) {
358         final String currentNodeName = currentNode.getNodeName();
359         final String newNodeName = newElement.getNodeName();
360 
361         
362         if (isTableChild(newNodeName)) {
363             final DomNode parent =
364                     "table".equals(currentNodeName) ? currentNode : findElementOnStack("table");
365             appendChild(parent, newElement);
366             return;
367         }
368         if ("tr".equals(newNodeName)) {
369             final DomNode parent =
370                     isTableChild(currentNodeName) ? currentNode : findElementOnStack("tbody", "thead", "tfoot");
371             appendChild(parent, newElement);
372             return;
373         }
374         if (isTableCell(newNodeName)) {
375             final DomNode parent =
376                     "tr".equals(currentNodeName) ? currentNode : findElementOnStack("tr");
377             appendChild(parent, newElement);
378             return;
379         }
380 
381         
382         if ("table".equals(currentNodeName) || isTableChild(currentNodeName) || "tr".equals(currentNodeName)) {
383             if ("template".equals(newNodeName)) {
384                 currentNode.appendChild(newElement);
385             }
386 
387             
388             else if (!"colgroup".equals(currentNodeName)
389                     && ("script".equals(newNodeName)
390                         || "form".equals(newNodeName)
391                         || "style".equals(newNodeName))) {
392                 currentNode.appendChild(newElement);
393             }
394 
395             
396             else if ("col".equals(newNodeName) && "colgroup".equals(currentNodeName)) {
397                 currentNode.appendChild(newElement);
398             }
399             else if ("caption".equals(currentNodeName)) {
400                 currentNode.appendChild(newElement);
401             }
402             else if (newElement instanceof HtmlHiddenInput) {
403                 currentNode.appendChild(newElement);
404             }
405             else {
406                 
407                 final DomNode parent = findElementOnStack("table");
408                 parent.insertBefore(newElement);
409             }
410             return;
411         }
412 
413         if (formEndingIsAdjusting_ && "form".equals(currentNodeName)) {
414             
415             appendChild(currentNode.getParentNode(), newElement);
416             return;
417         }
418 
419         
420         appendChild(currentNode, newElement);
421     }
422 
423     private DomNode findElementOnStack(final String searchedElementName) {
424         for (final DomNode node : stack_) {
425             if (searchedElementName.equals(node.getNodeName())) {
426                 return node;
427             }
428         }
429 
430         
431         return stack_.peek();
432     }
433 
434     private DomNode findElementOnStack(final String... searchedElementNames) {
435         for (final DomNode node : stack_) {
436             for (final String searchedElementName : searchedElementNames) {
437                 if (searchedElementName.equals(node.getNodeName())) {
438                     return node;
439                 }
440             }
441         }
442 
443         
444         return stack_.peek();
445     }
446 
447     private static boolean isTableChild(final String nodeName) {
448         if (nodeName == null || nodeName.length() < 5) {
449             return false;
450         }
451 
452         return "thead".equals(nodeName)
453                 || "tbody".equals(nodeName)
454                 || "tfoot".equals(nodeName)
455                 || "caption".equals(nodeName)
456                 || "colgroup".equals(nodeName);
457     }
458 
459     private static boolean isTableCell(final String nodeName) {
460         if (nodeName == null || nodeName.length() != 2) {
461             return false;
462         }
463         return "td".equals(nodeName) || "th".equals(nodeName);
464     }
465 
466     
467     @Override
468     public void endElement(final QName element, final Augmentations augs)
469         throws XNIException {
470         
471         lastTagWasSynthesized_ = augs.isSynthesized();
472         super.endElement(element, augs);
473     }
474 
475     
476 
477 
478     @Override
479     public void endElement(final String namespaceURI, final String localName, final String qName)
480         throws SAXException {
481 
482         final String tagLower = StringUtils.toRootLowerCase(localName);
483 
484         handleCharacters();
485 
486         if (page_.isParsingHtmlSnippet()) {
487             if ("html".equals(tagLower) || "body".equals(tagLower)) {
488                 return;
489             }
490             if (stack_.size() == initialSize_) {
491                 
492                 
493                 snippetStartNodeOverwritten_ = !StringUtils.equalsChar('p', tagLower);
494                 return;
495             }
496         }
497 
498         if ("svg".equals(tagLower)) {
499             insideSvg_ = false;
500         }
501         else if ("template".equals(tagLower)) {
502             insideTemplate_ = false;
503         }
504 
505         
506         
507         
508         if (stack_.isEmpty()) {
509             return;
510         }
511 
512         final DomNode previousNode = stack_.pop(); 
513         previousNode.setEndLocation(locator_.getLineNumber(), locator_.getColumnNumber());
514 
515         if ("form".equals(tagLower) && !lastTagWasSynthesized_) {
516             
517             
518             consumingForm_ = null;
519         }
520 
521         if (!stack_.isEmpty()) {
522             currentNode_ = stack_.peek();
523         }
524 
525         final boolean postponed = page_.isParsingInlineHtmlSnippet();
526         previousNode.onAllChildrenAddedToPage(postponed);
527     }
528 
529     
530     @Override
531     public void characters(final char[] ch, final int start, final int length) throws SAXException {
532         characters_.append(ch, start, length);
533     }
534 
535     
536     @Override
537     public void ignorableWhitespace(final char[] ch, final int start, final int length) throws SAXException {
538         characters_.append(ch, start, length);
539     }
540 
541     
542 
543 
544     private void handleCharacters() {
545         
546         if (characters_.length() == 0) {
547             return;
548         }
549 
550         
551         final String textValue = characters_.toString();
552         characters_.clear();
553 
554         if (StringUtils.isBlank(textValue)) {
555             appendChild(currentNode_, new DomText(page_, textValue));
556             return;
557         }
558 
559         
560         if (currentNode_ instanceof HtmlTableRow) {
561             final HtmlTableRow row = (HtmlTableRow) currentNode_;
562             final HtmlTable enclosingTable = row.getEnclosingTable();
563             if (enclosingTable != null) { 
564                 if (enclosingTable.getPreviousSibling() instanceof DomText) {
565                     final DomText domText = (DomText) enclosingTable.getPreviousSibling();
566                     domText.setTextContent(domText.getWholeText() + textValue);
567                 }
568                 else {
569                     enclosingTable.insertBefore(new DomText(page_, textValue));
570                 }
571             }
572         }
573         else if (currentNode_ instanceof HtmlTable) {
574             final HtmlTable enclosingTable = (HtmlTable) currentNode_;
575             if (enclosingTable.getPreviousSibling() instanceof DomText) {
576                 final DomText domText = (DomText) enclosingTable.getPreviousSibling();
577                 domText.setTextContent(domText.getWholeText() + textValue);
578             }
579             else {
580                 enclosingTable.insertBefore(new DomText(page_, textValue));
581             }
582         }
583         else if (currentNode_ instanceof HtmlImage) {
584             currentNode_.getParentNode().appendChild(new DomText(page_, textValue));
585         }
586         else {
587             appendChild(currentNode_, new DomText(page_, textValue));
588         }
589     }
590 
591     
592     @Override
593     public void endDocument() throws SAXException {
594         handleCharacters();
595         if (locator_ != null) {
596             page_.setEndLocation(locator_.getLineNumber(), locator_.getColumnNumber());
597         }
598     }
599 
600     
601     @Override
602     public void startPrefixMapping(final String prefix, final String uri) throws SAXException {
603         
604     }
605 
606     
607     @Override
608     public void endPrefixMapping(final String prefix) throws SAXException {
609         
610     }
611 
612     
613     @Override
614     public void processingInstruction(final String target, final String data) throws SAXException {
615         
616     }
617 
618     
619     @Override
620     public void skippedEntity(final String name) throws SAXException {
621         
622     }
623 
624     
625 
626     
627     @Override
628     public void comment(final char[] ch, final int start, final int length) {
629         handleCharacters();
630         final String data = new String(ch, start, length);
631         final DomComment comment = new DomComment(page_, data);
632         appendChild(currentNode_, comment);
633     }
634 
635     
636     @Override
637     public void endCDATA() {
638         final String data = characters_.toString();
639         characters_.clear();
640 
641         final DomCDataSection cdataSection = new DomCDataSection(page_, data);
642         appendChild(currentNode_, cdataSection);
643     }
644 
645     
646     @Override
647     public void endDTD() {
648         
649     }
650 
651     
652     @Override
653     public void endEntity(final String name) {
654         
655     }
656 
657     
658     @Override
659     public void startCDATA() {
660         handleCharacters();
661     }
662 
663     
664     @Override
665     public void startDTD(final String name, final String publicId, final String systemId) {
666         final DomDocumentType type = new DomDocumentType(page_, name, publicId, systemId);
667         page_.setDocumentType(type);
668 
669         final Node child;
670         child = type;
671         page_.appendChild(child);
672     }
673 
674     
675     @Override
676     public void startEntity(final String name) {
677         
678     }
679 
680     
681 
682 
683     @Override
684     public void ignoredEndElement(final QName element, final Augmentations augs) {
685         
686         
687         if ("form".equals(element.getLocalpart()) && consumingForm_ != null) {
688             consumingForm_ = null;
689 
690             if (findElementOnStack("table", "form") instanceof HtmlTable) {
691                 
692             }
693             else {
694                 
695 
696 
697 
698 
699 
700 
701 
702 
703 
704 
705 
706 
707 
708 
709 
710 
711 
712 
713 
714 
715 
716 
717 
718 
719                 
720                 
721                 formEndingIsAdjusting_ = true;
722             }
723         }
724     }
725 
726     
727 
728 
729     @Override
730     public void ignoredStartElement(final QName elem, final XMLAttributes attrs, final Augmentations augs) {
731         
732         
733         if (attrs != null && body_ != null) {
734             final String lp = elem.getLocalpart();
735             if (lp != null && lp.length() == 4) {
736                 if ("body".equalsIgnoreCase(lp)) {
737                     copyAttributes(body_, attrs);
738                 }
739                 else if ("html".equalsIgnoreCase(lp)) {
740                     final DomNode parent = body_.getParentNode();
741                     if (parent instanceof DomElement) {
742                         copyAttributes((DomElement) parent, attrs);
743                     }
744                 }
745             }
746         }
747     }
748 
749     private static void copyAttributes(final DomElement to, final XMLAttributes attrs) {
750         final int length = attrs.getLength();
751 
752         for (int i = 0; i < length; i++) {
753             final String attrName = StringUtils.toRootLowerCase(attrs.getLocalName(i));
754             if (to.getAttributes().getNamedItem(attrName) == null) {
755                 to.setAttribute(attrName, attrs.getValue(i));
756                 if (attrName.startsWith("on") && to.getPage().getWebClient().isJavaScriptEngineEnabled()
757                         && to.getScriptableObject() instanceof HTMLBodyElement) {
758                     final HTMLBodyElement jsBody = to.getScriptableObject();
759                     jsBody.createEventHandlerFromAttribute(attrName, attrs.getValue(i));
760                 }
761             }
762         }
763     }
764 
765     
766 
767 
768     @Override
769     public void parse(final XMLInputSource inputSource) throws XNIException, IOException {
770         final HTMLParserDOMBuilder oldBuilder = page_.getDOMBuilder();
771         page_.setDOMBuilder(this);
772         try {
773             super.parse(inputSource);
774         }
775         finally {
776             page_.setDOMBuilder(oldBuilder);
777         }
778     }
779 
780     private static void appendChild(final DomNode parent, final DomNode child) {
781         if (parent instanceof HtmlTemplate) {
782             ((HtmlTemplate) parent).getContent().appendChild(child);
783             return;
784         }
785 
786         parent.appendChild(child);
787     }
788 }