1
2
3
4
5
6
7
8
9
10
11
12
13
14
15 package org.htmlunit.html.parser.neko;
16
17 import static org.htmlunit.BrowserVersionFeatures.HTML_COMMAND_TAG;
18 import static org.htmlunit.BrowserVersionFeatures.JS_SCRIPT_IN_TEMPLATE_EXECUTED_ON_ATTACH;
19
20 import java.io.IOException;
21 import java.io.StringReader;
22 import java.net.URL;
23 import java.nio.charset.Charset;
24 import java.util.ArrayDeque;
25 import java.util.Deque;
26
27 import org.htmlunit.BrowserVersion;
28 import org.htmlunit.ObjectInstantiationException;
29 import org.htmlunit.WebClient;
30 import org.htmlunit.WebResponse;
31 import org.htmlunit.cyberneko.HTMLConfiguration;
32 import org.htmlunit.cyberneko.HTMLElements;
33 import org.htmlunit.cyberneko.HTMLScanner;
34 import org.htmlunit.cyberneko.HTMLTagBalancingListener;
35 import org.htmlunit.cyberneko.xerces.parsers.AbstractSAXParser;
36 import org.htmlunit.cyberneko.xerces.xni.Augmentations;
37 import org.htmlunit.cyberneko.xerces.xni.QName;
38 import org.htmlunit.cyberneko.xerces.xni.XMLAttributes;
39 import org.htmlunit.cyberneko.xerces.xni.XMLString;
40 import org.htmlunit.cyberneko.xerces.xni.XNIException;
41 import org.htmlunit.cyberneko.xerces.xni.parser.XMLInputSource;
42 import org.htmlunit.cyberneko.xerces.xni.parser.XMLParserConfiguration;
43 import org.htmlunit.html.DomCDataSection;
44 import org.htmlunit.html.DomComment;
45 import org.htmlunit.html.DomDocumentType;
46 import org.htmlunit.html.DomElement;
47 import org.htmlunit.html.DomNode;
48 import org.htmlunit.html.DomText;
49 import org.htmlunit.html.ElementFactory;
50 import org.htmlunit.html.Html;
51 import org.htmlunit.html.HtmlBody;
52 import org.htmlunit.html.HtmlElement;
53 import org.htmlunit.html.HtmlForm;
54 import org.htmlunit.html.HtmlHiddenInput;
55 import org.htmlunit.html.HtmlImage;
56 import org.htmlunit.html.HtmlPage;
57 import org.htmlunit.html.HtmlSvg;
58 import org.htmlunit.html.HtmlTable;
59 import org.htmlunit.html.HtmlTableRow;
60 import org.htmlunit.html.HtmlTemplate;
61 import org.htmlunit.html.ScriptElement;
62 import org.htmlunit.html.SubmittableElement;
63 import org.htmlunit.html.XHtmlPage;
64 import org.htmlunit.html.parser.HTMLParser;
65 import org.htmlunit.html.parser.HTMLParserDOMBuilder;
66 import org.htmlunit.html.parser.HTMLParserListener;
67 import org.htmlunit.javascript.host.html.HTMLBodyElement;
68 import org.htmlunit.util.StringUtils;
69 import org.w3c.dom.Node;
70 import org.xml.sax.Attributes;
71 import org.xml.sax.ContentHandler;
72 import org.xml.sax.Locator;
73 import org.xml.sax.SAXException;
74 import org.xml.sax.ext.LexicalHandler;
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96 final class HtmlUnitNekoDOMBuilder extends AbstractSAXParser
97 implements ContentHandler, LexicalHandler, HTMLTagBalancingListener, HTMLParserDOMBuilder {
98
99
100 private static final HTMLElements HTMLELEMENTS;
101 private static final HTMLElements HTMLELEMENTS_WITH_CMD;
102
103 static {
104
105 final short commandShortCode = HTMLElements.UNKNOWN + 1;
106
107 final HTMLElements.Element command = new HTMLElements.Element(commandShortCode, "COMMAND",
108 HTMLElements.Element.EMPTY, new short[] {HTMLElements.BODY, HTMLElements.HEAD}, null);
109
110 HTMLELEMENTS = new HTMLElements();
111
112 final HTMLElements value = new HTMLElements();
113 value.setElement(command);
114 HTMLELEMENTS_WITH_CMD = value;
115 }
116
117 private enum HeadParsed { YES, SYNTHESIZED, NO }
118
119 private final HTMLParser htmlParser_;
120 private final HtmlPage page_;
121
122 private Locator locator_;
123 private final Deque<DomNode> stack_ = new ArrayDeque<>();
124
125
126 private boolean snippetStartNodeOverwritten_;
127 private final int initialSize_;
128 private DomNode currentNode_;
129 private final boolean createdByJavascript_;
130 private final XMLString characters_ = new XMLString();
131 private HtmlUnitNekoDOMBuilder.HeadParsed headParsed_ = HeadParsed.NO;
132 private HtmlElement body_;
133 private boolean lastTagWasSynthesized_;
134 private HtmlForm consumingForm_;
135 private boolean formEndingIsAdjusting_;
136 private boolean insideSvg_;
137 private boolean insideTemplate_;
138
139 private static final String FEATURE_AUGMENTATIONS = "http://cyberneko.org/html/features/augmentations";
140 private static final String FEATURE_PARSE_NOSCRIPT
141 = "http://cyberneko.org/html/features/parse-noscript-content";
142
143
144
145
146
147 @Override
148 public void pushInputString(final String html) {
149 page_.registerParsingStart();
150 page_.registerInlineSnippetParsingStart();
151 try {
152 final WebResponse webResponse = page_.getWebResponse();
153 final Charset charset = webResponse.getContentCharset();
154 final String url = webResponse.getWebRequest().getUrl().toString();
155 final XMLInputSource in = new XMLInputSource(null, url, null, new StringReader(html), charset.name());
156 ((HTMLConfiguration) parserConfiguration_).evaluateInputSource(in);
157 }
158 finally {
159 page_.registerParsingEnd();
160 page_.registerInlineSnippetParsingEnd();
161 }
162 }
163
164
165
166
167
168
169
170 HtmlUnitNekoDOMBuilder(final HTMLParser htmlParser,
171 final DomNode node, final URL url, final String htmlContent, final boolean createdByJavascript) {
172 super(createConfiguration(node.getPage().getWebClient().getBrowserVersion()));
173
174 htmlParser_ = htmlParser;
175 page_ = (HtmlPage) node.getPage();
176
177 currentNode_ = node;
178 for (final Node ancestor : currentNode_.getAncestors()) {
179 stack_.push((DomNode) ancestor);
180 }
181 createdByJavascript_ = createdByJavascript;
182
183 final WebClient webClient = page_.getWebClient();
184 final HTMLParserListener listener = webClient.getHTMLParserListener();
185 final boolean reportErrors = listener != null;
186 if (reportErrors) {
187 parserConfiguration_.setErrorHandler(new HtmlUnitNekoHTMLErrorHandler(listener, url, htmlContent));
188 }
189
190 try {
191 setFeature(FEATURE_AUGMENTATIONS, true);
192 setFeature("http://cyberneko.org/html/features/report-errors", reportErrors);
193 setFeature(FEATURE_PARSE_NOSCRIPT, !webClient.isJavaScriptEnabled());
194 setFeature(HTMLScanner.ALLOW_SELFCLOSING_IFRAME, false);
195
196 setContentHandler(this);
197 setLexicalHandler(this);
198 }
199 catch (final SAXException e) {
200 throw new ObjectInstantiationException("unable to create HTML parser", e);
201 }
202 initialSize_ = stack_.size();
203 }
204
205
206
207
208
209 private static XMLParserConfiguration createConfiguration(final BrowserVersion browserVersion) {
210 if (browserVersion.hasFeature(HTML_COMMAND_TAG)) {
211 return new HTMLConfiguration(HTMLELEMENTS_WITH_CMD);
212 }
213 return new HTMLConfiguration(HTMLELEMENTS);
214 }
215
216
217
218
219 @Override
220 public void setDocumentLocator(final Locator locator) {
221 locator_ = locator;
222 }
223
224
225
226
227 @Override
228 public void startDocument() throws SAXException {
229
230 }
231
232
233 @Override
234 public void startElement(final QName element, final XMLAttributes attributes, final Augmentations augs)
235 throws XNIException {
236
237 lastTagWasSynthesized_ = augs.isSynthesized();
238 super.startElement(element, attributes, augs);
239 }
240
241
242
243
244 @Override
245 public void startElement(String namespaceURI, final String localName, final String qName, final Attributes atts)
246 throws SAXException {
247
248 if (snippetStartNodeOverwritten_) {
249 snippetStartNodeOverwritten_ = false;
250 return;
251 }
252 handleCharacters();
253
254 final String tagLower = StringUtils.toRootLowerCase(localName);
255 if (page_.isParsingHtmlSnippet() && ("html".equals(tagLower) || "body".equals(tagLower))) {
256
257
258 stack_.push(currentNode_);
259 return;
260 }
261
262 if ("head".equals(tagLower)) {
263 if (headParsed_ == HeadParsed.YES || page_.isParsingHtmlSnippet()) {
264
265
266 stack_.push(currentNode_);
267 return;
268 }
269
270 headParsed_ = lastTagWasSynthesized_ ? HeadParsed.SYNTHESIZED : HeadParsed.YES;
271 }
272
273
274
275 HtmlBody oldBody = null;
276 final boolean isBodyTag = "body".equals(tagLower);
277 if (isBodyTag) {
278 final HtmlBody body = page_.getBody();
279 if (body != null) {
280 oldBody = body;
281 }
282 }
283
284 if (namespaceURI != null) {
285 namespaceURI = namespaceURI.trim();
286 }
287
288 if (!(page_ instanceof XHtmlPage) && Html.XHTML_NAMESPACE.equals(namespaceURI)) {
289 namespaceURI = null;
290 }
291
292 final ElementFactory factory =
293 htmlParser_.getElementFactory(page_, namespaceURI, qName, insideSvg_, false);
294 if (factory == HtmlUnitNekoHtmlParser.SVG_FACTORY) {
295 namespaceURI = Html.SVG_NAMESPACE;
296 }
297
298 final DomElement newElement = factory.createElementNS(page_, namespaceURI, qName, atts);
299 newElement.setStartLocation(locator_.getLineNumber(), locator_.getColumnNumber());
300
301
302 addNodeToRightParent(currentNode_, newElement);
303
304 if (newElement instanceof HtmlSvg) {
305 insideSvg_ = true;
306 }
307 else if (newElement instanceof HtmlTemplate) {
308 insideTemplate_ = true;
309 }
310
311
312
313 else if (newElement instanceof HtmlForm) {
314 consumingForm_ = (HtmlForm) newElement;
315 formEndingIsAdjusting_ = false;
316 }
317 else if (consumingForm_ != null) {
318
319 if (newElement instanceof SubmittableElement) {
320
321 if (((HtmlElement) newElement).getEnclosingForm() != consumingForm_) {
322 ((HtmlElement) newElement).setOwningForm(consumingForm_);
323 }
324 }
325 }
326
327
328
329 if (oldBody != null) {
330 oldBody.quietlyRemoveAndMoveChildrenTo(newElement);
331 }
332
333 if (!insideSvg_ && isBodyTag) {
334 body_ = (HtmlElement) newElement;
335 }
336 else if (createdByJavascript_
337 && newElement instanceof ScriptElement
338 && (!insideTemplate_
339 || !page_.getWebClient().getBrowserVersion()
340 .hasFeature(JS_SCRIPT_IN_TEMPLATE_EXECUTED_ON_ATTACH))) {
341 final ScriptElement script = (ScriptElement) newElement;
342 script.markAsCreatedByDomParser();
343 }
344
345 currentNode_ = newElement;
346 stack_.push(currentNode_);
347 }
348
349
350
351
352
353 private void addNodeToRightParent(final DomNode currentNode, final DomElement newElement) {
354 final String currentNodeName = currentNode.getNodeName();
355 final String newNodeName = newElement.getNodeName();
356
357
358 if (isTableChild(newNodeName)) {
359 final DomNode parent =
360 "table".equals(currentNodeName) ? currentNode : findElementOnStack("table");
361 appendChild(parent, newElement);
362 return;
363 }
364 if ("tr".equals(newNodeName)) {
365 final DomNode parent =
366 isTableChild(currentNodeName) ? currentNode : findElementOnStack("tbody", "thead", "tfoot");
367 appendChild(parent, newElement);
368 return;
369 }
370 if (isTableCell(newNodeName)) {
371 final DomNode parent =
372 "tr".equals(currentNodeName) ? currentNode : findElementOnStack("tr");
373 appendChild(parent, newElement);
374 return;
375 }
376
377
378 if ("table".equals(currentNodeName) || isTableChild(currentNodeName) || "tr".equals(currentNodeName)) {
379 if ("template".equals(newNodeName)) {
380 currentNode.appendChild(newElement);
381 }
382
383
384 else if (!"colgroup".equals(currentNodeName)
385 && ("script".equals(newNodeName)
386 || "form".equals(newNodeName)
387 || "style".equals(newNodeName))) {
388 currentNode.appendChild(newElement);
389 }
390
391
392 else if ("col".equals(newNodeName) && "colgroup".equals(currentNodeName)) {
393 currentNode.appendChild(newElement);
394 }
395 else if ("caption".equals(currentNodeName)) {
396 currentNode.appendChild(newElement);
397 }
398 else if (newElement instanceof HtmlHiddenInput) {
399 currentNode.appendChild(newElement);
400 }
401 else {
402
403 final DomNode parent = findElementOnStack("table");
404 parent.insertBefore(newElement);
405 }
406 return;
407 }
408
409 if (formEndingIsAdjusting_ && "form".equals(currentNodeName)) {
410
411 appendChild(currentNode.getParentNode(), newElement);
412 return;
413 }
414
415
416 appendChild(currentNode, newElement);
417 }
418
419 private DomNode findElementOnStack(final String searchedElementName) {
420 for (final DomNode node : stack_) {
421 if (searchedElementName.equals(node.getNodeName())) {
422 return node;
423 }
424 }
425
426
427 return stack_.peek();
428 }
429
430 private DomNode findElementOnStack(final String... searchedElementNames) {
431 for (final DomNode node : stack_) {
432 for (final String searchedElementName : searchedElementNames) {
433 if (searchedElementName.equals(node.getNodeName())) {
434 return node;
435 }
436 }
437 }
438
439
440 return stack_.peek();
441 }
442
443 private static boolean isTableChild(final String nodeName) {
444 if (nodeName == null || nodeName.length() < 5) {
445 return false;
446 }
447
448 return "thead".equals(nodeName)
449 || "tbody".equals(nodeName)
450 || "tfoot".equals(nodeName)
451 || "caption".equals(nodeName)
452 || "colgroup".equals(nodeName);
453 }
454
455 private static boolean isTableCell(final String nodeName) {
456 if (nodeName == null || nodeName.length() != 2) {
457 return false;
458 }
459 return "td".equals(nodeName) || "th".equals(nodeName);
460 }
461
462
463 @Override
464 public void endElement(final QName element, final Augmentations augs)
465 throws XNIException {
466
467 lastTagWasSynthesized_ = augs.isSynthesized();
468 super.endElement(element, augs);
469 }
470
471
472
473
474 @Override
475 public void endElement(final String namespaceURI, final String localName, final String qName)
476 throws SAXException {
477
478 final String tagLower = StringUtils.toRootLowerCase(localName);
479
480 handleCharacters();
481
482 if (page_.isParsingHtmlSnippet()) {
483 if ("html".equals(tagLower) || "body".equals(tagLower)) {
484 return;
485 }
486 if (stack_.size() == initialSize_) {
487
488
489 snippetStartNodeOverwritten_ = !StringUtils.equalsChar('p', tagLower);
490 return;
491 }
492 }
493
494 if ("svg".equals(tagLower)) {
495 insideSvg_ = false;
496 }
497 else if ("template".equals(tagLower)) {
498 insideTemplate_ = false;
499 }
500
501
502
503
504 if (stack_.isEmpty()) {
505 return;
506 }
507
508 final DomNode previousNode = stack_.pop();
509 previousNode.setEndLocation(locator_.getLineNumber(), locator_.getColumnNumber());
510
511 if ("form".equals(tagLower) && !lastTagWasSynthesized_) {
512
513
514 consumingForm_ = null;
515 }
516
517 if (!stack_.isEmpty()) {
518 currentNode_ = stack_.peek();
519 }
520
521 final boolean postponed = page_.isParsingInlineHtmlSnippet();
522 previousNode.onAllChildrenAddedToPage(postponed);
523 }
524
525
526 @Override
527 public void characters(final char[] ch, final int start, final int length) throws SAXException {
528 characters_.append(ch, start, length);
529 }
530
531
532 @Override
533 public void ignorableWhitespace(final char[] ch, final int start, final int length) throws SAXException {
534 characters_.append(ch, start, length);
535 }
536
537
538
539
540 private void handleCharacters() {
541
542 if (characters_.length() == 0) {
543 return;
544 }
545
546
547 final String textValue = characters_.toString();
548 characters_.clear();
549
550 if (StringUtils.isBlank(textValue)) {
551 appendChild(currentNode_, new DomText(page_, textValue));
552 return;
553 }
554
555
556 if (currentNode_ instanceof HtmlTableRow) {
557 final HtmlTableRow row = (HtmlTableRow) currentNode_;
558 final HtmlTable enclosingTable = row.getEnclosingTable();
559 if (enclosingTable != null) {
560 if (enclosingTable.getPreviousSibling() instanceof DomText) {
561 final DomText domText = (DomText) enclosingTable.getPreviousSibling();
562 domText.setTextContent(domText.getWholeText() + textValue);
563 }
564 else {
565 enclosingTable.insertBefore(new DomText(page_, textValue));
566 }
567 }
568 }
569 else if (currentNode_ instanceof HtmlTable) {
570 final HtmlTable enclosingTable = (HtmlTable) currentNode_;
571 if (enclosingTable.getPreviousSibling() instanceof DomText) {
572 final DomText domText = (DomText) enclosingTable.getPreviousSibling();
573 domText.setTextContent(domText.getWholeText() + textValue);
574 }
575 else {
576 enclosingTable.insertBefore(new DomText(page_, textValue));
577 }
578 }
579 else if (currentNode_ instanceof HtmlImage) {
580 currentNode_.getParentNode().appendChild(new DomText(page_, textValue));
581 }
582 else {
583 appendChild(currentNode_, new DomText(page_, textValue));
584 }
585 }
586
587
588 @Override
589 public void endDocument() throws SAXException {
590 handleCharacters();
591 if (locator_ != null) {
592 page_.setEndLocation(locator_.getLineNumber(), locator_.getColumnNumber());
593 }
594 }
595
596
597 @Override
598 public void startPrefixMapping(final String prefix, final String uri) throws SAXException {
599
600 }
601
602
603 @Override
604 public void endPrefixMapping(final String prefix) throws SAXException {
605
606 }
607
608
609 @Override
610 public void processingInstruction(final String target, final String data) throws SAXException {
611
612 }
613
614
615 @Override
616 public void skippedEntity(final String name) throws SAXException {
617
618 }
619
620
621
622
623 @Override
624 public void comment(final char[] ch, final int start, final int length) {
625 handleCharacters();
626 final String data = new String(ch, start, length);
627 final DomComment comment = new DomComment(page_, data);
628 appendChild(currentNode_, comment);
629 }
630
631
632 @Override
633 public void endCDATA() {
634 final String data = characters_.toString();
635 characters_.clear();
636
637 final DomCDataSection cdataSection = new DomCDataSection(page_, data);
638 appendChild(currentNode_, cdataSection);
639 }
640
641
642 @Override
643 public void endDTD() {
644
645 }
646
647
648 @Override
649 public void endEntity(final String name) {
650
651 }
652
653
654 @Override
655 public void startCDATA() {
656 handleCharacters();
657 }
658
659
660 @Override
661 public void startDTD(final String name, final String publicId, final String systemId) {
662 final DomDocumentType type = new DomDocumentType(page_, name, publicId, systemId);
663 page_.setDocumentType(type);
664
665 final Node child;
666 child = type;
667 page_.appendChild(child);
668 }
669
670
671 @Override
672 public void startEntity(final String name) {
673
674 }
675
676
677
678
679 @Override
680 public void ignoredEndElement(final QName element, final Augmentations augs) {
681
682
683 if ("form".equals(element.getLocalpart()) && consumingForm_ != null) {
684 consumingForm_ = null;
685
686 if (findElementOnStack("table", "form") instanceof HtmlTable) {
687
688 }
689 else {
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717 formEndingIsAdjusting_ = true;
718 }
719 }
720 }
721
722
723
724
725 @Override
726 public void ignoredStartElement(final QName elem, final XMLAttributes attrs, final Augmentations augs) {
727
728
729 if (attrs != null && body_ != null) {
730 final String lp = elem.getLocalpart();
731 if (lp != null && lp.length() == 4) {
732 if ("body".equalsIgnoreCase(lp)) {
733 copyAttributes(body_, attrs);
734 }
735 else if ("html".equalsIgnoreCase(lp)) {
736 final DomNode parent = body_.getParentNode();
737 if (parent instanceof DomElement) {
738 copyAttributes((DomElement) parent, attrs);
739 }
740 }
741 }
742 }
743 }
744
745 private static void copyAttributes(final DomElement to, final XMLAttributes attrs) {
746 final int length = attrs.getLength();
747
748 for (int i = 0; i < length; i++) {
749 final String attrName = StringUtils.toRootLowerCase(attrs.getLocalName(i));
750 if (to.getAttributes().getNamedItem(attrName) == null) {
751 to.setAttribute(attrName, attrs.getValue(i));
752 if (attrName.startsWith("on") && to.getPage().getWebClient().isJavaScriptEngineEnabled()
753 && to.getScriptableObject() instanceof HTMLBodyElement) {
754 final HTMLBodyElement jsBody = to.getScriptableObject();
755 jsBody.createEventHandlerFromAttribute(attrName, attrs.getValue(i));
756 }
757 }
758 }
759 }
760
761
762
763
764 @Override
765 public void parse(final XMLInputSource inputSource) throws XNIException, IOException {
766 final HTMLParserDOMBuilder oldBuilder = page_.getDOMBuilder();
767 page_.setDOMBuilder(this);
768 try {
769 super.parse(inputSource);
770 }
771 finally {
772 page_.setDOMBuilder(oldBuilder);
773 }
774 }
775
776 private static void appendChild(final DomNode parent, final DomNode child) {
777 if (parent instanceof HtmlTemplate) {
778 ((HtmlTemplate) parent).getContent().appendChild(child);
779 return;
780 }
781
782 parent.appendChild(child);
783 }
784 }