View Javadoc
1   /*
2    * Copyright (c) 2002-2025 Gargoyle Software Inc.
3    *
4    * Licensed under the Apache License, Version 2.0 (the "License");
5    * you may not use this file except in compliance with the License.
6    * You may obtain a copy of the License at
7    * https://www.apache.org/licenses/LICENSE-2.0
8    *
9    * Unless required by applicable law or agreed to in writing, software
10   * distributed under the License is distributed on an "AS IS" BASIS,
11   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12   * See the License for the specific language governing permissions and
13   * limitations under the License.
14   */
15  package org.htmlunit.html.parser.neko;
16  
17  import java.io.IOException;
18  import java.io.InputStream;
19  import java.io.StringReader;
20  import java.net.URL;
21  import java.nio.charset.Charset;
22  import java.util.ArrayList;
23  import java.util.List;
24  import java.util.Map;
25  import java.util.concurrent.ConcurrentHashMap;
26  
27  import org.htmlunit.ObjectInstantiationException;
28  import org.htmlunit.Page;
29  import org.htmlunit.SgmlPage;
30  import org.htmlunit.WebAssert;
31  import org.htmlunit.WebClient;
32  import org.htmlunit.WebResponse;
33  import org.htmlunit.cyberneko.HTMLScanner;
34  import org.htmlunit.cyberneko.HTMLTagBalancer;
35  import org.htmlunit.cyberneko.xerces.util.DefaultErrorHandler;
36  import org.htmlunit.cyberneko.xerces.xni.QName;
37  import org.htmlunit.cyberneko.xerces.xni.XNIException;
38  import org.htmlunit.cyberneko.xerces.xni.parser.XMLErrorHandler;
39  import org.htmlunit.cyberneko.xerces.xni.parser.XMLInputSource;
40  import org.htmlunit.cyberneko.xerces.xni.parser.XMLParseException;
41  import org.htmlunit.html.DefaultElementFactory;
42  import org.htmlunit.html.DomNode;
43  import org.htmlunit.html.ElementFactory;
44  import org.htmlunit.html.Html;
45  import org.htmlunit.html.HtmlPage;
46  import org.htmlunit.html.UnknownElementFactory;
47  import org.htmlunit.html.parser.HTMLParser;
48  import org.htmlunit.html.parser.HTMLParserListener;
49  import org.htmlunit.svg.SvgElementFactory;
50  import org.htmlunit.util.StringUtils;
51  import org.w3c.dom.Node;
52  import org.xml.sax.SAXException;
53  
54  /**
55   * <p>SAX parser implementation that uses the NekoHTML {@link org.htmlunit.cyberneko.HTMLConfiguration}
56   * to parse HTML into a HtmlUnit-specific DOM (HU-DOM) tree.</p>
57   *
58   * @author Christian Sell
59   * @author David K. Taylor
60   * @author Chris Erskine
61   * @author Ahmed Ashour
62   * @author Marc Guillemot
63   * @author Ethan Glasser-Camp
64   * @author Sudhan Moghe
65   * @author Ronald Brill
66   * @author Frank Danek
67   * @author Carsten Steul
68   */
69  public final class HtmlUnitNekoHtmlParser implements HTMLParser {
70  
71      /**
72       * The SVG factory.
73       */
74      public static final SvgElementFactory SVG_FACTORY = new SvgElementFactory();
75  
76      private static final Map<String, ElementFactory> ELEMENT_FACTORIES = new ConcurrentHashMap<>();
77  
78      static {
79          final DefaultElementFactory defaultElementFactory = new DefaultElementFactory();
80          for (final String tagName : DefaultElementFactory.SUPPORTED_TAGS_) {
81              ELEMENT_FACTORIES.put(tagName, defaultElementFactory);
82          }
83      }
84  
85      /**
86       * {@inheritDoc}
87       */
88      @Override
89      public void parseFragment(final WebClient webClient, final DomNode parent, final DomNode context,
90              final String source, final boolean createdByJavascript)
91          throws SAXException, IOException {
92          final Page page = parent.getPage();
93          if (!(page instanceof HtmlPage)) {
94              return;
95          }
96          final HtmlPage htmlPage = (HtmlPage) page;
97          final URL url = htmlPage.getUrl();
98  
99          final HtmlUnitNekoDOMBuilder domBuilder =
100                 new HtmlUnitNekoDOMBuilder(this, parent, url, source, createdByJavascript);
101         domBuilder.setFeature("http://cyberneko.org/html/features/balance-tags/document-fragment", true);
102         // build fragment context stack
103         DomNode node = context;
104         final List<QName> ancestors = new ArrayList<>();
105         while (node != null && node.getNodeType() != Node.DOCUMENT_NODE) {
106             ancestors.add(0, new QName(null, node.getNodeName(), null, null));
107             node = node.getParentNode();
108         }
109         if (ancestors.isEmpty() || !"html".equals(ancestors.get(0).getLocalpart())) {
110             ancestors.add(new QName(null, "html", null, null));
111             ancestors.add(new QName(null, "body", null, null));
112         }
113         else if (ancestors.size() == 1
114                 || (!"body".equals(ancestors.get(1).getLocalpart())
115                         && !"head".equals(ancestors.get(1).getLocalpart()))) {
116             ancestors.add(new QName(null, "body", null, null));
117         }
118 
119         domBuilder.setFeature(HTMLScanner.ALLOW_SELFCLOSING_TAGS, true);
120         domBuilder.setProperty(HTMLTagBalancer.FRAGMENT_CONTEXT_STACK, ancestors.toArray(new QName[0]));
121 
122         final XMLInputSource in = new XMLInputSource(null, url.toString(), null, new StringReader(source), null);
123 
124         htmlPage.registerParsingStart();
125         htmlPage.registerSnippetParsingStart();
126         try {
127             domBuilder.parse(in);
128         }
129         finally {
130             htmlPage.registerParsingEnd();
131             htmlPage.registerSnippetParsingEnd();
132         }
133     }
134 
135     /**
136      * {@inheritDoc}
137      */
138     @Override
139     public void parse(final WebClient webClient, final WebResponse webResponse, final HtmlPage page,
140             final boolean xhtml, final boolean createdByJavascript) throws IOException {
141         final URL url = webResponse.getWebRequest().getUrl();
142         final HtmlUnitNekoDOMBuilder domBuilder =
143                 new HtmlUnitNekoDOMBuilder(this, page, url, null, createdByJavascript);
144 
145         final Charset charset = webResponse.getContentCharset();
146         try {
147             if (!webResponse.wasContentCharsetTentative()) {
148                 // The charset is certain so ignore any others found in the document
149                 domBuilder.setFeature(HTMLScanner.IGNORE_SPECIFIED_CHARSET, true);
150             }
151 
152             // xml content is different
153             if (xhtml) {
154                 domBuilder.setFeature(HTMLScanner.ALLOW_SELFCLOSING_TAGS, true);
155                 domBuilder.setFeature(HTMLScanner.SCRIPT_STRIP_CDATA_DELIMS, true);
156                 domBuilder.setFeature(HTMLScanner.STYLE_STRIP_CDATA_DELIMS, true);
157                 domBuilder.setFeature(HTMLScanner.CDATA_SECTIONS, true);
158                 domBuilder.setFeature(HTMLScanner.CDATA_EARLY_CLOSING, false);
159             }
160 
161             if (webClient != null) {
162                 final int bufferSize = webClient.getOptions().getNekoReaderBufferSize();
163                 if (bufferSize > 0) {
164                     domBuilder.setProperty(HTMLScanner.READER_BUFFER_SIZE, bufferSize);
165                 }
166             }
167         }
168         catch (final Exception e) {
169             throw new ObjectInstantiationException("Error setting HTML parser feature", e);
170         }
171 
172         try (InputStream content = webResponse.getContentAsStream()) {
173             final String encoding = charset.name();
174             final XMLInputSource in = new XMLInputSource(null, url.toString(), null, content, encoding);
175 
176             page.registerParsingStart();
177             try {
178                 domBuilder.parse(in);
179             }
180             catch (final XNIException e) {
181                 // extract enclosed exception
182                 final Throwable origin = extractNestedException(e);
183                 throw new RuntimeException("Failed parsing content from " + url, origin);
184             }
185         }
186         finally {
187             page.registerParsingEnd();
188         }
189     }
190 
191     /**
192      * Extract nested exception within an XNIException.
193      *
194      * @param e the original XNIException
195      * @return the cause exception
196      */
197     static Throwable extractNestedException(final Throwable e) {
198         Throwable originalException;
199         Throwable cause = e;
200         do {
201             originalException = cause;
202 
203             if (cause instanceof XNIException) {
204                 cause = cause.getCause();
205             }
206             else {
207                 cause = null;
208             }
209         }
210         while (cause != null);
211 
212         return originalException;
213     }
214 
215     /**
216      * {@inheritDoc}
217      */
218     @Override
219     public ElementFactory getSvgFactory() {
220         return SVG_FACTORY;
221     }
222 
223     /**
224      * {@inheritDoc}
225      */
226     @Override
227     public ElementFactory getFactory(final String tagName) {
228         final ElementFactory result = ELEMENT_FACTORIES.get(tagName);
229 
230         if (result != null) {
231             return result;
232         }
233         return UnknownElementFactory.INSTANCE;
234     }
235 
236     /**
237      * <span style="color:red">INTERNAL API - SUBJECT TO CHANGE AT ANY TIME - USE AT YOUR OWN RISK.</span><br>
238      *
239      * Returns the pre-registered element factory corresponding to the specified tag, or an UnknownElementFactory.
240      * @param page the page
241      * @param namespaceURI the namespace URI
242      * @param qualifiedName the qualified name
243      * @param insideSvg is the node inside an SVG node or not
244      * @param svgSupport true if called from javascript createElementNS
245      * @return the pre-registered element factory corresponding to the specified tag, or an UnknownElementFactory
246      */
247     @Override
248     public ElementFactory getElementFactory(final SgmlPage page, final String namespaceURI,
249             final String qualifiedName, final boolean insideSvg, final boolean svgSupport) {
250         if (insideSvg) {
251             return SVG_FACTORY;
252         }
253 
254         if (namespaceURI == null || namespaceURI.isEmpty()
255             || Html.XHTML_NAMESPACE.equals(namespaceURI)
256             || Html.SVG_NAMESPACE.equals(namespaceURI)
257             || !qualifiedName.contains(":")) {
258 
259             String tagName = qualifiedName;
260             final int index = tagName.indexOf(':');
261             if (index == -1) {
262                 tagName = StringUtils.toRootLowerCase(tagName);
263             }
264             else {
265                 tagName = tagName.substring(index + 1);
266             }
267             final ElementFactory factory;
268             if (svgSupport && !"svg".equals(tagName) && Html.SVG_NAMESPACE.equals(namespaceURI)) {
269                 factory = SVG_FACTORY;
270             }
271             else {
272                 factory = ELEMENT_FACTORIES.get(tagName);
273             }
274 
275             if (factory != null) {
276                 return factory;
277             }
278         }
279         return UnknownElementFactory.INSTANCE;
280     }
281 }
282 
283 /**
284  * Utility to transmit parsing errors to a {@link HTMLParserListener}.
285  */
286 class HtmlUnitNekoHTMLErrorHandler implements XMLErrorHandler {
287     private final HTMLParserListener listener_;
288     private final URL url_;
289     private final String html_;
290 
291     HtmlUnitNekoHTMLErrorHandler(final HTMLParserListener listener, final URL url, final String htmlContent) {
292         WebAssert.notNull("listener", listener);
293         WebAssert.notNull("url", url);
294         listener_ = listener;
295         url_ = url;
296         html_ = htmlContent;
297     }
298 
299     /**
300      * @see DefaultErrorHandler#error(String,String,XMLParseException)
301      */
302     @Override
303     public void error(final String domain, final String key,
304             final XMLParseException exception) throws XNIException {
305         listener_.error(exception.getMessage(),
306                 url_,
307                 html_,
308                 exception.getLineNumber(),
309                 exception.getColumnNumber(),
310                 key);
311     }
312 
313     /**
314      * @see DefaultErrorHandler#warning(String,String,XMLParseException)
315      */
316     @Override
317     public void warning(final String domain, final String key,
318             final XMLParseException exception) throws XNIException {
319         listener_.warning(exception.getMessage(),
320                 url_,
321                 html_,
322                 exception.getLineNumber(),
323                 exception.getColumnNumber(),
324                 key);
325     }
326 
327     @Override
328     public void fatalError(final String domain, final String key,
329             final XMLParseException exception) throws XNIException {
330         listener_.error(exception.getMessage(),
331                 url_,
332                 html_,
333                 exception.getLineNumber(),
334                 exception.getColumnNumber(),
335                 key);
336     }
337 }