View Javadoc
1   /*
2    * Copyright (c) 2002-2026 Gargoyle Software Inc.
3    *
4    * Licensed under the Apache License, Version 2.0 (the "License");
5    * you may not use this file except in compliance with the License.
6    * You may obtain a copy of the License at
7    * https://www.apache.org/licenses/LICENSE-2.0
8    *
9    * Unless required by applicable law or agreed to in writing, software
10   * distributed under the License is distributed on an "AS IS" BASIS,
11   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12   * See the License for the specific language governing permissions and
13   * limitations under the License.
14   */
15  package org.htmlunit.util;
16  
17  import static org.htmlunit.html.DomElement.ATTRIBUTE_NOT_DEFINED;
18  
19  import java.io.IOException;
20  import java.io.InputStream;
21  import java.io.InputStreamReader;
22  import java.io.Reader;
23  import java.io.StringReader;
24  import java.nio.charset.Charset;
25  import java.util.Locale;
26  import java.util.Map;
27  
28  import javax.xml.parsers.DocumentBuilder;
29  import javax.xml.parsers.DocumentBuilderFactory;
30  import javax.xml.parsers.ParserConfigurationException;
31  
32  import org.apache.commons.io.input.BOMInputStream;
33  import org.apache.commons.logging.Log;
34  import org.apache.commons.logging.LogFactory;
35  import org.htmlunit.SgmlPage;
36  import org.htmlunit.WebResponse;
37  import org.htmlunit.html.DomAttr;
38  import org.htmlunit.html.DomCDataSection;
39  import org.htmlunit.html.DomComment;
40  import org.htmlunit.html.DomDocumentType;
41  import org.htmlunit.html.DomElement;
42  import org.htmlunit.html.DomNode;
43  import org.htmlunit.html.DomProcessingInstruction;
44  import org.htmlunit.html.DomText;
45  import org.htmlunit.html.ElementFactory;
46  import org.htmlunit.html.Html;
47  import org.htmlunit.xml.XmlPage;
48  import org.w3c.dom.Attr;
49  import org.w3c.dom.Document;
50  import org.w3c.dom.DocumentType;
51  import org.w3c.dom.NamedNodeMap;
52  import org.w3c.dom.Node;
53  import org.w3c.dom.NodeList;
54  import org.xml.sax.Attributes;
55  import org.xml.sax.ErrorHandler;
56  import org.xml.sax.InputSource;
57  import org.xml.sax.SAXException;
58  import org.xml.sax.SAXParseException;
59  import org.xml.sax.helpers.AttributesImpl;
60  
61  /**
62   * <span style="color:red">INTERNAL API - SUBJECT TO CHANGE AT ANY TIME - USE AT YOUR OWN RISK.</span><br>
63   *
64   * Provides facility method to work with XML responses.
65   *
66   * @author Marc Guillemot
67   * @author Ahmed Ashour
68   * @author Sudhan Moghe
69   * @author Ronald Brill
70   * @author Chuck Dumont
71   * @author Frank Danek
72   */
73  public final class XmlUtils {
74  
75      private static final Log LOG = LogFactory.getLog(XmlUtils.class);
76  
77      private static final ErrorHandler DISCARD_MESSAGES_HANDLER = new ErrorHandler() {
78          /**
79           * Does nothing as we're not interested in this.
80           */
81          @Override
82          public void error(final SAXParseException exception) {
83              // Does nothing as we're not interested in this.
84          }
85  
86          /**
87           * Does nothing as we're not interested in this.
88           */
89          @Override
90          public void fatalError(final SAXParseException exception) {
91              // Does nothing as we're not interested in this.
92          }
93  
94          /**
95           * Does nothing as we're not interested in this.
96           */
97          @Override
98          public void warning(final SAXParseException exception) {
99              // Does nothing as we're not interested in this.
100         }
101     };
102 
103     /**
104      * Utility class, hide constructor.
105      */
106     private XmlUtils() {
107         // Empty.
108     }
109 
110     /**
111      * Builds a document from the content of the web response.
112      * A warning is logged if an exception is thrown while parsing the XML content
113      * (for instance when the content is not a valid XML and can't be parsed).
114      *
115      * @param webResponse the response from the server
116      * @throws IOException if the page could not be created
117      * @return the parse result
118      * @throws SAXException if the parsing fails
119      * @throws ParserConfigurationException if a DocumentBuilder cannot be created
120      */
121     public static Document buildDocument(final WebResponse webResponse)
122         throws IOException, SAXException, ParserConfigurationException {
123 
124         final DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
125 
126         if (webResponse == null) {
127             return factory.newDocumentBuilder().newDocument();
128         }
129 
130         factory.setNamespaceAware(true);
131 
132         Charset charset = webResponse.getContentCharset();
133         try (InputStream is = webResponse.getContentAsStreamWithBomIfApplicable()) {
134             if (is instanceof BOMInputStream stream) {
135                 final String bomCharsetName = stream.getBOMCharsetName();
136                 if (bomCharsetName != null) {
137                     charset = Charset.forName(bomCharsetName);
138                 }
139             }
140 
141             try (InputStreamReader reader = new InputStreamReader(is, charset)) {
142                 // we have to do the blank input check and the parsing in one step
143                 final TrackBlankContentAndSkipLeadingWhitespaceReader tracker
144                         = new TrackBlankContentAndSkipLeadingWhitespaceReader(reader);
145 
146                 final InputSource source = new InputSource(tracker);
147                 final DocumentBuilder builder = factory.newDocumentBuilder();
148                 builder.setErrorHandler(DISCARD_MESSAGES_HANDLER);
149                 builder.setEntityResolver((publicId, systemId) -> new InputSource(new StringReader("")));
150                 try {
151                     // this closes the input source/stream
152                     return builder.parse(source);
153                 }
154                 catch (final SAXException e) {
155                     if (tracker.wasBlank()) {
156                         return factory.newDocumentBuilder().newDocument();
157                     }
158                     throw e;
159                 }
160             }
161         }
162     }
163 
164     /**
165      * Helper for memory and performance optimization.
166      */
167     private static final class TrackBlankContentAndSkipLeadingWhitespaceReader extends Reader {
168         private final Reader reader_;
169         private boolean wasBlank_ = true;
170 
171         TrackBlankContentAndSkipLeadingWhitespaceReader(final Reader characterStream) {
172             super();
173             reader_ = characterStream;
174         }
175 
176         public boolean wasBlank() {
177             return wasBlank_;
178         }
179 
180         @Override
181         public void close() throws IOException {
182             reader_.close();
183         }
184 
185         @Override
186         public int read(final char[] cbuf, final int off, final int len) throws IOException {
187             int result = reader_.read(cbuf, off, len);
188 
189             if (wasBlank_ && result > -1) {
190                 for (int i = 0; i < result; i++) {
191                     final char ch = cbuf[off + i];
192                     if (!Character.isWhitespace(ch)) {
193                         wasBlank_ = false;
194                         if (i > 0) {
195                             // skipt the leading whitespace
196                             System.arraycopy(cbuf, i, cbuf, off, len - i);
197                             result -= i;
198                         }
199                         break;
200                     }
201                 }
202             }
203             return result;
204         }
205     }
206 
207     /**
208      * Recursively appends a {@link Node} child to {@link DomNode} parent.
209      *
210      * @param page the owner page of {@link DomElement}s to be created
211      * @param parent the parent DomNode
212      * @param child the child Node
213      * @param handleXHTMLAsHTML if true elements from the XHTML namespace are handled as HTML elements instead of
214      *     DOM elements
215      */
216     public static void appendChild(final SgmlPage page, final DomNode parent, final Node child,
217         final boolean handleXHTMLAsHTML) {
218         final DocumentType documentType = child.getOwnerDocument().getDoctype();
219         if (documentType != null && page instanceof XmlPage xmlPage) {
220             final DomDocumentType domDoctype = new DomDocumentType(
221                     page, documentType.getName(), documentType.getPublicId(), documentType.getSystemId());
222             xmlPage.setDocumentType(domDoctype);
223         }
224         final DomNode childXml = createFrom(page, child, handleXHTMLAsHTML);
225         parent.appendChild(childXml);
226         copy(page, child, childXml, handleXHTMLAsHTML);
227     }
228 
229     private static DomNode createFrom(final SgmlPage page, final Node source, final boolean handleXHTMLAsHTML) {
230         if (source.getNodeType() == Node.TEXT_NODE) {
231             return new DomText(page, source.getNodeValue());
232         }
233         if (source.getNodeType() == Node.PROCESSING_INSTRUCTION_NODE) {
234             return new DomProcessingInstruction(page, source.getNodeName(), source.getNodeValue());
235         }
236         if (source.getNodeType() == Node.COMMENT_NODE) {
237             return new DomComment(page, source.getNodeValue());
238         }
239         if (source.getNodeType() == Node.DOCUMENT_TYPE_NODE) {
240             final DocumentType documentType = (DocumentType) source;
241             return new DomDocumentType(page, documentType.getName(), documentType.getPublicId(),
242                     documentType.getSystemId());
243         }
244         final String ns = source.getNamespaceURI();
245         String localName = source.getLocalName();
246         if (handleXHTMLAsHTML && Html.XHTML_NAMESPACE.equals(ns)) {
247             final ElementFactory factory = page.getWebClient().getPageCreator().getHtmlParser().getFactory(localName);
248             return factory.createElementNS(page, ns, localName,
249                     namedNodeMapToSaxAttributes(source.getAttributes()));
250         }
251         final NamedNodeMap nodeAttributes = source.getAttributes();
252         if (page != null && page.isHtmlPage()) {
253             localName = localName.toUpperCase(Locale.ROOT);
254         }
255         final String qualifiedName;
256         if (source.getPrefix() == null) {
257             qualifiedName = localName;
258         }
259         else {
260             qualifiedName = source.getPrefix() + ':' + localName;
261         }
262 
263         final String namespaceURI = source.getNamespaceURI();
264         if (Html.SVG_NAMESPACE.equals(namespaceURI)) {
265             return page.getWebClient().getPageCreator().getHtmlParser().getSvgFactory()
266                     .createElementNS(page, namespaceURI, qualifiedName,
267                             namedNodeMapToSaxAttributes(nodeAttributes));
268         }
269 
270         final OrderedFastHashMap<String, DomAttr> attributes = new OrderedFastHashMap<>();
271         for (int i = 0; i < nodeAttributes.getLength(); i++) {
272             final Attr attribute = (Attr) nodeAttributes.item(i);
273             final String attributeNamespaceURI = attribute.getNamespaceURI();
274             final String attributeQualifiedName;
275             if (attribute.getPrefix() == null) {
276                 attributeQualifiedName = attribute.getLocalName();
277             }
278             else {
279                 attributeQualifiedName = attribute.getPrefix() + ':' + attribute.getLocalName();
280             }
281             final String value = attribute.getNodeValue();
282             final boolean specified = attribute.getSpecified();
283             final DomAttr xmlAttribute =
284                     new DomAttr(page, attributeNamespaceURI, attributeQualifiedName, value, specified);
285             attributes.put(attribute.getNodeName(), xmlAttribute);
286         }
287         return new DomElement(namespaceURI, qualifiedName, page, attributes);
288     }
289 
290     private static Attributes namedNodeMapToSaxAttributes(final NamedNodeMap attributesMap) {
291         final AttributesImpl attributes = new AttributesImpl();
292         final int length = attributesMap.getLength();
293         for (int i = 0; i < length; i++) {
294             final Node attr = attributesMap.item(i);
295             attributes.addAttribute(attr.getNamespaceURI(), attr.getLocalName(),
296                 attr.getNodeName(), null, attr.getNodeValue());
297         }
298 
299         return attributes;
300     }
301 
302     /**
303      * Copy all children from 'source' to 'dest', within the context of the specified page.
304      * @param page the page which the nodes belong to
305      * @param source the node to copy from
306      * @param dest the node to copy to
307      * @param handleXHTMLAsHTML if true elements from the XHTML namespace are handled as HTML elements instead of
308      *     DOM elements
309      */
310     private static void copy(final SgmlPage page, final Node source, final DomNode dest,
311         final boolean handleXHTMLAsHTML) {
312         final NodeList nodeChildren = source.getChildNodes();
313         for (int i = 0; i < nodeChildren.getLength(); i++) {
314             final Node child = nodeChildren.item(i);
315             switch (child.getNodeType()) {
316                 case Node.ELEMENT_NODE:
317                     final DomNode childXml = createFrom(page, child, handleXHTMLAsHTML);
318                     dest.appendChild(childXml);
319                     copy(page, child, childXml, handleXHTMLAsHTML);
320                     break;
321 
322                 case Node.TEXT_NODE:
323                     dest.appendChild(new DomText(page, child.getNodeValue()));
324                     break;
325 
326                 case Node.CDATA_SECTION_NODE:
327                     dest.appendChild(new DomCDataSection(page, child.getNodeValue()));
328                     break;
329 
330                 case Node.COMMENT_NODE:
331                     dest.appendChild(new DomComment(page, child.getNodeValue()));
332                     break;
333 
334                 case Node.PROCESSING_INSTRUCTION_NODE:
335                     dest.appendChild(new DomProcessingInstruction(page, child.getNodeName(), child.getNodeValue()));
336                     break;
337 
338                 default:
339                     if (LOG.isWarnEnabled()) {
340                         LOG.warn("NodeType " + child.getNodeType()
341                             + " (" + child.getNodeName() + ") is not yet supported.");
342                     }
343             }
344         }
345     }
346 
347     /**
348      * Search for the namespace URI of the given prefix, starting from the specified element.
349      * The default namespace can be searched for by specifying "" as the prefix.
350      * @param element the element to start searching from
351      * @param prefix the namespace prefix
352      * @return the namespace URI bound to the prefix; or null if there is no such namespace
353      */
354     public static String lookupNamespaceURI(final DomElement element, final String prefix) {
355         String uri;
356         if (prefix.isEmpty()) {
357             uri = element.getAttributeDirect("xmlns");
358         }
359         else {
360             uri = element.getAttribute("xmlns:" + prefix);
361         }
362         if (ATTRIBUTE_NOT_DEFINED == uri) {
363             final DomNode parentNode = element.getParentNode();
364             if (parentNode instanceof DomElement domElement) {
365                 uri = lookupNamespaceURI(domElement, prefix);
366             }
367         }
368         return uri;
369     }
370 
371     /**
372      * Search for the prefix associated with specified namespace URI.
373      * @param element the element to start searching from
374      * @param namespace the namespace prefix
375      * @return the prefix bound to the namespace URI; or null if there is no such namespace
376      */
377     public static String lookupPrefix(final DomElement element, final String namespace) {
378         final Map<String, DomAttr> attributes = element.getAttributesMap();
379         for (final Map.Entry<String, DomAttr> entry : attributes.entrySet()) {
380             final String name = entry.getKey();
381             final DomAttr value = entry.getValue();
382             if (name.startsWith("xmlns:") && value.getValue().equals(namespace)) {
383                 return name.substring(6);
384             }
385         }
386         for (final DomNode child : element.getChildren()) {
387             if (child instanceof DomElement domElement) {
388                 final String prefix = lookupPrefix(domElement, namespace);
389                 if (prefix != null) {
390                     return prefix;
391                 }
392             }
393         }
394         return null;
395     }
396 }