View Javadoc
1   /*
2    * Copyright (c) 2002-2025 Gargoyle Software Inc.
3    *
4    * Licensed under the Apache License, Version 2.0 (the "License");
5    * you may not use this file except in compliance with the License.
6    * You may obtain a copy of the License at
7    * https://www.apache.org/licenses/LICENSE-2.0
8    *
9    * Unless required by applicable law or agreed to in writing, software
10   * distributed under the License is distributed on an "AS IS" BASIS,
11   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12   * See the License for the specific language governing permissions and
13   * limitations under the License.
14   */
15  package org.htmlunit.util;
16  
17  import static org.htmlunit.html.DomElement.ATTRIBUTE_NOT_DEFINED;
18  
19  import java.io.IOException;
20  import java.io.InputStream;
21  import java.io.InputStreamReader;
22  import java.io.Reader;
23  import java.io.StringReader;
24  import java.nio.charset.Charset;
25  import java.util.List;
26  import java.util.Locale;
27  import java.util.Map;
28  
29  import javax.xml.parsers.DocumentBuilder;
30  import javax.xml.parsers.DocumentBuilderFactory;
31  import javax.xml.parsers.ParserConfigurationException;
32  
33  import org.apache.commons.io.input.BOMInputStream;
34  import org.apache.commons.logging.Log;
35  import org.apache.commons.logging.LogFactory;
36  import org.htmlunit.SgmlPage;
37  import org.htmlunit.WebResponse;
38  import org.htmlunit.html.DomAttr;
39  import org.htmlunit.html.DomCDataSection;
40  import org.htmlunit.html.DomComment;
41  import org.htmlunit.html.DomDocumentType;
42  import org.htmlunit.html.DomElement;
43  import org.htmlunit.html.DomNode;
44  import org.htmlunit.html.DomProcessingInstruction;
45  import org.htmlunit.html.DomText;
46  import org.htmlunit.html.ElementFactory;
47  import org.htmlunit.html.Html;
48  import org.htmlunit.platform.Platform;
49  import org.htmlunit.xml.XmlPage;
50  import org.w3c.dom.Attr;
51  import org.w3c.dom.Document;
52  import org.w3c.dom.DocumentType;
53  import org.w3c.dom.NamedNodeMap;
54  import org.w3c.dom.Node;
55  import org.w3c.dom.NodeList;
56  import org.xml.sax.Attributes;
57  import org.xml.sax.ErrorHandler;
58  import org.xml.sax.InputSource;
59  import org.xml.sax.SAXException;
60  import org.xml.sax.SAXParseException;
61  import org.xml.sax.helpers.AttributesImpl;
62  
63  /**
64   * <span style="color:red">INTERNAL API - SUBJECT TO CHANGE AT ANY TIME - USE AT YOUR OWN RISK.</span><br>
65   *
66   * Provides facility method to work with XML responses.
67   *
68   * @author Marc Guillemot
69   * @author Ahmed Ashour
70   * @author Sudhan Moghe
71   * @author Ronald Brill
72   * @author Chuck Dumont
73   * @author Frank Danek
74   */
75  public final class XmlUtils {
76  
77      private static final Log LOG = LogFactory.getLog(XmlUtils.class);
78  
79      private static final ErrorHandler DISCARD_MESSAGES_HANDLER = new ErrorHandler() {
80          /**
81           * Does nothing as we're not interested in this.
82           */
83          @Override
84          public void error(final SAXParseException exception) {
85              // Does nothing as we're not interested in this.
86          }
87  
88          /**
89           * Does nothing as we're not interested in this.
90           */
91          @Override
92          public void fatalError(final SAXParseException exception) {
93              // Does nothing as we're not interested in this.
94          }
95  
96          /**
97           * Does nothing as we're not interested in this.
98           */
99          @Override
100         public void warning(final SAXParseException exception) {
101             // Does nothing as we're not interested in this.
102         }
103     };
104 
105     /**
106      * Utility class, hide constructor.
107      */
108     private XmlUtils() {
109         // Empty.
110     }
111 
112     /**
113      * Builds a document from the content of the web response.
114      * A warning is logged if an exception is thrown while parsing the XML content
115      * (for instance when the content is not a valid XML and can't be parsed).
116      *
117      * @param webResponse the response from the server
118      * @throws IOException if the page could not be created
119      * @return the parse result
120      * @throws SAXException if the parsing fails
121      * @throws ParserConfigurationException if a DocumentBuilder cannot be created
122      */
123     public static Document buildDocument(final WebResponse webResponse)
124         throws IOException, SAXException, ParserConfigurationException {
125 
126         final DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
127 
128         if (webResponse == null) {
129             return factory.newDocumentBuilder().newDocument();
130         }
131 
132         factory.setNamespaceAware(true);
133 
134         Charset charset = webResponse.getContentCharset();
135         try (InputStream is = webResponse.getContentAsStreamWithBomIfApplicable()) {
136             if (is instanceof BOMInputStream) {
137                 final String bomCharsetName = ((BOMInputStream) is).getBOMCharsetName();
138                 if (bomCharsetName != null) {
139                     charset = Charset.forName(bomCharsetName);
140                 }
141             }
142 
143             try (InputStreamReader reader = new InputStreamReader(is, charset)) {
144                 // we have to do the blank input check and the parsing in one step
145                 final TrackBlankContentAndSkipLeadingWhitespaceReader tracker
146                         = new TrackBlankContentAndSkipLeadingWhitespaceReader(reader);
147 
148                 final InputSource source = new InputSource(tracker);
149                 final DocumentBuilder builder = factory.newDocumentBuilder();
150                 builder.setErrorHandler(DISCARD_MESSAGES_HANDLER);
151                 builder.setEntityResolver((publicId, systemId) -> new InputSource(new StringReader("")));
152                 try {
153                     // this closes the input source/stream
154                     return builder.parse(source);
155                 }
156                 catch (final SAXException e) {
157                     if (tracker.wasBlank()) {
158                         return factory.newDocumentBuilder().newDocument();
159                     }
160                     throw e;
161                 }
162             }
163         }
164     }
165 
166     /**
167      * Helper for memory and performance optimization.
168      */
169     private static final class TrackBlankContentAndSkipLeadingWhitespaceReader extends Reader {
170         private final Reader reader_;
171         private boolean wasBlank_ = true;
172 
173         TrackBlankContentAndSkipLeadingWhitespaceReader(final Reader characterStream) {
174             super();
175             reader_ = characterStream;
176         }
177 
178         public boolean wasBlank() {
179             return wasBlank_;
180         }
181 
182         @Override
183         public void close() throws IOException {
184             reader_.close();
185         }
186 
187         @Override
188         public int read(final char[] cbuf, final int off, final int len) throws IOException {
189             int result = reader_.read(cbuf, off, len);
190 
191             if (wasBlank_ && result > -1) {
192                 for (int i = 0; i < result; i++) {
193                     final char ch = cbuf[off + i];
194                     if (!Character.isWhitespace(ch)) {
195                         wasBlank_ = false;
196                         if (i > 0) {
197                             // skipt the leading whitespace
198                             System.arraycopy(cbuf, i, cbuf, off, len - i);
199                             result -= i;
200                         }
201                         break;
202                     }
203                 }
204             }
205             return result;
206         }
207     }
208 
209     /**
210      * Recursively appends a {@link Node} child to {@link DomNode} parent.
211      *
212      * @param page the owner page of {@link DomElement}s to be created
213      * @param parent the parent DomNode
214      * @param child the child Node
215      * @param handleXHTMLAsHTML if true elements from the XHTML namespace are handled as HTML elements instead of
216      *     DOM elements
217      */
218     public static void appendChild(final SgmlPage page, final DomNode parent, final Node child,
219         final boolean handleXHTMLAsHTML) {
220         appendChild(page, parent, child, handleXHTMLAsHTML, null);
221     }
222 
223     /**
224      * Recursively appends a {@link Node} child to {@link DomNode} parent.
225      *
226      * @param page the owner page of {@link DomElement}s to be created
227      * @param parent the parent DomNode
228      * @param child the child Node
229      * @param handleXHTMLAsHTML if true elements from the XHTML namespace are handled as HTML elements instead of
230      *     DOM elements
231      * @param attributesOrderMap (optional) the one returned by {@link #getAttributesOrderMap(Document)}
232      */
233     public static void appendChild(final SgmlPage page, final DomNode parent, final Node child,
234         final boolean handleXHTMLAsHTML, final Map<Integer, List<String>> attributesOrderMap) {
235         final DocumentType documentType = child.getOwnerDocument().getDoctype();
236         if (documentType != null && page instanceof XmlPage) {
237             final DomDocumentType domDoctype = new DomDocumentType(
238                     page, documentType.getName(), documentType.getPublicId(), documentType.getSystemId());
239             ((XmlPage) page).setDocumentType(domDoctype);
240         }
241         final DomNode childXml = createFrom(page, child, handleXHTMLAsHTML, attributesOrderMap);
242         parent.appendChild(childXml);
243         copy(page, child, childXml, handleXHTMLAsHTML, attributesOrderMap);
244     }
245 
246     private static DomNode createFrom(final SgmlPage page, final Node source, final boolean handleXHTMLAsHTML,
247             final Map<Integer, List<String>> attributesOrderMap) {
248         if (source.getNodeType() == Node.TEXT_NODE) {
249             return new DomText(page, source.getNodeValue());
250         }
251         if (source.getNodeType() == Node.PROCESSING_INSTRUCTION_NODE) {
252             return new DomProcessingInstruction(page, source.getNodeName(), source.getNodeValue());
253         }
254         if (source.getNodeType() == Node.COMMENT_NODE) {
255             return new DomComment(page, source.getNodeValue());
256         }
257         if (source.getNodeType() == Node.DOCUMENT_TYPE_NODE) {
258             final DocumentType documentType = (DocumentType) source;
259             return new DomDocumentType(page, documentType.getName(), documentType.getPublicId(),
260                     documentType.getSystemId());
261         }
262         final String ns = source.getNamespaceURI();
263         String localName = source.getLocalName();
264         if (handleXHTMLAsHTML && Html.XHTML_NAMESPACE.equals(ns)) {
265             final ElementFactory factory = page.getWebClient().getPageCreator().getHtmlParser().getFactory(localName);
266             return factory.createElementNS(page, ns, localName,
267                     namedNodeMapToSaxAttributes(source.getAttributes(), attributesOrderMap, source));
268         }
269         final NamedNodeMap nodeAttributes = source.getAttributes();
270         if (page != null && page.isHtmlPage()) {
271             localName = localName.toUpperCase(Locale.ROOT);
272         }
273         final String qualifiedName;
274         if (source.getPrefix() == null) {
275             qualifiedName = localName;
276         }
277         else {
278             qualifiedName = source.getPrefix() + ':' + localName;
279         }
280 
281         final String namespaceURI = source.getNamespaceURI();
282         if (Html.SVG_NAMESPACE.equals(namespaceURI)) {
283             return page.getWebClient().getPageCreator().getHtmlParser().getSvgFactory()
284                     .createElementNS(page, namespaceURI, qualifiedName,
285                             namedNodeMapToSaxAttributes(nodeAttributes, attributesOrderMap, source));
286         }
287 
288         final OrderedFastHashMap<String, DomAttr> attributes = new OrderedFastHashMap<>();
289         for (int i = 0; i < nodeAttributes.getLength(); i++) {
290             final int orderedIndex = Platform.getIndex(nodeAttributes, attributesOrderMap, source, i);
291             final Attr attribute = (Attr) nodeAttributes.item(orderedIndex);
292             final String attributeNamespaceURI = attribute.getNamespaceURI();
293             final String attributeQualifiedName;
294             if (attribute.getPrefix() == null) {
295                 attributeQualifiedName = attribute.getLocalName();
296             }
297             else {
298                 attributeQualifiedName = attribute.getPrefix() + ':' + attribute.getLocalName();
299             }
300             final String value = attribute.getNodeValue();
301             final boolean specified = attribute.getSpecified();
302             final DomAttr xmlAttribute =
303                     new DomAttr(page, attributeNamespaceURI, attributeQualifiedName, value, specified);
304             attributes.put(attribute.getNodeName(), xmlAttribute);
305         }
306         return new DomElement(namespaceURI, qualifiedName, page, attributes);
307     }
308 
309     private static Attributes namedNodeMapToSaxAttributes(final NamedNodeMap attributesMap,
310             final Map<Integer, List<String>> attributesOrderMap, final Node element) {
311         final AttributesImpl attributes = new AttributesImpl();
312         final int length = attributesMap.getLength();
313         for (int i = 0; i < length; i++) {
314             final int orderedIndex = Platform.getIndex(attributesMap, attributesOrderMap, element, i);
315             final Node attr = attributesMap.item(orderedIndex);
316             attributes.addAttribute(attr.getNamespaceURI(), attr.getLocalName(),
317                 attr.getNodeName(), null, attr.getNodeValue());
318         }
319 
320         return attributes;
321     }
322 
323     /**
324      * Copy all children from 'source' to 'dest', within the context of the specified page.
325      * @param page the page which the nodes belong to
326      * @param source the node to copy from
327      * @param dest the node to copy to
328      * @param handleXHTMLAsHTML if true elements from the XHTML namespace are handled as HTML elements instead of
329      *     DOM elements
330      */
331     private static void copy(final SgmlPage page, final Node source, final DomNode dest,
332         final boolean handleXHTMLAsHTML, final Map<Integer, List<String>> attributesOrderMap) {
333         final NodeList nodeChildren = source.getChildNodes();
334         for (int i = 0; i < nodeChildren.getLength(); i++) {
335             final Node child = nodeChildren.item(i);
336             switch (child.getNodeType()) {
337                 case Node.ELEMENT_NODE:
338                     final DomNode childXml = createFrom(page, child, handleXHTMLAsHTML, attributesOrderMap);
339                     dest.appendChild(childXml);
340                     copy(page, child, childXml, handleXHTMLAsHTML, attributesOrderMap);
341                     break;
342 
343                 case Node.TEXT_NODE:
344                     dest.appendChild(new DomText(page, child.getNodeValue()));
345                     break;
346 
347                 case Node.CDATA_SECTION_NODE:
348                     dest.appendChild(new DomCDataSection(page, child.getNodeValue()));
349                     break;
350 
351                 case Node.COMMENT_NODE:
352                     dest.appendChild(new DomComment(page, child.getNodeValue()));
353                     break;
354 
355                 case Node.PROCESSING_INSTRUCTION_NODE:
356                     dest.appendChild(new DomProcessingInstruction(page, child.getNodeName(), child.getNodeValue()));
357                     break;
358 
359                 default:
360                     if (LOG.isWarnEnabled()) {
361                         LOG.warn("NodeType " + child.getNodeType()
362                             + " (" + child.getNodeName() + ") is not yet supported.");
363                     }
364             }
365         }
366     }
367 
368     /**
369      * Search for the namespace URI of the given prefix, starting from the specified element.
370      * The default namespace can be searched for by specifying "" as the prefix.
371      * @param element the element to start searching from
372      * @param prefix the namespace prefix
373      * @return the namespace URI bound to the prefix; or null if there is no such namespace
374      */
375     public static String lookupNamespaceURI(final DomElement element, final String prefix) {
376         String uri;
377         if (prefix.isEmpty()) {
378             uri = element.getAttributeDirect("xmlns");
379         }
380         else {
381             uri = element.getAttribute("xmlns:" + prefix);
382         }
383         if (ATTRIBUTE_NOT_DEFINED == uri) {
384             final DomNode parentNode = element.getParentNode();
385             if (parentNode instanceof DomElement) {
386                 uri = lookupNamespaceURI((DomElement) parentNode, prefix);
387             }
388         }
389         return uri;
390     }
391 
392     /**
393      * Search for the prefix associated with specified namespace URI.
394      * @param element the element to start searching from
395      * @param namespace the namespace prefix
396      * @return the prefix bound to the namespace URI; or null if there is no such namespace
397      */
398     public static String lookupPrefix(final DomElement element, final String namespace) {
399         final Map<String, DomAttr> attributes = element.getAttributesMap();
400         for (final Map.Entry<String, DomAttr> entry : attributes.entrySet()) {
401             final String name = entry.getKey();
402             final DomAttr value = entry.getValue();
403             if (name.startsWith("xmlns:") && value.getValue().equals(namespace)) {
404                 return name.substring(6);
405             }
406         }
407         for (final DomNode child : element.getChildren()) {
408             if (child instanceof DomElement) {
409                 final String prefix = lookupPrefix((DomElement) child, namespace);
410                 if (prefix != null) {
411                     return prefix;
412                 }
413             }
414         }
415         return null;
416     }
417 
418     /**
419      * Returns internal Xerces details about all elements in the specified document.
420      * The id of the returned {@link Map} is the {@code nodeIndex} of an element, and the list
421      * is the array of ordered attributes names.
422      * @param document the document
423      * @return the map of an element index with its ordered attribute names
424      */
425     public static Map<Integer, List<String>> getAttributesOrderMap(final Document document) {
426         return Platform.getAttributesOrderMap(document);
427     }
428 }