1
2
3
4
5
6
7
8
9
10
11
12
13
14
15 package org.htmlunit.util;
16
17 import static org.htmlunit.html.DomElement.ATTRIBUTE_NOT_DEFINED;
18
19 import java.io.IOException;
20 import java.io.InputStream;
21 import java.io.InputStreamReader;
22 import java.io.Reader;
23 import java.io.StringReader;
24 import java.nio.charset.Charset;
25 import java.util.Locale;
26 import java.util.Map;
27
28 import javax.xml.parsers.DocumentBuilder;
29 import javax.xml.parsers.DocumentBuilderFactory;
30 import javax.xml.parsers.ParserConfigurationException;
31
32 import org.apache.commons.io.input.BOMInputStream;
33 import org.apache.commons.logging.Log;
34 import org.apache.commons.logging.LogFactory;
35 import org.htmlunit.SgmlPage;
36 import org.htmlunit.WebResponse;
37 import org.htmlunit.html.DomAttr;
38 import org.htmlunit.html.DomCDataSection;
39 import org.htmlunit.html.DomComment;
40 import org.htmlunit.html.DomDocumentType;
41 import org.htmlunit.html.DomElement;
42 import org.htmlunit.html.DomNode;
43 import org.htmlunit.html.DomProcessingInstruction;
44 import org.htmlunit.html.DomText;
45 import org.htmlunit.html.ElementFactory;
46 import org.htmlunit.html.Html;
47 import org.htmlunit.xml.XmlPage;
48 import org.w3c.dom.Attr;
49 import org.w3c.dom.Document;
50 import org.w3c.dom.DocumentType;
51 import org.w3c.dom.NamedNodeMap;
52 import org.w3c.dom.Node;
53 import org.w3c.dom.NodeList;
54 import org.xml.sax.Attributes;
55 import org.xml.sax.ErrorHandler;
56 import org.xml.sax.InputSource;
57 import org.xml.sax.SAXException;
58 import org.xml.sax.SAXParseException;
59 import org.xml.sax.helpers.AttributesImpl;
60
61
62
63
64
65
66
67
68
69
70
71
72
73 public final class XmlUtils {
74
75 private static final Log LOG = LogFactory.getLog(XmlUtils.class);
76
77 private static final ErrorHandler DISCARD_MESSAGES_HANDLER = new ErrorHandler() {
78
79
80
81 @Override
82 public void error(final SAXParseException exception) {
83
84 }
85
86
87
88
89 @Override
90 public void fatalError(final SAXParseException exception) {
91
92 }
93
94
95
96
97 @Override
98 public void warning(final SAXParseException exception) {
99
100 }
101 };
102
103
104
105
106 private XmlUtils() {
107
108 }
109
110
111
112
113
114
115
116
117
118
119
120
121 public static Document buildDocument(final WebResponse webResponse)
122 throws IOException, SAXException, ParserConfigurationException {
123
124 final DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
125
126 if (webResponse == null) {
127 return factory.newDocumentBuilder().newDocument();
128 }
129
130 factory.setNamespaceAware(true);
131
132 Charset charset = webResponse.getContentCharset();
133 try (InputStream is = webResponse.getContentAsStreamWithBomIfApplicable()) {
134 if (is instanceof BOMInputStream stream) {
135 final String bomCharsetName = stream.getBOMCharsetName();
136 if (bomCharsetName != null) {
137 charset = Charset.forName(bomCharsetName);
138 }
139 }
140
141 try (InputStreamReader reader = new InputStreamReader(is, charset)) {
142
143 final TrackBlankContentAndSkipLeadingWhitespaceReader tracker
144 = new TrackBlankContentAndSkipLeadingWhitespaceReader(reader);
145
146 final InputSource source = new InputSource(tracker);
147 final DocumentBuilder builder = factory.newDocumentBuilder();
148 builder.setErrorHandler(DISCARD_MESSAGES_HANDLER);
149 builder.setEntityResolver((publicId, systemId) -> new InputSource(new StringReader("")));
150 try {
151
152 return builder.parse(source);
153 }
154 catch (final SAXException e) {
155 if (tracker.wasBlank()) {
156 return factory.newDocumentBuilder().newDocument();
157 }
158 throw e;
159 }
160 }
161 }
162 }
163
164
165
166
167 private static final class TrackBlankContentAndSkipLeadingWhitespaceReader extends Reader {
168 private final Reader reader_;
169 private boolean wasBlank_ = true;
170
171 TrackBlankContentAndSkipLeadingWhitespaceReader(final Reader characterStream) {
172 super();
173 reader_ = characterStream;
174 }
175
176 public boolean wasBlank() {
177 return wasBlank_;
178 }
179
180 @Override
181 public void close() throws IOException {
182 reader_.close();
183 }
184
185 @Override
186 public int read(final char[] cbuf, final int off, final int len) throws IOException {
187 int result = reader_.read(cbuf, off, len);
188
189 if (wasBlank_ && result > -1) {
190 for (int i = 0; i < result; i++) {
191 final char ch = cbuf[off + i];
192 if (!Character.isWhitespace(ch)) {
193 wasBlank_ = false;
194 if (i > 0) {
195
196 System.arraycopy(cbuf, i, cbuf, off, len - i);
197 result -= i;
198 }
199 break;
200 }
201 }
202 }
203 return result;
204 }
205 }
206
207
208
209
210
211
212
213
214
215
216 public static void appendChild(final SgmlPage page, final DomNode parent, final Node child,
217 final boolean handleXHTMLAsHTML) {
218 final DocumentType documentType = child.getOwnerDocument().getDoctype();
219 if (documentType != null && page instanceof XmlPage xmlPage) {
220 final DomDocumentType domDoctype = new DomDocumentType(
221 page, documentType.getName(), documentType.getPublicId(), documentType.getSystemId());
222 xmlPage.setDocumentType(domDoctype);
223 }
224 final DomNode childXml = createFrom(page, child, handleXHTMLAsHTML);
225 parent.appendChild(childXml);
226 copy(page, child, childXml, handleXHTMLAsHTML);
227 }
228
229 private static DomNode createFrom(final SgmlPage page, final Node source, final boolean handleXHTMLAsHTML) {
230 if (source.getNodeType() == Node.TEXT_NODE) {
231 return new DomText(page, source.getNodeValue());
232 }
233 if (source.getNodeType() == Node.PROCESSING_INSTRUCTION_NODE) {
234 return new DomProcessingInstruction(page, source.getNodeName(), source.getNodeValue());
235 }
236 if (source.getNodeType() == Node.COMMENT_NODE) {
237 return new DomComment(page, source.getNodeValue());
238 }
239 if (source.getNodeType() == Node.DOCUMENT_TYPE_NODE) {
240 final DocumentType documentType = (DocumentType) source;
241 return new DomDocumentType(page, documentType.getName(), documentType.getPublicId(),
242 documentType.getSystemId());
243 }
244 final String ns = source.getNamespaceURI();
245 String localName = source.getLocalName();
246 if (handleXHTMLAsHTML && Html.XHTML_NAMESPACE.equals(ns)) {
247 final ElementFactory factory = page.getWebClient().getPageCreator().getHtmlParser().getFactory(localName);
248 return factory.createElementNS(page, ns, localName,
249 namedNodeMapToSaxAttributes(source.getAttributes()));
250 }
251 final NamedNodeMap nodeAttributes = source.getAttributes();
252 if (page != null && page.isHtmlPage()) {
253 localName = localName.toUpperCase(Locale.ROOT);
254 }
255 final String qualifiedName;
256 if (source.getPrefix() == null) {
257 qualifiedName = localName;
258 }
259 else {
260 qualifiedName = source.getPrefix() + ':' + localName;
261 }
262
263 final String namespaceURI = source.getNamespaceURI();
264 if (Html.SVG_NAMESPACE.equals(namespaceURI)) {
265 return page.getWebClient().getPageCreator().getHtmlParser().getSvgFactory()
266 .createElementNS(page, namespaceURI, qualifiedName,
267 namedNodeMapToSaxAttributes(nodeAttributes));
268 }
269
270 final OrderedFastHashMap<String, DomAttr> attributes = new OrderedFastHashMap<>();
271 for (int i = 0; i < nodeAttributes.getLength(); i++) {
272 final Attr attribute = (Attr) nodeAttributes.item(i);
273 final String attributeNamespaceURI = attribute.getNamespaceURI();
274 final String attributeQualifiedName;
275 if (attribute.getPrefix() == null) {
276 attributeQualifiedName = attribute.getLocalName();
277 }
278 else {
279 attributeQualifiedName = attribute.getPrefix() + ':' + attribute.getLocalName();
280 }
281 final String value = attribute.getNodeValue();
282 final boolean specified = attribute.getSpecified();
283 final DomAttr xmlAttribute =
284 new DomAttr(page, attributeNamespaceURI, attributeQualifiedName, value, specified);
285 attributes.put(attribute.getNodeName(), xmlAttribute);
286 }
287 return new DomElement(namespaceURI, qualifiedName, page, attributes);
288 }
289
290 private static Attributes namedNodeMapToSaxAttributes(final NamedNodeMap attributesMap) {
291 final AttributesImpl attributes = new AttributesImpl();
292 final int length = attributesMap.getLength();
293 for (int i = 0; i < length; i++) {
294 final Node attr = attributesMap.item(i);
295 attributes.addAttribute(attr.getNamespaceURI(), attr.getLocalName(),
296 attr.getNodeName(), null, attr.getNodeValue());
297 }
298
299 return attributes;
300 }
301
302
303
304
305
306
307
308
309
310 private static void copy(final SgmlPage page, final Node source, final DomNode dest,
311 final boolean handleXHTMLAsHTML) {
312 final NodeList nodeChildren = source.getChildNodes();
313 for (int i = 0; i < nodeChildren.getLength(); i++) {
314 final Node child = nodeChildren.item(i);
315 switch (child.getNodeType()) {
316 case Node.ELEMENT_NODE:
317 final DomNode childXml = createFrom(page, child, handleXHTMLAsHTML);
318 dest.appendChild(childXml);
319 copy(page, child, childXml, handleXHTMLAsHTML);
320 break;
321
322 case Node.TEXT_NODE:
323 dest.appendChild(new DomText(page, child.getNodeValue()));
324 break;
325
326 case Node.CDATA_SECTION_NODE:
327 dest.appendChild(new DomCDataSection(page, child.getNodeValue()));
328 break;
329
330 case Node.COMMENT_NODE:
331 dest.appendChild(new DomComment(page, child.getNodeValue()));
332 break;
333
334 case Node.PROCESSING_INSTRUCTION_NODE:
335 dest.appendChild(new DomProcessingInstruction(page, child.getNodeName(), child.getNodeValue()));
336 break;
337
338 default:
339 if (LOG.isWarnEnabled()) {
340 LOG.warn("NodeType " + child.getNodeType()
341 + " (" + child.getNodeName() + ") is not yet supported.");
342 }
343 }
344 }
345 }
346
347
348
349
350
351
352
353
354 public static String lookupNamespaceURI(final DomElement element, final String prefix) {
355 String uri;
356 if (prefix.isEmpty()) {
357 uri = element.getAttributeDirect("xmlns");
358 }
359 else {
360 uri = element.getAttribute("xmlns:" + prefix);
361 }
362 if (ATTRIBUTE_NOT_DEFINED == uri) {
363 final DomNode parentNode = element.getParentNode();
364 if (parentNode instanceof DomElement domElement) {
365 uri = lookupNamespaceURI(domElement, prefix);
366 }
367 }
368 return uri;
369 }
370
371
372
373
374
375
376
377 public static String lookupPrefix(final DomElement element, final String namespace) {
378 final Map<String, DomAttr> attributes = element.getAttributesMap();
379 for (final Map.Entry<String, DomAttr> entry : attributes.entrySet()) {
380 final String name = entry.getKey();
381 final DomAttr value = entry.getValue();
382 if (name.startsWith("xmlns:") && value.getValue().equals(namespace)) {
383 return name.substring(6);
384 }
385 }
386 for (final DomNode child : element.getChildren()) {
387 if (child instanceof DomElement domElement) {
388 final String prefix = lookupPrefix(domElement, namespace);
389 if (prefix != null) {
390 return prefix;
391 }
392 }
393 }
394 return null;
395 }
396 }