1
2
3
4
5
6
7
8
9
10
11
12
13
14
15 package org.htmlunit.util;
16
17 import static org.htmlunit.html.DomElement.ATTRIBUTE_NOT_DEFINED;
18
19 import java.io.IOException;
20 import java.io.InputStream;
21 import java.io.InputStreamReader;
22 import java.io.Reader;
23 import java.io.StringReader;
24 import java.nio.charset.Charset;
25 import java.util.List;
26 import java.util.Locale;
27 import java.util.Map;
28
29 import javax.xml.parsers.DocumentBuilder;
30 import javax.xml.parsers.DocumentBuilderFactory;
31 import javax.xml.parsers.ParserConfigurationException;
32
33 import org.apache.commons.io.input.BOMInputStream;
34 import org.apache.commons.logging.Log;
35 import org.apache.commons.logging.LogFactory;
36 import org.htmlunit.SgmlPage;
37 import org.htmlunit.WebResponse;
38 import org.htmlunit.html.DomAttr;
39 import org.htmlunit.html.DomCDataSection;
40 import org.htmlunit.html.DomComment;
41 import org.htmlunit.html.DomDocumentType;
42 import org.htmlunit.html.DomElement;
43 import org.htmlunit.html.DomNode;
44 import org.htmlunit.html.DomProcessingInstruction;
45 import org.htmlunit.html.DomText;
46 import org.htmlunit.html.ElementFactory;
47 import org.htmlunit.html.Html;
48 import org.htmlunit.platform.Platform;
49 import org.htmlunit.xml.XmlPage;
50 import org.w3c.dom.Attr;
51 import org.w3c.dom.Document;
52 import org.w3c.dom.DocumentType;
53 import org.w3c.dom.NamedNodeMap;
54 import org.w3c.dom.Node;
55 import org.w3c.dom.NodeList;
56 import org.xml.sax.Attributes;
57 import org.xml.sax.ErrorHandler;
58 import org.xml.sax.InputSource;
59 import org.xml.sax.SAXException;
60 import org.xml.sax.SAXParseException;
61 import org.xml.sax.helpers.AttributesImpl;
62
63
64
65
66
67
68
69
70
71
72
73
74
75 public final class XmlUtils {
76
77 private static final Log LOG = LogFactory.getLog(XmlUtils.class);
78
79 private static final ErrorHandler DISCARD_MESSAGES_HANDLER = new ErrorHandler() {
80
81
82
83 @Override
84 public void error(final SAXParseException exception) {
85
86 }
87
88
89
90
91 @Override
92 public void fatalError(final SAXParseException exception) {
93
94 }
95
96
97
98
99 @Override
100 public void warning(final SAXParseException exception) {
101
102 }
103 };
104
105
106
107
108 private XmlUtils() {
109
110 }
111
112
113
114
115
116
117
118
119
120
121
122
123 public static Document buildDocument(final WebResponse webResponse)
124 throws IOException, SAXException, ParserConfigurationException {
125
126 final DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
127
128 if (webResponse == null) {
129 return factory.newDocumentBuilder().newDocument();
130 }
131
132 factory.setNamespaceAware(true);
133
134 Charset charset = webResponse.getContentCharset();
135 try (InputStream is = webResponse.getContentAsStreamWithBomIfApplicable()) {
136 if (is instanceof BOMInputStream) {
137 final String bomCharsetName = ((BOMInputStream) is).getBOMCharsetName();
138 if (bomCharsetName != null) {
139 charset = Charset.forName(bomCharsetName);
140 }
141 }
142
143 try (InputStreamReader reader = new InputStreamReader(is, charset)) {
144
145 final TrackBlankContentAndSkipLeadingWhitespaceReader tracker
146 = new TrackBlankContentAndSkipLeadingWhitespaceReader(reader);
147
148 final InputSource source = new InputSource(tracker);
149 final DocumentBuilder builder = factory.newDocumentBuilder();
150 builder.setErrorHandler(DISCARD_MESSAGES_HANDLER);
151 builder.setEntityResolver((publicId, systemId) -> new InputSource(new StringReader("")));
152 try {
153
154 return builder.parse(source);
155 }
156 catch (final SAXException e) {
157 if (tracker.wasBlank()) {
158 return factory.newDocumentBuilder().newDocument();
159 }
160 throw e;
161 }
162 }
163 }
164 }
165
166
167
168
169 private static final class TrackBlankContentAndSkipLeadingWhitespaceReader extends Reader {
170 private final Reader reader_;
171 private boolean wasBlank_ = true;
172
173 TrackBlankContentAndSkipLeadingWhitespaceReader(final Reader characterStream) {
174 super();
175 reader_ = characterStream;
176 }
177
178 public boolean wasBlank() {
179 return wasBlank_;
180 }
181
182 @Override
183 public void close() throws IOException {
184 reader_.close();
185 }
186
187 @Override
188 public int read(final char[] cbuf, final int off, final int len) throws IOException {
189 int result = reader_.read(cbuf, off, len);
190
191 if (wasBlank_ && result > -1) {
192 for (int i = 0; i < result; i++) {
193 final char ch = cbuf[off + i];
194 if (!Character.isWhitespace(ch)) {
195 wasBlank_ = false;
196 if (i > 0) {
197
198 System.arraycopy(cbuf, i, cbuf, off, len - i);
199 result -= i;
200 }
201 break;
202 }
203 }
204 }
205 return result;
206 }
207 }
208
209
210
211
212
213
214
215
216
217
218 public static void appendChild(final SgmlPage page, final DomNode parent, final Node child,
219 final boolean handleXHTMLAsHTML) {
220 appendChild(page, parent, child, handleXHTMLAsHTML, null);
221 }
222
223
224
225
226
227
228
229
230
231
232
233 public static void appendChild(final SgmlPage page, final DomNode parent, final Node child,
234 final boolean handleXHTMLAsHTML, final Map<Integer, List<String>> attributesOrderMap) {
235 final DocumentType documentType = child.getOwnerDocument().getDoctype();
236 if (documentType != null && page instanceof XmlPage) {
237 final DomDocumentType domDoctype = new DomDocumentType(
238 page, documentType.getName(), documentType.getPublicId(), documentType.getSystemId());
239 ((XmlPage) page).setDocumentType(domDoctype);
240 }
241 final DomNode childXml = createFrom(page, child, handleXHTMLAsHTML, attributesOrderMap);
242 parent.appendChild(childXml);
243 copy(page, child, childXml, handleXHTMLAsHTML, attributesOrderMap);
244 }
245
246 private static DomNode createFrom(final SgmlPage page, final Node source, final boolean handleXHTMLAsHTML,
247 final Map<Integer, List<String>> attributesOrderMap) {
248 if (source.getNodeType() == Node.TEXT_NODE) {
249 return new DomText(page, source.getNodeValue());
250 }
251 if (source.getNodeType() == Node.PROCESSING_INSTRUCTION_NODE) {
252 return new DomProcessingInstruction(page, source.getNodeName(), source.getNodeValue());
253 }
254 if (source.getNodeType() == Node.COMMENT_NODE) {
255 return new DomComment(page, source.getNodeValue());
256 }
257 if (source.getNodeType() == Node.DOCUMENT_TYPE_NODE) {
258 final DocumentType documentType = (DocumentType) source;
259 return new DomDocumentType(page, documentType.getName(), documentType.getPublicId(),
260 documentType.getSystemId());
261 }
262 final String ns = source.getNamespaceURI();
263 String localName = source.getLocalName();
264 if (handleXHTMLAsHTML && Html.XHTML_NAMESPACE.equals(ns)) {
265 final ElementFactory factory = page.getWebClient().getPageCreator().getHtmlParser().getFactory(localName);
266 return factory.createElementNS(page, ns, localName,
267 namedNodeMapToSaxAttributes(source.getAttributes(), attributesOrderMap, source));
268 }
269 final NamedNodeMap nodeAttributes = source.getAttributes();
270 if (page != null && page.isHtmlPage()) {
271 localName = localName.toUpperCase(Locale.ROOT);
272 }
273 final String qualifiedName;
274 if (source.getPrefix() == null) {
275 qualifiedName = localName;
276 }
277 else {
278 qualifiedName = source.getPrefix() + ':' + localName;
279 }
280
281 final String namespaceURI = source.getNamespaceURI();
282 if (Html.SVG_NAMESPACE.equals(namespaceURI)) {
283 return page.getWebClient().getPageCreator().getHtmlParser().getSvgFactory()
284 .createElementNS(page, namespaceURI, qualifiedName,
285 namedNodeMapToSaxAttributes(nodeAttributes, attributesOrderMap, source));
286 }
287
288 final OrderedFastHashMap<String, DomAttr> attributes = new OrderedFastHashMap<>();
289 for (int i = 0; i < nodeAttributes.getLength(); i++) {
290 final int orderedIndex = Platform.getIndex(nodeAttributes, attributesOrderMap, source, i);
291 final Attr attribute = (Attr) nodeAttributes.item(orderedIndex);
292 final String attributeNamespaceURI = attribute.getNamespaceURI();
293 final String attributeQualifiedName;
294 if (attribute.getPrefix() == null) {
295 attributeQualifiedName = attribute.getLocalName();
296 }
297 else {
298 attributeQualifiedName = attribute.getPrefix() + ':' + attribute.getLocalName();
299 }
300 final String value = attribute.getNodeValue();
301 final boolean specified = attribute.getSpecified();
302 final DomAttr xmlAttribute =
303 new DomAttr(page, attributeNamespaceURI, attributeQualifiedName, value, specified);
304 attributes.put(attribute.getNodeName(), xmlAttribute);
305 }
306 return new DomElement(namespaceURI, qualifiedName, page, attributes);
307 }
308
309 private static Attributes namedNodeMapToSaxAttributes(final NamedNodeMap attributesMap,
310 final Map<Integer, List<String>> attributesOrderMap, final Node element) {
311 final AttributesImpl attributes = new AttributesImpl();
312 final int length = attributesMap.getLength();
313 for (int i = 0; i < length; i++) {
314 final int orderedIndex = Platform.getIndex(attributesMap, attributesOrderMap, element, i);
315 final Node attr = attributesMap.item(orderedIndex);
316 attributes.addAttribute(attr.getNamespaceURI(), attr.getLocalName(),
317 attr.getNodeName(), null, attr.getNodeValue());
318 }
319
320 return attributes;
321 }
322
323
324
325
326
327
328
329
330
331 private static void copy(final SgmlPage page, final Node source, final DomNode dest,
332 final boolean handleXHTMLAsHTML, final Map<Integer, List<String>> attributesOrderMap) {
333 final NodeList nodeChildren = source.getChildNodes();
334 for (int i = 0; i < nodeChildren.getLength(); i++) {
335 final Node child = nodeChildren.item(i);
336 switch (child.getNodeType()) {
337 case Node.ELEMENT_NODE:
338 final DomNode childXml = createFrom(page, child, handleXHTMLAsHTML, attributesOrderMap);
339 dest.appendChild(childXml);
340 copy(page, child, childXml, handleXHTMLAsHTML, attributesOrderMap);
341 break;
342
343 case Node.TEXT_NODE:
344 dest.appendChild(new DomText(page, child.getNodeValue()));
345 break;
346
347 case Node.CDATA_SECTION_NODE:
348 dest.appendChild(new DomCDataSection(page, child.getNodeValue()));
349 break;
350
351 case Node.COMMENT_NODE:
352 dest.appendChild(new DomComment(page, child.getNodeValue()));
353 break;
354
355 case Node.PROCESSING_INSTRUCTION_NODE:
356 dest.appendChild(new DomProcessingInstruction(page, child.getNodeName(), child.getNodeValue()));
357 break;
358
359 default:
360 if (LOG.isWarnEnabled()) {
361 LOG.warn("NodeType " + child.getNodeType()
362 + " (" + child.getNodeName() + ") is not yet supported.");
363 }
364 }
365 }
366 }
367
368
369
370
371
372
373
374
375 public static String lookupNamespaceURI(final DomElement element, final String prefix) {
376 String uri;
377 if (prefix.isEmpty()) {
378 uri = element.getAttributeDirect("xmlns");
379 }
380 else {
381 uri = element.getAttribute("xmlns:" + prefix);
382 }
383 if (ATTRIBUTE_NOT_DEFINED == uri) {
384 final DomNode parentNode = element.getParentNode();
385 if (parentNode instanceof DomElement) {
386 uri = lookupNamespaceURI((DomElement) parentNode, prefix);
387 }
388 }
389 return uri;
390 }
391
392
393
394
395
396
397
398 public static String lookupPrefix(final DomElement element, final String namespace) {
399 final Map<String, DomAttr> attributes = element.getAttributesMap();
400 for (final Map.Entry<String, DomAttr> entry : attributes.entrySet()) {
401 final String name = entry.getKey();
402 final DomAttr value = entry.getValue();
403 if (name.startsWith("xmlns:") && value.getValue().equals(namespace)) {
404 return name.substring(6);
405 }
406 }
407 for (final DomNode child : element.getChildren()) {
408 if (child instanceof DomElement) {
409 final String prefix = lookupPrefix((DomElement) child, namespace);
410 if (prefix != null) {
411 return prefix;
412 }
413 }
414 }
415 return null;
416 }
417
418
419
420
421
422
423
424
425 public static Map<Integer, List<String>> getAttributesOrderMap(final Document document) {
426 return Platform.getAttributesOrderMap(document);
427 }
428 }