View Javadoc
1   /*
2    * Copyright (c) 2002-2026 Gargoyle Software Inc.
3    *
4    * Licensed under the Apache License, Version 2.0 (the "License");
5    * you may not use this file except in compliance with the License.
6    * You may obtain a copy of the License at
7    * https://www.apache.org/licenses/LICENSE-2.0
8    *
9    * Unless required by applicable law or agreed to in writing, software
10   * distributed under the License is distributed on an "AS IS" BASIS,
11   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12   * See the License for the specific language governing permissions and
13   * limitations under the License.
14   */
15  package org.htmlunit.javascript.host.xml;
16  
17  import java.util.Arrays;
18  import java.util.HashSet;
19  import java.util.Set;
20  
21  import org.htmlunit.SgmlPage;
22  import org.htmlunit.html.DomAttr;
23  import org.htmlunit.html.DomElement;
24  import org.htmlunit.html.DomNode;
25  import org.htmlunit.html.HtmlAbbreviated;
26  import org.htmlunit.html.HtmlAcronym;
27  import org.htmlunit.html.HtmlAddress;
28  import org.htmlunit.html.HtmlAnchor;
29  import org.htmlunit.html.HtmlAudio;
30  import org.htmlunit.html.HtmlBidirectionalOverride;
31  import org.htmlunit.html.HtmlBig;
32  import org.htmlunit.html.HtmlBlockQuote;
33  import org.htmlunit.html.HtmlBody;
34  import org.htmlunit.html.HtmlBold;
35  import org.htmlunit.html.HtmlButton;
36  import org.htmlunit.html.HtmlCanvas;
37  import org.htmlunit.html.HtmlCaption;
38  import org.htmlunit.html.HtmlCenter;
39  import org.htmlunit.html.HtmlCitation;
40  import org.htmlunit.html.HtmlCode;
41  import org.htmlunit.html.HtmlDefinition;
42  import org.htmlunit.html.HtmlDefinitionDescription;
43  import org.htmlunit.html.HtmlDefinitionList;
44  import org.htmlunit.html.HtmlDefinitionTerm;
45  import org.htmlunit.html.HtmlDeletedText;
46  import org.htmlunit.html.HtmlDirectory;
47  import org.htmlunit.html.HtmlDivision;
48  import org.htmlunit.html.HtmlEmbed;
49  import org.htmlunit.html.HtmlEmphasis;
50  import org.htmlunit.html.HtmlExample;
51  import org.htmlunit.html.HtmlFieldSet;
52  import org.htmlunit.html.HtmlFont;
53  import org.htmlunit.html.HtmlForm;
54  import org.htmlunit.html.HtmlFrame;
55  import org.htmlunit.html.HtmlFrameSet;
56  import org.htmlunit.html.HtmlHead;
57  import org.htmlunit.html.HtmlHeading1;
58  import org.htmlunit.html.HtmlHeading2;
59  import org.htmlunit.html.HtmlHeading3;
60  import org.htmlunit.html.HtmlHeading4;
61  import org.htmlunit.html.HtmlHeading5;
62  import org.htmlunit.html.HtmlHeading6;
63  import org.htmlunit.html.HtmlHeadingGroup;
64  import org.htmlunit.html.HtmlHtml;
65  import org.htmlunit.html.HtmlInlineFrame;
66  import org.htmlunit.html.HtmlInlineQuotation;
67  import org.htmlunit.html.HtmlInsertedText;
68  import org.htmlunit.html.HtmlItalic;
69  import org.htmlunit.html.HtmlKeyboard;
70  import org.htmlunit.html.HtmlLabel;
71  import org.htmlunit.html.HtmlLegend;
72  import org.htmlunit.html.HtmlListItem;
73  import org.htmlunit.html.HtmlListing;
74  import org.htmlunit.html.HtmlMap;
75  import org.htmlunit.html.HtmlMarquee;
76  import org.htmlunit.html.HtmlMenu;
77  import org.htmlunit.html.HtmlNoBreak;
78  import org.htmlunit.html.HtmlNoEmbed;
79  import org.htmlunit.html.HtmlNoFrames;
80  import org.htmlunit.html.HtmlNoScript;
81  import org.htmlunit.html.HtmlObject;
82  import org.htmlunit.html.HtmlOption;
83  import org.htmlunit.html.HtmlOptionGroup;
84  import org.htmlunit.html.HtmlOrderedList;
85  import org.htmlunit.html.HtmlParagraph;
86  import org.htmlunit.html.HtmlPlainText;
87  import org.htmlunit.html.HtmlPreformattedText;
88  import org.htmlunit.html.HtmlS;
89  import org.htmlunit.html.HtmlSample;
90  import org.htmlunit.html.HtmlScript;
91  import org.htmlunit.html.HtmlSelect;
92  import org.htmlunit.html.HtmlSmall;
93  import org.htmlunit.html.HtmlSource;
94  import org.htmlunit.html.HtmlSpan;
95  import org.htmlunit.html.HtmlStrike;
96  import org.htmlunit.html.HtmlStrong;
97  import org.htmlunit.html.HtmlStyle;
98  import org.htmlunit.html.HtmlSubscript;
99  import org.htmlunit.html.HtmlSuperscript;
100 import org.htmlunit.html.HtmlTable;
101 import org.htmlunit.html.HtmlTableBody;
102 import org.htmlunit.html.HtmlTableColumn;
103 import org.htmlunit.html.HtmlTableColumnGroup;
104 import org.htmlunit.html.HtmlTableDataCell;
105 import org.htmlunit.html.HtmlTableFooter;
106 import org.htmlunit.html.HtmlTableHeader;
107 import org.htmlunit.html.HtmlTableHeaderCell;
108 import org.htmlunit.html.HtmlTableRow;
109 import org.htmlunit.html.HtmlTeletype;
110 import org.htmlunit.html.HtmlTextArea;
111 import org.htmlunit.html.HtmlTitle;
112 import org.htmlunit.html.HtmlUnderlined;
113 import org.htmlunit.html.HtmlUnorderedList;
114 import org.htmlunit.html.HtmlVariable;
115 import org.htmlunit.html.HtmlVideo;
116 import org.htmlunit.html.HtmlWordBreak;
117 import org.htmlunit.javascript.HtmlUnitScriptable;
118 import org.htmlunit.javascript.configuration.JsxClass;
119 import org.htmlunit.javascript.configuration.JsxConstructor;
120 import org.htmlunit.javascript.configuration.JsxFunction;
121 import org.htmlunit.javascript.host.Element;
122 import org.htmlunit.javascript.host.dom.Document;
123 import org.htmlunit.javascript.host.dom.DocumentFragment;
124 import org.htmlunit.javascript.host.dom.Node;
125 import org.htmlunit.util.StringUtils;
126 import org.w3c.dom.NamedNodeMap;
127 
128 /**
129  * A JavaScript object for {@code XMLSerializer}.
130  * see https://w3c.github.io/DOM-Parsing/#the-xmlserializer-interface
131  *
132  * @author Ahmed Ashour
133  * @author Darrell DeBoer
134  * @author Ronald Brill
135  * @author Frank Danek
136  */
137 @JsxClass
138 public class XMLSerializer extends HtmlUnitScriptable {
139 
140     // this is a bit strange but it is the way FF works
141     // output of empty tags are not allowed for these HTML tags
142     private static final Set<String> NON_EMPTY_TAGS = new HashSet<>(Arrays.asList(
143             HtmlAbbreviated.TAG_NAME, HtmlAcronym.TAG_NAME,
144             HtmlAnchor.TAG_NAME, HtmlAddress.TAG_NAME, HtmlAudio.TAG_NAME,
145             HtmlBidirectionalOverride.TAG_NAME, HtmlBig.TAG_NAME,
146             HtmlBlockQuote.TAG_NAME, HtmlBody.TAG_NAME, HtmlBold.TAG_NAME,
147             HtmlButton.TAG_NAME, HtmlCanvas.TAG_NAME, HtmlCaption.TAG_NAME,
148             HtmlCenter.TAG_NAME, HtmlCitation.TAG_NAME, HtmlCode.TAG_NAME,
149             HtmlDefinition.TAG_NAME, HtmlDefinitionDescription.TAG_NAME,
150             HtmlDeletedText.TAG_NAME, HtmlDirectory.TAG_NAME,
151             HtmlDivision.TAG_NAME,
152             HtmlDefinitionList.TAG_NAME,
153             HtmlDefinitionTerm.TAG_NAME, HtmlEmbed.TAG_NAME,
154             HtmlEmphasis.TAG_NAME, HtmlFieldSet.TAG_NAME,
155             HtmlFont.TAG_NAME, HtmlForm.TAG_NAME,
156             HtmlFrame.TAG_NAME, HtmlFrameSet.TAG_NAME, HtmlHeading1.TAG_NAME,
157             HtmlHeading2.TAG_NAME, HtmlHeading3.TAG_NAME,
158             HtmlHeading4.TAG_NAME, HtmlHeading5.TAG_NAME,
159             HtmlHeading6.TAG_NAME, HtmlHead.TAG_NAME, HtmlHeadingGroup.TAG_NAME,
160             HtmlHtml.TAG_NAME, HtmlInlineFrame.TAG_NAME,
161             HtmlInsertedText.TAG_NAME,
162             HtmlItalic.TAG_NAME, HtmlKeyboard.TAG_NAME, HtmlLabel.TAG_NAME,
163             HtmlLegend.TAG_NAME, HtmlListing.TAG_NAME, HtmlListItem.TAG_NAME,
164             HtmlMap.TAG_NAME, HtmlMarquee.TAG_NAME,
165             HtmlMenu.TAG_NAME,
166             HtmlNoBreak.TAG_NAME, HtmlNoEmbed.TAG_NAME, HtmlNoFrames.TAG_NAME,
167             HtmlNoScript.TAG_NAME, HtmlObject.TAG_NAME, HtmlOrderedList.TAG_NAME,
168             HtmlOptionGroup.TAG_NAME, HtmlOption.TAG_NAME, HtmlParagraph.TAG_NAME,
169             HtmlPlainText.TAG_NAME, HtmlPreformattedText.TAG_NAME,
170             HtmlInlineQuotation.TAG_NAME, HtmlS.TAG_NAME, HtmlSample.TAG_NAME,
171             HtmlScript.TAG_NAME, HtmlSelect.TAG_NAME, HtmlSmall.TAG_NAME,
172             HtmlSource.TAG_NAME, HtmlSpan.TAG_NAME,
173             HtmlStrike.TAG_NAME, HtmlStrong.TAG_NAME, HtmlStyle.TAG_NAME,
174             HtmlSubscript.TAG_NAME, HtmlSuperscript.TAG_NAME, HtmlTitle.TAG_NAME,
175             HtmlTable.TAG_NAME, HtmlTableColumn.TAG_NAME, HtmlTableColumnGroup.TAG_NAME,
176             HtmlTableBody.TAG_NAME, HtmlTableDataCell.TAG_NAME, HtmlTableHeaderCell.TAG_NAME,
177             HtmlTableRow.TAG_NAME, HtmlTextArea.TAG_NAME, HtmlTableFooter.TAG_NAME,
178             HtmlTableHeader.TAG_NAME, HtmlTeletype.TAG_NAME, HtmlUnderlined.TAG_NAME,
179             HtmlUnorderedList.TAG_NAME, HtmlVariable.TAG_NAME, HtmlVideo.TAG_NAME,
180             HtmlWordBreak.TAG_NAME, HtmlExample.TAG_NAME
181     ));
182 
183     /**
184      * JavaScript constructor.
185      */
186     @JsxConstructor
187     public void jsConstructor() {
188         // nothing to do
189     }
190 
191     /**
192      * The subtree rooted by the specified element is serialized to a string.
193      * @param root the root of the subtree to be serialized (this may be any node, even a document)
194      * @return the serialized string
195      */
196     @JsxFunction
197     public String serializeToString(Node root) {
198         if (root == null) {
199             return "";
200         }
201 
202         if (root instanceof DocumentFragment) {
203             Node node = root.getFirstChild();
204             if (node == null) {
205                 return "";
206             }
207 
208             final StringBuilder builder = new StringBuilder();
209             while (node != null) {
210                 builder.append(serializeToString(node));
211                 node = node.getNextSibling();
212             }
213             return builder.toString().trim();
214         }
215 
216         final boolean rootIsDocument = root instanceof Document;
217         if (rootIsDocument) {
218             root = ((Document) root).getDocumentElement();
219         }
220 
221         if (root instanceof Element) {
222             final StringBuilder builder = new StringBuilder();
223             final DomNode node = root.getDomNodeOrDie();
224             final SgmlPage page = node.getPage();
225             final boolean isHtmlPage = page != null && page.isHtmlPage();
226 
227             String forcedNamespace = null;
228             if (!rootIsDocument && isHtmlPage) {
229                 forcedNamespace = "http://www.w3.org/1999/xhtml";
230             }
231             toXml(1, node, builder, forcedNamespace);
232 
233             return builder.toString();
234         }
235 
236         return root.getDomNodeOrDie().asXml();
237     }
238 
239     private void toXml(final int indent,
240             final DomNode node, final StringBuilder builder, final String foredNamespace) {
241         final String nodeName = node.getNodeName();
242         builder.append('<').append(nodeName);
243 
244         String optionalPrefix = "";
245         final String namespaceURI = node.getNamespaceURI();
246         final String prefix = node.getPrefix();
247         if (namespaceURI != null && prefix != null) {
248             boolean sameNamespace = false;
249             for (DomNode parentNode = node.getParentNode(); parentNode instanceof DomElement;
250                     parentNode = parentNode.getParentNode()) {
251                 if (namespaceURI.equals(parentNode.getNamespaceURI())) {
252                     sameNamespace = true;
253                     break;
254                 }
255             }
256             if (node.getParentNode() == null || !sameNamespace) {
257                 ((DomElement) node).setAttribute("xmlns:" + prefix, namespaceURI);
258             }
259         }
260         else if (foredNamespace != null) {
261             builder.append(" xmlns=\"").append(foredNamespace).append('"');
262             optionalPrefix = " ";
263         }
264 
265         final NamedNodeMap attributesMap = node.getAttributes();
266         final int length = attributesMap.getLength();
267         for (int i = 0; i < length; i++) {
268             final DomAttr attrib = (DomAttr) attributesMap.item(i);
269             builder.append(' ').append(attrib.getQualifiedName())
270                    .append("=\"").append(attrib.getValue()).append('"');
271         }
272         boolean startTagClosed = false;
273         for (final DomNode child : node.getChildren()) {
274             if (!startTagClosed) {
275                 builder.append(optionalPrefix).append('>');
276                 startTagClosed = true;
277             }
278             switch (child.getNodeType()) {
279                 case Node.ELEMENT_NODE:
280                     toXml(indent + 1, child, builder, null);
281                     break;
282 
283                 case Node.TEXT_NODE:
284                     String value = child.getNodeValue();
285                     value = StringUtils.escapeXmlChars(value);
286                     builder.append(value);
287                     break;
288 
289                 case Node.CDATA_SECTION_NODE:
290                 case Node.COMMENT_NODE:
291                     builder.append(child.asXml());
292                     break;
293 
294                 default:
295                     break;
296             }
297         }
298 
299         if (startTagClosed) {
300             builder.append("</").append(nodeName).append('>');
301         }
302         else {
303             final String tagName = StringUtils.toRootLowerCase(nodeName);
304             if (NON_EMPTY_TAGS.contains(tagName)) {
305                 builder.append("></").append(nodeName).append('>');
306             }
307             else {
308                 builder.append(optionalPrefix).append("/>");
309             }
310         }
311     }
312 
313 }