View Javadoc
1   /*
2    * Copyright (c) 2002-2026 Gargoyle Software Inc.
3    *
4    * Licensed under the Apache License, Version 2.0 (the "License");
5    * you may not use this file except in compliance with the License.
6    * You may obtain a copy of the License at
7    * https://www.apache.org/licenses/LICENSE-2.0
8    *
9    * Unless required by applicable law or agreed to in writing, software
10   * distributed under the License is distributed on an "AS IS" BASIS,
11   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12   * See the License for the specific language governing permissions and
13   * limitations under the License.
14   */
15  package org.htmlunit.html;
16  
17  import static java.nio.charset.StandardCharsets.ISO_8859_1;
18  
19  import java.io.File;
20  import java.io.IOException;
21  import java.io.InputStream;
22  import java.io.OutputStream;
23  import java.net.URL;
24  import java.nio.charset.Charset;
25  import java.nio.file.Files;
26  import java.util.HashMap;
27  import java.util.Map;
28  import java.util.regex.Pattern;
29  
30  import org.apache.commons.io.FileUtils;
31  import org.apache.commons.io.IOUtils;
32  import org.apache.commons.lang3.StringUtils;
33  import org.apache.commons.logging.Log;
34  import org.apache.commons.logging.LogFactory;
35  import org.htmlunit.Page;
36  import org.htmlunit.SgmlPage;
37  import org.htmlunit.WebResponse;
38  import org.htmlunit.util.MimeType;
39  
40  /**
41   * Utility to handle conversion from HTML code to XML string.
42   * @author Ahmed Ashour
43   * @author Ronald Brill
44   * @author Marc Guillemot
45   */
46  public class XmlSerializer {
47  
48      private static final String FILE_SEPARATOR = "/";
49      private static final Pattern CREATE_FILE_PATTERN = Pattern.compile(".*/");
50  
51      private static final Log LOG = LogFactory.getLog(XmlSerializer.class);
52  
53      private final StringBuilder builder_ = new StringBuilder();
54      private final StringBuilder indent_ = new StringBuilder();
55      private File outputDir_;
56  
57      /**
58       * Saves the given {@link SgmlPage} to the file.
59       * @param page the page to save
60       * @param file the destination
61       * @throws IOException in case of error
62       */
63      public void save(final SgmlPage page, final File file) throws IOException {
64          save(page, file, false);
65      }
66  
67      private void save(final SgmlPage page, final File file, final boolean append) throws IOException {
68          String fileName = file.getName();
69  
70          if (!append) {
71              if (!fileName.endsWith(".htm") && !fileName.endsWith(".html")) {
72                  fileName += ".html";
73              }
74          }
75          final File outputFile = new File(file.getParentFile(), fileName);
76  
77          if (!append && outputFile.exists()) {
78              throw new IOException("File already exists: " + outputFile);
79          }
80          fileName = fileName.substring(0, fileName.lastIndexOf('.'));
81          outputDir_ = new File(file.getParentFile(), fileName);
82  
83          // don't use asXml here because we have to sync the encoding from the
84          // header with the one used by the writer
85          final DomElement node = page.getDocumentElement();
86          Charset charsetName = ISO_8859_1;
87          builder_.setLength(0);
88          indent_.setLength(0);
89          if (page.isHtmlPage()) {
90              charsetName = page.getCharset();
91              if (charsetName != null && node instanceof HtmlHtml) {
92                  builder_.append("<?xml version=\"1.0\" encoding=\"").append(charsetName).append("\"?>\n");
93              }
94          }
95          printXml(node);
96          final String response = builder_.toString();
97          builder_.setLength(0);
98          FileUtils.writeStringToFile(outputFile, response, charsetName, append);
99      }
100 
101     /**
102      * @param node a node
103      * @return the xml representation according to the setting of this serializer
104      * @throws IOException in case of problem saving resources
105      */
106     public String asXml(final DomElement node) throws IOException {
107         builder_.setLength(0);
108         indent_.setLength(0);
109         final SgmlPage page = node.getPage();
110         if (null != page && page.isHtmlPage()) {
111             final Charset charsetName = page.getCharset();
112             if (charsetName != null && node instanceof HtmlHtml) {
113                 builder_.append("<?xml version=\"1.0\" encoding=\"").append(charsetName).append("\"?>\n");
114             }
115         }
116         printXml(node);
117         final String response = builder_.toString();
118         builder_.setLength(0);
119         return response;
120     }
121 
122     protected void printXml(final DomElement node) throws IOException {
123         if (!isExcluded(node)) {
124             final boolean hasChildren = node.getFirstChild() != null;
125             builder_.append(indent_).append('<');
126             printOpeningTag(node);
127 
128             if (hasChildren || node.isEmptyXmlTagExpanded()) {
129                 builder_.append(">\n");
130                 for (DomNode child = node.getFirstChild(); child != null; child = child.getNextSibling()) {
131                     indent_.append("  ");
132                     if (child instanceof DomElement element) {
133                         printXml(element);
134                     }
135                     else {
136                         builder_.append(child);
137                     }
138                     indent_.setLength(indent_.length() - 2);
139                 }
140                 builder_.append(indent_).append("</").append(node.getTagName()).append(">\n");
141             }
142             else {
143                 builder_.append("/>\n");
144             }
145         }
146     }
147 
148     /**
149      * @param node a node
150      * @return the text representation according to the setting of this serializer
151      */
152     public String asText(final DomNode node) {
153         builder_.setLength(0);
154 
155         if (node instanceof DomText text) {
156             builder_.append(text.getData());
157         }
158         else {
159             printText(node);
160         }
161 
162         final String response = builder_.toString();
163         builder_.setLength(0);
164         return response;
165     }
166 
167     /**
168      * Prints the text content from this node and all children.
169      * @param node the node
170      */
171     protected void printText(final DomNode node) {
172         for (DomNode child = node.getFirstChild(); child != null; child = child.getNextSibling()) {
173             if (child instanceof DomText text) {
174                 builder_.append(text.getData());
175             }
176             else {
177                 printText(child);
178             }
179         }
180     }
181 
182     /**
183      * Prints the content between "&lt;" and "&gt;" (or "/&gt;") in the output of the tag name
184      * and its attributes in XML format.
185      * @param node the node whose opening tag is to be printed
186      * @throws IOException in case of problem saving resources
187      */
188     protected void printOpeningTag(final DomElement node) throws IOException {
189         builder_.append(node.getTagName());
190         final Map<String, DomAttr> attributes = readAttributes(node);
191 
192         for (final Map.Entry<String, DomAttr> entry : attributes.entrySet()) {
193             builder_.append(' ')
194                 .append(entry.getKey())
195                 .append("=\"");
196             final String value = entry.getValue().getNodeValue();
197             builder_.append(org.htmlunit.util.StringUtils.escapeXmlAttributeValue(value))
198                 .append('"');
199         }
200     }
201 
202     private Map<String, DomAttr> readAttributes(final DomElement node) throws IOException {
203         if (node instanceof HtmlImage image) {
204             return getAttributesFor(image);
205         }
206         else if (node instanceof HtmlLink link) {
207             return getAttributesFor(link);
208         }
209         else if (node instanceof BaseFrameElement element) {
210             return getAttributesFor(element);
211         }
212 
213         Map<String, DomAttr> attributes = node.getAttributesMap();
214         if (node instanceof HtmlOption option) {
215             attributes = new HashMap<>(attributes);
216             if (option.isSelected()) {
217                 if (!attributes.containsKey("selected")) {
218                     attributes.put("selected", new DomAttr(node.getPage(), null, "selected", "selected", false));
219                 }
220             }
221             else {
222                 attributes.remove("selected");
223             }
224         }
225         return attributes;
226     }
227 
228     /**
229      * @param frame the frame to get the attributes from
230      * @return the attribute map
231      */
232     private Map<String, DomAttr> getAttributesFor(final BaseFrameElement frame) throws IOException {
233         final Map<String, DomAttr> map = createAttributesCopyWithClonedAttribute(frame, DomElement.SRC_ATTRIBUTE);
234         final DomAttr srcAttr = map.get(DomElement.SRC_ATTRIBUTE);
235         if (srcAttr == null) {
236             return map;
237         }
238 
239         final Page enclosedPage = frame.getEnclosedPage();
240         final String suffix = getFileExtension(enclosedPage);
241         final File file = createFile(srcAttr.getValue(), "." + suffix);
242 
243         if (enclosedPage != null) {
244             if (enclosedPage.isHtmlPage()) {
245                 new XmlSerializer().save((HtmlPage) enclosedPage, file, true);
246             }
247             else {
248                 try (InputStream is = enclosedPage.getWebResponse().getContentAsStream()) {
249                     try (OutputStream fos = Files.newOutputStream(file.toPath())) {
250                         IOUtils.copyLarge(is, fos);
251                     }
252                 }
253             }
254         }
255 
256         srcAttr.setValue(file.getParentFile().getName() + FILE_SEPARATOR + file.getName());
257         return map;
258     }
259 
260     private static String getFileExtension(final Page enclosedPage) {
261         if (enclosedPage != null) {
262             if (enclosedPage.isHtmlPage()) {
263                 return "html";
264             }
265 
266             final URL url = enclosedPage.getUrl();
267             if (url.getPath().contains(".")) {
268                 return StringUtils.substringAfterLast(url.getPath(), ".");
269             }
270         }
271 
272         return ".unknown";
273     }
274 
275     /**
276      * @param link the link to get the attributes from
277      * @return the attribute map
278      * @throws IOException in case of error
279      */
280     protected Map<String, DomAttr> getAttributesFor(final HtmlLink link) throws IOException {
281         final Map<String, DomAttr> map = createAttributesCopyWithClonedAttribute(link, "href");
282         final DomAttr hrefAttr = map.get("href");
283         if (hrefAttr != null && org.htmlunit.util.StringUtils.isNotBlank(hrefAttr.getValue())) {
284             final String protocol = link.getWebRequest().getUrl().getProtocol();
285             if ("http".equals(protocol) || "https".equals(protocol)) {
286                 try {
287                     final WebResponse response = link.getWebResponse(true, null, false, null);
288 
289                     final File file = createFile(hrefAttr.getValue(), ".css");
290                     FileUtils.writeStringToFile(file, response.getContentAsString(), ISO_8859_1);
291                     hrefAttr.setValue(outputDir_.getName() + FILE_SEPARATOR + file.getName());
292                 }
293                 catch (final IOException e) {
294                     LOG.error("XmlSerializer: IOException while downloading link content from url '"
295                                 + hrefAttr + "'", e);
296                 }
297                 catch (final IllegalStateException e) {
298                     LOG.error("XmlSerializer: IllegalStateException while downloading link content from url '"
299                                 + hrefAttr + "'", e);
300                 }
301             }
302         }
303 
304         return map;
305     }
306 
307     /**
308      * @param image the image to get the attributes from
309      * @return the attribute map
310      */
311     protected Map<String, DomAttr> getAttributesFor(final HtmlImage image) {
312         final Map<String, DomAttr> map = createAttributesCopyWithClonedAttribute(image, DomElement.SRC_ATTRIBUTE);
313         final DomAttr srcAttr = map.get(DomElement.SRC_ATTRIBUTE);
314         if (srcAttr != null && org.htmlunit.util.StringUtils.isNotBlank(srcAttr.getValue())) {
315             try {
316                 final WebResponse response = image.getWebResponse(true);
317 
318                 try (InputStream inputStream = response.getContentAsStream()) {
319                     final File file = createFile(srcAttr.getValue(), "." + getSuffix(response));
320                     FileUtils.copyInputStreamToFile(inputStream, file);
321 
322                     final String valueOnFileSystem = outputDir_.getName() + FILE_SEPARATOR + file.getName();
323                     // this is the clone attribute node, not the original one of the page
324                     srcAttr.setValue(valueOnFileSystem);
325                 }
326             }
327             catch (final IOException e) {
328                 LOG.error("XmlSerializer: IOException while downloading image content from url '" + srcAttr + "'", e);
329             }
330             catch (final IllegalStateException e) {
331                 LOG.error("XmlSerializer: IllegalStateException while downloading image content from url '"
332                             + srcAttr + "'", e);
333             }
334         }
335 
336         return map;
337     }
338 
339     private static String getSuffix(final WebResponse response) {
340         // first try to take the one from the requested file
341         final String url = response.getWebRequest().getUrl().toString();
342         final String fileName =
343                 StringUtils.substringAfterLast(org.htmlunit.util.StringUtils.substringBefore(url, "?"), "/");
344         // if there is a suffix with 2-4 letters, the take it
345         final String suffix = StringUtils.substringAfterLast(fileName, ".");
346         if (suffix.length() > 1 && suffix.length() < 5) {
347             return suffix;
348         }
349 
350         // use content type
351         return MimeType.getFileExtension(response.getContentType());
352     }
353 
354     private static Map<String, DomAttr> createAttributesCopyWithClonedAttribute(final HtmlElement elt,
355             final String attrName) {
356         final Map<String, DomAttr> newMap = new HashMap<>(elt.getAttributesMap());
357 
358         // clone the specified element, if possible
359         final DomAttr attr = newMap.get(attrName);
360         if (null == attr) {
361             return newMap;
362         }
363 
364         final DomAttr clonedAttr = new DomAttr(attr.getPage(), attr.getNamespaceURI(),
365             attr.getQualifiedName(), attr.getValue(), attr.getSpecified());
366 
367         newMap.put(attrName, clonedAttr);
368 
369         return newMap;
370     }
371 
372     /**
373      * @param element the element to check
374      * @return true if the element is a HtmlScript
375      */
376     protected boolean isExcluded(final DomElement element) {
377         return element instanceof HtmlScript;
378     }
379 
380     /**
381      * Computes the best file to save the response to the given URL.
382      * @param url the requested URL
383      * @param extension the preferred extension
384      * @return the file to create
385      * @throws IOException if a problem occurs creating the file
386      */
387     private File createFile(final String url, final String extension) throws IOException {
388         String name = url.replaceFirst("/$", "");
389         name = CREATE_FILE_PATTERN.matcher(name).replaceAll("");
390         name = org.htmlunit.util.StringUtils.substringBefore(name, "?"); // remove query
391         name = org.htmlunit.util.StringUtils.substringBefore(name, ";"); // remove additional info
392         name = StringUtils.substring(name, 0, 30); // many file systems have a limit at 255, let's limit it
393         name = org.htmlunit.util.StringUtils.sanitizeForFileName(name);
394         if (!name.endsWith(extension)) {
395             name += extension;
396         }
397         int counter = 0;
398         while (true) {
399             final String fileName;
400             if (counter == 0) {
401                 fileName = name;
402             }
403             else {
404                 fileName = StringUtils.substringBeforeLast(name, ".")
405                         + "_" + counter + "." + StringUtils.substringAfterLast(name, ".");
406             }
407             FileUtils.forceMkdir(outputDir_);
408             final File f = new File(outputDir_, fileName);
409             if (f.createNewFile()) {
410                 return f;
411             }
412             counter++;
413         }
414     }
415 }