View Javadoc
1   /*
2    * Copyright (c) 2002-2026 Gargoyle Software Inc.
3    *
4    * Licensed under the Apache License, Version 2.0 (the "License");
5    * you may not use this file except in compliance with the License.
6    * You may obtain a copy of the License at
7    * https://www.apache.org/licenses/LICENSE-2.0
8    *
9    * Unless required by applicable law or agreed to in writing, software
10   * distributed under the License is distributed on an "AS IS" BASIS,
11   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12   * See the License for the specific language governing permissions and
13   * limitations under the License.
14   */
15  package org.htmlunit.html.serializer;
16  
17  import static org.htmlunit.BrowserVersionFeatures.JS_INNER_TEXT_SVG_NL;
18  
19  import org.htmlunit.BrowserVersion;
20  import org.htmlunit.SgmlPage;
21  import org.htmlunit.WebWindow;
22  import org.htmlunit.css.ComputedCssStyleDeclaration;
23  import org.htmlunit.css.StyleAttributes.Definition;
24  import org.htmlunit.html.DomElement;
25  import org.htmlunit.html.DomNode;
26  import org.htmlunit.html.DomText;
27  import org.htmlunit.html.HtmlBreak;
28  import org.htmlunit.html.HtmlDefinitionTerm;
29  import org.htmlunit.html.HtmlDetails;
30  import org.htmlunit.html.HtmlHead;
31  import org.htmlunit.html.HtmlListItem;
32  import org.htmlunit.html.HtmlNoFrames;
33  import org.htmlunit.html.HtmlParagraph;
34  import org.htmlunit.html.HtmlScript;
35  import org.htmlunit.html.HtmlStyle;
36  import org.htmlunit.html.HtmlSummary;
37  import org.htmlunit.html.HtmlSvg;
38  import org.htmlunit.html.HtmlTextArea;
39  import org.htmlunit.html.HtmlTitle;
40  import org.htmlunit.html.ScriptElement;
41  import org.htmlunit.html.serializer.HtmlSerializerInnerOuterText.HtmlSerializerTextBuilder.Mode;
42  import org.htmlunit.svg.SvgTitle;
43  import org.htmlunit.util.StringUtils;
44  
45  /**
46   * Special serializer to generate the output we need
47   * for innerText and outerText.
48   *
49   * @author Ronald Brill
50   */
51  public class HtmlSerializerInnerOuterText {
52  
53      private final BrowserVersion browserVersion_;
54  
55      /**
56       * Ctor.
57       *
58       * @param browserVersion the {@link BrowserVersion}
59       */
60      public HtmlSerializerInnerOuterText(final BrowserVersion browserVersion) {
61          super();
62          browserVersion_ = browserVersion;
63      }
64  
65      /**
66       * Converts an HTML node to text.
67       * @param node a node
68       * @return the text representation according to the setting of this serializer
69       */
70      public String asText(final DomNode node) {
71          if (node instanceof HtmlBreak) {
72              return "";
73          }
74  
75          // included scripts are ignored, but if we ask for the script itself....
76          if (node instanceof ScriptElement) {
77              final HtmlSerializerTextBuilder builder = new HtmlSerializerTextBuilder();
78              appendChildren(builder, node, Mode.WHITE_SPACE_NORMAL, false);
79              return builder.getText();
80          }
81  
82          // when calling on the title itself we have to output
83          final boolean insideHead = node instanceof HtmlTitle;
84  
85          final HtmlSerializerTextBuilder builder = new HtmlSerializerTextBuilder();
86          appendNode(builder, node, whiteSpaceStyle(node, Mode.WHITE_SPACE_NORMAL), insideHead);
87          return builder.getText();
88      }
89  
90      /**
91       * Iterate over all Children and call appendNode() for every.
92       *
93       * @param builder the StringBuilder to add to
94       * @param node the node to process
95       * @param mode the {@link Mode} to use for processing
96       * @param insideHead true if inside head section
97       */
98      protected void appendChildren(final HtmlSerializerTextBuilder builder, final DomNode node,
99              final Mode mode, final boolean insideHead) {
100         for (final DomNode child : node.getChildren()) {
101             appendNode(builder, child, mode, insideHead);
102         }
103     }
104 
105     /**
106      * The core distribution method call the different appendXXX
107      * methods depending on the type of the given node.
108      *
109      * @param builder the StringBuilder to add to
110      * @param node the node to process
111      * @param mode the {@link Mode} to use for processing
112      * @param insideHead true if inside head section
113      */
114     protected void appendNode(final HtmlSerializerTextBuilder builder, final DomNode node,
115             final Mode mode, final boolean insideHead) {
116         if (node instanceof DomText text) {
117             appendText(builder, text, mode);
118         }
119         else if (node instanceof HtmlBreak break1) {
120             appendBreak(builder, break1);
121         }
122         else if (node instanceof HtmlParagraph paragraph) {
123             appendParagraph(builder, paragraph, mode, insideHead);
124         }
125         else if (node instanceof HtmlListItem item) {
126             appendListItem(builder, item, mode, insideHead);
127         }
128         else if (node instanceof HtmlDetails details) {
129             appendDetails(builder, details, mode, insideHead);
130         }
131         else if (node instanceof HtmlHead) {
132             appendChildren(builder, node, mode, true);
133         }
134         else if (node instanceof HtmlNoFrames) {
135             appendChildren(builder, node, Mode.PLAIN, insideHead);
136         }
137         else if (node instanceof HtmlTitle && !insideHead) {
138             // nothing to do
139         }
140         else if (node instanceof HtmlTextArea) {
141             // nothing to do
142         }
143         else if (node instanceof ScriptElement) {
144             if (insideHead) {
145                 appendChildren(builder, node, mode, insideHead);
146             }
147         }
148         else if (node instanceof HtmlDefinitionTerm item) {
149             appendDefinitionTerm(builder, item, mode, insideHead);
150         }
151         else if (node instanceof HtmlSvg) {
152             if (browserVersion_.hasFeature(JS_INNER_TEXT_SVG_NL)) {
153                 builder.appendRequiredLineBreak();
154                 appendChildren(builder, node, mode, insideHead);
155                 builder.appendRequiredLineBreak();
156             }
157             else {
158                 appendChildren(builder, node, mode, insideHead);
159             }
160         }
161         else if (node instanceof SvgTitle) {
162             // nothing to do
163         }
164         else {
165             appendChildren(builder, node, mode, insideHead);
166         }
167     }
168 
169     /**
170      * Process {@link DomText}.
171      *
172      * @param builder the StringBuilder to add to
173      * @param domText the target to process
174      * @param mode the {@link Mode} to use for processing
175      */
176     protected void appendText(final HtmlSerializerTextBuilder builder, final DomText domText, final Mode mode) {
177         final DomNode parent = domText.getParentNode();
178         if (parent instanceof HtmlTitle
179                 || parent instanceof HtmlStyle
180                 || parent instanceof HtmlScript) {
181             builder.append(domText.getData(), Mode.PLAIN);
182             return;
183         }
184 
185         if (parent == null
186                 || parent instanceof HtmlNoFrames
187                 || parent.isDisplayed()) {
188             builder.append(domText.getData(), mode);
189         }
190     }
191 
192     /**
193      * Process {@link HtmlBreak}.
194      *
195      * @param builder the StringBuilder to add to
196      * @param htmlBreak the target to process
197      */
198     protected void appendBreak(final HtmlSerializerTextBuilder builder,
199             final HtmlBreak htmlBreak) {
200         builder.appendRequiredLineBreak();
201     }
202 
203     /**
204      * Process {@link HtmlListItem}.
205      *
206      * @param builder the StringBuilder to add to
207      * @param htmlParagraph the target to process
208      * @param mode the {@link Mode} to use for processing
209      * @param insideHead true if inside head section
210      */
211     protected void appendParagraph(final HtmlSerializerTextBuilder builder,
212             final HtmlParagraph htmlParagraph, final Mode mode, final boolean insideHead) {
213         builder.appendRequiredLineBreak();
214         appendChildren(builder, htmlParagraph, mode, insideHead);
215         builder.appendRequiredLineBreak();
216     }
217 
218     /**
219      * Process {@link HtmlListItem}.
220      *
221      * @param builder the StringBuilder to add to
222      * @param htmlListItem the target to process
223      * @param mode the {@link Mode} to use for processing
224      * @param insideHead true if inside head section
225      */
226     protected void appendListItem(final HtmlSerializerTextBuilder builder,
227             final HtmlListItem htmlListItem, final Mode mode, final boolean insideHead) {
228         builder.appendRequiredLineBreak();
229         appendChildren(builder, htmlListItem, mode, insideHead);
230         builder.appendRequiredLineBreak();
231     }
232 
233     /**
234      * Process {@link HtmlDetails}.
235      * @param builder the StringBuilder to add to
236      * @param htmlDetails the target to process
237      * @param mode the {@link Mode} to use for processing
238      * @param insideHead true if inside head section
239      */
240     protected void appendDetails(final HtmlSerializerTextBuilder builder,
241                     final HtmlDetails htmlDetails, final Mode mode, final boolean insideHead) {
242         if (htmlDetails.isOpen()) {
243             appendChildren(builder, htmlDetails, mode, insideHead);
244             return;
245         }
246 
247         for (final DomNode child : htmlDetails.getChildren()) {
248             if (child instanceof HtmlSummary) {
249                 appendNode(builder, child, mode, insideHead);
250             }
251         }
252     }
253 
254     /**
255      * Process {@link HtmlDefinitionTerm}.
256      *
257      * @param builder the StringBuilder to add to
258      * @param htmlDefinitionTerm the target to process
259      * @param mode the {@link Mode} to use for processing
260      * @param insideHead true if inside head section
261      */
262     protected void appendDefinitionTerm(final HtmlSerializerTextBuilder builder,
263             final HtmlDefinitionTerm htmlDefinitionTerm, final Mode mode, final boolean insideHead) {
264         builder.appendRequiredLineBreak();
265         appendChildren(builder, htmlDefinitionTerm, mode, insideHead);
266         builder.appendRequiredLineBreak();
267     }
268 
269     private static Mode whiteSpaceStyle(final DomNode domNode, final Mode defaultMode) {
270         if (domNode instanceof DomElement) {
271             final SgmlPage page = domNode.getPage();
272             if (page != null) {
273                 if (page.getWebClient().getOptions().isCssEnabled()) {
274                     DomNode node = domNode;
275                     while (node != null) {
276                         if (node instanceof DomElement) {
277                             final WebWindow window = page.getEnclosingWindow();
278                             if (window != null) {
279                                 final ComputedCssStyleDeclaration style =
280                                         window.getComputedStyle((DomElement) domNode, null);
281                                 final String value = style.getStyleAttribute(Definition.WHITE_SPACE, false);
282 
283                                 if (!StringUtils.isEmptyOrNull(value)) {
284                                     if ("normal".equalsIgnoreCase(value)) {
285                                         return Mode.WHITE_SPACE_NORMAL;
286                                     }
287                                     if ("nowrap".equalsIgnoreCase(value)) {
288                                         return Mode.WHITE_SPACE_NORMAL;
289                                     }
290                                     if ("pre".equalsIgnoreCase(value)) {
291                                         return Mode.WHITE_SPACE_PRE;
292                                     }
293                                     if ("pre-wrap".equalsIgnoreCase(value)) {
294                                         return Mode.WHITE_SPACE_PRE;
295                                     }
296                                     if ("pre-line".equalsIgnoreCase(value)) {
297                                         return Mode.WHITE_SPACE_PRE_LINE;
298                                     }
299                                 }
300                             }
301                         }
302                         node = node.getParentNode();
303                     }
304                 }
305             }
306         }
307         return defaultMode;
308     }
309 
310     /**
311      * Helper to compose the text for the serializer based on several modes.
312      */
313     protected static class HtmlSerializerTextBuilder {
314 
315         /** Mode. */
316         protected enum Mode {
317             /**
318              * The mode for plain.
319              */
320             PLAIN,
321 
322             /**
323              * Sequences of white space are collapsed. Newline characters
324              * in the source are handled the same as other white space.
325              * Lines are broken as necessary to fill line boxes.
326              */
327             WHITE_SPACE_NORMAL,
328 
329             /**
330              * Sequences of white space are preserved. Lines are only broken
331              * at newline characters in the source and at <br> elements.
332              */
333             WHITE_SPACE_PRE,
334 
335             /**
336              * Sequences of white space are collapsed. Lines are broken
337              * at newline characters, at <br> and as necessary
338              * to fill line boxes.
339              */
340             WHITE_SPACE_PRE_LINE
341         }
342 
343         private enum State {
344             DEFAULT,
345             EMPTY,
346             BLANK_AT_END,
347             BLANK_AT_END_AFTER_NEWLINE,
348             NEWLINE_AT_END,
349             BREAK_AT_END,
350             BLOCK_SEPARATOR_AT_END,
351             REQUIRED_LINE_BREAK_AT_END
352         }
353 
354         private State state_;
355         private final StringBuilder builder_;
356         private int trimRightPos_;
357 
358         /**
359          * Ctor.
360          */
361         public HtmlSerializerTextBuilder() {
362             builder_ = new StringBuilder();
363             state_ = State.EMPTY;
364             trimRightPos_ = 0;
365         }
366 
367         /**
368          * Append a line separator.
369          */
370         public void appendRequiredLineBreak() {
371             if (state_ == State.EMPTY) {
372                 return;
373             }
374 
375             // trimRight
376             builder_.setLength(trimRightPos_);
377             if (trimRightPos_ == 0) {
378                 state_ = State.EMPTY;
379             }
380 
381             builder_.append('\n');
382             state_ = State.REQUIRED_LINE_BREAK_AT_END;
383         }
384 
385         /**
386          * Append the provided content.
387          * see https://drafts.csswg.org/css-text-3/#white-space
388          *
389          * @param content the content to add
390          * @param mode the {@link Mode}
391          */
392         public void append(final String content, final Mode mode) {
393             if (content == null) {
394                 return;
395             }
396             int length = content.length();
397             if (length == 0) {
398                 return;
399             }
400 
401             if (mode == Mode.PLAIN) {
402                 builder_.append(content);
403                 state_ = State.DEFAULT;
404                 trimRightPos_ = builder_.length();
405                 return;
406             }
407 
408             length--;
409             final int contentLength = content.length();
410             for (int i = 0; i < contentLength; i++) {
411                 char c = content.charAt(i);
412 
413                 // handle \r
414                 if (c == '\r') {
415                     if (length != i) {
416                         continue;
417                     }
418                     c = '\n';
419                 }
420 
421                 if (c == '\n') {
422                     if (mode == Mode.WHITE_SPACE_PRE) {
423                         switch (state_) {
424                             case EMPTY:
425                             case BLOCK_SEPARATOR_AT_END:
426                                 break;
427                             default:
428                                 builder_.append('\n');
429                                 state_ = State.NEWLINE_AT_END;
430                                 trimRightPos_ = builder_.length();
431                                 break;
432                         }
433                         continue;
434                     }
435 
436                     if (mode == Mode.WHITE_SPACE_PRE_LINE) {
437                         switch (state_) {
438                             case EMPTY:
439                             case BLOCK_SEPARATOR_AT_END:
440                                 break;
441                             case BLANK_AT_END:
442                                 builder_.setLength(trimRightPos_);
443                                 builder_.append('\n');
444                                 state_ = State.NEWLINE_AT_END;
445                                 trimRightPos_ = builder_.length();
446                                 break;
447                             default:
448                                 builder_.append('\n');
449                                 state_ = State.NEWLINE_AT_END;
450                                 trimRightPos_ = builder_.length();
451                                 break;
452                         }
453                         continue;
454                     }
455 
456                     switch (state_) {
457                         case EMPTY:
458                         case BLANK_AT_END:
459                         case BLANK_AT_END_AFTER_NEWLINE:
460                         case BLOCK_SEPARATOR_AT_END:
461                         case NEWLINE_AT_END:
462                         case BREAK_AT_END:
463                         case REQUIRED_LINE_BREAK_AT_END:
464                             break;
465                         default:
466                             builder_.append(' ');
467                             state_ = State.BLANK_AT_END;
468                             break;
469                     }
470                     continue;
471                 }
472 
473                 if (c == ' ' || c == '\t' || c == '\f') {
474                     if (mode == Mode.WHITE_SPACE_PRE) {
475                         if (c == '\t') {
476                             builder_.append('\t');
477                         }
478                         else {
479                             builder_.append(' ');
480                         }
481                         state_ = State.BLANK_AT_END;
482                         trimRightPos_ = builder_.length();
483 
484                         continue;
485                     }
486 
487                     if (mode == Mode.WHITE_SPACE_PRE_LINE) {
488                         switch (state_) {
489                             case EMPTY:
490                             case BLANK_AT_END:
491                             case BLANK_AT_END_AFTER_NEWLINE:
492                             case BREAK_AT_END:
493                             case NEWLINE_AT_END:
494                                 break;
495                             default:
496                                 builder_.append(' ');
497                                 state_ = State.BLANK_AT_END;
498                                 break;
499                         }
500                         continue;
501                     }
502 
503                     switch (state_) {
504                         case EMPTY:
505                         case BLANK_AT_END:
506                         case BLANK_AT_END_AFTER_NEWLINE:
507                         case BLOCK_SEPARATOR_AT_END:
508                         case NEWLINE_AT_END:
509                         case BREAK_AT_END:
510                         case REQUIRED_LINE_BREAK_AT_END:
511                             break;
512                         default:
513                             builder_.append(' ');
514                             state_ = State.BLANK_AT_END;
515                             break;
516                     }
517                     continue;
518                 }
519 
520                 builder_.append(c);
521                 state_ = State.DEFAULT;
522                 trimRightPos_ = builder_.length();
523             }
524         }
525 
526         /**
527          * @return the constructed text.
528          */
529         public String getText() {
530             return builder_.substring(0, trimRightPos_);
531         }
532     }
533 }