View Javadoc
1   /*
2    * Copyright (c) 2002-2025 Gargoyle Software Inc.
3    *
4    * Licensed under the Apache License, Version 2.0 (the "License");
5    * you may not use this file except in compliance with the License.
6    * You may obtain a copy of the License at
7    * https://www.apache.org/licenses/LICENSE-2.0
8    *
9    * Unless required by applicable law or agreed to in writing, software
10   * distributed under the License is distributed on an "AS IS" BASIS,
11   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12   * See the License for the specific language governing permissions and
13   * limitations under the License.
14   */
15  package org.htmlunit.html.serializer;
16  
17  import static org.htmlunit.BrowserVersionFeatures.JS_INNER_TEXT_SVG_NL;
18  
19  import org.apache.commons.lang3.StringUtils;
20  import org.htmlunit.BrowserVersion;
21  import org.htmlunit.SgmlPage;
22  import org.htmlunit.WebWindow;
23  import org.htmlunit.css.ComputedCssStyleDeclaration;
24  import org.htmlunit.css.StyleAttributes.Definition;
25  import org.htmlunit.html.DomElement;
26  import org.htmlunit.html.DomNode;
27  import org.htmlunit.html.DomText;
28  import org.htmlunit.html.HtmlBreak;
29  import org.htmlunit.html.HtmlDetails;
30  import org.htmlunit.html.HtmlHead;
31  import org.htmlunit.html.HtmlListItem;
32  import org.htmlunit.html.HtmlNoFrames;
33  import org.htmlunit.html.HtmlParagraph;
34  import org.htmlunit.html.HtmlScript;
35  import org.htmlunit.html.HtmlStyle;
36  import org.htmlunit.html.HtmlSummary;
37  import org.htmlunit.html.HtmlSvg;
38  import org.htmlunit.html.HtmlTextArea;
39  import org.htmlunit.html.HtmlTitle;
40  import org.htmlunit.html.ScriptElement;
41  import org.htmlunit.html.serializer.HtmlSerializerInnerOuterText.HtmlSerializerTextBuilder.Mode;
42  import org.htmlunit.svg.SvgTitle;
43  
44  /**
45   * Special serializer to generate the output we need
46   * for innerText and outerText.
47   *
48   * @author Ronald Brill
49   */
50  public class HtmlSerializerInnerOuterText {
51  
52      private final BrowserVersion browserVersion_;
53  
54      /**
55       * Ctor.
56       *
57       * @param browserVersion the {@link BrowserVersion}
58       */
59      public HtmlSerializerInnerOuterText(final BrowserVersion browserVersion) {
60          super();
61          browserVersion_ = browserVersion;
62      }
63  
64      /**
65       * Converts an HTML node to text.
66       * @param node a node
67       * @return the text representation according to the setting of this serializer
68       */
69      public String asText(final DomNode node) {
70          if (node instanceof HtmlBreak) {
71              return "";
72          }
73  
74          // included scripts are ignored, but if we ask for the script itself....
75          if (node instanceof ScriptElement) {
76              final HtmlSerializerTextBuilder builder = new HtmlSerializerTextBuilder();
77              appendChildren(builder, node, Mode.WHITE_SPACE_NORMAL, false);
78              return builder.getText();
79          }
80  
81          // when calling on the title itself we have to output
82          final boolean insideHead = node instanceof HtmlTitle;
83  
84          final HtmlSerializerTextBuilder builder = new HtmlSerializerTextBuilder();
85          appendNode(builder, node, whiteSpaceStyle(node, Mode.WHITE_SPACE_NORMAL), insideHead);
86          return builder.getText();
87      }
88  
89      /**
90       * Iterate over all Children and call appendNode() for every.
91       *
92       * @param builder the StringBuilder to add to
93       * @param node the node to process
94       * @param mode the {@link Mode} to use for processing
95       * @param insideHead true if inside head section
96       */
97      protected void appendChildren(final HtmlSerializerTextBuilder builder, final DomNode node,
98              final Mode mode, final boolean insideHead) {
99          for (final DomNode child : node.getChildren()) {
100             appendNode(builder, child, mode, insideHead);
101         }
102     }
103 
104     /**
105      * The core distribution method call the different appendXXX
106      * methods depending on the type of the given node.
107      *
108      * @param builder the StringBuilder to add to
109      * @param node the node to process
110      * @param mode the {@link Mode} to use for processing
111      * @param insideHead true if inside head section
112      */
113     protected void appendNode(final HtmlSerializerTextBuilder builder, final DomNode node,
114             final Mode mode, final boolean insideHead) {
115         if (node instanceof DomText) {
116             appendText(builder, (DomText) node, mode);
117         }
118         else if (node instanceof HtmlBreak) {
119             appendBreak(builder, (HtmlBreak) node);
120         }
121         else if (node instanceof HtmlParagraph) {
122             appendParagraph(builder, (HtmlParagraph) node, mode, insideHead);
123         }
124         else if (node instanceof HtmlListItem) {
125             appendListItem(builder, (HtmlListItem) node, mode, insideHead);
126         }
127         else if (node instanceof HtmlDetails) {
128             appendDetails(builder, (HtmlDetails) node, mode, insideHead);
129         }
130         else if (node instanceof HtmlHead) {
131             appendChildren(builder, node, mode, true);
132         }
133         else if (node instanceof HtmlNoFrames) {
134             appendChildren(builder, node, Mode.PLAIN, insideHead);
135         }
136         else if (node instanceof HtmlTitle && !insideHead) {
137             // nothing to do
138         }
139         else if (node instanceof HtmlTextArea) {
140             // nothing to do
141         }
142         else if (node instanceof ScriptElement) {
143             if (insideHead) {
144                 appendChildren(builder, node, mode, insideHead);
145             }
146         }
147         else if (node instanceof HtmlSvg) {
148             if (browserVersion_.hasFeature(JS_INNER_TEXT_SVG_NL)) {
149                 builder.appendRequiredLineBreak();
150                 appendChildren(builder, node, mode, insideHead);
151                 builder.appendRequiredLineBreak();
152             }
153             else {
154                 appendChildren(builder, node, mode, insideHead);
155             }
156         }
157         else if (node instanceof SvgTitle) {
158             // nothing to do
159         }
160         else {
161             appendChildren(builder, node, mode, insideHead);
162         }
163     }
164 
165     /**
166      * Process {@link DomText}.
167      *
168      * @param builder the StringBuilder to add to
169      * @param domText the target to process
170      * @param mode the {@link Mode} to use for processing
171      */
172     protected void appendText(final HtmlSerializerTextBuilder builder, final DomText domText, final Mode mode) {
173         final DomNode parent = domText.getParentNode();
174         if (parent instanceof HtmlTitle
175                 || parent instanceof HtmlStyle
176                 || parent instanceof HtmlScript) {
177             builder.append(domText.getData(), Mode.PLAIN);
178             return;
179         }
180 
181         if (parent == null
182                 || parent instanceof HtmlNoFrames
183                 || parent.isDisplayed()) {
184             builder.append(domText.getData(), mode);
185         }
186     }
187 
188     /**
189      * Process {@link HtmlBreak}.
190      *
191      * @param builder the StringBuilder to add to
192      * @param htmlBreak the target to process
193      */
194     protected void appendBreak(final HtmlSerializerTextBuilder builder,
195             final HtmlBreak htmlBreak) {
196         builder.appendRequiredLineBreak();
197     }
198 
199     /**
200      * Process {@link HtmlListItem}.
201      *
202      * @param builder the StringBuilder to add to
203      * @param htmlParagraph the target to process
204      * @param mode the {@link Mode} to use for processing
205      * @param insideHead true if inside head section
206      */
207     protected void appendParagraph(final HtmlSerializerTextBuilder builder,
208             final HtmlParagraph htmlParagraph, final Mode mode, final boolean insideHead) {
209         builder.appendRequiredLineBreak();
210         appendChildren(builder, htmlParagraph, mode, insideHead);
211         builder.appendRequiredLineBreak();
212     }
213 
214     /**
215      * Process {@link HtmlListItem}.
216      *
217      * @param builder the StringBuilder to add to
218      * @param htmlListItem the target to process
219      * @param mode the {@link Mode} to use for processing
220      * @param insideHead true if inside head section
221      */
222     protected void appendListItem(final HtmlSerializerTextBuilder builder,
223             final HtmlListItem htmlListItem, final Mode mode, final boolean insideHead) {
224         builder.appendRequiredLineBreak();
225         appendChildren(builder, htmlListItem, mode, insideHead);
226         builder.appendRequiredLineBreak();
227     }
228 
229     /**
230      * Process {@link HtmlDetails}.
231      * @param builder the StringBuilder to add to
232      * @param htmlDetails the target to process
233      * @param mode the {@link Mode} to use for processing
234      * @param insideHead true if inside head section
235      */
236     protected void appendDetails(final HtmlSerializerTextBuilder builder,
237                     final HtmlDetails htmlDetails, final Mode mode, final boolean insideHead) {
238         if (htmlDetails.isOpen()) {
239             appendChildren(builder, htmlDetails, mode, insideHead);
240             return;
241         }
242 
243         for (final DomNode child : htmlDetails.getChildren()) {
244             if (child instanceof HtmlSummary) {
245                 appendNode(builder, child, mode, insideHead);
246             }
247         }
248     }
249 
250     private static Mode whiteSpaceStyle(final DomNode domNode, final Mode defaultMode) {
251         if (domNode instanceof DomElement) {
252             final SgmlPage page = domNode.getPage();
253             if (page != null) {
254                 if (page.getWebClient().getOptions().isCssEnabled()) {
255                     DomNode node = domNode;
256                     while (node != null) {
257                         if (node instanceof DomElement) {
258                             final WebWindow window = page.getEnclosingWindow();
259                             if (window != null) {
260                                 final ComputedCssStyleDeclaration style =
261                                         window.getComputedStyle((DomElement) domNode, null);
262                                 final String value = style.getStyleAttribute(Definition.WHITE_SPACE, false);
263 
264                                 if (StringUtils.isNoneEmpty(value)) {
265                                     if ("normal".equalsIgnoreCase(value)) {
266                                         return Mode.WHITE_SPACE_NORMAL;
267                                     }
268                                     if ("nowrap".equalsIgnoreCase(value)) {
269                                         return Mode.WHITE_SPACE_NORMAL;
270                                     }
271                                     if ("pre".equalsIgnoreCase(value)) {
272                                         return Mode.WHITE_SPACE_PRE;
273                                     }
274                                     if ("pre-wrap".equalsIgnoreCase(value)) {
275                                         return Mode.WHITE_SPACE_PRE;
276                                     }
277                                     if ("pre-line".equalsIgnoreCase(value)) {
278                                         return Mode.WHITE_SPACE_PRE_LINE;
279                                     }
280                                 }
281                             }
282                         }
283                         node = node.getParentNode();
284                     }
285                 }
286             }
287         }
288         return defaultMode;
289     }
290 
291     /**
292      * Helper to compose the text for the serializer based on several modes.
293      */
294     protected static class HtmlSerializerTextBuilder {
295 
296         /** Mode. */
297         protected enum Mode {
298             /**
299              * The mode for plain.
300              */
301             PLAIN,
302 
303             /**
304              * Sequences of white space are collapsed. Newline characters
305              * in the source are handled the same as other white space.
306              * Lines are broken as necessary to fill line boxes.
307              */
308             WHITE_SPACE_NORMAL,
309 
310             /**
311              * Sequences of white space are preserved. Lines are only broken
312              * at newline characters in the source and at <br> elements.
313              */
314             WHITE_SPACE_PRE,
315 
316             /**
317              * Sequences of white space are collapsed. Lines are broken
318              * at newline characters, at <br>, and as necessary
319              * to fill line boxes.
320              */
321             WHITE_SPACE_PRE_LINE
322         }
323 
324         private enum State {
325             DEFAULT,
326             EMPTY,
327             BLANK_AT_END,
328             BLANK_AT_END_AFTER_NEWLINE,
329             NEWLINE_AT_END,
330             BREAK_AT_END,
331             BLOCK_SEPARATOR_AT_END,
332             REQUIRED_LINE_BREAK_AT_END
333         }
334 
335         private State state_;
336         private final StringBuilder builder_;
337         private int trimRightPos_;
338 
339         /**
340          * Ctor.
341          */
342         public HtmlSerializerTextBuilder() {
343             builder_ = new StringBuilder();
344             state_ = State.EMPTY;
345             trimRightPos_ = 0;
346         }
347 
348         /**
349          * Append a line separator.
350          */
351         public void appendRequiredLineBreak() {
352             if (state_ == State.EMPTY) {
353                 return;
354             }
355 
356             // trimRight
357             builder_.setLength(trimRightPos_);
358             if (trimRightPos_ == 0) {
359                 state_ = State.EMPTY;
360             }
361 
362             builder_.append('\n');
363             state_ = State.REQUIRED_LINE_BREAK_AT_END;
364         }
365 
366         /**
367          * Append the provided content.
368          * see https://drafts.csswg.org/css-text-3/#white-space
369          *
370          * @param content the content to add
371          * @param mode the {@link Mode}
372          */
373         public void append(final String content, final Mode mode) {
374             if (content == null) {
375                 return;
376             }
377             int length = content.length();
378             if (length == 0) {
379                 return;
380             }
381 
382             if (mode == Mode.PLAIN) {
383                 builder_.append(content);
384                 state_ = State.DEFAULT;
385                 trimRightPos_ = builder_.length();
386                 return;
387             }
388 
389             length--;
390             int i = -1;
391             for (char c : content.toCharArray()) {
392                 i++;
393 
394                 // handle \r
395                 if (c == '\r') {
396                     if (length != i) {
397                         continue;
398                     }
399                     c = '\n';
400                 }
401 
402                 if (c == '\n') {
403                     if (mode == Mode.WHITE_SPACE_PRE) {
404                         switch (state_) {
405                             case EMPTY:
406                             case BLOCK_SEPARATOR_AT_END:
407                                 break;
408                             default:
409                                 builder_.append('\n');
410                                 state_ = State.NEWLINE_AT_END;
411                                 trimRightPos_ = builder_.length();
412                                 break;
413                         }
414                         continue;
415                     }
416 
417                     if (mode == Mode.WHITE_SPACE_PRE_LINE) {
418                         switch (state_) {
419                             case EMPTY:
420                             case BLOCK_SEPARATOR_AT_END:
421                                 break;
422                             case BLANK_AT_END:
423                                 builder_.setLength(trimRightPos_);
424                                 builder_.append('\n');
425                                 state_ = State.NEWLINE_AT_END;
426                                 trimRightPos_ = builder_.length();
427                                 break;
428                             default:
429                                 builder_.append('\n');
430                                 state_ = State.NEWLINE_AT_END;
431                                 trimRightPos_ = builder_.length();
432                                 break;
433                         }
434                         continue;
435                     }
436 
437                     switch (state_) {
438                         case EMPTY:
439                         case BLANK_AT_END:
440                         case BLANK_AT_END_AFTER_NEWLINE:
441                         case BLOCK_SEPARATOR_AT_END:
442                         case NEWLINE_AT_END:
443                         case BREAK_AT_END:
444                         case REQUIRED_LINE_BREAK_AT_END:
445                             break;
446                         default:
447                             builder_.append(' ');
448                             state_ = State.BLANK_AT_END;
449                             break;
450                     }
451                     continue;
452                 }
453 
454                 if (c == ' ' || c == '\t' || c == '\f') {
455                     if (mode == Mode.WHITE_SPACE_PRE) {
456                         if (c == '\t') {
457                             builder_.append('\t');
458                         }
459                         else {
460                             builder_.append(' ');
461                         }
462                         state_ = State.BLANK_AT_END;
463                         trimRightPos_ = builder_.length();
464 
465                         continue;
466                     }
467 
468                     if (mode == Mode.WHITE_SPACE_PRE_LINE) {
469                         switch (state_) {
470                             case EMPTY:
471                             case BLANK_AT_END:
472                             case BLANK_AT_END_AFTER_NEWLINE:
473                             case BREAK_AT_END:
474                             case NEWLINE_AT_END:
475                                 break;
476                             default:
477                                 builder_.append(' ');
478                                 state_ = State.BLANK_AT_END;
479                                 break;
480                         }
481                         continue;
482                     }
483 
484                     switch (state_) {
485                         case EMPTY:
486                         case BLANK_AT_END:
487                         case BLANK_AT_END_AFTER_NEWLINE:
488                         case BLOCK_SEPARATOR_AT_END:
489                         case NEWLINE_AT_END:
490                         case BREAK_AT_END:
491                         case REQUIRED_LINE_BREAK_AT_END:
492                             break;
493                         default:
494                             builder_.append(' ');
495                             state_ = State.BLANK_AT_END;
496                             break;
497                     }
498                     continue;
499                 }
500 
501                 builder_.append(c);
502                 state_ = State.DEFAULT;
503                 trimRightPos_ = builder_.length();
504             }
505         }
506 
507         /**
508          * @return the constructed text.
509          */
510         public String getText() {
511             return builder_.substring(0, trimRightPos_);
512         }
513     }
514 }