1
2
3
4
5
6
7
8
9
10
11
12
13
14
15 package org.htmlunit.html.serializer;
16
17 import static org.htmlunit.BrowserVersionFeatures.JS_INNER_TEXT_SVG_NL;
18
19 import org.htmlunit.BrowserVersion;
20 import org.htmlunit.SgmlPage;
21 import org.htmlunit.WebWindow;
22 import org.htmlunit.css.ComputedCssStyleDeclaration;
23 import org.htmlunit.css.StyleAttributes.Definition;
24 import org.htmlunit.html.DomElement;
25 import org.htmlunit.html.DomNode;
26 import org.htmlunit.html.DomText;
27 import org.htmlunit.html.HtmlBreak;
28 import org.htmlunit.html.HtmlDefinitionTerm;
29 import org.htmlunit.html.HtmlDetails;
30 import org.htmlunit.html.HtmlHead;
31 import org.htmlunit.html.HtmlListItem;
32 import org.htmlunit.html.HtmlNoFrames;
33 import org.htmlunit.html.HtmlParagraph;
34 import org.htmlunit.html.HtmlScript;
35 import org.htmlunit.html.HtmlStyle;
36 import org.htmlunit.html.HtmlSummary;
37 import org.htmlunit.html.HtmlSvg;
38 import org.htmlunit.html.HtmlTextArea;
39 import org.htmlunit.html.HtmlTitle;
40 import org.htmlunit.html.ScriptElement;
41 import org.htmlunit.html.serializer.HtmlSerializerInnerOuterText.HtmlSerializerTextBuilder.Mode;
42 import org.htmlunit.svg.SvgTitle;
43 import org.htmlunit.util.StringUtils;
44
45
46
47
48
49
50
51 public class HtmlSerializerInnerOuterText {
52
53 private final BrowserVersion browserVersion_;
54
55
56
57
58
59
60 public HtmlSerializerInnerOuterText(final BrowserVersion browserVersion) {
61 super();
62 browserVersion_ = browserVersion;
63 }
64
65
66
67
68
69
70 public String asText(final DomNode node) {
71 if (node instanceof HtmlBreak) {
72 return "";
73 }
74
75
76 if (node instanceof ScriptElement) {
77 final HtmlSerializerTextBuilder builder = new HtmlSerializerTextBuilder();
78 appendChildren(builder, node, Mode.WHITE_SPACE_NORMAL, false);
79 return builder.getText();
80 }
81
82
83 final boolean insideHead = node instanceof HtmlTitle;
84
85 final HtmlSerializerTextBuilder builder = new HtmlSerializerTextBuilder();
86 appendNode(builder, node, whiteSpaceStyle(node, Mode.WHITE_SPACE_NORMAL), insideHead);
87 return builder.getText();
88 }
89
90
91
92
93
94
95
96
97
98 protected void appendChildren(final HtmlSerializerTextBuilder builder, final DomNode node,
99 final Mode mode, final boolean insideHead) {
100 for (final DomNode child : node.getChildren()) {
101 appendNode(builder, child, mode, insideHead);
102 }
103 }
104
105
106
107
108
109
110
111
112
113
114 protected void appendNode(final HtmlSerializerTextBuilder builder, final DomNode node,
115 final Mode mode, final boolean insideHead) {
116 if (node instanceof DomText text) {
117 appendText(builder, text, mode);
118 }
119 else if (node instanceof HtmlBreak break1) {
120 appendBreak(builder, break1);
121 }
122 else if (node instanceof HtmlParagraph paragraph) {
123 appendParagraph(builder, paragraph, mode, insideHead);
124 }
125 else if (node instanceof HtmlListItem item) {
126 appendListItem(builder, item, mode, insideHead);
127 }
128 else if (node instanceof HtmlDetails details) {
129 appendDetails(builder, details, mode, insideHead);
130 }
131 else if (node instanceof HtmlHead) {
132 appendChildren(builder, node, mode, true);
133 }
134 else if (node instanceof HtmlNoFrames) {
135 appendChildren(builder, node, Mode.PLAIN, insideHead);
136 }
137 else if (node instanceof HtmlTitle && !insideHead) {
138
139 }
140 else if (node instanceof HtmlTextArea) {
141
142 }
143 else if (node instanceof ScriptElement) {
144 if (insideHead) {
145 appendChildren(builder, node, mode, insideHead);
146 }
147 }
148 else if (node instanceof HtmlDefinitionTerm item) {
149 appendDefinitionTerm(builder, item, mode, insideHead);
150 }
151 else if (node instanceof HtmlSvg) {
152 if (browserVersion_.hasFeature(JS_INNER_TEXT_SVG_NL)) {
153 builder.appendRequiredLineBreak();
154 appendChildren(builder, node, mode, insideHead);
155 builder.appendRequiredLineBreak();
156 }
157 else {
158 appendChildren(builder, node, mode, insideHead);
159 }
160 }
161 else if (node instanceof SvgTitle) {
162
163 }
164 else {
165 appendChildren(builder, node, mode, insideHead);
166 }
167 }
168
169
170
171
172
173
174
175
176 protected void appendText(final HtmlSerializerTextBuilder builder, final DomText domText, final Mode mode) {
177 final DomNode parent = domText.getParentNode();
178 if (parent instanceof HtmlTitle
179 || parent instanceof HtmlStyle
180 || parent instanceof HtmlScript) {
181 builder.append(domText.getData(), Mode.PLAIN);
182 return;
183 }
184
185 if (parent == null
186 || parent instanceof HtmlNoFrames
187 || parent.isDisplayed()) {
188 builder.append(domText.getData(), mode);
189 }
190 }
191
192
193
194
195
196
197
198 protected void appendBreak(final HtmlSerializerTextBuilder builder,
199 final HtmlBreak htmlBreak) {
200 builder.appendRequiredLineBreak();
201 }
202
203
204
205
206
207
208
209
210
211 protected void appendParagraph(final HtmlSerializerTextBuilder builder,
212 final HtmlParagraph htmlParagraph, final Mode mode, final boolean insideHead) {
213 builder.appendRequiredLineBreak();
214 appendChildren(builder, htmlParagraph, mode, insideHead);
215 builder.appendRequiredLineBreak();
216 }
217
218
219
220
221
222
223
224
225
226 protected void appendListItem(final HtmlSerializerTextBuilder builder,
227 final HtmlListItem htmlListItem, final Mode mode, final boolean insideHead) {
228 builder.appendRequiredLineBreak();
229 appendChildren(builder, htmlListItem, mode, insideHead);
230 builder.appendRequiredLineBreak();
231 }
232
233
234
235
236
237
238
239
240 protected void appendDetails(final HtmlSerializerTextBuilder builder,
241 final HtmlDetails htmlDetails, final Mode mode, final boolean insideHead) {
242 if (htmlDetails.isOpen()) {
243 appendChildren(builder, htmlDetails, mode, insideHead);
244 return;
245 }
246
247 for (final DomNode child : htmlDetails.getChildren()) {
248 if (child instanceof HtmlSummary) {
249 appendNode(builder, child, mode, insideHead);
250 }
251 }
252 }
253
254
255
256
257
258
259
260
261
262 protected void appendDefinitionTerm(final HtmlSerializerTextBuilder builder,
263 final HtmlDefinitionTerm htmlDefinitionTerm, final Mode mode, final boolean insideHead) {
264 builder.appendRequiredLineBreak();
265 appendChildren(builder, htmlDefinitionTerm, mode, insideHead);
266 builder.appendRequiredLineBreak();
267 }
268
269 private static Mode whiteSpaceStyle(final DomNode domNode, final Mode defaultMode) {
270 if (domNode instanceof DomElement) {
271 final SgmlPage page = domNode.getPage();
272 if (page != null) {
273 if (page.getWebClient().getOptions().isCssEnabled()) {
274 DomNode node = domNode;
275 while (node != null) {
276 if (node instanceof DomElement) {
277 final WebWindow window = page.getEnclosingWindow();
278 if (window != null) {
279 final ComputedCssStyleDeclaration style =
280 window.getComputedStyle((DomElement) domNode, null);
281 final String value = style.getStyleAttribute(Definition.WHITE_SPACE, false);
282
283 if (!StringUtils.isEmptyOrNull(value)) {
284 if ("normal".equalsIgnoreCase(value)) {
285 return Mode.WHITE_SPACE_NORMAL;
286 }
287 if ("nowrap".equalsIgnoreCase(value)) {
288 return Mode.WHITE_SPACE_NORMAL;
289 }
290 if ("pre".equalsIgnoreCase(value)) {
291 return Mode.WHITE_SPACE_PRE;
292 }
293 if ("pre-wrap".equalsIgnoreCase(value)) {
294 return Mode.WHITE_SPACE_PRE;
295 }
296 if ("pre-line".equalsIgnoreCase(value)) {
297 return Mode.WHITE_SPACE_PRE_LINE;
298 }
299 }
300 }
301 }
302 node = node.getParentNode();
303 }
304 }
305 }
306 }
307 return defaultMode;
308 }
309
310
311
312
313 protected static class HtmlSerializerTextBuilder {
314
315
316 protected enum Mode {
317
318
319
320 PLAIN,
321
322
323
324
325
326
327 WHITE_SPACE_NORMAL,
328
329
330
331
332
333 WHITE_SPACE_PRE,
334
335
336
337
338
339
340 WHITE_SPACE_PRE_LINE
341 }
342
343 private enum State {
344 DEFAULT,
345 EMPTY,
346 BLANK_AT_END,
347 BLANK_AT_END_AFTER_NEWLINE,
348 NEWLINE_AT_END,
349 BREAK_AT_END,
350 BLOCK_SEPARATOR_AT_END,
351 REQUIRED_LINE_BREAK_AT_END
352 }
353
354 private State state_;
355 private final StringBuilder builder_;
356 private int trimRightPos_;
357
358
359
360
361 public HtmlSerializerTextBuilder() {
362 builder_ = new StringBuilder();
363 state_ = State.EMPTY;
364 trimRightPos_ = 0;
365 }
366
367
368
369
370 public void appendRequiredLineBreak() {
371 if (state_ == State.EMPTY) {
372 return;
373 }
374
375
376 builder_.setLength(trimRightPos_);
377 if (trimRightPos_ == 0) {
378 state_ = State.EMPTY;
379 }
380
381 builder_.append('\n');
382 state_ = State.REQUIRED_LINE_BREAK_AT_END;
383 }
384
385
386
387
388
389
390
391
392 public void append(final String content, final Mode mode) {
393 if (content == null) {
394 return;
395 }
396 int length = content.length();
397 if (length == 0) {
398 return;
399 }
400
401 if (mode == Mode.PLAIN) {
402 builder_.append(content);
403 state_ = State.DEFAULT;
404 trimRightPos_ = builder_.length();
405 return;
406 }
407
408 length--;
409 final int contentLength = content.length();
410 for (int i = 0; i < contentLength; i++) {
411 char c = content.charAt(i);
412
413
414 if (c == '\r') {
415 if (length != i) {
416 continue;
417 }
418 c = '\n';
419 }
420
421 if (c == '\n') {
422 if (mode == Mode.WHITE_SPACE_PRE) {
423 switch (state_) {
424 case EMPTY:
425 case BLOCK_SEPARATOR_AT_END:
426 break;
427 default:
428 builder_.append('\n');
429 state_ = State.NEWLINE_AT_END;
430 trimRightPos_ = builder_.length();
431 break;
432 }
433 continue;
434 }
435
436 if (mode == Mode.WHITE_SPACE_PRE_LINE) {
437 switch (state_) {
438 case EMPTY:
439 case BLOCK_SEPARATOR_AT_END:
440 break;
441 case BLANK_AT_END:
442 builder_.setLength(trimRightPos_);
443 builder_.append('\n');
444 state_ = State.NEWLINE_AT_END;
445 trimRightPos_ = builder_.length();
446 break;
447 default:
448 builder_.append('\n');
449 state_ = State.NEWLINE_AT_END;
450 trimRightPos_ = builder_.length();
451 break;
452 }
453 continue;
454 }
455
456 switch (state_) {
457 case EMPTY:
458 case BLANK_AT_END:
459 case BLANK_AT_END_AFTER_NEWLINE:
460 case BLOCK_SEPARATOR_AT_END:
461 case NEWLINE_AT_END:
462 case BREAK_AT_END:
463 case REQUIRED_LINE_BREAK_AT_END:
464 break;
465 default:
466 builder_.append(' ');
467 state_ = State.BLANK_AT_END;
468 break;
469 }
470 continue;
471 }
472
473 if (c == ' ' || c == '\t' || c == '\f') {
474 if (mode == Mode.WHITE_SPACE_PRE) {
475 if (c == '\t') {
476 builder_.append('\t');
477 }
478 else {
479 builder_.append(' ');
480 }
481 state_ = State.BLANK_AT_END;
482 trimRightPos_ = builder_.length();
483
484 continue;
485 }
486
487 if (mode == Mode.WHITE_SPACE_PRE_LINE) {
488 switch (state_) {
489 case EMPTY:
490 case BLANK_AT_END:
491 case BLANK_AT_END_AFTER_NEWLINE:
492 case BREAK_AT_END:
493 case NEWLINE_AT_END:
494 break;
495 default:
496 builder_.append(' ');
497 state_ = State.BLANK_AT_END;
498 break;
499 }
500 continue;
501 }
502
503 switch (state_) {
504 case EMPTY:
505 case BLANK_AT_END:
506 case BLANK_AT_END_AFTER_NEWLINE:
507 case BLOCK_SEPARATOR_AT_END:
508 case NEWLINE_AT_END:
509 case BREAK_AT_END:
510 case REQUIRED_LINE_BREAK_AT_END:
511 break;
512 default:
513 builder_.append(' ');
514 state_ = State.BLANK_AT_END;
515 break;
516 }
517 continue;
518 }
519
520 builder_.append(c);
521 state_ = State.DEFAULT;
522 trimRightPos_ = builder_.length();
523 }
524 }
525
526
527
528
529 public String getText() {
530 return builder_.substring(0, trimRightPos_);
531 }
532 }
533 }