1
2
3
4
5
6
7
8
9
10
11
12
13
14
15 package org.htmlunit.html.serializer;
16
17 import static org.htmlunit.BrowserVersionFeatures.JS_INNER_TEXT_SVG_NL;
18
19 import org.apache.commons.lang3.StringUtils;
20 import org.htmlunit.BrowserVersion;
21 import org.htmlunit.SgmlPage;
22 import org.htmlunit.WebWindow;
23 import org.htmlunit.css.ComputedCssStyleDeclaration;
24 import org.htmlunit.css.StyleAttributes.Definition;
25 import org.htmlunit.html.DomElement;
26 import org.htmlunit.html.DomNode;
27 import org.htmlunit.html.DomText;
28 import org.htmlunit.html.HtmlBreak;
29 import org.htmlunit.html.HtmlDetails;
30 import org.htmlunit.html.HtmlHead;
31 import org.htmlunit.html.HtmlListItem;
32 import org.htmlunit.html.HtmlNoFrames;
33 import org.htmlunit.html.HtmlParagraph;
34 import org.htmlunit.html.HtmlScript;
35 import org.htmlunit.html.HtmlStyle;
36 import org.htmlunit.html.HtmlSummary;
37 import org.htmlunit.html.HtmlSvg;
38 import org.htmlunit.html.HtmlTextArea;
39 import org.htmlunit.html.HtmlTitle;
40 import org.htmlunit.html.ScriptElement;
41 import org.htmlunit.html.serializer.HtmlSerializerInnerOuterText.HtmlSerializerTextBuilder.Mode;
42 import org.htmlunit.svg.SvgTitle;
43
44
45
46
47
48
49
50 public class HtmlSerializerInnerOuterText {
51
52 private final BrowserVersion browserVersion_;
53
54
55
56
57
58
59 public HtmlSerializerInnerOuterText(final BrowserVersion browserVersion) {
60 super();
61 browserVersion_ = browserVersion;
62 }
63
64
65
66
67
68
69 public String asText(final DomNode node) {
70 if (node instanceof HtmlBreak) {
71 return "";
72 }
73
74
75 if (node instanceof ScriptElement) {
76 final HtmlSerializerTextBuilder builder = new HtmlSerializerTextBuilder();
77 appendChildren(builder, node, Mode.WHITE_SPACE_NORMAL, false);
78 return builder.getText();
79 }
80
81
82 final boolean insideHead = node instanceof HtmlTitle;
83
84 final HtmlSerializerTextBuilder builder = new HtmlSerializerTextBuilder();
85 appendNode(builder, node, whiteSpaceStyle(node, Mode.WHITE_SPACE_NORMAL), insideHead);
86 return builder.getText();
87 }
88
89
90
91
92
93
94
95
96
97 protected void appendChildren(final HtmlSerializerTextBuilder builder, final DomNode node,
98 final Mode mode, final boolean insideHead) {
99 for (final DomNode child : node.getChildren()) {
100 appendNode(builder, child, mode, insideHead);
101 }
102 }
103
104
105
106
107
108
109
110
111
112
113 protected void appendNode(final HtmlSerializerTextBuilder builder, final DomNode node,
114 final Mode mode, final boolean insideHead) {
115 if (node instanceof DomText) {
116 appendText(builder, (DomText) node, mode);
117 }
118 else if (node instanceof HtmlBreak) {
119 appendBreak(builder, (HtmlBreak) node);
120 }
121 else if (node instanceof HtmlParagraph) {
122 appendParagraph(builder, (HtmlParagraph) node, mode, insideHead);
123 }
124 else if (node instanceof HtmlListItem) {
125 appendListItem(builder, (HtmlListItem) node, mode, insideHead);
126 }
127 else if (node instanceof HtmlDetails) {
128 appendDetails(builder, (HtmlDetails) node, mode, insideHead);
129 }
130 else if (node instanceof HtmlHead) {
131 appendChildren(builder, node, mode, true);
132 }
133 else if (node instanceof HtmlNoFrames) {
134 appendChildren(builder, node, Mode.PLAIN, insideHead);
135 }
136 else if (node instanceof HtmlTitle && !insideHead) {
137
138 }
139 else if (node instanceof HtmlTextArea) {
140
141 }
142 else if (node instanceof ScriptElement) {
143 if (insideHead) {
144 appendChildren(builder, node, mode, insideHead);
145 }
146 }
147 else if (node instanceof HtmlSvg) {
148 if (browserVersion_.hasFeature(JS_INNER_TEXT_SVG_NL)) {
149 builder.appendRequiredLineBreak();
150 appendChildren(builder, node, mode, insideHead);
151 builder.appendRequiredLineBreak();
152 }
153 else {
154 appendChildren(builder, node, mode, insideHead);
155 }
156 }
157 else if (node instanceof SvgTitle) {
158
159 }
160 else {
161 appendChildren(builder, node, mode, insideHead);
162 }
163 }
164
165
166
167
168
169
170
171
172 protected void appendText(final HtmlSerializerTextBuilder builder, final DomText domText, final Mode mode) {
173 final DomNode parent = domText.getParentNode();
174 if (parent instanceof HtmlTitle
175 || parent instanceof HtmlStyle
176 || parent instanceof HtmlScript) {
177 builder.append(domText.getData(), Mode.PLAIN);
178 return;
179 }
180
181 if (parent == null
182 || parent instanceof HtmlNoFrames
183 || parent.isDisplayed()) {
184 builder.append(domText.getData(), mode);
185 }
186 }
187
188
189
190
191
192
193
194 protected void appendBreak(final HtmlSerializerTextBuilder builder,
195 final HtmlBreak htmlBreak) {
196 builder.appendRequiredLineBreak();
197 }
198
199
200
201
202
203
204
205
206
207 protected void appendParagraph(final HtmlSerializerTextBuilder builder,
208 final HtmlParagraph htmlParagraph, final Mode mode, final boolean insideHead) {
209 builder.appendRequiredLineBreak();
210 appendChildren(builder, htmlParagraph, mode, insideHead);
211 builder.appendRequiredLineBreak();
212 }
213
214
215
216
217
218
219
220
221
222 protected void appendListItem(final HtmlSerializerTextBuilder builder,
223 final HtmlListItem htmlListItem, final Mode mode, final boolean insideHead) {
224 builder.appendRequiredLineBreak();
225 appendChildren(builder, htmlListItem, mode, insideHead);
226 builder.appendRequiredLineBreak();
227 }
228
229
230
231
232
233
234
235
236 protected void appendDetails(final HtmlSerializerTextBuilder builder,
237 final HtmlDetails htmlDetails, final Mode mode, final boolean insideHead) {
238 if (htmlDetails.isOpen()) {
239 appendChildren(builder, htmlDetails, mode, insideHead);
240 return;
241 }
242
243 for (final DomNode child : htmlDetails.getChildren()) {
244 if (child instanceof HtmlSummary) {
245 appendNode(builder, child, mode, insideHead);
246 }
247 }
248 }
249
250 private static Mode whiteSpaceStyle(final DomNode domNode, final Mode defaultMode) {
251 if (domNode instanceof DomElement) {
252 final SgmlPage page = domNode.getPage();
253 if (page != null) {
254 if (page.getWebClient().getOptions().isCssEnabled()) {
255 DomNode node = domNode;
256 while (node != null) {
257 if (node instanceof DomElement) {
258 final WebWindow window = page.getEnclosingWindow();
259 if (window != null) {
260 final ComputedCssStyleDeclaration style =
261 window.getComputedStyle((DomElement) domNode, null);
262 final String value = style.getStyleAttribute(Definition.WHITE_SPACE, false);
263
264 if (StringUtils.isNoneEmpty(value)) {
265 if ("normal".equalsIgnoreCase(value)) {
266 return Mode.WHITE_SPACE_NORMAL;
267 }
268 if ("nowrap".equalsIgnoreCase(value)) {
269 return Mode.WHITE_SPACE_NORMAL;
270 }
271 if ("pre".equalsIgnoreCase(value)) {
272 return Mode.WHITE_SPACE_PRE;
273 }
274 if ("pre-wrap".equalsIgnoreCase(value)) {
275 return Mode.WHITE_SPACE_PRE;
276 }
277 if ("pre-line".equalsIgnoreCase(value)) {
278 return Mode.WHITE_SPACE_PRE_LINE;
279 }
280 }
281 }
282 }
283 node = node.getParentNode();
284 }
285 }
286 }
287 }
288 return defaultMode;
289 }
290
291
292
293
294 protected static class HtmlSerializerTextBuilder {
295
296
297 protected enum Mode {
298
299
300
301 PLAIN,
302
303
304
305
306
307
308 WHITE_SPACE_NORMAL,
309
310
311
312
313
314 WHITE_SPACE_PRE,
315
316
317
318
319
320
321 WHITE_SPACE_PRE_LINE
322 }
323
324 private enum State {
325 DEFAULT,
326 EMPTY,
327 BLANK_AT_END,
328 BLANK_AT_END_AFTER_NEWLINE,
329 NEWLINE_AT_END,
330 BREAK_AT_END,
331 BLOCK_SEPARATOR_AT_END,
332 REQUIRED_LINE_BREAK_AT_END
333 }
334
335 private State state_;
336 private final StringBuilder builder_;
337 private int trimRightPos_;
338
339
340
341
342 public HtmlSerializerTextBuilder() {
343 builder_ = new StringBuilder();
344 state_ = State.EMPTY;
345 trimRightPos_ = 0;
346 }
347
348
349
350
351 public void appendRequiredLineBreak() {
352 if (state_ == State.EMPTY) {
353 return;
354 }
355
356
357 builder_.setLength(trimRightPos_);
358 if (trimRightPos_ == 0) {
359 state_ = State.EMPTY;
360 }
361
362 builder_.append('\n');
363 state_ = State.REQUIRED_LINE_BREAK_AT_END;
364 }
365
366
367
368
369
370
371
372
373 public void append(final String content, final Mode mode) {
374 if (content == null) {
375 return;
376 }
377 int length = content.length();
378 if (length == 0) {
379 return;
380 }
381
382 if (mode == Mode.PLAIN) {
383 builder_.append(content);
384 state_ = State.DEFAULT;
385 trimRightPos_ = builder_.length();
386 return;
387 }
388
389 length--;
390 int i = -1;
391 for (char c : content.toCharArray()) {
392 i++;
393
394
395 if (c == '\r') {
396 if (length != i) {
397 continue;
398 }
399 c = '\n';
400 }
401
402 if (c == '\n') {
403 if (mode == Mode.WHITE_SPACE_PRE) {
404 switch (state_) {
405 case EMPTY:
406 case BLOCK_SEPARATOR_AT_END:
407 break;
408 default:
409 builder_.append('\n');
410 state_ = State.NEWLINE_AT_END;
411 trimRightPos_ = builder_.length();
412 break;
413 }
414 continue;
415 }
416
417 if (mode == Mode.WHITE_SPACE_PRE_LINE) {
418 switch (state_) {
419 case EMPTY:
420 case BLOCK_SEPARATOR_AT_END:
421 break;
422 case BLANK_AT_END:
423 builder_.setLength(trimRightPos_);
424 builder_.append('\n');
425 state_ = State.NEWLINE_AT_END;
426 trimRightPos_ = builder_.length();
427 break;
428 default:
429 builder_.append('\n');
430 state_ = State.NEWLINE_AT_END;
431 trimRightPos_ = builder_.length();
432 break;
433 }
434 continue;
435 }
436
437 switch (state_) {
438 case EMPTY:
439 case BLANK_AT_END:
440 case BLANK_AT_END_AFTER_NEWLINE:
441 case BLOCK_SEPARATOR_AT_END:
442 case NEWLINE_AT_END:
443 case BREAK_AT_END:
444 case REQUIRED_LINE_BREAK_AT_END:
445 break;
446 default:
447 builder_.append(' ');
448 state_ = State.BLANK_AT_END;
449 break;
450 }
451 continue;
452 }
453
454 if (c == ' ' || c == '\t' || c == '\f') {
455 if (mode == Mode.WHITE_SPACE_PRE) {
456 if (c == '\t') {
457 builder_.append('\t');
458 }
459 else {
460 builder_.append(' ');
461 }
462 state_ = State.BLANK_AT_END;
463 trimRightPos_ = builder_.length();
464
465 continue;
466 }
467
468 if (mode == Mode.WHITE_SPACE_PRE_LINE) {
469 switch (state_) {
470 case EMPTY:
471 case BLANK_AT_END:
472 case BLANK_AT_END_AFTER_NEWLINE:
473 case BREAK_AT_END:
474 case NEWLINE_AT_END:
475 break;
476 default:
477 builder_.append(' ');
478 state_ = State.BLANK_AT_END;
479 break;
480 }
481 continue;
482 }
483
484 switch (state_) {
485 case EMPTY:
486 case BLANK_AT_END:
487 case BLANK_AT_END_AFTER_NEWLINE:
488 case BLOCK_SEPARATOR_AT_END:
489 case NEWLINE_AT_END:
490 case BREAK_AT_END:
491 case REQUIRED_LINE_BREAK_AT_END:
492 break;
493 default:
494 builder_.append(' ');
495 state_ = State.BLANK_AT_END;
496 break;
497 }
498 continue;
499 }
500
501 builder_.append(c);
502 state_ = State.DEFAULT;
503 trimRightPos_ = builder_.length();
504 }
505 }
506
507
508
509
510 public String getText() {
511 return builder_.substring(0, trimRightPos_);
512 }
513 }
514 }