1
2
3
4
5
6
7
8
9
10
11
12
13
14
15 package org.htmlunit;
16
17 import java.io.IOException;
18 import java.io.InputStream;
19 import java.io.Serializable;
20 import java.nio.charset.StandardCharsets;
21 import java.util.Locale;
22
23 import org.apache.commons.lang3.ArrayUtils;
24 import org.htmlunit.html.DomElement;
25 import org.htmlunit.html.Html;
26 import org.htmlunit.html.HtmlPage;
27 import org.htmlunit.html.XHtmlPage;
28 import org.htmlunit.html.parser.HTMLParser;
29 import org.htmlunit.html.parser.neko.HtmlUnitNekoHtmlParser;
30 import org.htmlunit.util.MimeType;
31 import org.htmlunit.util.StringUtils;
32 import org.htmlunit.xml.XmlPage;
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81 public class DefaultPageCreator implements PageCreator, Serializable {
82
83 private static final byte[] MARKER_UTF8 = {(byte) 0xef, (byte) 0xbb, (byte) 0xbf};
84 private static final byte[] MARKER_UTF16BE = {(byte) 0xfe, (byte) 0xff};
85 private static final byte[] MARKER_UTF16LE = {(byte) 0xff, (byte) 0xfe};
86
87
88
89
90
91 private static final String[] HTML_PATTERNS = {"!DOCTYPE HTML", "HTML", "HEAD", "SCRIPT",
92 "IFRAME", "H1", "DIV", "FONT", "TABLE", "A", "STYLE", "TITLE", "B", "BODY", "BR", "P", "!--" };
93
94 private static final HTMLParser HTML_PARSER = new HtmlUnitNekoHtmlParser();
95
96
97
98
99 public enum PageType {
100
101 HTML,
102
103 JAVASCRIPT,
104
105 XML,
106
107 TEXT,
108
109 UNKNOWN
110 }
111
112
113
114
115
116
117 public static PageType determinePageType(final String contentType) {
118 if (null == contentType) {
119 return PageType.UNKNOWN;
120 }
121
122 final String contentTypeLC = StringUtils.toRootLowerCase(contentType);
123
124 if (MimeType.isJavascriptMimeType(contentTypeLC)) {
125 return PageType.JAVASCRIPT;
126 }
127 switch (contentTypeLC) {
128 case MimeType.TEXT_HTML:
129 case "image/svg+xml":
130 return PageType.HTML;
131
132 case MimeType.TEXT_XML:
133 case MimeType.APPLICATION_XML:
134 case "text/vnd.wap.wml":
135 return PageType.XML;
136
137 default:
138 if (contentTypeLC.endsWith("+xml")) {
139 return PageType.XML;
140 }
141
142 if (contentTypeLC.startsWith("text/")) {
143 return PageType.TEXT;
144 }
145
146 return PageType.UNKNOWN;
147 }
148 }
149
150
151
152
153
154
155
156 public static PageType determinePageType(final WebResponse webResponse) throws IOException {
157 final String contentType = webResponse.getContentType();
158 if (!StringUtils.isEmptyOrNull(contentType)) {
159 return determinePageType(contentType);
160 }
161
162
163 try (InputStream contentAsStream = webResponse.getContentAsStream()) {
164 final byte[] bytes = read(contentAsStream, 512);
165 if (bytes.length == 0) {
166 return determinePageType(MimeType.TEXT_PLAIN);
167 }
168
169
170
171 if (startsWith(bytes, MARKER_UTF8) || startsWith(bytes, MARKER_UTF16BE)
172 || startsWith(bytes, MARKER_UTF16LE)) {
173 return determinePageType(MimeType.TEXT_PLAIN);
174 }
175
176 if (isBinary(bytes)) {
177 return determinePageType(MimeType.APPLICATION_OCTET_STREAM);
178 }
179
180 final String asAsciiString = new String(bytes, StandardCharsets.US_ASCII).trim().toUpperCase(Locale.ROOT);
181
182 if (asAsciiString.startsWith("<?XML")) {
183 return determinePageType(MimeType.TEXT_XML);
184 }
185
186 for (final String htmlPattern : HTML_PATTERNS) {
187 try {
188 if ('<' == asAsciiString.charAt(0)) {
189 if (asAsciiString.startsWith(htmlPattern, 1)) {
190 final char spaceOrBracket = asAsciiString.charAt(htmlPattern.length() + 1);
191 if (' ' == spaceOrBracket || '>' == spaceOrBracket) {
192 return determinePageType(MimeType.TEXT_HTML);
193 }
194 }
195 }
196 }
197 catch (final ArrayIndexOutOfBoundsException ignored) {
198
199 }
200 }
201 }
202 return determinePageType(MimeType.TEXT_PLAIN);
203 }
204
205
206
207
208
209
210
211
212
213 @Override
214 public Page createPage(final WebResponse webResponse, final WebWindow webWindow) throws IOException {
215 final PageType pageType = determinePageType(webResponse);
216 switch (pageType) {
217 case HTML:
218 return createHtmlPage(webResponse, webWindow);
219
220 case JAVASCRIPT:
221 return createHtmlPage(webResponse, webWindow);
222
223 case XML:
224 final SgmlPage sgmlPage = createXmlPage(webResponse, webWindow);
225 final DomElement doc = sgmlPage.getDocumentElement();
226 if (doc != null && Html.XHTML_NAMESPACE.equals(doc.getNamespaceURI())) {
227 return createXHtmlPage(webResponse, webWindow);
228 }
229 return sgmlPage;
230
231 case TEXT:
232 return createTextPage(webResponse, webWindow);
233
234 default:
235 return createUnexpectedPage(webResponse, webWindow);
236 }
237 }
238
239
240
241
242 @Override
243 public HTMLParser getHtmlParser() {
244 return HTML_PARSER;
245 }
246
247
248
249
250
251
252 private static boolean isBinary(final byte[] bytes) {
253 for (final byte b : bytes) {
254 if ((b >= 0x00 && b < 0x08)
255 || b == 0x0B
256 || (b >= 0x0E && b <= 0x1A)
257 || (b >= 0x1C && b <= 0x1F)) {
258 return true;
259 }
260 }
261 return false;
262 }
263
264 private static boolean startsWith(final byte[] bytes, final byte[] lookFor) {
265 if (bytes.length < lookFor.length) {
266 return false;
267 }
268
269 for (int i = 0; i < lookFor.length; i++) {
270 if (bytes[i] != lookFor[i]) {
271 return false;
272 }
273 }
274
275 return true;
276 }
277
278 private static byte[] read(final InputStream stream, final int maxNb) throws IOException {
279 final byte[] buffer = new byte[maxNb];
280 final int nbRead = stream.read(buffer);
281 if (nbRead == buffer.length) {
282 return buffer;
283 }
284 return ArrayUtils.subarray(buffer, 0, nbRead);
285 }
286
287
288
289
290
291
292
293
294
295 protected HtmlPage createHtmlPage(final WebResponse webResponse, final WebWindow webWindow) throws IOException {
296 final HtmlPage page = new HtmlPage(webResponse, webWindow);
297 webWindow.setEnclosedPage(page);
298
299 HTML_PARSER.parse(webWindow.getWebClient(), webResponse, page, false, false);
300 return page;
301 }
302
303
304
305
306
307
308
309
310
311 protected XHtmlPage createXHtmlPage(final WebResponse webResponse, final WebWindow webWindow) throws IOException {
312 final XHtmlPage page = new XHtmlPage(webResponse, webWindow);
313 webWindow.setEnclosedPage(page);
314
315 HTML_PARSER.parse(webWindow.getWebClient(), webResponse, page, true, false);
316 return page;
317 }
318
319
320
321
322
323
324
325
326 protected TextPage createTextPage(final WebResponse webResponse, final WebWindow webWindow) {
327 final TextPage newPage = new TextPage(webResponse, webWindow);
328 webWindow.setEnclosedPage(newPage);
329 return newPage;
330 }
331
332
333
334
335
336
337
338
339 protected UnexpectedPage createUnexpectedPage(final WebResponse webResponse, final WebWindow webWindow) {
340 final UnexpectedPage newPage = new UnexpectedPage(webResponse, webWindow);
341 webWindow.setEnclosedPage(newPage);
342 return newPage;
343 }
344
345
346
347
348
349
350
351
352
353 protected SgmlPage createXmlPage(final WebResponse webResponse, final WebWindow webWindow) throws IOException {
354 final SgmlPage page = new XmlPage(webResponse, webWindow);
355 webWindow.setEnclosedPage(page);
356 return page;
357 }
358
359 }