1
2
3
4
5
6
7
8
9
10
11
12
13
14
15 package org.htmlunit.util;
16
17 import static java.nio.charset.StandardCharsets.US_ASCII;
18 import static java.nio.charset.StandardCharsets.UTF_16BE;
19 import static java.nio.charset.StandardCharsets.UTF_16LE;
20 import static java.nio.charset.StandardCharsets.UTF_8;
21
22 import java.io.IOException;
23 import java.io.InputStream;
24 import java.nio.charset.Charset;
25 import java.nio.charset.IllegalCharsetNameException;
26 import java.nio.charset.UnsupportedCharsetException;
27 import java.util.Arrays;
28 import java.util.List;
29 import java.util.Locale;
30
31 import org.apache.commons.io.ByteOrderMark;
32 import org.apache.commons.io.IOUtils;
33 import org.apache.commons.lang3.ArrayUtils;
34 import org.apache.commons.logging.Log;
35 import org.apache.commons.logging.LogFactory;
36 import org.htmlunit.HttpHeader;
37 import org.htmlunit.cyberneko.xerces.util.StandardEncodingTranslator;
38
39
40
41
42
43
44
45
46
47
48
49 public final class EncodingSniffer {
50
51
52 private static final Log LOG = LogFactory.getLog(EncodingSniffer.class);
53
54
55 private static final byte[][] COMMENT_START = {
56 new byte[] {'<'},
57 new byte[] {'!'},
58 new byte[] {'-'},
59 new byte[] {'-'}
60 };
61
62
63 private static final byte[][] META_START = {
64 new byte[] {'<'},
65 new byte[] {'m', 'M'},
66 new byte[] {'e', 'E'},
67 new byte[] {'t', 'T'},
68 new byte[] {'a', 'A'},
69 new byte[] {0x09, 0x0A, 0x0C, 0x0D, 0x20, 0x2F}
70 };
71
72
73 private static final byte[][] OTHER_START = {
74 new byte[] {'<'},
75 new byte[] {'!', '/', '?'}
76 };
77
78
79 private static final byte[][] CHARSET_START = {
80 new byte[] {'c', 'C'},
81 new byte[] {'h', 'H'},
82 new byte[] {'a', 'A'},
83 new byte[] {'r', 'R'},
84 new byte[] {'s', 'S'},
85 new byte[] {'e', 'E'},
86 new byte[] {'t', 'T'}
87 };
88
89 private static final byte[] WHITESPACE = {0x09, 0x0A, 0x0C, 0x0D, 0x20, 0x3E};
90 private static final byte[] COMMENT_END = {'-', '-', '>'};
91
92 private static final byte[] XML_DECLARATION_PREFIX = "<?xml ".getBytes(US_ASCII);
93
94 private static final byte[] CSS_CHARSET_DECLARATION_PREFIX = "@charset \"".getBytes(US_ASCII);
95
96
97
98
99 private static final int SIZE_OF_HTML_CONTENT_SNIFFED = 1024;
100
101
102
103
104
105 private static final int SIZE_OF_XML_CONTENT_SNIFFED = 512;
106
107 private static final int SIZE_OF_CSS_CONTENT_SNIFFED = 1024;
108
109
110
111
112 private EncodingSniffer() {
113
114 }
115
116
117
118
119
120
121
122
123
124
125 static boolean contentTypeEndsWith(final List<NameValuePair> headers, final String... contentTypeEndings) {
126 for (final NameValuePair pair : headers) {
127 final String name = pair.getName();
128 if (HttpHeader.CONTENT_TYPE_LC.equalsIgnoreCase(name)) {
129 String value = pair.getValue();
130 final int i = value.indexOf(';');
131 if (i != -1) {
132 value = value.substring(0, i);
133 }
134 value = value.trim().toLowerCase(Locale.ROOT);
135 for (final String ending : contentTypeEndings) {
136 if (value.endsWith(ending.toLowerCase(Locale.ROOT))) {
137 return true;
138 }
139 }
140 return false;
141 }
142 }
143 return false;
144 }
145
146
147
148
149
150
151
152
153
154 static Charset sniffEncodingFromUnicodeBom(final byte[] bytes) {
155 if (bytes == null) {
156 return null;
157 }
158
159 Charset encoding = null;
160 if (startsWith(bytes, ByteOrderMark.UTF_8)) {
161 encoding = UTF_8;
162 }
163 else if (startsWith(bytes, ByteOrderMark.UTF_16BE)) {
164 encoding = UTF_16BE;
165 }
166 else if (startsWith(bytes, ByteOrderMark.UTF_16LE)) {
167 encoding = UTF_16LE;
168 }
169
170 if (encoding != null && LOG.isDebugEnabled()) {
171 LOG.debug("Encoding found in Unicode Byte Order Mark: '" + encoding + "'.");
172 }
173 return encoding;
174 }
175
176
177
178
179
180
181
182 private static boolean startsWith(final byte[] bytes, final ByteOrderMark bom) {
183 final byte[] bomBytes = bom.getBytes();
184 final byte[] firstBytes = Arrays.copyOfRange(bytes, 0, Math.min(bytes.length, bomBytes.length));
185 return Arrays.equals(firstBytes, bomBytes);
186 }
187
188
189
190
191
192
193
194
195
196 public static Charset sniffEncodingFromMetaTag(final InputStream is) throws IOException {
197 final byte[] bytes = read(is, SIZE_OF_HTML_CONTENT_SNIFFED);
198 for (int i = 0; i < bytes.length; i++) {
199 if (matches(bytes, i, COMMENT_START)) {
200 i = indexOfSubArray(bytes, COMMENT_END, i);
201 if (i == -1) {
202 break;
203 }
204 i += 2;
205 }
206 else if (matches(bytes, i, META_START)) {
207 i += META_START.length;
208 for (Attribute att = getAttribute(bytes, i); att != null; att = getAttribute(bytes, i)) {
209 i = att.getUpdatedIndex();
210 final String name = att.getName().toLowerCase(Locale.ROOT);
211 final String value = att.getValue().toLowerCase(Locale.ROOT);
212 if ("charset".equals(name) || "content".equals(name)) {
213 Charset charset = null;
214 if ("charset".equals(name)) {
215 charset = toCharset(value);
216
217 if (charset == null && "x-user-defined".equals(value)) {
218 charset = Charset.forName("windows-1252");
219 }
220 }
221 else if ("content".equals(name)) {
222 charset = extractEncodingFromContentType(value);
223
224 if (charset == null && value != null && value.contains("x-user-defined")) {
225 charset = Charset.forName("windows-1252");
226 }
227 if (charset == null) {
228 continue;
229 }
230 }
231 if (UTF_16BE == charset || UTF_16LE == charset) {
232 charset = UTF_8;
233 }
234 if (charset != null) {
235 if (LOG.isDebugEnabled()) {
236 LOG.debug("Encoding found in meta tag: '" + charset + "'.");
237 }
238 return charset;
239 }
240 }
241 }
242 }
243 else if (i + 1 < bytes.length && bytes[i] == '<' && Character.isLetter(bytes[i + 1])) {
244 i = skipToAnyOf(bytes, i, WHITESPACE);
245 if (i == -1) {
246 break;
247 }
248 Attribute att = getAttribute(bytes, i);
249 while (att != null) {
250 i = att.getUpdatedIndex();
251 att = getAttribute(bytes, i);
252 }
253 }
254 else if (i + 2 < bytes.length && bytes[i] == '<' && bytes[i + 1] == '/' && Character.isLetter(bytes[i + 2])) {
255 i = skipToAnyOf(bytes, i, new byte[] {0x09, 0x0A, 0x0C, 0x0D, 0x20, 0x3E});
256 if (i == -1) {
257 break;
258 }
259 Attribute attribute = getAttribute(bytes, i);
260 while (attribute != null) {
261 i = attribute.getUpdatedIndex();
262 attribute = getAttribute(bytes, i);
263 }
264 }
265 else if (matches(bytes, i, OTHER_START)) {
266 i = skipToAnyOf(bytes, i, new byte[] {0x3E});
267 if (i == -1) {
268 break;
269 }
270 }
271 }
272 return null;
273 }
274
275
276
277
278
279
280
281
282
283
284 static Attribute getAttribute(final byte[] bytes, final int startFrom) {
285 if (startFrom >= bytes.length) {
286 return null;
287 }
288
289 int pos = startFrom;
290 while (bytes[pos] == 0x09 || bytes[pos] == 0x0A || bytes[pos] == 0x0C || bytes[pos] == 0x0D || bytes[pos] == 0x20 || bytes[pos] == 0x2F) {
291 pos++;
292 if (pos >= bytes.length) {
293 return null;
294 }
295 }
296 if (bytes[pos] == '>') {
297 return null;
298 }
299 final StringBuilder name = new StringBuilder();
300 final StringBuilder value = new StringBuilder();
301 for ( ;; pos++) {
302 if (pos >= bytes.length) {
303 return new Attribute(name.toString(), value.toString(), pos);
304 }
305 if (bytes[pos] == '=' && name.length() != 0) {
306 pos++;
307 break;
308 }
309 if (bytes[pos] == 0x09 || bytes[pos] == 0x0A || bytes[pos] == 0x0C || bytes[pos] == 0x0D || bytes[pos] == 0x20) {
310 while (bytes[pos] == 0x09 || bytes[pos] == 0x0A || bytes[pos] == 0x0C || bytes[pos] == 0x0D || bytes[pos] == 0x20) {
311 pos++;
312 if (pos >= bytes.length) {
313 return new Attribute(name.toString(), value.toString(), pos);
314 }
315 }
316 if (bytes[pos] != '=') {
317 return new Attribute(name.toString(), value.toString(), pos);
318 }
319 pos++;
320 break;
321 }
322 if (bytes[pos] == '/' || bytes[pos] == '>') {
323 return new Attribute(name.toString(), value.toString(), pos);
324 }
325 name.append((char) bytes[pos]);
326 }
327 if (pos >= bytes.length) {
328 return new Attribute(name.toString(), value.toString(), pos);
329 }
330 while (bytes[pos] == 0x09 || bytes[pos] == 0x0A || bytes[pos] == 0x0C || bytes[pos] == 0x0D || bytes[pos] == 0x20) {
331 pos++;
332 if (pos >= bytes.length) {
333 return new Attribute(name.toString(), value.toString(), pos);
334 }
335 }
336 if (bytes[pos] == '"' || bytes[pos] == '\'') {
337 final byte b = bytes[pos];
338 for (pos++; pos < bytes.length; pos++) {
339 if (bytes[pos] == b) {
340 pos++;
341 return new Attribute(name.toString(), value.toString(), pos);
342 }
343 else if (bytes[pos] >= 'A' && bytes[pos] <= 'Z') {
344 final byte b2 = (byte) (bytes[pos] + 0x20);
345 value.append((char) b2);
346 }
347 else {
348 value.append((char) bytes[pos]);
349 }
350 }
351 return new Attribute(name.toString(), value.toString(), pos);
352 }
353 else if (bytes[pos] == '>') {
354 return new Attribute(name.toString(), value.toString(), pos);
355 }
356 else if (bytes[pos] >= 'A' && bytes[pos] <= 'Z') {
357 final byte b = (byte) (bytes[pos] + 0x20);
358 value.append((char) b);
359 pos++;
360 }
361 else {
362 value.append((char) bytes[pos]);
363 pos++;
364 }
365 for ( ; pos < bytes.length; pos++) {
366 if (bytes[pos] == 0x09 || bytes[pos] == 0x0A || bytes[pos] == 0x0C || bytes[pos] == 0x0D || bytes[pos] == 0x20 || bytes[pos] == 0x3E) {
367 return new Attribute(name.toString(), value.toString(), pos);
368 }
369 else if (bytes[pos] >= 'A' && bytes[pos] <= 'Z') {
370 final byte b = (byte) (bytes[pos] + 0x20);
371 value.append((char) b);
372 }
373 else {
374 value.append((char) bytes[pos]);
375 }
376 }
377 return new Attribute(name.toString(), value.toString(), pos);
378 }
379
380
381
382
383
384
385
386
387
388
389 public static Charset extractEncodingFromContentType(final String s) {
390 if (s == null) {
391 return null;
392 }
393 final byte[] bytes = s.getBytes(US_ASCII);
394 int i;
395 for (i = 0; i < bytes.length; i++) {
396 if (matches(bytes, i, CHARSET_START)) {
397 i += CHARSET_START.length;
398 break;
399 }
400 }
401 if (i == bytes.length) {
402 return null;
403 }
404 while (bytes[i] == 0x09 || bytes[i] == 0x0A || bytes[i] == 0x0C || bytes[i] == 0x0D || bytes[i] == 0x20) {
405 i++;
406 if (i == bytes.length) {
407 return null;
408 }
409 }
410 if (bytes[i] != '=') {
411 return null;
412 }
413 do {
414 i++;
415 if (i == bytes.length) {
416 return null;
417 }
418 }
419 while (bytes[i] == 0x09 || bytes[i] == 0x0A || bytes[i] == 0x0C || bytes[i] == 0x0D || bytes[i] == 0x20);
420
421 if (bytes[i] == '"') {
422 if (bytes.length <= i + 1) {
423 return null;
424 }
425 final int index = ArrayUtils.indexOf(bytes, (byte) '"', i + 1);
426 if (index == -1) {
427 return null;
428 }
429 final String charsetName = new String(ArrayUtils.subarray(bytes, i + 1, index), US_ASCII);
430 return toCharset(charsetName);
431 }
432 if (bytes[i] == '\'') {
433 if (bytes.length <= i + 1) {
434 return null;
435 }
436 final int index = ArrayUtils.indexOf(bytes, (byte) '\'', i + 1);
437 if (index == -1) {
438 return null;
439 }
440 final String charsetName = new String(ArrayUtils.subarray(bytes, i + 1, index), US_ASCII);
441 return toCharset(charsetName);
442 }
443 int end = skipToAnyOf(bytes, i, new byte[] {0x09, 0x0A, 0x0C, 0x0D, 0x20, 0x3B});
444 if (end == -1) {
445 end = bytes.length;
446 }
447 final String charsetName = new String(ArrayUtils.subarray(bytes, i, end), US_ASCII);
448 return toCharset(charsetName);
449 }
450
451
452
453
454
455
456
457
458
459 public static Charset sniffEncodingFromXmlDeclaration(final InputStream is) throws IOException {
460 final byte[] bytes = read(is, SIZE_OF_XML_CONTENT_SNIFFED);
461 Charset encoding = null;
462 if (bytes.length > 5
463 && XML_DECLARATION_PREFIX[0] == bytes[0]
464 && XML_DECLARATION_PREFIX[1] == bytes[1]
465 && XML_DECLARATION_PREFIX[2] == bytes[2]
466 && XML_DECLARATION_PREFIX[3] == bytes[3]
467 && XML_DECLARATION_PREFIX[4] == bytes[4]
468 && XML_DECLARATION_PREFIX[5] == bytes[5]) {
469 final int index = ArrayUtils.indexOf(bytes, (byte) '?', 2);
470 if (index + 1 < bytes.length && bytes[index + 1] == '>') {
471 final String declaration = new String(bytes, 0, index + 2, US_ASCII);
472 int start = declaration.indexOf("encoding");
473 if (start != -1) {
474 start += 8;
475 final char delimiter;
476 outer:
477 while (true) {
478 switch (declaration.charAt(start)) {
479 case '"':
480 case '\'':
481 delimiter = declaration.charAt(start);
482 start = start + 1;
483 break outer;
484
485 default:
486 start++;
487 }
488 }
489 final int end = declaration.indexOf(delimiter, start);
490 encoding = toCharset(declaration.substring(start, end));
491 }
492 }
493 }
494 if (encoding != null && LOG.isDebugEnabled()) {
495 LOG.debug("Encoding found in XML declaration: '" + encoding + "'.");
496 }
497 return encoding;
498 }
499
500
501
502
503
504
505
506
507
508 public static Charset sniffEncodingFromCssDeclaration(final InputStream is) throws IOException {
509 final byte[] bytes = read(is, SIZE_OF_CSS_CONTENT_SNIFFED);
510 if (bytes.length < CSS_CHARSET_DECLARATION_PREFIX.length) {
511 return null;
512 }
513 for (int i = 0; i < CSS_CHARSET_DECLARATION_PREFIX.length; i++) {
514 if (bytes[i] != CSS_CHARSET_DECLARATION_PREFIX[i]) {
515 return null;
516 }
517 }
518
519 Charset encoding = null;
520 final int index = ArrayUtils.indexOf(bytes, (byte) '"', CSS_CHARSET_DECLARATION_PREFIX.length);
521 if (index + 1 < bytes.length && bytes[index + 1] == ';') {
522 encoding = toCharset(new String(bytes, CSS_CHARSET_DECLARATION_PREFIX.length, index - CSS_CHARSET_DECLARATION_PREFIX.length, US_ASCII));
523
524 if (encoding == UTF_16BE || encoding == UTF_16LE) {
525 encoding = UTF_8;
526 }
527 }
528 return encoding;
529 }
530
531
532
533
534
535
536
537 public static Charset toCharset(final String charsetName) {
538 final String nameFromLabel = translateEncodingLabel(charsetName);
539 if (nameFromLabel == null) {
540 return null;
541 }
542 try {
543 return Charset.forName(nameFromLabel);
544 }
545 catch (final IllegalCharsetNameException | UnsupportedCharsetException e) {
546 return null;
547 }
548 }
549
550
551
552
553
554
555
556
557
558
559
560 static boolean matches(final byte[] bytes, final int i, final byte[][] sought) {
561 if (i + sought.length > bytes.length) {
562 return false;
563 }
564 for (int x = 0; x < sought.length; x++) {
565 final byte[] possibilities = sought[x];
566 boolean match = false;
567 for (final byte possibility : possibilities) {
568 if (bytes[i + x] == possibility) {
569 match = true;
570 break;
571 }
572 }
573 if (!match) {
574 return false;
575 }
576 }
577 return true;
578 }
579
580
581
582
583
584
585
586
587
588
589 static int skipToAnyOf(final byte[] bytes, final int startFrom, final byte[] targets) {
590 int i = startFrom;
591 for ( ; i < bytes.length; i++) {
592 if (ArrayUtils.contains(targets, bytes[i])) {
593 break;
594 }
595 }
596 if (i == bytes.length) {
597 i = -1;
598 }
599 return i;
600 }
601
602
603
604
605
606
607
608
609
610
611 static int indexOfSubArray(final byte[] array, final byte[] subarray, final int startIndex) {
612 for (int i = startIndex; i < array.length; i++) {
613 boolean found = true;
614 if (i + subarray.length > array.length) {
615 break;
616 }
617 for (int j = 0; j < subarray.length; j++) {
618 final byte a = array[i + j];
619 final byte b = subarray[j];
620 if (a != b) {
621 found = false;
622 break;
623 }
624 }
625 if (found) {
626 return i;
627 }
628 }
629 return -1;
630 }
631
632
633
634
635
636
637
638
639
640
641
642 static byte[] read(final InputStream content, final int size) throws IOException {
643 byte[] bytes = new byte[size];
644
645
646 final int count = IOUtils.read(content, bytes);
647 if (count < size) {
648 final byte[] smaller = new byte[count];
649 System.arraycopy(bytes, 0, smaller, 0, count);
650 bytes = smaller;
651 }
652 return bytes;
653 }
654
655
656
657
658
659
660
661
662
663
664
665
666
667 static byte[] readAndPrepend(final InputStream content, final int size, final byte[] prefix) throws IOException {
668 final int prefixLength = prefix.length;
669 final byte[] joined = new byte[prefixLength + size];
670
671
672
673 final int count = IOUtils.read(content, joined, prefixLength, joined.length - prefixLength);
674 if (count < size) {
675 final byte[] smaller = new byte[prefixLength + count];
676 System.arraycopy(prefix, 0, smaller, 0, prefix.length);
677 System.arraycopy(joined, prefixLength, smaller, prefixLength, count);
678 return smaller;
679 }
680
681 System.arraycopy(prefix, 0, joined, 0, prefix.length);
682 return joined;
683 }
684
685 static class Attribute {
686 private final String name_;
687 private final String value_;
688 private final int updatedIndex_;
689 Attribute(final String name, final String value, final int updatedIndex) {
690 name_ = name;
691 value_ = value;
692 updatedIndex_ = updatedIndex;
693 }
694 String getName() {
695 return name_;
696 }
697 String getValue() {
698 return value_;
699 }
700 int getUpdatedIndex() {
701 return updatedIndex_;
702 }
703 }
704
705
706
707
708
709
710
711 public static String translateEncodingLabel(final String encodingLabel) {
712 if (StringUtils.isEmptyOrNull(encodingLabel)) {
713 return null;
714 }
715
716 final String encLC = encodingLabel.toLowerCase(Locale.ROOT);
717 final String enc = StandardEncodingTranslator.INSTANCE.encodingNameFromLabel(encodingLabel);
718 if (encLC.equals(enc)) {
719 return encodingLabel;
720 }
721 return enc;
722 }
723 }