1
2
3
4
5
6
7
8
9
10
11
12
13
14
15 package org.htmlunit.util;
16
17 import static java.nio.charset.StandardCharsets.US_ASCII;
18 import static java.nio.charset.StandardCharsets.UTF_16BE;
19 import static java.nio.charset.StandardCharsets.UTF_16LE;
20 import static java.nio.charset.StandardCharsets.UTF_8;
21
22 import java.io.IOException;
23 import java.io.InputStream;
24 import java.nio.charset.Charset;
25 import java.nio.charset.IllegalCharsetNameException;
26 import java.nio.charset.UnsupportedCharsetException;
27 import java.util.Arrays;
28 import java.util.List;
29 import java.util.Locale;
30
31 import org.apache.commons.io.ByteOrderMark;
32 import org.apache.commons.io.IOUtils;
33 import org.apache.commons.lang3.ArrayUtils;
34 import org.apache.commons.lang3.StringUtils;
35 import org.apache.commons.logging.Log;
36 import org.apache.commons.logging.LogFactory;
37 import org.htmlunit.HttpHeader;
38 import org.htmlunit.cyberneko.xerces.util.StandardEncodingTranslator;
39
40
41
42
43
44
45
46
47
48
49
50 public final class EncodingSniffer {
51
52
53 private static final Log LOG = LogFactory.getLog(EncodingSniffer.class);
54
55
56 private static final byte[][] COMMENT_START = {
57 new byte[] {'<'},
58 new byte[] {'!'},
59 new byte[] {'-'},
60 new byte[] {'-'}
61 };
62
63
64 private static final byte[][] META_START = {
65 new byte[] {'<'},
66 new byte[] {'m', 'M'},
67 new byte[] {'e', 'E'},
68 new byte[] {'t', 'T'},
69 new byte[] {'a', 'A'},
70 new byte[] {0x09, 0x0A, 0x0C, 0x0D, 0x20, 0x2F}
71 };
72
73
74 private static final byte[][] OTHER_START = {
75 new byte[] {'<'},
76 new byte[] {'!', '/', '?'}
77 };
78
79
80 private static final byte[][] CHARSET_START = {
81 new byte[] {'c', 'C'},
82 new byte[] {'h', 'H'},
83 new byte[] {'a', 'A'},
84 new byte[] {'r', 'R'},
85 new byte[] {'s', 'S'},
86 new byte[] {'e', 'E'},
87 new byte[] {'t', 'T'}
88 };
89
90 private static final byte[] WHITESPACE = {0x09, 0x0A, 0x0C, 0x0D, 0x20, 0x3E};
91 private static final byte[] COMMENT_END = {'-', '-', '>'};
92
93 private static final byte[] XML_DECLARATION_PREFIX = "<?xml ".getBytes(US_ASCII);
94
95 private static final byte[] CSS_CHARSET_DECLARATION_PREFIX = "@charset \"".getBytes(US_ASCII);
96
97
98
99
100 private static final int SIZE_OF_HTML_CONTENT_SNIFFED = 1024;
101
102
103
104
105
106 private static final int SIZE_OF_XML_CONTENT_SNIFFED = 512;
107
108 private static final int SIZE_OF_CSS_CONTENT_SNIFFED = 1024;
109
110
111
112
113 private EncodingSniffer() {
114
115 }
116
117
118
119
120
121
122
123
124
125
126 static boolean contentTypeEndsWith(final List<NameValuePair> headers, final String... contentTypeEndings) {
127 for (final NameValuePair pair : headers) {
128 final String name = pair.getName();
129 if (HttpHeader.CONTENT_TYPE_LC.equalsIgnoreCase(name)) {
130 String value = pair.getValue();
131 final int i = value.indexOf(';');
132 if (i != -1) {
133 value = value.substring(0, i);
134 }
135 value = value.trim().toLowerCase(Locale.ROOT);
136 for (final String ending : contentTypeEndings) {
137 if (value.endsWith(ending.toLowerCase(Locale.ROOT))) {
138 return true;
139 }
140 }
141 return false;
142 }
143 }
144 return false;
145 }
146
147
148
149
150
151
152
153
154
155 static Charset sniffEncodingFromUnicodeBom(final byte[] bytes) {
156 if (bytes == null) {
157 return null;
158 }
159
160 Charset encoding = null;
161 if (startsWith(bytes, ByteOrderMark.UTF_8)) {
162 encoding = UTF_8;
163 }
164 else if (startsWith(bytes, ByteOrderMark.UTF_16BE)) {
165 encoding = UTF_16BE;
166 }
167 else if (startsWith(bytes, ByteOrderMark.UTF_16LE)) {
168 encoding = UTF_16LE;
169 }
170
171 if (encoding != null && LOG.isDebugEnabled()) {
172 LOG.debug("Encoding found in Unicode Byte Order Mark: '" + encoding + "'.");
173 }
174 return encoding;
175 }
176
177
178
179
180
181
182
183 private static boolean startsWith(final byte[] bytes, final ByteOrderMark bom) {
184 final byte[] bomBytes = bom.getBytes();
185 final byte[] firstBytes = Arrays.copyOfRange(bytes, 0, Math.min(bytes.length, bomBytes.length));
186 return Arrays.equals(firstBytes, bomBytes);
187 }
188
189
190
191
192
193
194
195
196
197 public static Charset sniffEncodingFromMetaTag(final InputStream is) throws IOException {
198 final byte[] bytes = read(is, SIZE_OF_HTML_CONTENT_SNIFFED);
199 for (int i = 0; i < bytes.length; i++) {
200 if (matches(bytes, i, COMMENT_START)) {
201 i = indexOfSubArray(bytes, COMMENT_END, i);
202 if (i == -1) {
203 break;
204 }
205 i += 2;
206 }
207 else if (matches(bytes, i, META_START)) {
208 i += META_START.length;
209 for (Attribute att = getAttribute(bytes, i); att != null; att = getAttribute(bytes, i)) {
210 i = att.getUpdatedIndex();
211 final String name = att.getName().toLowerCase(Locale.ROOT);
212 final String value = att.getValue().toLowerCase(Locale.ROOT);
213 if ("charset".equals(name) || "content".equals(name)) {
214 Charset charset = null;
215 if ("charset".equals(name)) {
216 charset = toCharset(value);
217
218 if (charset == null && "x-user-defined".equals(value)) {
219 charset = Charset.forName("windows-1252");
220 }
221 }
222 else if ("content".equals(name)) {
223 charset = extractEncodingFromContentType(value);
224
225 if (charset == null && value != null && value.contains("x-user-defined")) {
226 charset = Charset.forName("windows-1252");
227 }
228 if (charset == null) {
229 continue;
230 }
231 }
232 if (UTF_16BE == charset || UTF_16LE == charset) {
233 charset = UTF_8;
234 }
235 if (charset != null) {
236 if (LOG.isDebugEnabled()) {
237 LOG.debug("Encoding found in meta tag: '" + charset + "'.");
238 }
239 return charset;
240 }
241 }
242 }
243 }
244 else if (i + 1 < bytes.length && bytes[i] == '<' && Character.isLetter(bytes[i + 1])) {
245 i = skipToAnyOf(bytes, i, WHITESPACE);
246 if (i == -1) {
247 break;
248 }
249 Attribute att = getAttribute(bytes, i);
250 while (att != null) {
251 i = att.getUpdatedIndex();
252 att = getAttribute(bytes, i);
253 }
254 }
255 else if (i + 2 < bytes.length && bytes[i] == '<' && bytes[i + 1] == '/' && Character.isLetter(bytes[i + 2])) {
256 i = skipToAnyOf(bytes, i, new byte[] {0x09, 0x0A, 0x0C, 0x0D, 0x20, 0x3E});
257 if (i == -1) {
258 break;
259 }
260 Attribute attribute = getAttribute(bytes, i);
261 while (attribute != null) {
262 i = attribute.getUpdatedIndex();
263 attribute = getAttribute(bytes, i);
264 }
265 }
266 else if (matches(bytes, i, OTHER_START)) {
267 i = skipToAnyOf(bytes, i, new byte[] {0x3E});
268 if (i == -1) {
269 break;
270 }
271 }
272 }
273 return null;
274 }
275
276
277
278
279
280
281
282
283
284
285 static Attribute getAttribute(final byte[] bytes, final int startFrom) {
286 if (startFrom >= bytes.length) {
287 return null;
288 }
289
290 int pos = startFrom;
291 while (bytes[pos] == 0x09 || bytes[pos] == 0x0A || bytes[pos] == 0x0C || bytes[pos] == 0x0D || bytes[pos] == 0x20 || bytes[pos] == 0x2F) {
292 pos++;
293 if (pos >= bytes.length) {
294 return null;
295 }
296 }
297 if (bytes[pos] == '>') {
298 return null;
299 }
300 final StringBuilder name = new StringBuilder();
301 final StringBuilder value = new StringBuilder();
302 for ( ;; pos++) {
303 if (pos >= bytes.length) {
304 return new Attribute(name.toString(), value.toString(), pos);
305 }
306 if (bytes[pos] == '=' && name.length() != 0) {
307 pos++;
308 break;
309 }
310 if (bytes[pos] == 0x09 || bytes[pos] == 0x0A || bytes[pos] == 0x0C || bytes[pos] == 0x0D || bytes[pos] == 0x20) {
311 while (bytes[pos] == 0x09 || bytes[pos] == 0x0A || bytes[pos] == 0x0C || bytes[pos] == 0x0D || bytes[pos] == 0x20) {
312 pos++;
313 if (pos >= bytes.length) {
314 return new Attribute(name.toString(), value.toString(), pos);
315 }
316 }
317 if (bytes[pos] != '=') {
318 return new Attribute(name.toString(), value.toString(), pos);
319 }
320 pos++;
321 break;
322 }
323 if (bytes[pos] == '/' || bytes[pos] == '>') {
324 return new Attribute(name.toString(), value.toString(), pos);
325 }
326 name.append((char) bytes[pos]);
327 }
328 if (pos >= bytes.length) {
329 return new Attribute(name.toString(), value.toString(), pos);
330 }
331 while (bytes[pos] == 0x09 || bytes[pos] == 0x0A || bytes[pos] == 0x0C || bytes[pos] == 0x0D || bytes[pos] == 0x20) {
332 pos++;
333 if (pos >= bytes.length) {
334 return new Attribute(name.toString(), value.toString(), pos);
335 }
336 }
337 if (bytes[pos] == '"' || bytes[pos] == '\'') {
338 final byte b = bytes[pos];
339 for (pos++; pos < bytes.length; pos++) {
340 if (bytes[pos] == b) {
341 pos++;
342 return new Attribute(name.toString(), value.toString(), pos);
343 }
344 else if (bytes[pos] >= 'A' && bytes[pos] <= 'Z') {
345 final byte b2 = (byte) (bytes[pos] + 0x20);
346 value.append((char) b2);
347 }
348 else {
349 value.append((char) bytes[pos]);
350 }
351 }
352 return new Attribute(name.toString(), value.toString(), pos);
353 }
354 else if (bytes[pos] == '>') {
355 return new Attribute(name.toString(), value.toString(), pos);
356 }
357 else if (bytes[pos] >= 'A' && bytes[pos] <= 'Z') {
358 final byte b = (byte) (bytes[pos] + 0x20);
359 value.append((char) b);
360 pos++;
361 }
362 else {
363 value.append((char) bytes[pos]);
364 pos++;
365 }
366 for ( ; pos < bytes.length; pos++) {
367 if (bytes[pos] == 0x09 || bytes[pos] == 0x0A || bytes[pos] == 0x0C || bytes[pos] == 0x0D || bytes[pos] == 0x20 || bytes[pos] == 0x3E) {
368 return new Attribute(name.toString(), value.toString(), pos);
369 }
370 else if (bytes[pos] >= 'A' && bytes[pos] <= 'Z') {
371 final byte b = (byte) (bytes[pos] + 0x20);
372 value.append((char) b);
373 }
374 else {
375 value.append((char) bytes[pos]);
376 }
377 }
378 return new Attribute(name.toString(), value.toString(), pos);
379 }
380
381
382
383
384
385
386
387
388
389
390 public static Charset extractEncodingFromContentType(final String s) {
391 if (s == null) {
392 return null;
393 }
394 final byte[] bytes = s.getBytes(US_ASCII);
395 int i;
396 for (i = 0; i < bytes.length; i++) {
397 if (matches(bytes, i, CHARSET_START)) {
398 i += CHARSET_START.length;
399 break;
400 }
401 }
402 if (i == bytes.length) {
403 return null;
404 }
405 while (bytes[i] == 0x09 || bytes[i] == 0x0A || bytes[i] == 0x0C || bytes[i] == 0x0D || bytes[i] == 0x20) {
406 i++;
407 if (i == bytes.length) {
408 return null;
409 }
410 }
411 if (bytes[i] != '=') {
412 return null;
413 }
414 i++;
415 if (i == bytes.length) {
416 return null;
417 }
418 while (bytes[i] == 0x09 || bytes[i] == 0x0A || bytes[i] == 0x0C || bytes[i] == 0x0D || bytes[i] == 0x20) {
419 i++;
420 if (i == bytes.length) {
421 return null;
422 }
423 }
424 if (bytes[i] == '"') {
425 if (bytes.length <= i + 1) {
426 return null;
427 }
428 final int index = ArrayUtils.indexOf(bytes, (byte) '"', i + 1);
429 if (index == -1) {
430 return null;
431 }
432 final String charsetName = new String(ArrayUtils.subarray(bytes, i + 1, index), US_ASCII);
433 return toCharset(charsetName);
434 }
435 if (bytes[i] == '\'') {
436 if (bytes.length <= i + 1) {
437 return null;
438 }
439 final int index = ArrayUtils.indexOf(bytes, (byte) '\'', i + 1);
440 if (index == -1) {
441 return null;
442 }
443 final String charsetName = new String(ArrayUtils.subarray(bytes, i + 1, index), US_ASCII);
444 return toCharset(charsetName);
445 }
446 int end = skipToAnyOf(bytes, i, new byte[] {0x09, 0x0A, 0x0C, 0x0D, 0x20, 0x3B});
447 if (end == -1) {
448 end = bytes.length;
449 }
450 final String charsetName = new String(ArrayUtils.subarray(bytes, i, end), US_ASCII);
451 return toCharset(charsetName);
452 }
453
454
455
456
457
458
459
460
461
462 public static Charset sniffEncodingFromXmlDeclaration(final InputStream is) throws IOException {
463 final byte[] bytes = read(is, SIZE_OF_XML_CONTENT_SNIFFED);
464 Charset encoding = null;
465 if (bytes.length > 5
466 && XML_DECLARATION_PREFIX[0] == bytes[0]
467 && XML_DECLARATION_PREFIX[1] == bytes[1]
468 && XML_DECLARATION_PREFIX[2] == bytes[2]
469 && XML_DECLARATION_PREFIX[3] == bytes[3]
470 && XML_DECLARATION_PREFIX[4] == bytes[4]
471 && XML_DECLARATION_PREFIX[5] == bytes[5]) {
472 final int index = ArrayUtils.indexOf(bytes, (byte) '?', 2);
473 if (index + 1 < bytes.length && bytes[index + 1] == '>') {
474 final String declaration = new String(bytes, 0, index + 2, US_ASCII);
475 int start = declaration.indexOf("encoding");
476 if (start != -1) {
477 start += 8;
478 char delimiter;
479 outer:
480 while (true) {
481 switch (declaration.charAt(start)) {
482 case '"':
483 case '\'':
484 delimiter = declaration.charAt(start);
485 start = start + 1;
486 break outer;
487
488 default:
489 start++;
490 }
491 }
492 final int end = declaration.indexOf(delimiter, start);
493 encoding = toCharset(declaration.substring(start, end));
494 }
495 }
496 }
497 if (encoding != null && LOG.isDebugEnabled()) {
498 LOG.debug("Encoding found in XML declaration: '" + encoding + "'.");
499 }
500 return encoding;
501 }
502
503
504
505
506
507
508
509
510
511 public static Charset sniffEncodingFromCssDeclaration(final InputStream is) throws IOException {
512 final byte[] bytes = read(is, SIZE_OF_CSS_CONTENT_SNIFFED);
513 if (bytes.length < CSS_CHARSET_DECLARATION_PREFIX.length) {
514 return null;
515 }
516 for (int i = 0; i < CSS_CHARSET_DECLARATION_PREFIX.length; i++) {
517 if (bytes[i] != CSS_CHARSET_DECLARATION_PREFIX[i]) {
518 return null;
519 }
520 }
521
522 Charset encoding = null;
523 final int index = ArrayUtils.indexOf(bytes, (byte) '"', CSS_CHARSET_DECLARATION_PREFIX.length);
524 if (index + 1 < bytes.length && bytes[index + 1] == ';') {
525 encoding = toCharset(new String(bytes, CSS_CHARSET_DECLARATION_PREFIX.length, index - CSS_CHARSET_DECLARATION_PREFIX.length, US_ASCII));
526
527 if (encoding == UTF_16BE || encoding == UTF_16LE) {
528 encoding = UTF_8;
529 }
530 }
531 return encoding;
532 }
533
534
535
536
537
538
539
540 public static Charset toCharset(final String charsetName) {
541 final String nameFromLabel = translateEncodingLabel(charsetName);
542 if (nameFromLabel == null) {
543 return null;
544 }
545 try {
546 return Charset.forName(nameFromLabel);
547 }
548 catch (final IllegalCharsetNameException | UnsupportedCharsetException e) {
549 return null;
550 }
551 }
552
553
554
555
556
557
558
559
560
561
562
563 static boolean matches(final byte[] bytes, final int i, final byte[][] sought) {
564 if (i + sought.length > bytes.length) {
565 return false;
566 }
567 for (int x = 0; x < sought.length; x++) {
568 final byte[] possibilities = sought[x];
569 boolean match = false;
570 for (final byte possibility : possibilities) {
571 if (bytes[i + x] == possibility) {
572 match = true;
573 break;
574 }
575 }
576 if (!match) {
577 return false;
578 }
579 }
580 return true;
581 }
582
583
584
585
586
587
588
589
590
591
592 static int skipToAnyOf(final byte[] bytes, final int startFrom, final byte[] targets) {
593 int i = startFrom;
594 for ( ; i < bytes.length; i++) {
595 if (ArrayUtils.contains(targets, bytes[i])) {
596 break;
597 }
598 }
599 if (i == bytes.length) {
600 i = -1;
601 }
602 return i;
603 }
604
605
606
607
608
609
610
611
612
613
614 static int indexOfSubArray(final byte[] array, final byte[] subarray, final int startIndex) {
615 for (int i = startIndex; i < array.length; i++) {
616 boolean found = true;
617 if (i + subarray.length > array.length) {
618 break;
619 }
620 for (int j = 0; j < subarray.length; j++) {
621 final byte a = array[i + j];
622 final byte b = subarray[j];
623 if (a != b) {
624 found = false;
625 break;
626 }
627 }
628 if (found) {
629 return i;
630 }
631 }
632 return -1;
633 }
634
635
636
637
638
639
640
641
642
643
644
645 static byte[] read(final InputStream content, final int size) throws IOException {
646 byte[] bytes = new byte[size];
647
648
649 final int count = IOUtils.read(content, bytes);
650 if (count < size) {
651 final byte[] smaller = new byte[count];
652 System.arraycopy(bytes, 0, smaller, 0, count);
653 bytes = smaller;
654 }
655 return bytes;
656 }
657
658
659
660
661
662
663
664
665
666
667
668
669
670 static byte[] readAndPrepend(final InputStream content, final int size, final byte[] prefix) throws IOException {
671 final int prefixLength = prefix.length;
672 final byte[] joined = new byte[prefixLength + size];
673
674
675
676 final int count = IOUtils.read(content, joined, prefixLength, joined.length - prefixLength);
677 if (count < size) {
678 final byte[] smaller = new byte[prefixLength + count];
679 System.arraycopy(prefix, 0, smaller, 0, prefix.length);
680 System.arraycopy(joined, prefixLength, smaller, prefixLength, count);
681 return smaller;
682 }
683
684 System.arraycopy(prefix, 0, joined, 0, prefix.length);
685 return joined;
686 }
687
688 static class Attribute {
689 private final String name_;
690 private final String value_;
691 private final int updatedIndex_;
692 Attribute(final String name, final String value, final int updatedIndex) {
693 name_ = name;
694 value_ = value;
695 updatedIndex_ = updatedIndex;
696 }
697 String getName() {
698 return name_;
699 }
700 String getValue() {
701 return value_;
702 }
703 int getUpdatedIndex() {
704 return updatedIndex_;
705 }
706 }
707
708
709
710
711
712
713
714 public static String translateEncodingLabel(final String encodingLabel) {
715 if (StringUtils.isEmpty(encodingLabel)) {
716 return null;
717 }
718
719 final String encLC = encodingLabel.toLowerCase(Locale.ROOT);
720 final String enc = StandardEncodingTranslator.INSTANCE.encodingNameFromLabel(encodingLabel);
721 if (encLC.equals(enc)) {
722 return encodingLabel;
723 }
724 return enc;
725 }
726 }