1
2
3
4
5
6
7
8
9
10
11
12
13
14
15 package org.htmlunit.util;
16
17 import static java.nio.charset.StandardCharsets.US_ASCII;
18 import static java.nio.charset.StandardCharsets.UTF_16BE;
19 import static java.nio.charset.StandardCharsets.UTF_16LE;
20 import static java.nio.charset.StandardCharsets.UTF_8;
21
22 import java.io.IOException;
23 import java.io.InputStream;
24 import java.nio.charset.Charset;
25 import java.nio.charset.IllegalCharsetNameException;
26 import java.nio.charset.UnsupportedCharsetException;
27 import java.util.Arrays;
28 import java.util.List;
29 import java.util.Locale;
30
31 import org.apache.commons.io.ByteOrderMark;
32 import org.apache.commons.io.IOUtils;
33 import org.apache.commons.lang3.ArrayUtils;
34 import org.apache.commons.logging.Log;
35 import org.apache.commons.logging.LogFactory;
36 import org.htmlunit.HttpHeader;
37 import org.htmlunit.cyberneko.xerces.util.StandardEncodingTranslator;
38
39
40
41
42
43
44
45
46
47
48
49 public final class EncodingSniffer {
50
51
52 private static final Log LOG = LogFactory.getLog(EncodingSniffer.class);
53
54
55 private static final byte[][] COMMENT_START = {
56 new byte[] {'<'},
57 new byte[] {'!'},
58 new byte[] {'-'},
59 new byte[] {'-'}
60 };
61
62
63 private static final byte[][] META_START = {
64 new byte[] {'<'},
65 new byte[] {'m', 'M'},
66 new byte[] {'e', 'E'},
67 new byte[] {'t', 'T'},
68 new byte[] {'a', 'A'},
69 new byte[] {0x09, 0x0A, 0x0C, 0x0D, 0x20, 0x2F}
70 };
71
72
73 private static final byte[][] OTHER_START = {
74 new byte[] {'<'},
75 new byte[] {'!', '/', '?'}
76 };
77
78
79 private static final byte[][] CHARSET_START = {
80 new byte[] {'c', 'C'},
81 new byte[] {'h', 'H'},
82 new byte[] {'a', 'A'},
83 new byte[] {'r', 'R'},
84 new byte[] {'s', 'S'},
85 new byte[] {'e', 'E'},
86 new byte[] {'t', 'T'}
87 };
88
89 private static final byte[] WHITESPACE = {0x09, 0x0A, 0x0C, 0x0D, 0x20, 0x3E};
90 private static final byte[] COMMENT_END = {'-', '-', '>'};
91
92 private static final byte[] XML_DECLARATION_PREFIX = "<?xml ".getBytes(US_ASCII);
93
94 private static final byte[] CSS_CHARSET_DECLARATION_PREFIX = "@charset \"".getBytes(US_ASCII);
95
96
97
98
99 private static final int SIZE_OF_HTML_CONTENT_SNIFFED = 1024;
100
101
102
103
104
105 private static final int SIZE_OF_XML_CONTENT_SNIFFED = 512;
106
107 private static final int SIZE_OF_CSS_CONTENT_SNIFFED = 1024;
108
109
110
111
112 private EncodingSniffer() {
113
114 }
115
116
117
118
119
120
121
122
123
124
125 static boolean contentTypeEndsWith(final List<NameValuePair> headers, final String... contentTypeEndings) {
126 for (final NameValuePair pair : headers) {
127 final String name = pair.getName();
128 if (HttpHeader.CONTENT_TYPE_LC.equalsIgnoreCase(name)) {
129 String value = pair.getValue();
130 final int i = value.indexOf(';');
131 if (i != -1) {
132 value = value.substring(0, i);
133 }
134 value = value.trim().toLowerCase(Locale.ROOT);
135 for (final String ending : contentTypeEndings) {
136 if (value.endsWith(ending.toLowerCase(Locale.ROOT))) {
137 return true;
138 }
139 }
140 return false;
141 }
142 }
143 return false;
144 }
145
146
147
148
149
150
151
152
153
154 static Charset sniffEncodingFromUnicodeBom(final byte[] bytes) {
155 if (bytes == null) {
156 return null;
157 }
158
159 Charset encoding = null;
160 if (startsWith(bytes, ByteOrderMark.UTF_8)) {
161 encoding = UTF_8;
162 }
163 else if (startsWith(bytes, ByteOrderMark.UTF_16BE)) {
164 encoding = UTF_16BE;
165 }
166 else if (startsWith(bytes, ByteOrderMark.UTF_16LE)) {
167 encoding = UTF_16LE;
168 }
169
170 if (encoding != null && LOG.isDebugEnabled()) {
171 LOG.debug("Encoding found in Unicode Byte Order Mark: '" + encoding + "'.");
172 }
173 return encoding;
174 }
175
176
177
178
179
180
181
182 private static boolean startsWith(final byte[] bytes, final ByteOrderMark bom) {
183 final byte[] bomBytes = bom.getBytes();
184 final byte[] firstBytes = Arrays.copyOfRange(bytes, 0, Math.min(bytes.length, bomBytes.length));
185 return Arrays.equals(firstBytes, bomBytes);
186 }
187
188
189
190
191
192
193
194
195
196 public static Charset sniffEncodingFromMetaTag(final InputStream is) throws IOException {
197 final byte[] bytes = read(is, SIZE_OF_HTML_CONTENT_SNIFFED);
198 for (int i = 0; i < bytes.length; i++) {
199 if (matches(bytes, i, COMMENT_START)) {
200 i = indexOfSubArray(bytes, COMMENT_END, i);
201 if (i == -1) {
202 break;
203 }
204 i += 2;
205 }
206 else if (matches(bytes, i, META_START)) {
207 i += META_START.length;
208 for (Attribute att = getAttribute(bytes, i); att != null; att = getAttribute(bytes, i)) {
209 i = att.getUpdatedIndex();
210 final String name = att.getName().toLowerCase(Locale.ROOT);
211 final String value = att.getValue().toLowerCase(Locale.ROOT);
212 if ("charset".equals(name) || "content".equals(name)) {
213 Charset charset = null;
214 if ("charset".equals(name)) {
215 charset = toCharset(value);
216
217 if (charset == null && "x-user-defined".equals(value)) {
218 charset = Charset.forName("windows-1252");
219 }
220 }
221 else if ("content".equals(name)) {
222 charset = extractEncodingFromContentType(value);
223
224 if (charset == null && value != null && value.contains("x-user-defined")) {
225 charset = Charset.forName("windows-1252");
226 }
227 if (charset == null) {
228 continue;
229 }
230 }
231 if (UTF_16BE == charset || UTF_16LE == charset) {
232 charset = UTF_8;
233 }
234 if (charset != null) {
235 if (LOG.isDebugEnabled()) {
236 LOG.debug("Encoding found in meta tag: '" + charset + "'.");
237 }
238 return charset;
239 }
240 }
241 }
242 }
243 else if (i + 1 < bytes.length && bytes[i] == '<' && Character.isLetter(bytes[i + 1])) {
244 i = skipToAnyOf(bytes, i, WHITESPACE);
245 if (i == -1) {
246 break;
247 }
248 Attribute att = getAttribute(bytes, i);
249 while (att != null) {
250 i = att.getUpdatedIndex();
251 att = getAttribute(bytes, i);
252 }
253 }
254 else if (i + 2 < bytes.length && bytes[i] == '<' && bytes[i + 1] == '/' && Character.isLetter(bytes[i + 2])) {
255 i = skipToAnyOf(bytes, i, new byte[] {0x09, 0x0A, 0x0C, 0x0D, 0x20, 0x3E});
256 if (i == -1) {
257 break;
258 }
259 Attribute attribute = getAttribute(bytes, i);
260 while (attribute != null) {
261 i = attribute.getUpdatedIndex();
262 attribute = getAttribute(bytes, i);
263 }
264 }
265 else if (matches(bytes, i, OTHER_START)) {
266 i = skipToAnyOf(bytes, i, new byte[] {0x3E});
267 if (i == -1) {
268 break;
269 }
270 }
271 }
272 return null;
273 }
274
275
276
277
278
279
280
281
282
283
284 static Attribute getAttribute(final byte[] bytes, final int startFrom) {
285 if (startFrom >= bytes.length) {
286 return null;
287 }
288
289 int pos = startFrom;
290 while (bytes[pos] == 0x09 || bytes[pos] == 0x0A || bytes[pos] == 0x0C || bytes[pos] == 0x0D || bytes[pos] == 0x20 || bytes[pos] == 0x2F) {
291 pos++;
292 if (pos >= bytes.length) {
293 return null;
294 }
295 }
296 if (bytes[pos] == '>') {
297 return null;
298 }
299 final StringBuilder name = new StringBuilder();
300 final StringBuilder value = new StringBuilder();
301 for ( ;; pos++) {
302 if (pos >= bytes.length) {
303 return new Attribute(name.toString(), value.toString(), pos);
304 }
305 if (bytes[pos] == '=' && name.length() != 0) {
306 pos++;
307 break;
308 }
309 if (bytes[pos] == 0x09 || bytes[pos] == 0x0A || bytes[pos] == 0x0C || bytes[pos] == 0x0D || bytes[pos] == 0x20) {
310 while (bytes[pos] == 0x09 || bytes[pos] == 0x0A || bytes[pos] == 0x0C || bytes[pos] == 0x0D || bytes[pos] == 0x20) {
311 pos++;
312 if (pos >= bytes.length) {
313 return new Attribute(name.toString(), value.toString(), pos);
314 }
315 }
316 if (bytes[pos] != '=') {
317 return new Attribute(name.toString(), value.toString(), pos);
318 }
319 pos++;
320 break;
321 }
322 if (bytes[pos] == '/' || bytes[pos] == '>') {
323 return new Attribute(name.toString(), value.toString(), pos);
324 }
325 name.append((char) bytes[pos]);
326 }
327 if (pos >= bytes.length) {
328 return new Attribute(name.toString(), value.toString(), pos);
329 }
330 while (bytes[pos] == 0x09 || bytes[pos] == 0x0A || bytes[pos] == 0x0C || bytes[pos] == 0x0D || bytes[pos] == 0x20) {
331 pos++;
332 if (pos >= bytes.length) {
333 return new Attribute(name.toString(), value.toString(), pos);
334 }
335 }
336 if (bytes[pos] == '"' || bytes[pos] == '\'') {
337 final byte b = bytes[pos];
338 for (pos++; pos < bytes.length; pos++) {
339 if (bytes[pos] == b) {
340 pos++;
341 return new Attribute(name.toString(), value.toString(), pos);
342 }
343 else if (bytes[pos] >= 'A' && bytes[pos] <= 'Z') {
344 final byte b2 = (byte) (bytes[pos] + 0x20);
345 value.append((char) b2);
346 }
347 else {
348 value.append((char) bytes[pos]);
349 }
350 }
351 return new Attribute(name.toString(), value.toString(), pos);
352 }
353 else if (bytes[pos] == '>') {
354 return new Attribute(name.toString(), value.toString(), pos);
355 }
356 else if (bytes[pos] >= 'A' && bytes[pos] <= 'Z') {
357 final byte b = (byte) (bytes[pos] + 0x20);
358 value.append((char) b);
359 pos++;
360 }
361 else {
362 value.append((char) bytes[pos]);
363 pos++;
364 }
365 for ( ; pos < bytes.length; pos++) {
366 if (bytes[pos] == 0x09 || bytes[pos] == 0x0A || bytes[pos] == 0x0C || bytes[pos] == 0x0D || bytes[pos] == 0x20 || bytes[pos] == 0x3E) {
367 return new Attribute(name.toString(), value.toString(), pos);
368 }
369 else if (bytes[pos] >= 'A' && bytes[pos] <= 'Z') {
370 final byte b = (byte) (bytes[pos] + 0x20);
371 value.append((char) b);
372 }
373 else {
374 value.append((char) bytes[pos]);
375 }
376 }
377 return new Attribute(name.toString(), value.toString(), pos);
378 }
379
380
381
382
383
384
385
386
387
388
389 public static Charset extractEncodingFromContentType(final String s) {
390 if (s == null) {
391 return null;
392 }
393 final byte[] bytes = s.getBytes(US_ASCII);
394 int i;
395 for (i = 0; i < bytes.length; i++) {
396 if (matches(bytes, i, CHARSET_START)) {
397 i += CHARSET_START.length;
398 break;
399 }
400 }
401 if (i == bytes.length) {
402 return null;
403 }
404 while (bytes[i] == 0x09 || bytes[i] == 0x0A || bytes[i] == 0x0C || bytes[i] == 0x0D || bytes[i] == 0x20) {
405 i++;
406 if (i == bytes.length) {
407 return null;
408 }
409 }
410 if (bytes[i] != '=') {
411 return null;
412 }
413 i++;
414 if (i == bytes.length) {
415 return null;
416 }
417 while (bytes[i] == 0x09 || bytes[i] == 0x0A || bytes[i] == 0x0C || bytes[i] == 0x0D || bytes[i] == 0x20) {
418 i++;
419 if (i == bytes.length) {
420 return null;
421 }
422 }
423 if (bytes[i] == '"') {
424 if (bytes.length <= i + 1) {
425 return null;
426 }
427 final int index = ArrayUtils.indexOf(bytes, (byte) '"', i + 1);
428 if (index == -1) {
429 return null;
430 }
431 final String charsetName = new String(ArrayUtils.subarray(bytes, i + 1, index), US_ASCII);
432 return toCharset(charsetName);
433 }
434 if (bytes[i] == '\'') {
435 if (bytes.length <= i + 1) {
436 return null;
437 }
438 final int index = ArrayUtils.indexOf(bytes, (byte) '\'', i + 1);
439 if (index == -1) {
440 return null;
441 }
442 final String charsetName = new String(ArrayUtils.subarray(bytes, i + 1, index), US_ASCII);
443 return toCharset(charsetName);
444 }
445 int end = skipToAnyOf(bytes, i, new byte[] {0x09, 0x0A, 0x0C, 0x0D, 0x20, 0x3B});
446 if (end == -1) {
447 end = bytes.length;
448 }
449 final String charsetName = new String(ArrayUtils.subarray(bytes, i, end), US_ASCII);
450 return toCharset(charsetName);
451 }
452
453
454
455
456
457
458
459
460
461 public static Charset sniffEncodingFromXmlDeclaration(final InputStream is) throws IOException {
462 final byte[] bytes = read(is, SIZE_OF_XML_CONTENT_SNIFFED);
463 Charset encoding = null;
464 if (bytes.length > 5
465 && XML_DECLARATION_PREFIX[0] == bytes[0]
466 && XML_DECLARATION_PREFIX[1] == bytes[1]
467 && XML_DECLARATION_PREFIX[2] == bytes[2]
468 && XML_DECLARATION_PREFIX[3] == bytes[3]
469 && XML_DECLARATION_PREFIX[4] == bytes[4]
470 && XML_DECLARATION_PREFIX[5] == bytes[5]) {
471 final int index = ArrayUtils.indexOf(bytes, (byte) '?', 2);
472 if (index + 1 < bytes.length && bytes[index + 1] == '>') {
473 final String declaration = new String(bytes, 0, index + 2, US_ASCII);
474 int start = declaration.indexOf("encoding");
475 if (start != -1) {
476 start += 8;
477 final char delimiter;
478 outer:
479 while (true) {
480 switch (declaration.charAt(start)) {
481 case '"':
482 case '\'':
483 delimiter = declaration.charAt(start);
484 start = start + 1;
485 break outer;
486
487 default:
488 start++;
489 }
490 }
491 final int end = declaration.indexOf(delimiter, start);
492 encoding = toCharset(declaration.substring(start, end));
493 }
494 }
495 }
496 if (encoding != null && LOG.isDebugEnabled()) {
497 LOG.debug("Encoding found in XML declaration: '" + encoding + "'.");
498 }
499 return encoding;
500 }
501
502
503
504
505
506
507
508
509
510 public static Charset sniffEncodingFromCssDeclaration(final InputStream is) throws IOException {
511 final byte[] bytes = read(is, SIZE_OF_CSS_CONTENT_SNIFFED);
512 if (bytes.length < CSS_CHARSET_DECLARATION_PREFIX.length) {
513 return null;
514 }
515 for (int i = 0; i < CSS_CHARSET_DECLARATION_PREFIX.length; i++) {
516 if (bytes[i] != CSS_CHARSET_DECLARATION_PREFIX[i]) {
517 return null;
518 }
519 }
520
521 Charset encoding = null;
522 final int index = ArrayUtils.indexOf(bytes, (byte) '"', CSS_CHARSET_DECLARATION_PREFIX.length);
523 if (index + 1 < bytes.length && bytes[index + 1] == ';') {
524 encoding = toCharset(new String(bytes, CSS_CHARSET_DECLARATION_PREFIX.length, index - CSS_CHARSET_DECLARATION_PREFIX.length, US_ASCII));
525
526 if (encoding == UTF_16BE || encoding == UTF_16LE) {
527 encoding = UTF_8;
528 }
529 }
530 return encoding;
531 }
532
533
534
535
536
537
538
539 public static Charset toCharset(final String charsetName) {
540 final String nameFromLabel = translateEncodingLabel(charsetName);
541 if (nameFromLabel == null) {
542 return null;
543 }
544 try {
545 return Charset.forName(nameFromLabel);
546 }
547 catch (final IllegalCharsetNameException | UnsupportedCharsetException e) {
548 return null;
549 }
550 }
551
552
553
554
555
556
557
558
559
560
561
562 static boolean matches(final byte[] bytes, final int i, final byte[][] sought) {
563 if (i + sought.length > bytes.length) {
564 return false;
565 }
566 for (int x = 0; x < sought.length; x++) {
567 final byte[] possibilities = sought[x];
568 boolean match = false;
569 for (final byte possibility : possibilities) {
570 if (bytes[i + x] == possibility) {
571 match = true;
572 break;
573 }
574 }
575 if (!match) {
576 return false;
577 }
578 }
579 return true;
580 }
581
582
583
584
585
586
587
588
589
590
591 static int skipToAnyOf(final byte[] bytes, final int startFrom, final byte[] targets) {
592 int i = startFrom;
593 for ( ; i < bytes.length; i++) {
594 if (ArrayUtils.contains(targets, bytes[i])) {
595 break;
596 }
597 }
598 if (i == bytes.length) {
599 i = -1;
600 }
601 return i;
602 }
603
604
605
606
607
608
609
610
611
612
613 static int indexOfSubArray(final byte[] array, final byte[] subarray, final int startIndex) {
614 for (int i = startIndex; i < array.length; i++) {
615 boolean found = true;
616 if (i + subarray.length > array.length) {
617 break;
618 }
619 for (int j = 0; j < subarray.length; j++) {
620 final byte a = array[i + j];
621 final byte b = subarray[j];
622 if (a != b) {
623 found = false;
624 break;
625 }
626 }
627 if (found) {
628 return i;
629 }
630 }
631 return -1;
632 }
633
634
635
636
637
638
639
640
641
642
643
644 static byte[] read(final InputStream content, final int size) throws IOException {
645 byte[] bytes = new byte[size];
646
647
648 final int count = IOUtils.read(content, bytes);
649 if (count < size) {
650 final byte[] smaller = new byte[count];
651 System.arraycopy(bytes, 0, smaller, 0, count);
652 bytes = smaller;
653 }
654 return bytes;
655 }
656
657
658
659
660
661
662
663
664
665
666
667
668
669 static byte[] readAndPrepend(final InputStream content, final int size, final byte[] prefix) throws IOException {
670 final int prefixLength = prefix.length;
671 final byte[] joined = new byte[prefixLength + size];
672
673
674
675 final int count = IOUtils.read(content, joined, prefixLength, joined.length - prefixLength);
676 if (count < size) {
677 final byte[] smaller = new byte[prefixLength + count];
678 System.arraycopy(prefix, 0, smaller, 0, prefix.length);
679 System.arraycopy(joined, prefixLength, smaller, prefixLength, count);
680 return smaller;
681 }
682
683 System.arraycopy(prefix, 0, joined, 0, prefix.length);
684 return joined;
685 }
686
687 static class Attribute {
688 private final String name_;
689 private final String value_;
690 private final int updatedIndex_;
691 Attribute(final String name, final String value, final int updatedIndex) {
692 name_ = name;
693 value_ = value;
694 updatedIndex_ = updatedIndex;
695 }
696 String getName() {
697 return name_;
698 }
699 String getValue() {
700 return value_;
701 }
702 int getUpdatedIndex() {
703 return updatedIndex_;
704 }
705 }
706
707
708
709
710
711
712
713 public static String translateEncodingLabel(final String encodingLabel) {
714 if (StringUtils.isEmptyOrNull(encodingLabel)) {
715 return null;
716 }
717
718 final String encLC = encodingLabel.toLowerCase(Locale.ROOT);
719 final String enc = StandardEncodingTranslator.INSTANCE.encodingNameFromLabel(encodingLabel);
720 if (encLC.equals(enc)) {
721 return encodingLabel;
722 }
723 return enc;
724 }
725 }