View Javadoc
1   /*
2    * Copyright (c) 2002-2025 Gargoyle Software Inc.
3    *
4    * Licensed under the Apache License, Version 2.0 (the "License");
5    * you may not use this file except in compliance with the License.
6    * You may obtain a copy of the License at
7    * https://www.apache.org/licenses/LICENSE-2.0
8    *
9    * Unless required by applicable law or agreed to in writing, software
10   * distributed under the License is distributed on an "AS IS" BASIS,
11   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12   * See the License for the specific language governing permissions and
13   * limitations under the License.
14   */
15  package org.htmlunit.util;
16  
17  import static java.nio.charset.StandardCharsets.US_ASCII;
18  import static java.nio.charset.StandardCharsets.UTF_16BE;
19  import static java.nio.charset.StandardCharsets.UTF_16LE;
20  import static java.nio.charset.StandardCharsets.UTF_8;
21  
22  import java.io.IOException;
23  import java.io.InputStream;
24  import java.nio.charset.Charset;
25  import java.nio.charset.IllegalCharsetNameException;
26  import java.nio.charset.UnsupportedCharsetException;
27  import java.util.Arrays;
28  import java.util.List;
29  import java.util.Locale;
30  
31  import org.apache.commons.io.ByteOrderMark;
32  import org.apache.commons.io.IOUtils;
33  import org.apache.commons.lang3.ArrayUtils;
34  import org.apache.commons.logging.Log;
35  import org.apache.commons.logging.LogFactory;
36  import org.htmlunit.HttpHeader;
37  import org.htmlunit.cyberneko.xerces.util.StandardEncodingTranslator;
38  
39  /**
40   * Sniffs encoding settings from HTML, XML or other content. The HTML encoding sniffing algorithm is based on the
41   * <a href="http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#determining-the-character-encoding">HTML5
42   * encoding sniffing algorithm</a>.
43   *
44   * @author Daniel Gredler
45   * @author Ahmed Ashour
46   * @author Ronald Brill
47   * @author Lai Quang Duong
48   */
49  public final class EncodingSniffer {
50  
51      /** Logging support. */
52      private static final Log LOG = LogFactory.getLog(EncodingSniffer.class);
53  
54      /** Sequence(s) of bytes indicating the beginning of a comment. */
55      private static final byte[][] COMMENT_START = {
56          new byte[] {'<'},
57          new byte[] {'!'},
58          new byte[] {'-'},
59          new byte[] {'-'}
60      };
61  
62      /** Sequence(s) of bytes indicating the beginning of a <code>meta</code> HTML tag. */
63      private static final byte[][] META_START = {
64          new byte[] {'<'},
65          new byte[] {'m', 'M'},
66          new byte[] {'e', 'E'},
67          new byte[] {'t', 'T'},
68          new byte[] {'a', 'A'},
69          new byte[] {0x09, 0x0A, 0x0C, 0x0D, 0x20, 0x2F}
70      };
71  
72      /** Sequence(s) of bytes indicating the beginning of miscellaneous HTML content. */
73      private static final byte[][] OTHER_START = {
74          new byte[] {'<'},
75          new byte[] {'!', '/', '?'}
76      };
77  
78      /** Sequence(s) of bytes indicating the beginning of a charset specification. */
79      private static final byte[][] CHARSET_START = {
80          new byte[] {'c', 'C'},
81          new byte[] {'h', 'H'},
82          new byte[] {'a', 'A'},
83          new byte[] {'r', 'R'},
84          new byte[] {'s', 'S'},
85          new byte[] {'e', 'E'},
86          new byte[] {'t', 'T'}
87      };
88  
89      private static final byte[] WHITESPACE = {0x09, 0x0A, 0x0C, 0x0D, 0x20, 0x3E};
90      private static final byte[] COMMENT_END = {'-', '-', '>'};
91  
92      private static final byte[] XML_DECLARATION_PREFIX = "<?xml ".getBytes(US_ASCII);
93  
94      private static final byte[] CSS_CHARSET_DECLARATION_PREFIX = "@charset \"".getBytes(US_ASCII);
95  
96      /**
97       * The number of HTML bytes to sniff for encoding info embedded in <code>meta</code> tags;
98       */
99      private static final int SIZE_OF_HTML_CONTENT_SNIFFED = 1024;
100 
101     /**
102      * The number of XML bytes to sniff for encoding info embedded in the XML declaration;
103      * relatively small because it's always at the very beginning of the file.
104      */
105     private static final int SIZE_OF_XML_CONTENT_SNIFFED = 512;
106 
107     private static final int SIZE_OF_CSS_CONTENT_SNIFFED = 1024;
108 
109     /**
110      * Disallow instantiation of this class.
111      */
112     private EncodingSniffer() {
113         // Empty.
114     }
115 
116     /**
117      * Returns {@code true} if the specified HTTP response headers contain a <code>Content-Type</code> that
118      * ends with one of the specified strings.
119      *
120      * @param headers the HTTP response headers
121      * @param contentTypeEndings the content type endings to search for
122      * @return {@code true} if the specified HTTP response headers contain a <code>Content-Type</code> that
123      *         ends with one of the specified strings
124      */
125     static boolean contentTypeEndsWith(final List<NameValuePair> headers, final String... contentTypeEndings) {
126         for (final NameValuePair pair : headers) {
127             final String name = pair.getName();
128             if (HttpHeader.CONTENT_TYPE_LC.equalsIgnoreCase(name)) {
129                 String value = pair.getValue();
130                 final int i = value.indexOf(';');
131                 if (i != -1) {
132                     value = value.substring(0, i);
133                 }
134                 value = value.trim().toLowerCase(Locale.ROOT);
135                 for (final String ending : contentTypeEndings) {
136                     if (value.endsWith(ending.toLowerCase(Locale.ROOT))) {
137                         return true;
138                     }
139                 }
140                 return false;
141             }
142         }
143         return false;
144     }
145 
146     /**
147      * Attempts to sniff an encoding from a <a href="http://en.wikipedia.org/wiki/Byte_Order_Mark">Byte Order Mark</a>
148      * in the specified byte array.
149      *
150      * @param bytes the bytes to check for a Byte Order Mark
151      * @return the encoding sniffed from the specified bytes, or {@code null} if the encoding
152      *         could not be determined
153      */
154     static Charset sniffEncodingFromUnicodeBom(final byte[] bytes) {
155         if (bytes == null) {
156             return null;
157         }
158 
159         Charset encoding = null;
160         if (startsWith(bytes, ByteOrderMark.UTF_8)) {
161             encoding = UTF_8;
162         }
163         else if (startsWith(bytes, ByteOrderMark.UTF_16BE)) {
164             encoding = UTF_16BE;
165         }
166         else if (startsWith(bytes, ByteOrderMark.UTF_16LE)) {
167             encoding = UTF_16LE;
168         }
169 
170         if (encoding != null && LOG.isDebugEnabled()) {
171             LOG.debug("Encoding found in Unicode Byte Order Mark: '" + encoding + "'.");
172         }
173         return encoding;
174     }
175 
176     /**
177      * Returns whether the specified byte array starts with the given {@link ByteOrderMark}, or not.
178      * @param bytes the byte array to check
179      * @param bom the {@link ByteOrderMark}
180      * @return whether the specified byte array starts with the given {@link ByteOrderMark}, or not
181      */
182     private static boolean startsWith(final byte[] bytes, final ByteOrderMark bom) {
183         final byte[] bomBytes = bom.getBytes();
184         final byte[] firstBytes = Arrays.copyOfRange(bytes, 0, Math.min(bytes.length, bomBytes.length));
185         return Arrays.equals(firstBytes, bomBytes);
186     }
187 
188     /**
189      * Attempts to sniff an encoding from an HTML <code>meta</code> tag in the specified byte array.
190      *
191      * @param is the content stream to check for an HTML <code>meta</code> tag
192      * @return the encoding sniffed from the specified bytes, or {@code null} if the encoding
193      *         could not be determined
194      * @throws IOException if an IO error occurs
195      */
196     public static Charset sniffEncodingFromMetaTag(final InputStream is) throws IOException {
197         final byte[] bytes = read(is, SIZE_OF_HTML_CONTENT_SNIFFED);
198         for (int i = 0; i < bytes.length; i++) {
199             if (matches(bytes, i, COMMENT_START)) {
200                 i = indexOfSubArray(bytes, COMMENT_END, i);
201                 if (i == -1) {
202                     break;
203                 }
204                 i += 2;
205             }
206             else if (matches(bytes, i, META_START)) {
207                 i += META_START.length;
208                 for (Attribute att = getAttribute(bytes, i); att != null; att = getAttribute(bytes, i)) {
209                     i = att.getUpdatedIndex();
210                     final String name = att.getName().toLowerCase(Locale.ROOT);
211                     final String value = att.getValue().toLowerCase(Locale.ROOT);
212                     if ("charset".equals(name) || "content".equals(name)) {
213                         Charset charset = null;
214                         if ("charset".equals(name)) {
215                             charset = toCharset(value);
216                             // https://html.spec.whatwg.org/multipage/parsing.html#prescan-a-byte-stream-to-determine-its-encoding
217                             if (charset == null && "x-user-defined".equals(value)) {
218                                 charset = Charset.forName("windows-1252");
219                             }
220                         }
221                         else if ("content".equals(name)) {
222                             charset = extractEncodingFromContentType(value);
223                             // https://html.spec.whatwg.org/multipage/parsing.html#prescan-a-byte-stream-to-determine-its-encoding
224                             if (charset == null && value != null && value.contains("x-user-defined")) {
225                                 charset = Charset.forName("windows-1252");
226                             }
227                             if (charset == null) {
228                                 continue;
229                             }
230                         }
231                         if (UTF_16BE == charset || UTF_16LE == charset) {
232                             charset = UTF_8;
233                         }
234                         if (charset != null) {
235                             if (LOG.isDebugEnabled()) {
236                                 LOG.debug("Encoding found in meta tag: '" + charset + "'.");
237                             }
238                             return charset;
239                         }
240                     }
241                 }
242             }
243             else if (i + 1 < bytes.length && bytes[i] == '<' && Character.isLetter(bytes[i + 1])) {
244                 i = skipToAnyOf(bytes, i, WHITESPACE);
245                 if (i == -1) {
246                     break;
247                 }
248                 Attribute att = getAttribute(bytes, i);
249                 while (att != null) {
250                     i = att.getUpdatedIndex();
251                     att = getAttribute(bytes, i);
252                 }
253             }
254             else if (i + 2 < bytes.length && bytes[i] == '<' && bytes[i + 1] == '/' && Character.isLetter(bytes[i + 2])) {
255                 i = skipToAnyOf(bytes, i, new byte[] {0x09, 0x0A, 0x0C, 0x0D, 0x20, 0x3E});
256                 if (i == -1) {
257                     break;
258                 }
259                 Attribute attribute = getAttribute(bytes, i);
260                 while (attribute != null) {
261                     i = attribute.getUpdatedIndex();
262                     attribute = getAttribute(bytes, i);
263                 }
264             }
265             else if (matches(bytes, i, OTHER_START)) {
266                 i = skipToAnyOf(bytes, i, new byte[] {0x3E});
267                 if (i == -1) {
268                     break;
269                 }
270             }
271         }
272         return null;
273     }
274 
275     /**
276      * Extracts an attribute from the specified byte array, starting at the specified index, using the
277      * <a href="http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#concept-get-attributes-when-sniffing">HTML5
278      * attribute algorithm</a>.
279      *
280      * @param bytes the byte array to extract an attribute from
281      * @param startFrom the index to start searching from
282      * @return the next attribute in the specified byte array, or {@code null} if one is not available
283      */
284     static Attribute getAttribute(final byte[] bytes, final int startFrom) {
285         if (startFrom >= bytes.length) {
286             return null;
287         }
288 
289         int pos = startFrom;
290         while (bytes[pos] == 0x09 || bytes[pos] == 0x0A || bytes[pos] == 0x0C || bytes[pos] == 0x0D || bytes[pos] == 0x20 || bytes[pos] == 0x2F) {
291             pos++;
292             if (pos >= bytes.length) {
293                 return null;
294             }
295         }
296         if (bytes[pos] == '>') {
297             return null;
298         }
299         final StringBuilder name = new StringBuilder();
300         final StringBuilder value = new StringBuilder();
301         for ( ;; pos++) {
302             if (pos >= bytes.length) {
303                 return new Attribute(name.toString(), value.toString(), pos);
304             }
305             if (bytes[pos] == '=' && name.length() != 0) {
306                 pos++;
307                 break;
308             }
309             if (bytes[pos] == 0x09 || bytes[pos] == 0x0A || bytes[pos] == 0x0C || bytes[pos] == 0x0D || bytes[pos] == 0x20) {
310                 while (bytes[pos] == 0x09 || bytes[pos] == 0x0A || bytes[pos] == 0x0C || bytes[pos] == 0x0D || bytes[pos] == 0x20) {
311                     pos++;
312                     if (pos >= bytes.length) {
313                         return new Attribute(name.toString(), value.toString(), pos);
314                     }
315                 }
316                 if (bytes[pos] != '=') {
317                     return new Attribute(name.toString(), value.toString(), pos);
318                 }
319                 pos++;
320                 break;
321             }
322             if (bytes[pos] == '/' || bytes[pos] == '>') {
323                 return new Attribute(name.toString(), value.toString(), pos);
324             }
325             name.append((char) bytes[pos]);
326         }
327         if (pos >= bytes.length) {
328             return new Attribute(name.toString(), value.toString(), pos);
329         }
330         while (bytes[pos] == 0x09 || bytes[pos] == 0x0A || bytes[pos] == 0x0C || bytes[pos] == 0x0D || bytes[pos] == 0x20) {
331             pos++;
332             if (pos >= bytes.length) {
333                 return new Attribute(name.toString(), value.toString(), pos);
334             }
335         }
336         if (bytes[pos] == '"' || bytes[pos] == '\'') {
337             final byte b = bytes[pos];
338             for (pos++; pos < bytes.length; pos++) {
339                 if (bytes[pos] == b) {
340                     pos++;
341                     return new Attribute(name.toString(), value.toString(), pos);
342                 }
343                 else if (bytes[pos] >= 'A' && bytes[pos] <= 'Z') {
344                     final byte b2 = (byte) (bytes[pos] + 0x20);
345                     value.append((char) b2);
346                 }
347                 else {
348                     value.append((char) bytes[pos]);
349                 }
350             }
351             return new Attribute(name.toString(), value.toString(), pos);
352         }
353         else if (bytes[pos] == '>') {
354             return new Attribute(name.toString(), value.toString(), pos);
355         }
356         else if (bytes[pos] >= 'A' && bytes[pos] <= 'Z') {
357             final byte b = (byte) (bytes[pos] + 0x20);
358             value.append((char) b);
359             pos++;
360         }
361         else {
362             value.append((char) bytes[pos]);
363             pos++;
364         }
365         for ( ; pos < bytes.length; pos++) {
366             if (bytes[pos] == 0x09 || bytes[pos] == 0x0A || bytes[pos] == 0x0C || bytes[pos] == 0x0D || bytes[pos] == 0x20 || bytes[pos] == 0x3E) {
367                 return new Attribute(name.toString(), value.toString(), pos);
368             }
369             else if (bytes[pos] >= 'A' && bytes[pos] <= 'Z') {
370                 final byte b = (byte) (bytes[pos] + 0x20);
371                 value.append((char) b);
372             }
373             else {
374                 value.append((char) bytes[pos]);
375             }
376         }
377         return new Attribute(name.toString(), value.toString(), pos);
378     }
379 
380     /**
381      * Extracts an encoding from the specified <code>Content-Type</code> value using
382      * <a href="http://ietfreport.isoc.org/idref/draft-abarth-mime-sniff/">the IETF algorithm</a>; if
383      * no encoding is found, this method returns {@code null}.
384      *
385      * @param s the <code>Content-Type</code> value to search for an encoding
386      * @return the encoding found in the specified <code>Content-Type</code> value, or {@code null} if no
387      *         encoding was found
388      */
389     public static Charset extractEncodingFromContentType(final String s) {
390         if (s == null) {
391             return null;
392         }
393         final byte[] bytes = s.getBytes(US_ASCII);
394         int i;
395         for (i = 0; i < bytes.length; i++) {
396             if (matches(bytes, i, CHARSET_START)) {
397                 i += CHARSET_START.length;
398                 break;
399             }
400         }
401         if (i == bytes.length) {
402             return null;
403         }
404         while (bytes[i] == 0x09 || bytes[i] == 0x0A || bytes[i] == 0x0C || bytes[i] == 0x0D || bytes[i] == 0x20) {
405             i++;
406             if (i == bytes.length) {
407                 return null;
408             }
409         }
410         if (bytes[i] != '=') {
411             return null;
412         }
413         i++;
414         if (i == bytes.length) {
415             return null;
416         }
417         while (bytes[i] == 0x09 || bytes[i] == 0x0A || bytes[i] == 0x0C || bytes[i] == 0x0D || bytes[i] == 0x20) {
418             i++;
419             if (i == bytes.length) {
420                 return null;
421             }
422         }
423         if (bytes[i] == '"') {
424             if (bytes.length <= i + 1) {
425                 return null;
426             }
427             final int index = ArrayUtils.indexOf(bytes, (byte) '"', i + 1);
428             if (index == -1) {
429                 return null;
430             }
431             final String charsetName = new String(ArrayUtils.subarray(bytes, i + 1, index), US_ASCII);
432             return toCharset(charsetName);
433         }
434         if (bytes[i] == '\'') {
435             if (bytes.length <= i + 1) {
436                 return null;
437             }
438             final int index = ArrayUtils.indexOf(bytes, (byte) '\'', i + 1);
439             if (index == -1) {
440                 return null;
441             }
442             final String charsetName = new String(ArrayUtils.subarray(bytes, i + 1, index), US_ASCII);
443             return toCharset(charsetName);
444         }
445         int end = skipToAnyOf(bytes, i, new byte[] {0x09, 0x0A, 0x0C, 0x0D, 0x20, 0x3B});
446         if (end == -1) {
447             end = bytes.length;
448         }
449         final String charsetName = new String(ArrayUtils.subarray(bytes, i, end), US_ASCII);
450         return toCharset(charsetName);
451     }
452 
453     /**
454      * Searches the specified XML content for an XML declaration and returns the encoding if found,
455      * otherwise returns {@code null}.
456      *
457      * @param is the content stream to check for the charset declaration
458      * @return the encoding of the specified XML content, or {@code null} if it could not be determined
459      * @throws IOException if an IO error occurs
460      */
461     public static Charset sniffEncodingFromXmlDeclaration(final InputStream is) throws IOException {
462         final byte[] bytes = read(is, SIZE_OF_XML_CONTENT_SNIFFED);
463         Charset encoding = null;
464         if (bytes.length > 5
465                 && XML_DECLARATION_PREFIX[0] == bytes[0]
466                 && XML_DECLARATION_PREFIX[1] == bytes[1]
467                 && XML_DECLARATION_PREFIX[2] == bytes[2]
468                 && XML_DECLARATION_PREFIX[3] == bytes[3]
469                 && XML_DECLARATION_PREFIX[4] == bytes[4]
470                 && XML_DECLARATION_PREFIX[5] == bytes[5]) {
471             final int index = ArrayUtils.indexOf(bytes, (byte) '?', 2);
472             if (index + 1 < bytes.length && bytes[index + 1] == '>') {
473                 final String declaration = new String(bytes, 0, index + 2, US_ASCII);
474                 int start = declaration.indexOf("encoding");
475                 if (start != -1) {
476                     start += 8;
477                     final char delimiter;
478                 outer:
479                     while (true) {
480                         switch (declaration.charAt(start)) {
481                             case '"':
482                             case '\'':
483                                 delimiter = declaration.charAt(start);
484                                 start = start + 1;
485                                 break outer;
486 
487                             default:
488                                 start++;
489                         }
490                     }
491                     final int end = declaration.indexOf(delimiter, start);
492                     encoding = toCharset(declaration.substring(start, end));
493                 }
494             }
495         }
496         if (encoding != null && LOG.isDebugEnabled()) {
497             LOG.debug("Encoding found in XML declaration: '" + encoding + "'.");
498         }
499         return encoding;
500     }
501 
502     /**
503      * Parses and returns the charset declaration at the start of a css file if any, otherwise returns {@code null}.
504      * <p>e.g. <pre>@charset "UTF-8"</pre>
505      *
506      * @param is the input stream to parse
507      * @return the charset declaration at the start of a css file if any, otherwise returns {@code null}.
508      * @throws IOException if an IO error occurs
509      */
510     public static Charset sniffEncodingFromCssDeclaration(final InputStream is) throws IOException {
511         final byte[] bytes = read(is, SIZE_OF_CSS_CONTENT_SNIFFED);
512         if (bytes.length < CSS_CHARSET_DECLARATION_PREFIX.length) {
513             return null;
514         }
515         for (int i = 0; i < CSS_CHARSET_DECLARATION_PREFIX.length; i++) {
516             if (bytes[i] != CSS_CHARSET_DECLARATION_PREFIX[i]) {
517                 return null;
518             }
519         }
520 
521         Charset encoding = null;
522         final int index = ArrayUtils.indexOf(bytes, (byte) '"', CSS_CHARSET_DECLARATION_PREFIX.length);
523         if (index + 1 < bytes.length && bytes[index + 1] == ';') {
524             encoding = toCharset(new String(bytes, CSS_CHARSET_DECLARATION_PREFIX.length, index - CSS_CHARSET_DECLARATION_PREFIX.length, US_ASCII));
525             // https://www.w3.org/TR/css-syntax-3/#input-byte-stream "Why use utf-8 when the declaration says utf-16?"
526             if (encoding == UTF_16BE || encoding == UTF_16LE) {
527                 encoding = UTF_8;
528             }
529         }
530         return encoding;
531     }
532 
533     /**
534      * Returns {@code Charset} if the specified charset name is supported on this platform.
535      *
536      * @param charsetName the charset name to check
537      * @return {@code Charset} if the specified charset name is supported on this platform
538      */
539     public static Charset toCharset(final String charsetName) {
540         final String nameFromLabel = translateEncodingLabel(charsetName);
541         if (nameFromLabel == null) {
542             return null;
543         }
544         try {
545             return Charset.forName(nameFromLabel);
546         }
547         catch (final IllegalCharsetNameException | UnsupportedCharsetException e) {
548             return null;
549         }
550     }
551 
552     /**
553      * Returns {@code true} if the byte in the specified byte array at the specified index matches one of the
554      * specified byte array patterns.
555      *
556      * @param bytes the byte array to search in
557      * @param i the index at which to search
558      * @param sought the byte array patterns to search for
559      * @return {@code true} if the byte in the specified byte array at the specified index matches one of the
560      *         specified byte array patterns
561      */
562     static boolean matches(final byte[] bytes, final int i, final byte[][] sought) {
563         if (i + sought.length > bytes.length) {
564             return false;
565         }
566         for (int x = 0; x < sought.length; x++) {
567             final byte[] possibilities = sought[x];
568             boolean match = false;
569             for (final byte possibility : possibilities) {
570                 if (bytes[i + x] == possibility) {
571                     match = true;
572                     break;
573                 }
574             }
575             if (!match) {
576                 return false;
577             }
578         }
579         return true;
580     }
581 
582     /**
583      * Skips ahead to the first occurrence of any of the specified targets within the specified array,
584      * starting at the specified index. This method returns <code>-1</code> if none of the targets are found.
585      *
586      * @param bytes the array to search through
587      * @param startFrom the index to start looking at
588      * @param targets the targets to search for
589      * @return the index of the first occurrence of the specified targets within the specified array
590      */
591     static int skipToAnyOf(final byte[] bytes, final int startFrom, final byte[] targets) {
592         int i = startFrom;
593         for ( ; i < bytes.length; i++) {
594             if (ArrayUtils.contains(targets, bytes[i])) {
595                 break;
596             }
597         }
598         if (i == bytes.length) {
599             i = -1;
600         }
601         return i;
602     }
603 
604     /**
605      * Finds the first index of the specified sub-array inside the specified array, starting at the
606      * specified index. This method returns <code>-1</code> if the specified sub-array cannot be found.
607      *
608      * @param array the array to traverse for looking for the sub-array
609      * @param subarray the sub-array to find
610      * @param startIndex the start index to traverse forwards from
611      * @return the index of the sub-array within the array
612      */
613     static int indexOfSubArray(final byte[] array, final byte[] subarray, final int startIndex) {
614         for (int i = startIndex; i < array.length; i++) {
615             boolean found = true;
616             if (i + subarray.length > array.length) {
617                 break;
618             }
619             for (int j = 0; j < subarray.length; j++) {
620                 final byte a = array[i + j];
621                 final byte b = subarray[j];
622                 if (a != b) {
623                     found = false;
624                     break;
625                 }
626             }
627             if (found) {
628                 return i;
629             }
630         }
631         return -1;
632     }
633 
634     /**
635      * Attempts to read <code>size</code> bytes from the specified input stream. Note that this method is not guaranteed
636      * to be able to read <code>size</code> bytes; however, the returned byte array will always be the exact length of the
637      * number of bytes read.
638      *
639      * @param content the input stream to read from
640      * @param size the number of bytes to try to read
641      * @return the bytes read from the specified input stream
642      * @throws IOException if an IO error occurs
643      */
644     static byte[] read(final InputStream content, final int size) throws IOException {
645         byte[] bytes = new byte[size];
646         // using IOUtils guarantees that it will read as many bytes as possible before giving up;
647         // this may not always be the case for subclasses of InputStream} - eg. GZIPInputStream
648         final int count = IOUtils.read(content, bytes);
649         if (count < size) {
650             final byte[] smaller = new byte[count];
651             System.arraycopy(bytes, 0, smaller, 0, count);
652             bytes = smaller;
653         }
654         return bytes;
655     }
656 
657     /**
658      * Attempts to read <code>size</code> bytes from the specified input stream and then prepends the specified prefix to
659      * the bytes read, returning the resultant byte array. Note that this method is not guaranteed to be able to read
660      * <code>size</code> bytes; however, the returned byte array will always be the exact length of the number of bytes
661      * read plus the length of the prefix array.
662      *
663      * @param content the input stream to read from
664      * @param size the number of bytes to try to read
665      * @param prefix the byte array to prepend to the bytes read from the specified input stream
666      * @return the bytes read from the specified input stream, prefixed by the specified prefix
667      * @throws IOException if an IO error occurs
668      */
669     static byte[] readAndPrepend(final InputStream content, final int size, final byte[] prefix) throws IOException {
670         final int prefixLength = prefix.length;
671         final byte[] joined = new byte[prefixLength + size];
672 
673         // using IOUtils guarantees that it will read as many bytes as possible before giving up;
674         // this may not always be the case for subclasses of InputStream} - eg. GZIPInputStream
675         final int count = IOUtils.read(content, joined, prefixLength, joined.length - prefixLength);
676         if (count < size) {
677             final byte[] smaller = new byte[prefixLength + count];
678             System.arraycopy(prefix, 0, smaller, 0, prefix.length);
679             System.arraycopy(joined, prefixLength, smaller, prefixLength, count);
680             return smaller;
681         }
682 
683         System.arraycopy(prefix, 0, joined, 0, prefix.length);
684         return joined;
685     }
686 
687     static class Attribute {
688         private final String name_;
689         private final String value_;
690         private final int updatedIndex_;
691         Attribute(final String name, final String value, final int updatedIndex) {
692             name_ = name;
693             value_ = value;
694             updatedIndex_ = updatedIndex;
695         }
696         String getName() {
697             return name_;
698         }
699         String getValue() {
700             return value_;
701         }
702         int getUpdatedIndex() {
703             return updatedIndex_;
704         }
705     }
706 
707     /**
708      * Translates the given encoding label into a normalized form
709      * according to <a href="http://encoding.spec.whatwg.org/#encodings">Reference</a>.
710      * @param encodingLabel the label to translate
711      * @return the normalized encoding name or null if not found
712      */
713     public static String translateEncodingLabel(final String encodingLabel) {
714         if (StringUtils.isEmptyOrNull(encodingLabel)) {
715             return null;
716         }
717 
718         final String encLC = encodingLabel.toLowerCase(Locale.ROOT);
719         final String enc = StandardEncodingTranslator.INSTANCE.encodingNameFromLabel(encodingLabel);
720         if (encLC.equals(enc)) {
721             return encodingLabel;
722         }
723         return enc;
724     }
725 }