View Javadoc
1   /*
2    * Copyright (c) 2002-2025 Gargoyle Software Inc.
3    *
4    * Licensed under the Apache License, Version 2.0 (the "License");
5    * you may not use this file except in compliance with the License.
6    * You may obtain a copy of the License at
7    * https://www.apache.org/licenses/LICENSE-2.0
8    *
9    * Unless required by applicable law or agreed to in writing, software
10   * distributed under the License is distributed on an "AS IS" BASIS,
11   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12   * See the License for the specific language governing permissions and
13   * limitations under the License.
14   */
15  package org.htmlunit.util;
16  
17  import static java.nio.charset.StandardCharsets.US_ASCII;
18  import static java.nio.charset.StandardCharsets.UTF_16BE;
19  import static java.nio.charset.StandardCharsets.UTF_16LE;
20  import static java.nio.charset.StandardCharsets.UTF_8;
21  
22  import java.io.IOException;
23  import java.io.InputStream;
24  import java.nio.charset.Charset;
25  import java.nio.charset.IllegalCharsetNameException;
26  import java.nio.charset.UnsupportedCharsetException;
27  import java.util.Arrays;
28  import java.util.List;
29  import java.util.Locale;
30  
31  import org.apache.commons.io.ByteOrderMark;
32  import org.apache.commons.io.IOUtils;
33  import org.apache.commons.lang3.ArrayUtils;
34  import org.apache.commons.lang3.StringUtils;
35  import org.apache.commons.logging.Log;
36  import org.apache.commons.logging.LogFactory;
37  import org.htmlunit.HttpHeader;
38  import org.htmlunit.cyberneko.xerces.util.StandardEncodingTranslator;
39  
40  /**
41   * Sniffs encoding settings from HTML, XML or other content. The HTML encoding sniffing algorithm is based on the
42   * <a href="http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#determining-the-character-encoding">HTML5
43   * encoding sniffing algorithm</a>.
44   *
45   * @author Daniel Gredler
46   * @author Ahmed Ashour
47   * @author Ronald Brill
48   * @author Lai Quang Duong
49   */
50  public final class EncodingSniffer {
51  
52      /** Logging support. */
53      private static final Log LOG = LogFactory.getLog(EncodingSniffer.class);
54  
55      /** Sequence(s) of bytes indicating the beginning of a comment. */
56      private static final byte[][] COMMENT_START = {
57          new byte[] {'<'},
58          new byte[] {'!'},
59          new byte[] {'-'},
60          new byte[] {'-'}
61      };
62  
63      /** Sequence(s) of bytes indicating the beginning of a <code>meta</code> HTML tag. */
64      private static final byte[][] META_START = {
65          new byte[] {'<'},
66          new byte[] {'m', 'M'},
67          new byte[] {'e', 'E'},
68          new byte[] {'t', 'T'},
69          new byte[] {'a', 'A'},
70          new byte[] {0x09, 0x0A, 0x0C, 0x0D, 0x20, 0x2F}
71      };
72  
73      /** Sequence(s) of bytes indicating the beginning of miscellaneous HTML content. */
74      private static final byte[][] OTHER_START = {
75          new byte[] {'<'},
76          new byte[] {'!', '/', '?'}
77      };
78  
79      /** Sequence(s) of bytes indicating the beginning of a charset specification. */
80      private static final byte[][] CHARSET_START = {
81          new byte[] {'c', 'C'},
82          new byte[] {'h', 'H'},
83          new byte[] {'a', 'A'},
84          new byte[] {'r', 'R'},
85          new byte[] {'s', 'S'},
86          new byte[] {'e', 'E'},
87          new byte[] {'t', 'T'}
88      };
89  
90      private static final byte[] WHITESPACE = {0x09, 0x0A, 0x0C, 0x0D, 0x20, 0x3E};
91      private static final byte[] COMMENT_END = {'-', '-', '>'};
92  
93      private static final byte[] XML_DECLARATION_PREFIX = "<?xml ".getBytes(US_ASCII);
94  
95      private static final byte[] CSS_CHARSET_DECLARATION_PREFIX = "@charset \"".getBytes(US_ASCII);
96  
97      /**
98       * The number of HTML bytes to sniff for encoding info embedded in <code>meta</code> tags;
99       */
100     private static final int SIZE_OF_HTML_CONTENT_SNIFFED = 1024;
101 
102     /**
103      * The number of XML bytes to sniff for encoding info embedded in the XML declaration;
104      * relatively small because it's always at the very beginning of the file.
105      */
106     private static final int SIZE_OF_XML_CONTENT_SNIFFED = 512;
107 
108     private static final int SIZE_OF_CSS_CONTENT_SNIFFED = 1024;
109 
110     /**
111      * Disallow instantiation of this class.
112      */
113     private EncodingSniffer() {
114         // Empty.
115     }
116 
117     /**
118      * Returns {@code true} if the specified HTTP response headers contain a <code>Content-Type</code> that
119      * ends with one of the specified strings.
120      *
121      * @param headers the HTTP response headers
122      * @param contentTypeEndings the content type endings to search for
123      * @return {@code true} if the specified HTTP response headers contain a <code>Content-Type</code> that
124      *         ends with one of the specified strings
125      */
126     static boolean contentTypeEndsWith(final List<NameValuePair> headers, final String... contentTypeEndings) {
127         for (final NameValuePair pair : headers) {
128             final String name = pair.getName();
129             if (HttpHeader.CONTENT_TYPE_LC.equalsIgnoreCase(name)) {
130                 String value = pair.getValue();
131                 final int i = value.indexOf(';');
132                 if (i != -1) {
133                     value = value.substring(0, i);
134                 }
135                 value = value.trim().toLowerCase(Locale.ROOT);
136                 for (final String ending : contentTypeEndings) {
137                     if (value.endsWith(ending.toLowerCase(Locale.ROOT))) {
138                         return true;
139                     }
140                 }
141                 return false;
142             }
143         }
144         return false;
145     }
146 
147     /**
148      * Attempts to sniff an encoding from a <a href="http://en.wikipedia.org/wiki/Byte_Order_Mark">Byte Order Mark</a>
149      * in the specified byte array.
150      *
151      * @param bytes the bytes to check for a Byte Order Mark
152      * @return the encoding sniffed from the specified bytes, or {@code null} if the encoding
153      *         could not be determined
154      */
155     static Charset sniffEncodingFromUnicodeBom(final byte[] bytes) {
156         if (bytes == null) {
157             return null;
158         }
159 
160         Charset encoding = null;
161         if (startsWith(bytes, ByteOrderMark.UTF_8)) {
162             encoding = UTF_8;
163         }
164         else if (startsWith(bytes, ByteOrderMark.UTF_16BE)) {
165             encoding = UTF_16BE;
166         }
167         else if (startsWith(bytes, ByteOrderMark.UTF_16LE)) {
168             encoding = UTF_16LE;
169         }
170 
171         if (encoding != null && LOG.isDebugEnabled()) {
172             LOG.debug("Encoding found in Unicode Byte Order Mark: '" + encoding + "'.");
173         }
174         return encoding;
175     }
176 
177     /**
178      * Returns whether the specified byte array starts with the given {@link ByteOrderMark}, or not.
179      * @param bytes the byte array to check
180      * @param bom the {@link ByteOrderMark}
181      * @return whether the specified byte array starts with the given {@link ByteOrderMark}, or not
182      */
183     private static boolean startsWith(final byte[] bytes, final ByteOrderMark bom) {
184         final byte[] bomBytes = bom.getBytes();
185         final byte[] firstBytes = Arrays.copyOfRange(bytes, 0, Math.min(bytes.length, bomBytes.length));
186         return Arrays.equals(firstBytes, bomBytes);
187     }
188 
189     /**
190      * Attempts to sniff an encoding from an HTML <code>meta</code> tag in the specified byte array.
191      *
192      * @param is the content stream to check for an HTML <code>meta</code> tag
193      * @return the encoding sniffed from the specified bytes, or {@code null} if the encoding
194      *         could not be determined
195      * @throws IOException if an IO error occurs
196      */
197     public static Charset sniffEncodingFromMetaTag(final InputStream is) throws IOException {
198         final byte[] bytes = read(is, SIZE_OF_HTML_CONTENT_SNIFFED);
199         for (int i = 0; i < bytes.length; i++) {
200             if (matches(bytes, i, COMMENT_START)) {
201                 i = indexOfSubArray(bytes, COMMENT_END, i);
202                 if (i == -1) {
203                     break;
204                 }
205                 i += 2;
206             }
207             else if (matches(bytes, i, META_START)) {
208                 i += META_START.length;
209                 for (Attribute att = getAttribute(bytes, i); att != null; att = getAttribute(bytes, i)) {
210                     i = att.getUpdatedIndex();
211                     final String name = att.getName().toLowerCase(Locale.ROOT);
212                     final String value = att.getValue().toLowerCase(Locale.ROOT);
213                     if ("charset".equals(name) || "content".equals(name)) {
214                         Charset charset = null;
215                         if ("charset".equals(name)) {
216                             charset = toCharset(value);
217                             // https://html.spec.whatwg.org/multipage/parsing.html#prescan-a-byte-stream-to-determine-its-encoding
218                             if (charset == null && "x-user-defined".equals(value)) {
219                                 charset = Charset.forName("windows-1252");
220                             }
221                         }
222                         else if ("content".equals(name)) {
223                             charset = extractEncodingFromContentType(value);
224                             // https://html.spec.whatwg.org/multipage/parsing.html#prescan-a-byte-stream-to-determine-its-encoding
225                             if (charset == null && value != null && value.contains("x-user-defined")) {
226                                 charset = Charset.forName("windows-1252");
227                             }
228                             if (charset == null) {
229                                 continue;
230                             }
231                         }
232                         if (UTF_16BE == charset || UTF_16LE == charset) {
233                             charset = UTF_8;
234                         }
235                         if (charset != null) {
236                             if (LOG.isDebugEnabled()) {
237                                 LOG.debug("Encoding found in meta tag: '" + charset + "'.");
238                             }
239                             return charset;
240                         }
241                     }
242                 }
243             }
244             else if (i + 1 < bytes.length && bytes[i] == '<' && Character.isLetter(bytes[i + 1])) {
245                 i = skipToAnyOf(bytes, i, WHITESPACE);
246                 if (i == -1) {
247                     break;
248                 }
249                 Attribute att = getAttribute(bytes, i);
250                 while (att != null) {
251                     i = att.getUpdatedIndex();
252                     att = getAttribute(bytes, i);
253                 }
254             }
255             else if (i + 2 < bytes.length && bytes[i] == '<' && bytes[i + 1] == '/' && Character.isLetter(bytes[i + 2])) {
256                 i = skipToAnyOf(bytes, i, new byte[] {0x09, 0x0A, 0x0C, 0x0D, 0x20, 0x3E});
257                 if (i == -1) {
258                     break;
259                 }
260                 Attribute attribute = getAttribute(bytes, i);
261                 while (attribute != null) {
262                     i = attribute.getUpdatedIndex();
263                     attribute = getAttribute(bytes, i);
264                 }
265             }
266             else if (matches(bytes, i, OTHER_START)) {
267                 i = skipToAnyOf(bytes, i, new byte[] {0x3E});
268                 if (i == -1) {
269                     break;
270                 }
271             }
272         }
273         return null;
274     }
275 
276     /**
277      * Extracts an attribute from the specified byte array, starting at the specified index, using the
278      * <a href="http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#concept-get-attributes-when-sniffing">HTML5
279      * attribute algorithm</a>.
280      *
281      * @param bytes the byte array to extract an attribute from
282      * @param startFrom the index to start searching from
283      * @return the next attribute in the specified byte array, or {@code null} if one is not available
284      */
285     static Attribute getAttribute(final byte[] bytes, final int startFrom) {
286         if (startFrom >= bytes.length) {
287             return null;
288         }
289 
290         int pos = startFrom;
291         while (bytes[pos] == 0x09 || bytes[pos] == 0x0A || bytes[pos] == 0x0C || bytes[pos] == 0x0D || bytes[pos] == 0x20 || bytes[pos] == 0x2F) {
292             pos++;
293             if (pos >= bytes.length) {
294                 return null;
295             }
296         }
297         if (bytes[pos] == '>') {
298             return null;
299         }
300         final StringBuilder name = new StringBuilder();
301         final StringBuilder value = new StringBuilder();
302         for ( ;; pos++) {
303             if (pos >= bytes.length) {
304                 return new Attribute(name.toString(), value.toString(), pos);
305             }
306             if (bytes[pos] == '=' && name.length() != 0) {
307                 pos++;
308                 break;
309             }
310             if (bytes[pos] == 0x09 || bytes[pos] == 0x0A || bytes[pos] == 0x0C || bytes[pos] == 0x0D || bytes[pos] == 0x20) {
311                 while (bytes[pos] == 0x09 || bytes[pos] == 0x0A || bytes[pos] == 0x0C || bytes[pos] == 0x0D || bytes[pos] == 0x20) {
312                     pos++;
313                     if (pos >= bytes.length) {
314                         return new Attribute(name.toString(), value.toString(), pos);
315                     }
316                 }
317                 if (bytes[pos] != '=') {
318                     return new Attribute(name.toString(), value.toString(), pos);
319                 }
320                 pos++;
321                 break;
322             }
323             if (bytes[pos] == '/' || bytes[pos] == '>') {
324                 return new Attribute(name.toString(), value.toString(), pos);
325             }
326             name.append((char) bytes[pos]);
327         }
328         if (pos >= bytes.length) {
329             return new Attribute(name.toString(), value.toString(), pos);
330         }
331         while (bytes[pos] == 0x09 || bytes[pos] == 0x0A || bytes[pos] == 0x0C || bytes[pos] == 0x0D || bytes[pos] == 0x20) {
332             pos++;
333             if (pos >= bytes.length) {
334                 return new Attribute(name.toString(), value.toString(), pos);
335             }
336         }
337         if (bytes[pos] == '"' || bytes[pos] == '\'') {
338             final byte b = bytes[pos];
339             for (pos++; pos < bytes.length; pos++) {
340                 if (bytes[pos] == b) {
341                     pos++;
342                     return new Attribute(name.toString(), value.toString(), pos);
343                 }
344                 else if (bytes[pos] >= 'A' && bytes[pos] <= 'Z') {
345                     final byte b2 = (byte) (bytes[pos] + 0x20);
346                     value.append((char) b2);
347                 }
348                 else {
349                     value.append((char) bytes[pos]);
350                 }
351             }
352             return new Attribute(name.toString(), value.toString(), pos);
353         }
354         else if (bytes[pos] == '>') {
355             return new Attribute(name.toString(), value.toString(), pos);
356         }
357         else if (bytes[pos] >= 'A' && bytes[pos] <= 'Z') {
358             final byte b = (byte) (bytes[pos] + 0x20);
359             value.append((char) b);
360             pos++;
361         }
362         else {
363             value.append((char) bytes[pos]);
364             pos++;
365         }
366         for ( ; pos < bytes.length; pos++) {
367             if (bytes[pos] == 0x09 || bytes[pos] == 0x0A || bytes[pos] == 0x0C || bytes[pos] == 0x0D || bytes[pos] == 0x20 || bytes[pos] == 0x3E) {
368                 return new Attribute(name.toString(), value.toString(), pos);
369             }
370             else if (bytes[pos] >= 'A' && bytes[pos] <= 'Z') {
371                 final byte b = (byte) (bytes[pos] + 0x20);
372                 value.append((char) b);
373             }
374             else {
375                 value.append((char) bytes[pos]);
376             }
377         }
378         return new Attribute(name.toString(), value.toString(), pos);
379     }
380 
381     /**
382      * Extracts an encoding from the specified <code>Content-Type</code> value using
383      * <a href="http://ietfreport.isoc.org/idref/draft-abarth-mime-sniff/">the IETF algorithm</a>; if
384      * no encoding is found, this method returns {@code null}.
385      *
386      * @param s the <code>Content-Type</code> value to search for an encoding
387      * @return the encoding found in the specified <code>Content-Type</code> value, or {@code null} if no
388      *         encoding was found
389      */
390     public static Charset extractEncodingFromContentType(final String s) {
391         if (s == null) {
392             return null;
393         }
394         final byte[] bytes = s.getBytes(US_ASCII);
395         int i;
396         for (i = 0; i < bytes.length; i++) {
397             if (matches(bytes, i, CHARSET_START)) {
398                 i += CHARSET_START.length;
399                 break;
400             }
401         }
402         if (i == bytes.length) {
403             return null;
404         }
405         while (bytes[i] == 0x09 || bytes[i] == 0x0A || bytes[i] == 0x0C || bytes[i] == 0x0D || bytes[i] == 0x20) {
406             i++;
407             if (i == bytes.length) {
408                 return null;
409             }
410         }
411         if (bytes[i] != '=') {
412             return null;
413         }
414         i++;
415         if (i == bytes.length) {
416             return null;
417         }
418         while (bytes[i] == 0x09 || bytes[i] == 0x0A || bytes[i] == 0x0C || bytes[i] == 0x0D || bytes[i] == 0x20) {
419             i++;
420             if (i == bytes.length) {
421                 return null;
422             }
423         }
424         if (bytes[i] == '"') {
425             if (bytes.length <= i + 1) {
426                 return null;
427             }
428             final int index = ArrayUtils.indexOf(bytes, (byte) '"', i + 1);
429             if (index == -1) {
430                 return null;
431             }
432             final String charsetName = new String(ArrayUtils.subarray(bytes, i + 1, index), US_ASCII);
433             return toCharset(charsetName);
434         }
435         if (bytes[i] == '\'') {
436             if (bytes.length <= i + 1) {
437                 return null;
438             }
439             final int index = ArrayUtils.indexOf(bytes, (byte) '\'', i + 1);
440             if (index == -1) {
441                 return null;
442             }
443             final String charsetName = new String(ArrayUtils.subarray(bytes, i + 1, index), US_ASCII);
444             return toCharset(charsetName);
445         }
446         int end = skipToAnyOf(bytes, i, new byte[] {0x09, 0x0A, 0x0C, 0x0D, 0x20, 0x3B});
447         if (end == -1) {
448             end = bytes.length;
449         }
450         final String charsetName = new String(ArrayUtils.subarray(bytes, i, end), US_ASCII);
451         return toCharset(charsetName);
452     }
453 
454     /**
455      * Searches the specified XML content for an XML declaration and returns the encoding if found,
456      * otherwise returns {@code null}.
457      *
458      * @param is the content stream to check for the charset declaration
459      * @return the encoding of the specified XML content, or {@code null} if it could not be determined
460      * @throws IOException if an IO error occurs
461      */
462     public static Charset sniffEncodingFromXmlDeclaration(final InputStream is) throws IOException {
463         final byte[] bytes = read(is, SIZE_OF_XML_CONTENT_SNIFFED);
464         Charset encoding = null;
465         if (bytes.length > 5
466                 && XML_DECLARATION_PREFIX[0] == bytes[0]
467                 && XML_DECLARATION_PREFIX[1] == bytes[1]
468                 && XML_DECLARATION_PREFIX[2] == bytes[2]
469                 && XML_DECLARATION_PREFIX[3] == bytes[3]
470                 && XML_DECLARATION_PREFIX[4] == bytes[4]
471                 && XML_DECLARATION_PREFIX[5] == bytes[5]) {
472             final int index = ArrayUtils.indexOf(bytes, (byte) '?', 2);
473             if (index + 1 < bytes.length && bytes[index + 1] == '>') {
474                 final String declaration = new String(bytes, 0, index + 2, US_ASCII);
475                 int start = declaration.indexOf("encoding");
476                 if (start != -1) {
477                     start += 8;
478                     char delimiter;
479                 outer:
480                     while (true) {
481                         switch (declaration.charAt(start)) {
482                             case '"':
483                             case '\'':
484                                 delimiter = declaration.charAt(start);
485                                 start = start + 1;
486                                 break outer;
487 
488                             default:
489                                 start++;
490                         }
491                     }
492                     final int end = declaration.indexOf(delimiter, start);
493                     encoding = toCharset(declaration.substring(start, end));
494                 }
495             }
496         }
497         if (encoding != null && LOG.isDebugEnabled()) {
498             LOG.debug("Encoding found in XML declaration: '" + encoding + "'.");
499         }
500         return encoding;
501     }
502 
503     /**
504      * Parses and returns the charset declaration at the start of a css file if any, otherwise returns {@code null}.
505      * <p>e.g. <pre>@charset "UTF-8"</pre>
506      *
507      * @param is the input stream to parse
508      * @return the charset declaration at the start of a css file if any, otherwise returns {@code null}.
509      * @throws IOException if an IO error occurs
510      */
511     public static Charset sniffEncodingFromCssDeclaration(final InputStream is) throws IOException {
512         final byte[] bytes = read(is, SIZE_OF_CSS_CONTENT_SNIFFED);
513         if (bytes.length < CSS_CHARSET_DECLARATION_PREFIX.length) {
514             return null;
515         }
516         for (int i = 0; i < CSS_CHARSET_DECLARATION_PREFIX.length; i++) {
517             if (bytes[i] != CSS_CHARSET_DECLARATION_PREFIX[i]) {
518                 return null;
519             }
520         }
521 
522         Charset encoding = null;
523         final int index = ArrayUtils.indexOf(bytes, (byte) '"', CSS_CHARSET_DECLARATION_PREFIX.length);
524         if (index + 1 < bytes.length && bytes[index + 1] == ';') {
525             encoding = toCharset(new String(bytes, CSS_CHARSET_DECLARATION_PREFIX.length, index - CSS_CHARSET_DECLARATION_PREFIX.length, US_ASCII));
526             // https://www.w3.org/TR/css-syntax-3/#input-byte-stream "Why use utf-8 when the declaration says utf-16?"
527             if (encoding == UTF_16BE || encoding == UTF_16LE) {
528                 encoding = UTF_8;
529             }
530         }
531         return encoding;
532     }
533 
534     /**
535      * Returns {@code Charset} if the specified charset name is supported on this platform.
536      *
537      * @param charsetName the charset name to check
538      * @return {@code Charset} if the specified charset name is supported on this platform
539      */
540     public static Charset toCharset(final String charsetName) {
541         final String nameFromLabel = translateEncodingLabel(charsetName);
542         if (nameFromLabel == null) {
543             return null;
544         }
545         try {
546             return Charset.forName(nameFromLabel);
547         }
548         catch (final IllegalCharsetNameException | UnsupportedCharsetException e) {
549             return null;
550         }
551     }
552 
553     /**
554      * Returns {@code true} if the byte in the specified byte array at the specified index matches one of the
555      * specified byte array patterns.
556      *
557      * @param bytes the byte array to search in
558      * @param i the index at which to search
559      * @param sought the byte array patterns to search for
560      * @return {@code true} if the byte in the specified byte array at the specified index matches one of the
561      *         specified byte array patterns
562      */
563     static boolean matches(final byte[] bytes, final int i, final byte[][] sought) {
564         if (i + sought.length > bytes.length) {
565             return false;
566         }
567         for (int x = 0; x < sought.length; x++) {
568             final byte[] possibilities = sought[x];
569             boolean match = false;
570             for (final byte possibility : possibilities) {
571                 if (bytes[i + x] == possibility) {
572                     match = true;
573                     break;
574                 }
575             }
576             if (!match) {
577                 return false;
578             }
579         }
580         return true;
581     }
582 
583     /**
584      * Skips ahead to the first occurrence of any of the specified targets within the specified array,
585      * starting at the specified index. This method returns <code>-1</code> if none of the targets are found.
586      *
587      * @param bytes the array to search through
588      * @param startFrom the index to start looking at
589      * @param targets the targets to search for
590      * @return the index of the first occurrence of the specified targets within the specified array
591      */
592     static int skipToAnyOf(final byte[] bytes, final int startFrom, final byte[] targets) {
593         int i = startFrom;
594         for ( ; i < bytes.length; i++) {
595             if (ArrayUtils.contains(targets, bytes[i])) {
596                 break;
597             }
598         }
599         if (i == bytes.length) {
600             i = -1;
601         }
602         return i;
603     }
604 
605     /**
606      * Finds the first index of the specified sub-array inside the specified array, starting at the
607      * specified index. This method returns <code>-1</code> if the specified sub-array cannot be found.
608      *
609      * @param array the array to traverse for looking for the sub-array
610      * @param subarray the sub-array to find
611      * @param startIndex the start index to traverse forwards from
612      * @return the index of the sub-array within the array
613      */
614     static int indexOfSubArray(final byte[] array, final byte[] subarray, final int startIndex) {
615         for (int i = startIndex; i < array.length; i++) {
616             boolean found = true;
617             if (i + subarray.length > array.length) {
618                 break;
619             }
620             for (int j = 0; j < subarray.length; j++) {
621                 final byte a = array[i + j];
622                 final byte b = subarray[j];
623                 if (a != b) {
624                     found = false;
625                     break;
626                 }
627             }
628             if (found) {
629                 return i;
630             }
631         }
632         return -1;
633     }
634 
635     /**
636      * Attempts to read <code>size</code> bytes from the specified input stream. Note that this method is not guaranteed
637      * to be able to read <code>size</code> bytes; however, the returned byte array will always be the exact length of the
638      * number of bytes read.
639      *
640      * @param content the input stream to read from
641      * @param size the number of bytes to try to read
642      * @return the bytes read from the specified input stream
643      * @throws IOException if an IO error occurs
644      */
645     static byte[] read(final InputStream content, final int size) throws IOException {
646         byte[] bytes = new byte[size];
647         // using IOUtils guarantees that it will read as many bytes as possible before giving up;
648         // this may not always be the case for subclasses of InputStream} - eg. GZIPInputStream
649         final int count = IOUtils.read(content, bytes);
650         if (count < size) {
651             final byte[] smaller = new byte[count];
652             System.arraycopy(bytes, 0, smaller, 0, count);
653             bytes = smaller;
654         }
655         return bytes;
656     }
657 
658     /**
659      * Attempts to read <code>size</code> bytes from the specified input stream and then prepends the specified prefix to
660      * the bytes read, returning the resultant byte array. Note that this method is not guaranteed to be able to read
661      * <code>size</code> bytes; however, the returned byte array will always be the exact length of the number of bytes
662      * read plus the length of the prefix array.
663      *
664      * @param content the input stream to read from
665      * @param size the number of bytes to try to read
666      * @param prefix the byte array to prepend to the bytes read from the specified input stream
667      * @return the bytes read from the specified input stream, prefixed by the specified prefix
668      * @throws IOException if an IO error occurs
669      */
670     static byte[] readAndPrepend(final InputStream content, final int size, final byte[] prefix) throws IOException {
671         final int prefixLength = prefix.length;
672         final byte[] joined = new byte[prefixLength + size];
673 
674         // using IOUtils guarantees that it will read as many bytes as possible before giving up;
675         // this may not always be the case for subclasses of InputStream} - eg. GZIPInputStream
676         final int count = IOUtils.read(content, joined, prefixLength, joined.length - prefixLength);
677         if (count < size) {
678             final byte[] smaller = new byte[prefixLength + count];
679             System.arraycopy(prefix, 0, smaller, 0, prefix.length);
680             System.arraycopy(joined, prefixLength, smaller, prefixLength, count);
681             return smaller;
682         }
683 
684         System.arraycopy(prefix, 0, joined, 0, prefix.length);
685         return joined;
686     }
687 
688     static class Attribute {
689         private final String name_;
690         private final String value_;
691         private final int updatedIndex_;
692         Attribute(final String name, final String value, final int updatedIndex) {
693             name_ = name;
694             value_ = value;
695             updatedIndex_ = updatedIndex;
696         }
697         String getName() {
698             return name_;
699         }
700         String getValue() {
701             return value_;
702         }
703         int getUpdatedIndex() {
704             return updatedIndex_;
705         }
706     }
707 
708     /**
709      * Translates the given encoding label into a normalized form
710      * according to <a href="http://encoding.spec.whatwg.org/#encodings">Reference</a>.
711      * @param encodingLabel the label to translate
712      * @return the normalized encoding name or null if not found
713      */
714     public static String translateEncodingLabel(final String encodingLabel) {
715         if (StringUtils.isEmpty(encodingLabel)) {
716             return null;
717         }
718 
719         final String encLC = encodingLabel.toLowerCase(Locale.ROOT);
720         final String enc = StandardEncodingTranslator.INSTANCE.encodingNameFromLabel(encodingLabel);
721         if (encLC.equals(enc)) {
722             return encodingLabel;
723         }
724         return enc;
725     }
726 }