View Javadoc
1   /*
2    * Copyright (c) 2002-2025 Gargoyle Software Inc.
3    *
4    * Licensed under the Apache License, Version 2.0 (the "License");
5    * you may not use this file except in compliance with the License.
6    * You may obtain a copy of the License at
7    * https://www.apache.org/licenses/LICENSE-2.0
8    *
9    * Unless required by applicable law or agreed to in writing, software
10   * distributed under the License is distributed on an "AS IS" BASIS,
11   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12   * See the License for the specific language governing permissions and
13   * limitations under the License.
14   */
15  package org.htmlunit.util;
16  
17  import static java.nio.charset.StandardCharsets.US_ASCII;
18  import static java.nio.charset.StandardCharsets.UTF_16BE;
19  import static java.nio.charset.StandardCharsets.UTF_16LE;
20  import static java.nio.charset.StandardCharsets.UTF_8;
21  
22  import java.io.IOException;
23  import java.io.InputStream;
24  import java.nio.charset.Charset;
25  import java.nio.charset.IllegalCharsetNameException;
26  import java.nio.charset.UnsupportedCharsetException;
27  import java.util.Arrays;
28  import java.util.List;
29  import java.util.Locale;
30  
31  import org.apache.commons.io.ByteOrderMark;
32  import org.apache.commons.io.IOUtils;
33  import org.apache.commons.lang3.ArrayUtils;
34  import org.apache.commons.logging.Log;
35  import org.apache.commons.logging.LogFactory;
36  import org.htmlunit.HttpHeader;
37  import org.htmlunit.cyberneko.xerces.util.StandardEncodingTranslator;
38  
39  /**
40   * Sniffs encoding settings from HTML, XML or other content. The HTML encoding sniffing algorithm is based on the
41   * <a href="http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#determining-the-character-encoding">HTML5
42   * encoding sniffing algorithm</a>.
43   *
44   * @author Daniel Gredler
45   * @author Ahmed Ashour
46   * @author Ronald Brill
47   * @author Lai Quang Duong
48   */
49  public final class EncodingSniffer {
50  
51      /** Logging support. */
52      private static final Log LOG = LogFactory.getLog(EncodingSniffer.class);
53  
54      /** Sequence(s) of bytes indicating the beginning of a comment. */
55      private static final byte[][] COMMENT_START = {
56          new byte[] {'<'},
57          new byte[] {'!'},
58          new byte[] {'-'},
59          new byte[] {'-'}
60      };
61  
62      /** Sequence(s) of bytes indicating the beginning of a <code>meta</code> HTML tag. */
63      private static final byte[][] META_START = {
64          new byte[] {'<'},
65          new byte[] {'m', 'M'},
66          new byte[] {'e', 'E'},
67          new byte[] {'t', 'T'},
68          new byte[] {'a', 'A'},
69          new byte[] {0x09, 0x0A, 0x0C, 0x0D, 0x20, 0x2F}
70      };
71  
72      /** Sequence(s) of bytes indicating the beginning of miscellaneous HTML content. */
73      private static final byte[][] OTHER_START = {
74          new byte[] {'<'},
75          new byte[] {'!', '/', '?'}
76      };
77  
78      /** Sequence(s) of bytes indicating the beginning of a charset specification. */
79      private static final byte[][] CHARSET_START = {
80          new byte[] {'c', 'C'},
81          new byte[] {'h', 'H'},
82          new byte[] {'a', 'A'},
83          new byte[] {'r', 'R'},
84          new byte[] {'s', 'S'},
85          new byte[] {'e', 'E'},
86          new byte[] {'t', 'T'}
87      };
88  
89      private static final byte[] WHITESPACE = {0x09, 0x0A, 0x0C, 0x0D, 0x20, 0x3E};
90      private static final byte[] COMMENT_END = {'-', '-', '>'};
91  
92      private static final byte[] XML_DECLARATION_PREFIX = "<?xml ".getBytes(US_ASCII);
93  
94      private static final byte[] CSS_CHARSET_DECLARATION_PREFIX = "@charset \"".getBytes(US_ASCII);
95  
96      /**
97       * The number of HTML bytes to sniff for encoding info embedded in <code>meta</code> tags;
98       */
99      private static final int SIZE_OF_HTML_CONTENT_SNIFFED = 1024;
100 
101     /**
102      * The number of XML bytes to sniff for encoding info embedded in the XML declaration;
103      * relatively small because it's always at the very beginning of the file.
104      */
105     private static final int SIZE_OF_XML_CONTENT_SNIFFED = 512;
106 
107     private static final int SIZE_OF_CSS_CONTENT_SNIFFED = 1024;
108 
109     /**
110      * Disallow instantiation of this class.
111      */
112     private EncodingSniffer() {
113         // Empty.
114     }
115 
116     /**
117      * Returns {@code true} if the specified HTTP response headers contain a <code>Content-Type</code> that
118      * ends with one of the specified strings.
119      *
120      * @param headers the HTTP response headers
121      * @param contentTypeEndings the content type endings to search for
122      * @return {@code true} if the specified HTTP response headers contain a <code>Content-Type</code> that
123      *         ends with one of the specified strings
124      */
125     static boolean contentTypeEndsWith(final List<NameValuePair> headers, final String... contentTypeEndings) {
126         for (final NameValuePair pair : headers) {
127             final String name = pair.getName();
128             if (HttpHeader.CONTENT_TYPE_LC.equalsIgnoreCase(name)) {
129                 String value = pair.getValue();
130                 final int i = value.indexOf(';');
131                 if (i != -1) {
132                     value = value.substring(0, i);
133                 }
134                 value = value.trim().toLowerCase(Locale.ROOT);
135                 for (final String ending : contentTypeEndings) {
136                     if (value.endsWith(ending.toLowerCase(Locale.ROOT))) {
137                         return true;
138                     }
139                 }
140                 return false;
141             }
142         }
143         return false;
144     }
145 
146     /**
147      * Attempts to sniff an encoding from a <a href="http://en.wikipedia.org/wiki/Byte_Order_Mark">Byte Order Mark</a>
148      * in the specified byte array.
149      *
150      * @param bytes the bytes to check for a Byte Order Mark
151      * @return the encoding sniffed from the specified bytes, or {@code null} if the encoding
152      *         could not be determined
153      */
154     static Charset sniffEncodingFromUnicodeBom(final byte[] bytes) {
155         if (bytes == null) {
156             return null;
157         }
158 
159         Charset encoding = null;
160         if (startsWith(bytes, ByteOrderMark.UTF_8)) {
161             encoding = UTF_8;
162         }
163         else if (startsWith(bytes, ByteOrderMark.UTF_16BE)) {
164             encoding = UTF_16BE;
165         }
166         else if (startsWith(bytes, ByteOrderMark.UTF_16LE)) {
167             encoding = UTF_16LE;
168         }
169 
170         if (encoding != null && LOG.isDebugEnabled()) {
171             LOG.debug("Encoding found in Unicode Byte Order Mark: '" + encoding + "'.");
172         }
173         return encoding;
174     }
175 
176     /**
177      * Returns whether the specified byte array starts with the given {@link ByteOrderMark}, or not.
178      * @param bytes the byte array to check
179      * @param bom the {@link ByteOrderMark}
180      * @return whether the specified byte array starts with the given {@link ByteOrderMark}, or not
181      */
182     private static boolean startsWith(final byte[] bytes, final ByteOrderMark bom) {
183         final byte[] bomBytes = bom.getBytes();
184         final byte[] firstBytes = Arrays.copyOfRange(bytes, 0, Math.min(bytes.length, bomBytes.length));
185         return Arrays.equals(firstBytes, bomBytes);
186     }
187 
188     /**
189      * Attempts to sniff an encoding from an HTML <code>meta</code> tag in the specified byte array.
190      *
191      * @param is the content stream to check for an HTML <code>meta</code> tag
192      * @return the encoding sniffed from the specified bytes, or {@code null} if the encoding
193      *         could not be determined
194      * @throws IOException if an IO error occurs
195      */
196     public static Charset sniffEncodingFromMetaTag(final InputStream is) throws IOException {
197         final byte[] bytes = read(is, SIZE_OF_HTML_CONTENT_SNIFFED);
198         for (int i = 0; i < bytes.length; i++) {
199             if (matches(bytes, i, COMMENT_START)) {
200                 i = indexOfSubArray(bytes, COMMENT_END, i);
201                 if (i == -1) {
202                     break;
203                 }
204                 i += 2;
205             }
206             else if (matches(bytes, i, META_START)) {
207                 i += META_START.length;
208                 for (Attribute att = getAttribute(bytes, i); att != null; att = getAttribute(bytes, i)) {
209                     i = att.getUpdatedIndex();
210                     final String name = att.getName().toLowerCase(Locale.ROOT);
211                     final String value = att.getValue().toLowerCase(Locale.ROOT);
212                     if ("charset".equals(name) || "content".equals(name)) {
213                         Charset charset = null;
214                         if ("charset".equals(name)) {
215                             charset = toCharset(value);
216                             // https://html.spec.whatwg.org/multipage/parsing.html#prescan-a-byte-stream-to-determine-its-encoding
217                             if (charset == null && "x-user-defined".equals(value)) {
218                                 charset = Charset.forName("windows-1252");
219                             }
220                         }
221                         else if ("content".equals(name)) {
222                             charset = extractEncodingFromContentType(value);
223                             // https://html.spec.whatwg.org/multipage/parsing.html#prescan-a-byte-stream-to-determine-its-encoding
224                             if (charset == null && value != null && value.contains("x-user-defined")) {
225                                 charset = Charset.forName("windows-1252");
226                             }
227                             if (charset == null) {
228                                 continue;
229                             }
230                         }
231                         if (UTF_16BE == charset || UTF_16LE == charset) {
232                             charset = UTF_8;
233                         }
234                         if (charset != null) {
235                             if (LOG.isDebugEnabled()) {
236                                 LOG.debug("Encoding found in meta tag: '" + charset + "'.");
237                             }
238                             return charset;
239                         }
240                     }
241                 }
242             }
243             else if (i + 1 < bytes.length && bytes[i] == '<' && Character.isLetter(bytes[i + 1])) {
244                 i = skipToAnyOf(bytes, i, WHITESPACE);
245                 if (i == -1) {
246                     break;
247                 }
248                 Attribute att = getAttribute(bytes, i);
249                 while (att != null) {
250                     i = att.getUpdatedIndex();
251                     att = getAttribute(bytes, i);
252                 }
253             }
254             else if (i + 2 < bytes.length && bytes[i] == '<' && bytes[i + 1] == '/' && Character.isLetter(bytes[i + 2])) {
255                 i = skipToAnyOf(bytes, i, new byte[] {0x09, 0x0A, 0x0C, 0x0D, 0x20, 0x3E});
256                 if (i == -1) {
257                     break;
258                 }
259                 Attribute attribute = getAttribute(bytes, i);
260                 while (attribute != null) {
261                     i = attribute.getUpdatedIndex();
262                     attribute = getAttribute(bytes, i);
263                 }
264             }
265             else if (matches(bytes, i, OTHER_START)) {
266                 i = skipToAnyOf(bytes, i, new byte[] {0x3E});
267                 if (i == -1) {
268                     break;
269                 }
270             }
271         }
272         return null;
273     }
274 
275     /**
276      * Extracts an attribute from the specified byte array, starting at the specified index, using the
277      * <a href="http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#concept-get-attributes-when-sniffing">HTML5
278      * attribute algorithm</a>.
279      *
280      * @param bytes the byte array to extract an attribute from
281      * @param startFrom the index to start searching from
282      * @return the next attribute in the specified byte array, or {@code null} if one is not available
283      */
284     static Attribute getAttribute(final byte[] bytes, final int startFrom) {
285         if (startFrom >= bytes.length) {
286             return null;
287         }
288 
289         int pos = startFrom;
290         while (bytes[pos] == 0x09 || bytes[pos] == 0x0A || bytes[pos] == 0x0C || bytes[pos] == 0x0D || bytes[pos] == 0x20 || bytes[pos] == 0x2F) {
291             pos++;
292             if (pos >= bytes.length) {
293                 return null;
294             }
295         }
296         if (bytes[pos] == '>') {
297             return null;
298         }
299         final StringBuilder name = new StringBuilder();
300         final StringBuilder value = new StringBuilder();
301         for ( ;; pos++) {
302             if (pos >= bytes.length) {
303                 return new Attribute(name.toString(), value.toString(), pos);
304             }
305             if (bytes[pos] == '=' && name.length() != 0) {
306                 pos++;
307                 break;
308             }
309             if (bytes[pos] == 0x09 || bytes[pos] == 0x0A || bytes[pos] == 0x0C || bytes[pos] == 0x0D || bytes[pos] == 0x20) {
310                 while (bytes[pos] == 0x09 || bytes[pos] == 0x0A || bytes[pos] == 0x0C || bytes[pos] == 0x0D || bytes[pos] == 0x20) {
311                     pos++;
312                     if (pos >= bytes.length) {
313                         return new Attribute(name.toString(), value.toString(), pos);
314                     }
315                 }
316                 if (bytes[pos] != '=') {
317                     return new Attribute(name.toString(), value.toString(), pos);
318                 }
319                 pos++;
320                 break;
321             }
322             if (bytes[pos] == '/' || bytes[pos] == '>') {
323                 return new Attribute(name.toString(), value.toString(), pos);
324             }
325             name.append((char) bytes[pos]);
326         }
327         if (pos >= bytes.length) {
328             return new Attribute(name.toString(), value.toString(), pos);
329         }
330         while (bytes[pos] == 0x09 || bytes[pos] == 0x0A || bytes[pos] == 0x0C || bytes[pos] == 0x0D || bytes[pos] == 0x20) {
331             pos++;
332             if (pos >= bytes.length) {
333                 return new Attribute(name.toString(), value.toString(), pos);
334             }
335         }
336         if (bytes[pos] == '"' || bytes[pos] == '\'') {
337             final byte b = bytes[pos];
338             for (pos++; pos < bytes.length; pos++) {
339                 if (bytes[pos] == b) {
340                     pos++;
341                     return new Attribute(name.toString(), value.toString(), pos);
342                 }
343                 else if (bytes[pos] >= 'A' && bytes[pos] <= 'Z') {
344                     final byte b2 = (byte) (bytes[pos] + 0x20);
345                     value.append((char) b2);
346                 }
347                 else {
348                     value.append((char) bytes[pos]);
349                 }
350             }
351             return new Attribute(name.toString(), value.toString(), pos);
352         }
353         else if (bytes[pos] == '>') {
354             return new Attribute(name.toString(), value.toString(), pos);
355         }
356         else if (bytes[pos] >= 'A' && bytes[pos] <= 'Z') {
357             final byte b = (byte) (bytes[pos] + 0x20);
358             value.append((char) b);
359             pos++;
360         }
361         else {
362             value.append((char) bytes[pos]);
363             pos++;
364         }
365         for ( ; pos < bytes.length; pos++) {
366             if (bytes[pos] == 0x09 || bytes[pos] == 0x0A || bytes[pos] == 0x0C || bytes[pos] == 0x0D || bytes[pos] == 0x20 || bytes[pos] == 0x3E) {
367                 return new Attribute(name.toString(), value.toString(), pos);
368             }
369             else if (bytes[pos] >= 'A' && bytes[pos] <= 'Z') {
370                 final byte b = (byte) (bytes[pos] + 0x20);
371                 value.append((char) b);
372             }
373             else {
374                 value.append((char) bytes[pos]);
375             }
376         }
377         return new Attribute(name.toString(), value.toString(), pos);
378     }
379 
380     /**
381      * Extracts an encoding from the specified <code>Content-Type</code> value using
382      * <a href="http://ietfreport.isoc.org/idref/draft-abarth-mime-sniff/">the IETF algorithm</a>; if
383      * no encoding is found, this method returns {@code null}.
384      *
385      * @param s the <code>Content-Type</code> value to search for an encoding
386      * @return the encoding found in the specified <code>Content-Type</code> value, or {@code null} if no
387      *         encoding was found
388      */
389     public static Charset extractEncodingFromContentType(final String s) {
390         if (s == null) {
391             return null;
392         }
393         final byte[] bytes = s.getBytes(US_ASCII);
394         int i;
395         for (i = 0; i < bytes.length; i++) {
396             if (matches(bytes, i, CHARSET_START)) {
397                 i += CHARSET_START.length;
398                 break;
399             }
400         }
401         if (i == bytes.length) {
402             return null;
403         }
404         while (bytes[i] == 0x09 || bytes[i] == 0x0A || bytes[i] == 0x0C || bytes[i] == 0x0D || bytes[i] == 0x20) {
405             i++;
406             if (i == bytes.length) {
407                 return null;
408             }
409         }
410         if (bytes[i] != '=') {
411             return null;
412         }
413         do {
414             i++;
415             if (i == bytes.length) {
416                 return null;
417             }
418         }
419         while (bytes[i] == 0x09 || bytes[i] == 0x0A || bytes[i] == 0x0C || bytes[i] == 0x0D || bytes[i] == 0x20);
420 
421         if (bytes[i] == '"') {
422             if (bytes.length <= i + 1) {
423                 return null;
424             }
425             final int index = ArrayUtils.indexOf(bytes, (byte) '"', i + 1);
426             if (index == -1) {
427                 return null;
428             }
429             final String charsetName = new String(ArrayUtils.subarray(bytes, i + 1, index), US_ASCII);
430             return toCharset(charsetName);
431         }
432         if (bytes[i] == '\'') {
433             if (bytes.length <= i + 1) {
434                 return null;
435             }
436             final int index = ArrayUtils.indexOf(bytes, (byte) '\'', i + 1);
437             if (index == -1) {
438                 return null;
439             }
440             final String charsetName = new String(ArrayUtils.subarray(bytes, i + 1, index), US_ASCII);
441             return toCharset(charsetName);
442         }
443         int end = skipToAnyOf(bytes, i, new byte[] {0x09, 0x0A, 0x0C, 0x0D, 0x20, 0x3B});
444         if (end == -1) {
445             end = bytes.length;
446         }
447         final String charsetName = new String(ArrayUtils.subarray(bytes, i, end), US_ASCII);
448         return toCharset(charsetName);
449     }
450 
451     /**
452      * Searches the specified XML content for an XML declaration and returns the encoding if found,
453      * otherwise returns {@code null}.
454      *
455      * @param is the content stream to check for the charset declaration
456      * @return the encoding of the specified XML content, or {@code null} if it could not be determined
457      * @throws IOException if an IO error occurs
458      */
459     public static Charset sniffEncodingFromXmlDeclaration(final InputStream is) throws IOException {
460         final byte[] bytes = read(is, SIZE_OF_XML_CONTENT_SNIFFED);
461         Charset encoding = null;
462         if (bytes.length > 5
463                 && XML_DECLARATION_PREFIX[0] == bytes[0]
464                 && XML_DECLARATION_PREFIX[1] == bytes[1]
465                 && XML_DECLARATION_PREFIX[2] == bytes[2]
466                 && XML_DECLARATION_PREFIX[3] == bytes[3]
467                 && XML_DECLARATION_PREFIX[4] == bytes[4]
468                 && XML_DECLARATION_PREFIX[5] == bytes[5]) {
469             final int index = ArrayUtils.indexOf(bytes, (byte) '?', 2);
470             if (index + 1 < bytes.length && bytes[index + 1] == '>') {
471                 final String declaration = new String(bytes, 0, index + 2, US_ASCII);
472                 int start = declaration.indexOf("encoding");
473                 if (start != -1) {
474                     start += 8;
475                     final char delimiter;
476                 outer:
477                     while (true) {
478                         switch (declaration.charAt(start)) {
479                             case '"':
480                             case '\'':
481                                 delimiter = declaration.charAt(start);
482                                 start = start + 1;
483                                 break outer;
484 
485                             default:
486                                 start++;
487                         }
488                     }
489                     final int end = declaration.indexOf(delimiter, start);
490                     encoding = toCharset(declaration.substring(start, end));
491                 }
492             }
493         }
494         if (encoding != null && LOG.isDebugEnabled()) {
495             LOG.debug("Encoding found in XML declaration: '" + encoding + "'.");
496         }
497         return encoding;
498     }
499 
500     /**
501      * Parses and returns the charset declaration at the start of a css file if any, otherwise returns {@code null}.
502      * <p>e.g. <pre>@charset "UTF-8"</pre>
503      *
504      * @param is the input stream to parse
505      * @return the charset declaration at the start of a css file if any, otherwise returns {@code null}.
506      * @throws IOException if an IO error occurs
507      */
508     public static Charset sniffEncodingFromCssDeclaration(final InputStream is) throws IOException {
509         final byte[] bytes = read(is, SIZE_OF_CSS_CONTENT_SNIFFED);
510         if (bytes.length < CSS_CHARSET_DECLARATION_PREFIX.length) {
511             return null;
512         }
513         for (int i = 0; i < CSS_CHARSET_DECLARATION_PREFIX.length; i++) {
514             if (bytes[i] != CSS_CHARSET_DECLARATION_PREFIX[i]) {
515                 return null;
516             }
517         }
518 
519         Charset encoding = null;
520         final int index = ArrayUtils.indexOf(bytes, (byte) '"', CSS_CHARSET_DECLARATION_PREFIX.length);
521         if (index + 1 < bytes.length && bytes[index + 1] == ';') {
522             encoding = toCharset(new String(bytes, CSS_CHARSET_DECLARATION_PREFIX.length, index - CSS_CHARSET_DECLARATION_PREFIX.length, US_ASCII));
523             // https://www.w3.org/TR/css-syntax-3/#input-byte-stream "Why use utf-8 when the declaration says utf-16?"
524             if (encoding == UTF_16BE || encoding == UTF_16LE) {
525                 encoding = UTF_8;
526             }
527         }
528         return encoding;
529     }
530 
531     /**
532      * Returns {@code Charset} if the specified charset name is supported on this platform.
533      *
534      * @param charsetName the charset name to check
535      * @return {@code Charset} if the specified charset name is supported on this platform
536      */
537     public static Charset toCharset(final String charsetName) {
538         final String nameFromLabel = translateEncodingLabel(charsetName);
539         if (nameFromLabel == null) {
540             return null;
541         }
542         try {
543             return Charset.forName(nameFromLabel);
544         }
545         catch (final IllegalCharsetNameException | UnsupportedCharsetException e) {
546             return null;
547         }
548     }
549 
550     /**
551      * Returns {@code true} if the byte in the specified byte array at the specified index matches one of the
552      * specified byte array patterns.
553      *
554      * @param bytes the byte array to search in
555      * @param i the index at which to search
556      * @param sought the byte array patterns to search for
557      * @return {@code true} if the byte in the specified byte array at the specified index matches one of the
558      *         specified byte array patterns
559      */
560     static boolean matches(final byte[] bytes, final int i, final byte[][] sought) {
561         if (i + sought.length > bytes.length) {
562             return false;
563         }
564         for (int x = 0; x < sought.length; x++) {
565             final byte[] possibilities = sought[x];
566             boolean match = false;
567             for (final byte possibility : possibilities) {
568                 if (bytes[i + x] == possibility) {
569                     match = true;
570                     break;
571                 }
572             }
573             if (!match) {
574                 return false;
575             }
576         }
577         return true;
578     }
579 
580     /**
581      * Skips ahead to the first occurrence of the specified targets within the specified array,
582      * starting at the specified index. This method returns <code>-1</code> if none of the targets are found.
583      *
584      * @param bytes the array to search through
585      * @param startFrom the index to start looking at
586      * @param targets the targets to search for
587      * @return the index of the first occurrence of the specified targets within the specified array
588      */
589     static int skipToAnyOf(final byte[] bytes, final int startFrom, final byte[] targets) {
590         int i = startFrom;
591         for ( ; i < bytes.length; i++) {
592             if (ArrayUtils.contains(targets, bytes[i])) {
593                 break;
594             }
595         }
596         if (i == bytes.length) {
597             i = -1;
598         }
599         return i;
600     }
601 
602     /**
603      * Finds the first index of the specified sub-array inside the specified array, starting at the
604      * specified index. This method returns <code>-1</code> if the specified sub-array cannot be found.
605      *
606      * @param array the array to traverse for looking for the sub-array
607      * @param subarray the sub-array to find
608      * @param startIndex the start index to traverse forwards from
609      * @return the index of the sub-array within the array
610      */
611     static int indexOfSubArray(final byte[] array, final byte[] subarray, final int startIndex) {
612         for (int i = startIndex; i < array.length; i++) {
613             boolean found = true;
614             if (i + subarray.length > array.length) {
615                 break;
616             }
617             for (int j = 0; j < subarray.length; j++) {
618                 final byte a = array[i + j];
619                 final byte b = subarray[j];
620                 if (a != b) {
621                     found = false;
622                     break;
623                 }
624             }
625             if (found) {
626                 return i;
627             }
628         }
629         return -1;
630     }
631 
632     /**
633      * Attempts to read <code>size</code> bytes from the specified input stream. Note that this method is not guaranteed
634      * to be able to read <code>size</code> bytes; however, the returned byte array will always be the exact length of the
635      * number of bytes read.
636      *
637      * @param content the input stream to read from
638      * @param size the number of bytes to try to read
639      * @return the bytes read from the specified input stream
640      * @throws IOException if an IO error occurs
641      */
642     static byte[] read(final InputStream content, final int size) throws IOException {
643         byte[] bytes = new byte[size];
644         // using IOUtils guarantees that it will read as many bytes as possible before giving up;
645         // this may not always be the case for subclasses of InputStream - e.g. GZIPInputStream
646         final int count = IOUtils.read(content, bytes);
647         if (count < size) {
648             final byte[] smaller = new byte[count];
649             System.arraycopy(bytes, 0, smaller, 0, count);
650             bytes = smaller;
651         }
652         return bytes;
653     }
654 
655     /**
656      * Attempts to read <code>size</code> bytes from the specified input stream and then prepends the specified prefix to
657      * the bytes read, returning the resultant byte array. Note that this method is not guaranteed to be able to read
658      * <code>size</code> bytes; however, the returned byte array will always be the exact length of the number of bytes
659      * read plus the length of the prefix array.
660      *
661      * @param content the input stream to read from
662      * @param size the number of bytes to try to read
663      * @param prefix the byte array to prepend to the bytes read from the specified input stream
664      * @return the bytes read from the specified input stream, prefixed by the specified prefix
665      * @throws IOException if an IO error occurs
666      */
667     static byte[] readAndPrepend(final InputStream content, final int size, final byte[] prefix) throws IOException {
668         final int prefixLength = prefix.length;
669         final byte[] joined = new byte[prefixLength + size];
670 
671         // using IOUtils guarantees that it will read as many bytes as possible before giving up;
672         // this may not always be the case for subclasses of InputStream - e.g. GZIPInputStream
673         final int count = IOUtils.read(content, joined, prefixLength, joined.length - prefixLength);
674         if (count < size) {
675             final byte[] smaller = new byte[prefixLength + count];
676             System.arraycopy(prefix, 0, smaller, 0, prefix.length);
677             System.arraycopy(joined, prefixLength, smaller, prefixLength, count);
678             return smaller;
679         }
680 
681         System.arraycopy(prefix, 0, joined, 0, prefix.length);
682         return joined;
683     }
684 
685     static class Attribute {
686         private final String name_;
687         private final String value_;
688         private final int updatedIndex_;
689         Attribute(final String name, final String value, final int updatedIndex) {
690             name_ = name;
691             value_ = value;
692             updatedIndex_ = updatedIndex;
693         }
694         String getName() {
695             return name_;
696         }
697         String getValue() {
698             return value_;
699         }
700         int getUpdatedIndex() {
701             return updatedIndex_;
702         }
703     }
704 
705     /**
706      * Translates the given encoding label into a normalized form
707      * according to <a href="http://encoding.spec.whatwg.org/#encodings">Reference</a>.
708      * @param encodingLabel the label to translate
709      * @return the normalized encoding name or null if not found
710      */
711     public static String translateEncodingLabel(final String encodingLabel) {
712         if (StringUtils.isEmptyOrNull(encodingLabel)) {
713             return null;
714         }
715 
716         final String encLC = encodingLabel.toLowerCase(Locale.ROOT);
717         final String enc = StandardEncodingTranslator.INSTANCE.encodingNameFromLabel(encodingLabel);
718         if (encLC.equals(enc)) {
719             return encodingLabel;
720         }
721         return enc;
722     }
723 }