View Javadoc
1   /* Copyright 2015 Google Inc. All Rights Reserved.
2   
3      Distributed under MIT license.
4      See file LICENSE for detail or copy at https://opensource.org/licenses/MIT
5   */
6   
7   package org.htmlunit.util.brotli;
8   
9   import java.nio.ByteBuffer;
10  
11  /**
12   * Transformations on dictionary words.
13   *
14   * Transform descriptor is a triplet: {prefix, operator, suffix}.
15   * "prefix" and "suffix" are short strings inserted before and after transformed dictionary word.
16   * "operator" is applied to dictionary word itself.
17   *
18   * Some operators has "built-in" parameters, i.e. parameter is defined by operator ordinal. Other
19   * operators have "external" parameters, supplied via additional table encoded in shared dictionary.
20   *
21   * Operators:
22   *  - IDENTITY (0): dictionary word is inserted "as is"
23   *  - OMIT_LAST_N (1 - 9): last N octets of dictionary word are not inserted; N == ordinal
24   *  - OMIT_FIRST_M (12-20): first M octets of dictionary word are not inserted; M == ordinal - 11
25   *  - UPPERCASE_FIRST (10): first "scalar" is XOR'ed with number 32
26   *  - UPPERCASE_ALL (11): all "scalars" are XOR'ed with number 32
27   *  - SHIFT_FIRST (21): first "scalar" is shifted by number form parameter table
28   *  - SHIFT_ALL (22): all "scalar" is shifted by number form parameter table
29   *
30   * Here "scalar" is a variable length character coding similar to UTF-8 encoding.
31   * UPPERCASE_XXX / SHIFT_XXX operators were designed to change the case of UTF-8 encoded characters.
32   * While UPPERCASE_XXX works well only on ASCII charset, SHIFT is much more generic and could be
33   * used for most (all?) alphabets.
34   */
35  @SuppressWarnings("all")
36  final class Transform {
37  
38    static final class Transforms {
39      final int numTransforms;
40      final int[] triplets;
41      final byte[] prefixSuffixStorage;
42      final int[] prefixSuffixHeads;
43      final short[] params;
44  
45      Transforms(int numTransforms, int prefixSuffixLen, int prefixSuffixCount) {
46        this.numTransforms = numTransforms;
47        this.triplets = new int[numTransforms * 3];
48        this.params = new short[numTransforms];
49        this.prefixSuffixStorage = new byte[prefixSuffixLen];
50        this.prefixSuffixHeads = new int[prefixSuffixCount + 1];
51      }
52    }
53  
54    static final int NUM_RFC_TRANSFORMS = 121;
55    static final Transforms RFC_TRANSFORMS = new Transforms(NUM_RFC_TRANSFORMS, 167, 50);
56  
57    private static final int OMIT_FIRST_LAST_LIMIT = 9;
58  
59    private static final int IDENTITY = 0;
60    private static final int OMIT_LAST_BASE = IDENTITY + 1 - 1;  // there is no OMIT_LAST_0.
61    private static final int UPPERCASE_FIRST = OMIT_LAST_BASE + OMIT_FIRST_LAST_LIMIT + 1;
62    private static final int UPPERCASE_ALL = UPPERCASE_FIRST + 1;
63    private static final int OMIT_FIRST_BASE = UPPERCASE_ALL + 1 - 1;  // there is no OMIT_FIRST_0.
64    private static final int SHIFT_FIRST = OMIT_FIRST_BASE + OMIT_FIRST_LAST_LIMIT + 1;
65    private static final int SHIFT_ALL = SHIFT_FIRST + 1;
66  
67    // Bundle of 0-terminated strings.
68    private static final String PREFIX_SUFFIX_SRC = "# #s #, #e #.# the #.com/#\u00C2\u00A0# of # and"
69        + " # in # to #\"#\">#\n#]# for # a # that #. # with #'# from # by #. The # on # as # is #ing"
70        + " #\n\t#:#ed #(# at #ly #=\"# of the #. This #,# not #er #al #='#ful #ive #less #est #ize #"
71        + "ous #";
72    private static final String TRANSFORMS_SRC = "     !! ! ,  *!  &!  \" !  ) *   * -  ! # !  #!*!  "
73        + "+  ,$ !  -  %  .  / #   0  1 .  \"   2  3!*   4%  ! # /   5  6  7  8 0  1 &   $   9 +   : "
74        + " ;  < '  !=  >  ?! 4  @ 4  2  &   A *# (   B  C& ) %  ) !*# *-% A +! *.  D! %'  & E *6  F "
75        + " G% ! *A *%  H! D  I!+!  J!+   K +- *4! A  L!*4  M  N +6  O!*% +.! K *G  P +%(  ! G *D +D "
76        + " Q +# *K!*G!+D!+# +G +A +4!+% +K!+4!*D!+K!*K";
77  
78    private static void unpackTransforms(byte[] prefixSuffix,
79        int[] prefixSuffixHeads, int[] transforms, String prefixSuffixSrc, String transformsSrc) {
80      final int[] prefixSuffixBytes = Utils.toUtf8Runes(prefixSuffixSrc);
81      final int n = prefixSuffixBytes.length;
82      int index = 1;
83      int j = 0;
84      for (int i = 0; i < n; ++i) {
85        final int c = prefixSuffixBytes[i];
86        if (c == 35) { // == #
87          prefixSuffixHeads[index++] = j;
88        } else {
89          prefixSuffix[j++] = (byte) c;
90        }
91      }
92  
93      for (int i = 0; i < NUM_RFC_TRANSFORMS * 3; ++i) {
94        transforms[i] = transformsSrc.charAt(i) - 32;
95      }
96    }
97  
98    static {
99      unpackTransforms(RFC_TRANSFORMS.prefixSuffixStorage, RFC_TRANSFORMS.prefixSuffixHeads,
100         RFC_TRANSFORMS.triplets, PREFIX_SUFFIX_SRC, TRANSFORMS_SRC);
101   }
102 
103   static int transformDictionaryWord(byte[] dst, int dstOffset, ByteBuffer src, int srcOffset,
104       int wordLen, Transforms transforms, int transformIndex) {
105     int offset = dstOffset;
106     final int[] triplets = transforms.triplets;
107     final byte[] prefixSuffixStorage = transforms.prefixSuffixStorage;
108     final int[] prefixSuffixHeads = transforms.prefixSuffixHeads;
109     final int transformOffset = 3 * transformIndex;
110     final int prefixIdx = triplets[transformOffset];
111     final int transformType = triplets[transformOffset + 1];
112     final int suffixIdx = triplets[transformOffset + 2];
113     int prefix = prefixSuffixHeads[prefixIdx];
114     final int prefixEnd = prefixSuffixHeads[prefixIdx + 1];
115     int suffix = prefixSuffixHeads[suffixIdx];
116     final int suffixEnd = prefixSuffixHeads[suffixIdx + 1];
117 
118     int omitFirst = transformType - OMIT_FIRST_BASE;
119     int omitLast = transformType - OMIT_LAST_BASE;
120     if (omitFirst < 1 || omitFirst > OMIT_FIRST_LAST_LIMIT) {
121       omitFirst = 0;
122     }
123     if (omitLast < 1 || omitLast > OMIT_FIRST_LAST_LIMIT) {
124       omitLast = 0;
125     }
126 
127     // Copy prefix.
128     while (prefix != prefixEnd) {
129       dst[offset++] = prefixSuffixStorage[prefix++];
130     }
131 
132     int len = wordLen;
133     // Copy trimmed word.
134     if (omitFirst > len) {
135       omitFirst = len;
136     }
137     int dictOffset = srcOffset + omitFirst;
138     len -= omitFirst;
139     len -= omitLast;
140     int i = len;
141     while (i > 0) {
142       dst[offset++] = src.get(dictOffset++);
143       i--;
144     }
145 
146     // Ferment.
147     if (transformType == UPPERCASE_FIRST || transformType == UPPERCASE_ALL) {
148       int uppercaseOffset = offset - len;
149       if (transformType == UPPERCASE_FIRST) {
150         len = 1;
151       }
152       while (len > 0) {
153         final int c0 = dst[uppercaseOffset] & 0xFF;
154         if (c0 < 0xC0) {
155           if (c0 >= 97 && c0 <= 122) { // in [a..z] range
156             dst[uppercaseOffset] = (byte) (dst[uppercaseOffset] ^ 32);
157           }
158           uppercaseOffset += 1;
159           len -= 1;
160         } else if (c0 < 0xE0) {
161           dst[uppercaseOffset + 1] = (byte) (dst[uppercaseOffset + 1] ^ 32);
162           uppercaseOffset += 2;
163           len -= 2;
164         } else {
165           dst[uppercaseOffset + 2] = (byte) (dst[uppercaseOffset + 2] ^ 5);
166           uppercaseOffset += 3;
167           len -= 3;
168         }
169       }
170     } else if (transformType == SHIFT_FIRST || transformType == SHIFT_ALL) {
171       int shiftOffset = offset - len;
172       final int param = transforms.params[transformIndex];
173       /* Limited sign extension: scalar < (1 << 24). */
174       int scalar = (param & 0x7FFF) + (0x1000000 - (param & 0x8000));
175       while (len > 0) {
176         int step = 1;
177         final int c0 = dst[shiftOffset] & 0xFF;
178         if (c0 < 0x80) {
179           /* 1-byte rune / 0sssssss / 7 bit scalar (ASCII). */
180           scalar += c0;
181           dst[shiftOffset] = (byte) (scalar & 0x7F);
182         } else if (c0 < 0xC0) {
183           /* Continuation / 10AAAAAA. */
184         } else if (c0 < 0xE0) {
185           /* 2-byte rune / 110sssss AAssssss / 11 bit scalar. */
186           if (len >= 2) {
187             final int c1 = dst[shiftOffset + 1];
188             scalar += (c1 & 0x3F) | ((c0 & 0x1F) << 6);
189             dst[shiftOffset] = (byte) (0xC0 | ((scalar >> 6) & 0x1F));
190             dst[shiftOffset + 1] = (byte) ((c1 & 0xC0) | (scalar & 0x3F));
191             step = 2;
192           } else {
193             step = len;
194           }
195         } else if (c0 < 0xF0) {
196           /* 3-byte rune / 1110ssss AAssssss BBssssss / 16 bit scalar. */
197           if (len >= 3) {
198             final int c1 = dst[shiftOffset + 1];
199             final int c2 = dst[shiftOffset + 2];
200             scalar += (c2 & 0x3F) | ((c1 & 0x3F) << 6) | ((c0 & 0x0F) << 12);
201             dst[shiftOffset] = (byte) (0xE0 | ((scalar >> 12) & 0x0F));
202             dst[shiftOffset + 1] = (byte) ((c1 & 0xC0) | ((scalar >> 6) & 0x3F));
203             dst[shiftOffset + 2] = (byte) ((c2 & 0xC0) | (scalar & 0x3F));
204             step = 3;
205           } else {
206             step = len;
207           }
208         } else if (c0 < 0xF8) {
209           /* 4-byte rune / 11110sss AAssssss BBssssss CCssssss / 21 bit scalar. */
210           if (len >= 4) {
211             final int c1 = dst[shiftOffset + 1];
212             final int c2 = dst[shiftOffset + 2];
213             final int c3 = dst[shiftOffset + 3];
214             scalar += (c3 & 0x3F) | ((c2 & 0x3F) << 6) | ((c1 & 0x3F) << 12) | ((c0 & 0x07) << 18);
215             dst[shiftOffset] = (byte) (0xF0 | ((scalar >> 18) & 0x07));
216             dst[shiftOffset + 1] = (byte) ((c1 & 0xC0) | ((scalar >> 12) & 0x3F));
217             dst[shiftOffset + 2] = (byte) ((c2 & 0xC0) | ((scalar >> 6) & 0x3F));
218             dst[shiftOffset + 3] = (byte) ((c3 & 0xC0) | (scalar & 0x3F));
219             step = 4;
220           } else {
221             step = len;
222           }
223         }
224         shiftOffset += step;
225         len -= step;
226         if (transformType == SHIFT_FIRST) {
227           len = 0;
228         }
229       }
230     }
231 
232     // Copy suffix.
233     while (suffix != suffixEnd) {
234       dst[offset++] = prefixSuffixStorage[suffix++];
235     }
236 
237     return offset - dstOffset;
238   }
239 }