View Javadoc
1   /* Copyright 2015 Google Inc. All Rights Reserved.
2   
3      Distributed under MIT license.
4      See file LICENSE for detail or copy at https://opensource.org/licenses/MIT
5   */
6   
7   package org.htmlunit.util.brotli;
8   
9   import java.nio.ByteBuffer;
10  
11  /**
12   * Transformations on dictionary words.
13   *
14   * Transform descriptor is a triplet: {prefix, operator, suffix}.
15   * "prefix" and "suffix" are short strings inserted before and after transformed dictionary word.
16   * "operator" is applied to dictionary word itself.
17   *
18   * Some operators has "built-in" parameters, i.e. parameter is defined by operator ordinal. Other
19   * operators have "external" parameters, supplied via additional table encoded in shared dictionary.
20   *
21   * Operators:
22   *  - IDENTITY (0): dictionary word is inserted "as is"
23   *  - OMIT_LAST_N (1 - 9): last N octets of dictionary word are not inserted; N == ordinal
24   *  - OMIT_FIRST_M (12-20): first M octets of dictionary word are not inserted; M == ordinal - 11
25   *  - UPPERCASE_FIRST (10): first "scalar" is XOR'ed with number 32
26   *  - UPPERCASE_ALL (11): all "scalars" are XOR'ed with number 32
27   *  - SHIFT_FIRST (21): first "scalar" is shifted by number form parameter table
28   *  - SHIFT_ALL (22): all "scalar" is shifted by number form parameter table
29   *
30   * Here "scalar" is a variable length character coding similar to UTF-8 encoding.
31   * UPPERCASE_XXX / SHIFT_XXX operators were designed to change the case of UTF-8 encoded characters.
32   * While UPPERCASE_XXX works well only on ASCII charset, SHIFT is much more generic and could be
33   * used for most (all?) alphabets.
34   */
35  @SuppressWarnings("all")
36  final class Transform {
37  
38    static final class Transforms {
39      final int numTransforms;
40      final int[] triplets;
41      final byte[] prefixSuffixStorage;
42      final int[] prefixSuffixHeads;
43      final short[] params;
44  
45      Transforms(int numTransforms, int prefixSuffixLen, int prefixSuffixCount) {
46        this.numTransforms = numTransforms;
47        this.triplets = new int[numTransforms * 3];
48        this.params = new short[numTransforms];
49        this.prefixSuffixStorage = new byte[prefixSuffixLen];
50        this.prefixSuffixHeads = new int[prefixSuffixCount + 1];
51      }
52    }
53  
54    static final int NUM_RFC_TRANSFORMS = 121;
55    static final Transforms RFC_TRANSFORMS = new Transforms(NUM_RFC_TRANSFORMS, 167, 50);
56  
57    private static final int OMIT_FIRST_LAST_LIMIT = 9;
58  
59    private static final int IDENTITY = 0;
60    private static final int OMIT_LAST_BASE = IDENTITY + 1 - 1;  // there is no OMIT_LAST_0.
61    private static final int UPPERCASE_FIRST = OMIT_LAST_BASE + OMIT_FIRST_LAST_LIMIT + 1;
62    private static final int UPPERCASE_ALL = UPPERCASE_FIRST + 1;
63    private static final int OMIT_FIRST_BASE = UPPERCASE_ALL + 1 - 1;  // there is no OMIT_FIRST_0.
64    private static final int SHIFT_FIRST = OMIT_FIRST_BASE + OMIT_FIRST_LAST_LIMIT + 1;
65    private static final int SHIFT_ALL = SHIFT_FIRST + 1;
66  
67    // Bundle of 0-terminated strings.
68    private static final String PREFIX_SUFFIX_SRC = """
69        # #s #, #e #.# the #.com/# # of # and\
70         # in # to #"#">#
71        #]# for # a # that #. # with #'# from # by #. The # on # as # is #ing\
72         #
73        	#:#ed #(# at #ly #="# of the #. This #,# not #er #al #='#ful #ive #less #est #ize #\
74        ous #""";
75    private static final String TRANSFORMS_SRC = "     !! ! ,  *!  &!  \" !  ) *   * -  ! # !  #!*!  "
76        + "+  ,$ !  -  %  .  / #   0  1 .  \"   2  3!*   4%  ! # /   5  6  7  8 0  1 &   $   9 +   : "
77        + " ;  < '  !=  >  ?! 4  @ 4  2  &   A *# (   B  C& ) %  ) !*# *-% A +! *.  D! %'  & E *6  F "
78        + " G% ! *A *%  H! D  I!+!  J!+   K +- *4! A  L!*4  M  N +6  O!*% +.! K *G  P +%(  ! G *D +D "
79        + " Q +# *K!*G!+D!+# +G +A +4!+% +K!+4!*D!+K!*K";
80  
81    private static void unpackTransforms(byte[] prefixSuffix,
82        int[] prefixSuffixHeads, int[] transforms, String prefixSuffixSrc, String transformsSrc) {
83      final int[] prefixSuffixBytes = Utils.toUtf8Runes(prefixSuffixSrc);
84      final int n = prefixSuffixBytes.length;
85      int index = 1;
86      int j = 0;
87      for (int i = 0; i < n; ++i) {
88        final int c = prefixSuffixBytes[i];
89        if (c == 35) { // == #
90          prefixSuffixHeads[index++] = j;
91        } else {
92          prefixSuffix[j++] = (byte) c;
93        }
94      }
95  
96      for (int i = 0; i < NUM_RFC_TRANSFORMS * 3; ++i) {
97        transforms[i] = transformsSrc.charAt(i) - 32;
98      }
99    }
100 
101   static {
102     unpackTransforms(RFC_TRANSFORMS.prefixSuffixStorage, RFC_TRANSFORMS.prefixSuffixHeads,
103         RFC_TRANSFORMS.triplets, PREFIX_SUFFIX_SRC, TRANSFORMS_SRC);
104   }
105 
106   static int transformDictionaryWord(byte[] dst, int dstOffset, ByteBuffer src, int srcOffset,
107       int wordLen, Transforms transforms, int transformIndex) {
108     int offset = dstOffset;
109     final int[] triplets = transforms.triplets;
110     final byte[] prefixSuffixStorage = transforms.prefixSuffixStorage;
111     final int[] prefixSuffixHeads = transforms.prefixSuffixHeads;
112     final int transformOffset = 3 * transformIndex;
113     final int prefixIdx = triplets[transformOffset];
114     final int transformType = triplets[transformOffset + 1];
115     final int suffixIdx = triplets[transformOffset + 2];
116     int prefix = prefixSuffixHeads[prefixIdx];
117     final int prefixEnd = prefixSuffixHeads[prefixIdx + 1];
118     int suffix = prefixSuffixHeads[suffixIdx];
119     final int suffixEnd = prefixSuffixHeads[suffixIdx + 1];
120 
121     int omitFirst = transformType - OMIT_FIRST_BASE;
122     int omitLast = transformType - OMIT_LAST_BASE;
123     if (omitFirst < 1 || omitFirst > OMIT_FIRST_LAST_LIMIT) {
124       omitFirst = 0;
125     }
126     if (omitLast < 1 || omitLast > OMIT_FIRST_LAST_LIMIT) {
127       omitLast = 0;
128     }
129 
130     // Copy prefix.
131     while (prefix != prefixEnd) {
132       dst[offset++] = prefixSuffixStorage[prefix++];
133     }
134 
135     int len = wordLen;
136     // Copy trimmed word.
137     if (omitFirst > len) {
138       omitFirst = len;
139     }
140     int dictOffset = srcOffset + omitFirst;
141     len -= omitFirst;
142     len -= omitLast;
143     int i = len;
144     while (i > 0) {
145       dst[offset++] = src.get(dictOffset++);
146       i--;
147     }
148 
149     // Ferment.
150     if (transformType == UPPERCASE_FIRST || transformType == UPPERCASE_ALL) {
151       int uppercaseOffset = offset - len;
152       if (transformType == UPPERCASE_FIRST) {
153         len = 1;
154       }
155       while (len > 0) {
156         final int c0 = dst[uppercaseOffset] & 0xFF;
157         if (c0 < 0xC0) {
158           if (c0 >= 97 && c0 <= 122) { // in [a..z] range
159             dst[uppercaseOffset] = (byte) (dst[uppercaseOffset] ^ 32);
160           }
161           uppercaseOffset += 1;
162           len -= 1;
163         } else if (c0 < 0xE0) {
164           dst[uppercaseOffset + 1] = (byte) (dst[uppercaseOffset + 1] ^ 32);
165           uppercaseOffset += 2;
166           len -= 2;
167         } else {
168           dst[uppercaseOffset + 2] = (byte) (dst[uppercaseOffset + 2] ^ 5);
169           uppercaseOffset += 3;
170           len -= 3;
171         }
172       }
173     } else if (transformType == SHIFT_FIRST || transformType == SHIFT_ALL) {
174       int shiftOffset = offset - len;
175       final int param = transforms.params[transformIndex];
176       /* Limited sign extension: scalar < (1 << 24). */
177       int scalar = (param & 0x7FFF) + (0x1000000 - (param & 0x8000));
178       while (len > 0) {
179         int step = 1;
180         final int c0 = dst[shiftOffset] & 0xFF;
181         if (c0 < 0x80) {
182           /* 1-byte rune / 0sssssss / 7 bit scalar (ASCII). */
183           scalar += c0;
184           dst[shiftOffset] = (byte) (scalar & 0x7F);
185         } else if (c0 < 0xC0) {
186           /* Continuation / 10AAAAAA. */
187         } else if (c0 < 0xE0) {
188           /* 2-byte rune / 110sssss AAssssss / 11 bit scalar. */
189           if (len >= 2) {
190             final int c1 = dst[shiftOffset + 1];
191             scalar += (c1 & 0x3F) | ((c0 & 0x1F) << 6);
192             dst[shiftOffset] = (byte) (0xC0 | ((scalar >> 6) & 0x1F));
193             dst[shiftOffset + 1] = (byte) ((c1 & 0xC0) | (scalar & 0x3F));
194             step = 2;
195           } else {
196             step = len;
197           }
198         } else if (c0 < 0xF0) {
199           /* 3-byte rune / 1110ssss AAssssss BBssssss / 16 bit scalar. */
200           if (len >= 3) {
201             final int c1 = dst[shiftOffset + 1];
202             final int c2 = dst[shiftOffset + 2];
203             scalar += (c2 & 0x3F) | ((c1 & 0x3F) << 6) | ((c0 & 0x0F) << 12);
204             dst[shiftOffset] = (byte) (0xE0 | ((scalar >> 12) & 0x0F));
205             dst[shiftOffset + 1] = (byte) ((c1 & 0xC0) | ((scalar >> 6) & 0x3F));
206             dst[shiftOffset + 2] = (byte) ((c2 & 0xC0) | (scalar & 0x3F));
207             step = 3;
208           } else {
209             step = len;
210           }
211         } else if (c0 < 0xF8) {
212           /* 4-byte rune / 11110sss AAssssss BBssssss CCssssss / 21 bit scalar. */
213           if (len >= 4) {
214             final int c1 = dst[shiftOffset + 1];
215             final int c2 = dst[shiftOffset + 2];
216             final int c3 = dst[shiftOffset + 3];
217             scalar += (c3 & 0x3F) | ((c2 & 0x3F) << 6) | ((c1 & 0x3F) << 12) | ((c0 & 0x07) << 18);
218             dst[shiftOffset] = (byte) (0xF0 | ((scalar >> 18) & 0x07));
219             dst[shiftOffset + 1] = (byte) ((c1 & 0xC0) | ((scalar >> 12) & 0x3F));
220             dst[shiftOffset + 2] = (byte) ((c2 & 0xC0) | ((scalar >> 6) & 0x3F));
221             dst[shiftOffset + 3] = (byte) ((c3 & 0xC0) | (scalar & 0x3F));
222             step = 4;
223           } else {
224             step = len;
225           }
226         }
227         shiftOffset += step;
228         len -= step;
229         if (transformType == SHIFT_FIRST) {
230           len = 0;
231         }
232       }
233     }
234 
235     // Copy suffix.
236     while (suffix != suffixEnd) {
237       dst[offset++] = prefixSuffixStorage[suffix++];
238     }
239 
240     return offset - dstOffset;
241   }
242 }