View Javadoc
1   /*
2    * Copyright (c) 2002-2025 Gargoyle Software Inc.
3    *
4    * Licensed under the Apache License, Version 2.0 (the "License");
5    * you may not use this file except in compliance with the License.
6    * You may obtain a copy of the License at
7    * https://www.apache.org/licenses/LICENSE-2.0
8    *
9    * Unless required by applicable law or agreed to in writing, software
10   * distributed under the License is distributed on an "AS IS" BASIS,
11   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12   * See the License for the specific language governing permissions and
13   * limitations under the License.
14   */
15  package org.htmlunit.javascript.regexp;
16  
17  import java.util.ArrayDeque;
18  import java.util.ArrayList;
19  import java.util.Deque;
20  import java.util.HashMap;
21  import java.util.List;
22  
23  /**
24   * Translates JavaScript RegExp to Java RegExp.<br>
25   * // [...\b...] to [...\cH...]
26   * // [...[...] to [...\[...]
27   * // [^\\1] to .
28   * // back reference in character classes are simply ignored by browsers [...ab\5cd...] to [...abcd...]
29   * // characters escaped without need should be "un-escaped"
30   * Escape curly braces that are not used in an expression
31   * like "{n}", "{n,}" or "{n,m}" (where n and m are positive integers).
32   *
33   * @author Ronald Brill
34   * @author Leszek Hoppe
35   * @author Atsushi Nakagawa
36   * @author Lai Quang Duong
37   */
38  public class RegExpJsToJavaConverter {
39  
40      private static final String DIGITS = "0123456789";
41      private static final HashMap<String, String> UNICODE_ESCAPES;
42  
43      private Tape tape_;
44      private boolean insideCharClass_;
45      private boolean insideRepetition_;
46      private Deque<Subexpresion> parsingSubexpressions_;
47      private List<Subexpresion> subexpressions_;
48  
49      static {
50          UNICODE_ESCAPES = new HashMap<>();
51          UNICODE_ESCAPES.put("L", "L");
52          UNICODE_ESCAPES.put("LETTER", "L");
53          UNICODE_ESCAPES.put("LU", "Lu");
54          UNICODE_ESCAPES.put("UPPERCASELETTER", "Lu");
55          UNICODE_ESCAPES.put("LL", "Ll");
56          UNICODE_ESCAPES.put("LOWERCASELETTER", "Ll");
57      }
58  
59      /**
60       * Helper to encapsulate the transformations.
61       */
62      private static class Tape {
63          private final StringBuilder tape_;
64          private int currentPos_;
65  
66          /**
67           * Wraps a JavaScript RegExp to access it char by char like a tape.
68           *
69           * @param input the Javascript RegExp
70           */
71          Tape(final String input) {
72              currentPos_ = 0;
73              tape_ = new StringBuilder(input);
74          }
75  
76          /**
77           * Moves the current read position by offset.
78           * @param offset the move position offset
79           */
80          public void move(final int offset) {
81              currentPos_ += offset;
82          }
83  
84          /**
85           * Reads the character at the current position and
86           * moves the read position by 1.
87           *
88           * @return the character at current position
89           */
90          public int read() {
91              if (currentPos_ < 0) {
92                  return -1;
93              }
94              if (currentPos_ >= tape_.length()) {
95                  return -1;
96              }
97  
98              return tape_.charAt(currentPos_++);
99          }
100 
101         /**
102          * Inserts a string at the current position + offset.
103          * @param token the string to insert
104          * @param offset the move position offset
105          */
106         public void insert(final String token, final int offset) {
107             tape_.insert(currentPos_ + offset, token);
108             currentPos_ += token.length();
109         }
110 
111         /**
112          * Inserts a string at the given pos.
113          * @param token the string to insert
114          * @param pos the move position offset
115          */
116         public void insertAt(final String token, final int pos) {
117             tape_.insert(pos, token);
118             currentPos_ += token.length();
119         }
120 
121         /**
122          * Replaces the current char with the given string.
123          * @param token the string to insert
124          */
125         public void replace(final int count, final String token) {
126             tape_.replace(currentPos_, currentPos_ + count, token);
127             currentPos_ += token.length();
128         }
129 
130         /**
131          * Removes number of chars from the given string.
132          * @param count the number of chars to remove
133          */
134         public void remove(final int count) {
135             tape_.delete(currentPos_, currentPos_ + count);
136         }
137 
138         /**
139          * Read the whole tape content.
140          *
141          * @return tape content
142          */
143         @Override
144         public String toString() {
145             return tape_.toString();
146         }
147     }
148 
149     private static final class Subexpresion {
150         private boolean closed_;
151         private boolean optional_;
152         private final boolean enhanced_;
153         private int start_;
154         private int end_;
155 
156         Subexpresion() {
157             closed_ = false;
158             optional_ = false;
159             enhanced_ = false;
160             start_ = -1;
161             end_ = -1;
162         }
163     }
164 
165     /**
166      * Run the state machine on a given input string.
167      *
168      * @param input
169      *            the js regexp to process
170      * @return a valid java regex pattern
171      */
172     public String convert(final String input) {
173         tape_ = new Tape(input);
174         insideCharClass_ = false;
175         insideRepetition_ = false;
176 
177         parsingSubexpressions_ = new ArrayDeque<>();
178         subexpressions_ = new ArrayList<>();
179 
180         int current = tape_.read();
181         while (current > -1) {
182             if ('\\' == current) {
183                 processEscapeSequence();
184             }
185             else if ('[' == current) {
186                 processCharClassStart();
187             }
188             else if (']' == current) {
189                 processCharClassEnd();
190             }
191             else if ('{' == current) {
192                 processRepetitionStart();
193             }
194             else if ('}' == current) {
195                 processRepetitionEnd();
196             }
197             else if ('(' == current) {
198                 processSubExpressionStart();
199             }
200             else if (')' == current) {
201                 processSubExpressionEnd();
202             }
203 
204             // process next
205             current = tape_.read();
206         }
207 
208         return tape_.toString();
209     }
210 
211     private void processCharClassStart() {
212         if (insideCharClass_) {
213             tape_.insert("\\", -1);
214         }
215         else {
216             insideCharClass_ = true;
217 
218             int next = tape_.read();
219             if (next < 0) {
220                 tape_.insert("\\", -1);
221                 return;
222             }
223             if ('^' == next) {
224                 // [^\2]
225                 next = tape_.read();
226                 if (next < 0) {
227                     tape_.insert("\\", -2);
228                     return;
229                 }
230                 if ('\\' == next) {
231                     next = tape_.read();
232                     if (DIGITS.indexOf(next) < 0) {
233                         tape_.move(-2);
234                         return;
235                     }
236                     // if it was a back reference then we have to check more
237                     if (handleBackReferenceOrOctal(next)) {
238                         next = tape_.read();
239                         if (']' == next) {
240                             tape_.move(-3);
241                             tape_.replace(3, ".");
242                             insideCharClass_ = false;
243                         }
244                     }
245                 }
246                 else if (']' == next) {
247                     // [^]
248                     tape_.move(-3);
249                     tape_.replace(3, "(?s:.)");
250                     insideCharClass_ = false;
251                 }
252                 else {
253                     tape_.move(-1);
254                 }
255             }
256             else if (']' == next) {
257                 // []
258                 tape_.move(-2);
259                 tape_.replace(2, "(?!)");
260                 insideCharClass_ = false;
261             }
262             else {
263                 tape_.move(-1);
264             }
265         }
266     }
267 
268     private void processCharClassEnd() {
269         insideCharClass_ = false;
270     }
271 
272     private void processRepetitionStart() {
273         final int next = tape_.read();
274         if (next < 0) {
275             tape_.insert("\\", -1);
276             return;
277         }
278 
279         if (DIGITS.indexOf(next) > -1) {
280             insideRepetition_ = true;
281         }
282         else {
283             tape_.insert("\\", -2);
284             tape_.move(-1);
285         }
286     }
287 
288     private void processRepetitionEnd() {
289         if (insideRepetition_) {
290             insideRepetition_ = false;
291             return;
292         }
293 
294         tape_.insert("\\", -1);
295     }
296 
297     private void processSubExpressionStart() {
298         int next = tape_.read();
299         if (next < 0) {
300             return;
301         }
302 
303         if ('?' != next) {
304             final Subexpresion sub = new Subexpresion();
305             sub.start_ = tape_.currentPos_;
306             parsingSubexpressions_.push(sub);
307             subexpressions_.add(sub);
308 
309             tape_.move(-1);
310             return;
311         }
312 
313         next = tape_.read();
314         if (next < 0) {
315             return;
316         }
317         if (':' != next) {
318             final Subexpresion sub = new Subexpresion();
319             sub.start_ = tape_.currentPos_;
320             parsingSubexpressions_.push(sub);
321             subexpressions_.add(sub);
322 
323             tape_.move(-1);
324             return;
325         }
326 
327         final Subexpresion sub = new Subexpresion();
328         sub.start_ = tape_.currentPos_;
329         parsingSubexpressions_.push(sub);
330     }
331 
332     private void processSubExpressionEnd() {
333         if (parsingSubexpressions_.isEmpty()) {
334             return;
335         }
336         final Subexpresion sub = parsingSubexpressions_.pop();
337         sub.closed_ = true;
338         sub.end_ = tape_.currentPos_;
339 
340         final int next = tape_.read();
341         sub.optional_ = '?' == next;
342         tape_.move(-1);
343     }
344 
345     private void processEscapeSequence() {
346         final int escapeSequence = tape_.read();
347         if (escapeSequence < 0) {
348             return;
349         }
350 
351         if ('x' == escapeSequence) {
352             // Hex code (e.g. \x00)
353             // Read the two char hex code
354             tape_.move(2);
355             return;
356         }
357 
358         if ('u' == escapeSequence) {
359             int next = tape_.read();
360             if (next > -1) {
361                 if (next == '{') {
362                     final int uPos = tape_.currentPos_ - 2;
363                     // Unicode point escapes
364                     do {
365                         next = tape_.read();
366                     }
367                     while (next > -1 && next != '}');
368                     if (next == '}') {
369                         tape_.tape_.replace(uPos, uPos + 1, "x");
370                     }
371                     return;
372                 }
373 
374                 // Unicode (e.g. \u0009)
375                 // we have nothing to convert here
376             }
377 
378             return;
379         }
380 
381         // Unicode property escape
382         if ('p' == escapeSequence) {
383             int next = tape_.read();
384             if (next > -1) {
385                 if (next == '{') {
386                     final int uPos = tape_.currentPos_;
387                     do {
388                         next = tape_.read();
389                     }
390                     while (next > -1 && next != '}');
391                     if (next == '}') {
392                         return;
393                     }
394 
395                     // back to the old behavior
396                     tape_.move(uPos - tape_.currentPos_ - 3);
397                     tape_.remove(1);
398                 }
399             }
400 
401             return;
402         }
403 
404         if ("ACEFGHIJKLMNOPQRTUVXYZaeghijklmpqyz".indexOf(escapeSequence) > -1) {
405             // no need to escape this chars
406             tape_.move(-2);
407             tape_.remove(1);
408             tape_.move(1);
409             return;
410         }
411 
412         if (insideCharClass_ && 'b' == escapeSequence) {
413             // [...\b...] -> [...\cH...]
414             tape_.move(-1);
415             tape_.replace(1, "cH");
416             return;
417         }
418 
419         if (DIGITS.indexOf(escapeSequence) > -1) {
420             handleBackReferenceOrOctal(escapeSequence);
421         }
422     }
423 
424     private boolean handleBackReferenceOrOctal(final int aFirstChar) {
425         // first try to figure out what the number is
426         final StringBuilder tmpNo = new StringBuilder(Character.toString((char) aFirstChar));
427         int tmpInsertPos = -1;
428         int next = tape_.read();
429         if (next > -1) {
430             if (DIGITS.indexOf(next) > -1) {
431                 tmpNo.append(next);
432                 tmpInsertPos--;
433                 next = tape_.read();
434                 if (next > -1) {
435                     if (DIGITS.indexOf(next) > -1) {
436                         tmpNo.append(next);
437                         tmpInsertPos--;
438                     }
439                     else {
440                         tape_.move(-1);
441                     }
442                 }
443             }
444             else {
445                 tape_.move(-1);
446                 if ('0' == aFirstChar) {
447                     // \0 has to be replaced by \x00
448                     tape_.insert("x0", -1);
449                     return false;
450                 }
451             }
452         }
453         else {
454             // if \0 is last character of pattern
455             if ('0' == aFirstChar) {
456                 // \0 has to be replaced by \x00
457                 tape_.insert("x0", -1);
458                 return false;
459             }
460         }
461 
462         if (tmpNo.charAt(0) == '0') {
463             // we have a octal here
464             return false;
465         }
466 
467         final int value = Integer.parseInt(tmpNo.toString());
468         if (value > subexpressions_.size()) {
469             // we have a octal here
470             tape_.insert("0", tmpInsertPos);
471             return false;
472         }
473 
474         // ignore invalid back references (inside char classes
475         // of if the referenced group is not (yet) available
476         if (insideCharClass_
477                 || (0 < value && !subexpressions_.get(value - 1).closed_)) {
478             // drop back reference
479             for (int i = tmpInsertPos; i <= 0; i++) {
480                 tape_.move(-1);
481                 tape_.remove(1);
482             }
483         }
484 
485         // ok it is a backreference
486         final Subexpresion back = subexpressions_.get(value - 1);
487         if (back.optional_ &&  !back.enhanced_) {
488             // change subexpression to make it java compatible
489             int insertPos = back.start_ - 1;
490             tape_.insertAt("(?:", insertPos);
491             for (final Subexpresion subexp : subexpressions_) {
492                 if (subexp.start_ > insertPos) {
493                     subexp.start_ = subexp.start_ + 3;
494                 }
495                 if (subexp.end_ > insertPos) {
496                     subexp.end_ = subexp.end_ + 3;
497                 }
498             }
499 
500             insertPos = back.end_ + 1;
501             tape_.insertAt(")", insertPos);
502             for (final Subexpresion subexp : subexpressions_) {
503                 if (subexp.start_ > insertPos) {
504                     subexp.start_ = subexp.start_ + 1;
505                 }
506                 if (subexp.end_ > insertPos) {
507                     subexp.end_ = subexp.end_ + 1;
508                 }
509             }
510         }
511         return true;
512     }
513 }