View Javadoc
1   /*
2    * Copyright (c) 2002-2025 Gargoyle Software Inc.
3    *
4    * Licensed under the Apache License, Version 2.0 (the "License");
5    * you may not use this file except in compliance with the License.
6    * You may obtain a copy of the License at
7    * https://www.apache.org/licenses/LICENSE-2.0
8    *
9    * Unless required by applicable law or agreed to in writing, software
10   * distributed under the License is distributed on an "AS IS" BASIS,
11   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12   * See the License for the specific language governing permissions and
13   * limitations under the License.
14   */
15  package org.htmlunit.javascript.regexp;
16  
17  import java.util.ArrayList;
18  import java.util.HashMap;
19  import java.util.List;
20  import java.util.Map;
21  import java.util.regex.MatchResult;
22  import java.util.regex.Matcher;
23  import java.util.regex.Pattern;
24  import java.util.regex.PatternSyntaxException;
25  
26  import org.apache.commons.lang3.StringUtils;
27  import org.apache.commons.logging.Log;
28  import org.apache.commons.logging.LogFactory;
29  import org.htmlunit.NotYetImplementedException;
30  import org.htmlunit.corejs.javascript.Context;
31  import org.htmlunit.corejs.javascript.RegExpProxy;
32  import org.htmlunit.corejs.javascript.ScriptRuntime;
33  import org.htmlunit.corejs.javascript.Scriptable;
34  import org.htmlunit.corejs.javascript.regexp.NativeRegExp;
35  import org.htmlunit.corejs.javascript.regexp.RegExpImpl;
36  import org.htmlunit.corejs.javascript.regexp.SubString;
37  import org.htmlunit.javascript.JavaScriptEngine;
38  
39  /**
40   * Begins customization of JavaScript RegExp base on JDK regular expression support.
41   *
42   * @author Marc Guillemot
43   * @author Ahmed Ashour
44   * @author Ronald Brill
45   * @author Carsten Steul
46   */
47  public class HtmlUnitRegExpProxy extends RegExpImpl {
48  
49      private static final Log LOG = LogFactory.getLog(HtmlUnitRegExpProxy.class);
50      /** Pattern cache */
51      private static final Map<String, Pattern> PATTENS = new HashMap<>();
52  
53      private final RegExpProxy wrapped_;
54  
55      /**
56       * Wraps a proxy to enhance it.
57       * @param wrapped the original proxy
58       */
59      public HtmlUnitRegExpProxy(final RegExpProxy wrapped) {
60          super();
61          wrapped_ = wrapped;
62      }
63  
64      /**
65       * Use the wrapped proxy except for replacement with string arg where it uses Java regular expression.
66       * {@inheritDoc}
67       */
68      @Override
69      public Object action(final Context cx, final Scriptable scope, final Scriptable thisObj,
70          final Object[] args, final int actionType) {
71          try {
72              return doAction(cx, scope, thisObj, args, actionType);
73          }
74          catch (final RegExStickyNotSupportedException e) {
75              if (LOG.isWarnEnabled()) {
76                  LOG.warn(e.getMessage(), e);
77              }
78              return wrapped_.action(cx, scope, thisObj, args, actionType);
79          }
80          catch (final StackOverflowError e) {
81              // TODO: We shouldn't have to catch this exception and fall back to Rhino's regex support!
82              // See HtmlUnitRegExpProxyTest.stackOverflow()
83              if (LOG.isWarnEnabled()) {
84                  LOG.warn(e.getMessage(), e);
85              }
86              return wrapped_.action(cx, scope, thisObj, args, actionType);
87          }
88      }
89  
90      private Object doAction(final Context cx, final Scriptable scope, final Scriptable thisObj,
91          final Object[] args, final int actionType) {
92          // in a first time just improve replacement with a String (not a function)
93          if ((RA_REPLACE == actionType || RA_REPLACE_ALL == actionType)
94                  && args.length == 2 && args[1] instanceof String) {
95              final String thisString = JavaScriptEngine.toString(thisObj);
96              final String replacement = (String) args[1];
97              final Object arg0 = args[0];
98              if (arg0 instanceof String) {
99                  // arg0 should *not* be interpreted as a RegExp
100                 return doStringReplacement(thisString, (String) arg0, replacement, RA_REPLACE_ALL == actionType);
101             }
102 
103             if (arg0 instanceof NativeRegExp) {
104                 try {
105                     final NativeRegExp regexp = (NativeRegExp) arg0;
106 
107                     if (RA_REPLACE_ALL == actionType
108                             && (regexp.getFlags() & NativeRegExp.JSREG_GLOB) == 0) {
109                         throw ScriptRuntime.typeError(
110                                 "replaceAll must be called with a global RegExp");
111                     }
112 
113                     final RegExpData reData = new RegExpData(regexp);
114                     final Matcher matcher = reData.getPattern().matcher(thisString);
115                     return doReplacement(thisString, replacement, matcher,
116                                             reData.isGlobal() || RA_REPLACE_ALL == actionType);
117                 }
118                 catch (final PatternSyntaxException e) {
119                     if (LOG.isWarnEnabled()) {
120                         LOG.warn(e.getMessage(), e);
121                     }
122                 }
123             }
124         }
125         else if (RA_MATCH == actionType || RA_SEARCH == actionType) {
126             if (args.length == 0) {
127                 return null;
128             }
129             final Object arg0 = args[0];
130             final String thisString = JavaScriptEngine.toString(thisObj);
131             final RegExpData reData;
132             if (arg0 instanceof NativeRegExp) {
133                 reData = new RegExpData((NativeRegExp) arg0);
134             }
135             else {
136                 reData = new RegExpData(JavaScriptEngine.toString(arg0));
137             }
138 
139             final Matcher matcher = reData.getPattern().matcher(thisString);
140 
141             final boolean found = matcher.find();
142             if (RA_SEARCH == actionType) {
143                 if (found) {
144                     setProperties(matcher, thisString, matcher.start(), matcher.end());
145                     return matcher.start();
146                 }
147                 return -1;
148             }
149 
150             if (!found) {
151                 return null;
152             }
153             final int index = matcher.start(0);
154             final List<Object> groups = new ArrayList<>();
155             if (reData.isGlobal()) { // has flag g
156                 groups.add(matcher.group(0));
157                 setProperties(matcher, thisString, matcher.start(0), matcher.end(0));
158 
159                 while (matcher.find()) {
160                     groups.add(matcher.group(0));
161                     setProperties(matcher, thisString, matcher.start(0), matcher.end(0));
162                 }
163             }
164             else {
165                 for (int i = 0; i <= matcher.groupCount(); i++) {
166                     Object group = matcher.group(i);
167                     if (group == null) {
168                         group = JavaScriptEngine.UNDEFINED;
169                     }
170                     groups.add(group);
171                 }
172 
173                 setProperties(matcher, thisString, matcher.start(), matcher.end());
174             }
175             final Scriptable response = cx.newArray(scope, groups.toArray());
176             // the additional properties (cf ECMA script reference 15.10.6.2 13)
177             response.put("index", response, Integer.valueOf(index));
178             response.put("input", response, thisString);
179             return response;
180         }
181 
182         return wrappedAction(cx, scope, thisObj, args, actionType);
183     }
184 
185     private String doStringReplacement(final String originalString,
186                         final String searchString, final String replacement,
187                         final boolean replaceAll) {
188         if (originalString == null) {
189             return "";
190         }
191 
192         final StaticStringMatcher matcher = new StaticStringMatcher(originalString, searchString);
193 
194         final StringBuilder sb = new StringBuilder();
195         int previousIndex = 0;
196 
197         while (matcher.find()) {
198             sb.append(originalString, previousIndex, matcher.start());
199 
200             String localReplacement = replacement;
201             if (replacement.contains("$")) {
202                 localReplacement = computeReplacementValue(localReplacement, originalString, matcher, false);
203             }
204             sb.append(localReplacement);
205             previousIndex = matcher.end();
206 
207             if (!replaceAll) {
208                 break;
209             }
210         }
211         sb.append(originalString, previousIndex, originalString.length());
212         return sb.toString();
213     }
214 
215     private String doReplacement(final String originalString, final String replacement, final Matcher matcher,
216         final boolean replaceAll) {
217 
218         final StringBuilder sb = new StringBuilder();
219         int previousIndex = 0;
220         while (matcher.find()) {
221             sb.append(originalString, previousIndex, matcher.start());
222             String localReplacement = replacement;
223             if (replacement.contains("$")) {
224                 localReplacement = computeReplacementValue(replacement, originalString, matcher, false);
225             }
226             sb.append(localReplacement);
227             previousIndex = matcher.end();
228 
229             setProperties(matcher, originalString, matcher.start(), previousIndex);
230             if (!replaceAll) {
231                 break;
232             }
233         }
234         sb.append(originalString, previousIndex, originalString.length());
235         return sb.toString();
236     }
237 
238     String computeReplacementValue(final String replacement, final String originalString,
239             final MatchResult matcher, final boolean group0ReturnsWholeMatch) {
240 
241         int lastIndex = 0;
242         final StringBuilder result = new StringBuilder();
243         int i;
244         while ((i = replacement.indexOf('$', lastIndex)) > -1) {
245             if (i > 0) {
246                 result.append(replacement, lastIndex, i);
247             }
248             String ss = null;
249             if (i < replacement.length() - 1 && (i == lastIndex || replacement.charAt(i - 1) != '$')) {
250                 final char next = replacement.charAt(i + 1);
251                 // only valid back reference are "evaluated"
252                 if (next >= '1' && next <= '9') {
253                     final int num1digit = next - '0';
254                     final char next2 = i + 2 < replacement.length() ? replacement.charAt(i + 2) : 'x';
255                     final int num2digits;
256                     // if there are 2 digits, the second one is considered as part of the group number
257                     // only if there is such a group
258                     if (next2 >= '1' && next2 <= '9') {
259                         num2digits = num1digit * 10 + (next2 - '0');
260                     }
261                     else {
262                         num2digits = Integer.MAX_VALUE;
263                     }
264                     if (num2digits <= matcher.groupCount()) {
265                         ss = matcher.group(num2digits);
266                         i++;
267                     }
268                     else if (num1digit <= matcher.groupCount()) {
269                         ss = StringUtils.defaultString(matcher.group(num1digit));
270                     }
271                 }
272                 else {
273                     switch (next) {
274                         case '&':
275                             ss = matcher.group();
276                             break;
277                         case '0':
278                             if (group0ReturnsWholeMatch) {
279                                 ss = matcher.group();
280                             }
281                             break;
282                         case '`':
283                             ss = originalString.substring(0, matcher.start());
284                             break;
285                         case '\'':
286                             ss = originalString.substring(matcher.end());
287                             break;
288                         case '$':
289                             ss = "$";
290                             break;
291                         default:
292                     }
293                 }
294             }
295             if (ss == null) {
296                 result.append('$');
297                 lastIndex = i + 1;
298             }
299             else {
300                 result.append(ss);
301                 lastIndex = i + 2;
302             }
303         }
304 
305         result.append(replacement, lastIndex, replacement.length());
306 
307         return result.toString();
308     }
309 
310     /**
311      * Calls action on the wrapped RegExp proxy.
312      */
313     private Object wrappedAction(final Context cx, final Scriptable scope, final Scriptable thisObj,
314             final Object[] args, final int actionType) {
315 
316         // take care to set the context's RegExp proxy to the original one as this is checked
317         // (cf org.htmlunit.corejs.javascript.regexp.RegExpImp:334)
318         try {
319             ScriptRuntime.setRegExpProxy(cx, wrapped_);
320             return wrapped_.action(cx, scope, thisObj, args, actionType);
321         }
322         finally {
323             ScriptRuntime.setRegExpProxy(cx, this);
324         }
325     }
326 
327     private void setProperties(final Matcher matcher, final String thisString, final int startPos, final int endPos) {
328         // lastMatch
329         final String match = matcher.group();
330         if (match == null) {
331             lastMatch = new SubString();
332         }
333         else {
334             lastMatch = new SubString(match, 0, match.length());
335         }
336 
337         // parens
338         final int groupCount = matcher.groupCount();
339         if (groupCount == 0) {
340             parens = null;
341         }
342         else {
343             final int count = Math.min(9, groupCount);
344             parens = new SubString[count];
345             for (int i = 0; i < count; i++) {
346                 final String group = matcher.group(i + 1);
347                 if (group == null) {
348                     parens[i] = new SubString();
349                 }
350                 else {
351                     parens[i] = new SubString(group, 0, group.length());
352                 }
353             }
354         }
355 
356         // lastParen
357         if (groupCount > 0) {
358             final String last = matcher.group(groupCount);
359             if (last == null) {
360                 lastParen = new SubString();
361             }
362             else {
363                 lastParen = new SubString(last, 0, last.length());
364             }
365         }
366 
367         // leftContext
368         if (startPos > 0) {
369             leftContext = new SubString(thisString, 0, startPos);
370         }
371         else {
372             leftContext = new SubString();
373         }
374 
375         // rightContext
376         final int length = thisString.length();
377         if (endPos < length) {
378             rightContext = new SubString(thisString, endPos, length - endPos);
379         }
380         else {
381             rightContext = new SubString();
382         }
383     }
384 
385     /**
386      * {@inheritDoc}
387      */
388     @Override
389     public Object compileRegExp(final Context cx, final String source, final String flags) {
390         try {
391             return wrapped_.compileRegExp(cx, source, flags);
392         }
393         catch (final Exception e) {
394             if (LOG.isWarnEnabled()) {
395                 LOG.warn("compileRegExp() threw for >" + source + "<, flags: >" + flags + "<. "
396                     + "Replacing with a '####shouldNotFindAnything###'");
397             }
398             return wrapped_.compileRegExp(cx, "####shouldNotFindAnything###", "");
399         }
400     }
401 
402     /**
403      * {@inheritDoc}
404      */
405     @Override
406     public int find_split(final Context cx, final Scriptable scope, final String target,
407             final String separator, final Scriptable re, final int[] ip, final int[] matchlen,
408             final boolean[] matched, final String[][] parensp) {
409         return wrapped_.find_split(cx, scope, target, separator, re, ip, matchlen, matched, parensp);
410     }
411 
412     /**
413      * {@inheritDoc}
414      */
415     @Override
416     public boolean isRegExp(final Scriptable obj) {
417         return wrapped_.isRegExp(obj);
418     }
419 
420     /**
421      * {@inheritDoc}
422      */
423     @Override
424     public Scriptable wrapRegExp(final Context cx, final Scriptable scope, final Object compiled) {
425         return wrapped_.wrapRegExp(cx, scope, compiled);
426     }
427 
428     private static class RegExpData {
429         private final boolean global_;
430         private Pattern pattern_;
431 
432         RegExpData(final NativeRegExp re) {
433             final String str = re.toString(); // the form is /regex/flags
434             final String jsFlags = StringUtils.substringAfterLast(str, "/");
435 
436             if (jsFlags.indexOf('y') != -1) {
437                 throw new RegExStickyNotSupportedException(str);
438             }
439 
440             global_ = jsFlags.indexOf('g') != -1;
441 
442             pattern_ = PATTENS.get(str);
443             if (pattern_ == null) {
444                 final String jsSource = StringUtils.substringBeforeLast(str.substring(1), "/");
445                 pattern_ = Pattern.compile(jsRegExpToJavaRegExp(jsSource), getJavaFlags(jsFlags));
446                 PATTENS.put(str, pattern_);
447             }
448         }
449 
450         RegExpData(final String string) {
451             global_ = false;
452 
453             pattern_ = PATTENS.get(string);
454             if (pattern_ == null) {
455                 pattern_ = Pattern.compile(jsRegExpToJavaRegExp(string), 0);
456                 PATTENS.put(string, pattern_);
457             }
458         }
459 
460         /**
461          * Converts the current JavaScript RegExp flags to Java Pattern flags.
462          * @return the Java Pattern flags
463          */
464         private static int getJavaFlags(final String jsFlags) {
465             int flags = 0;
466             if (jsFlags.contains("i")) {
467                 flags |= Pattern.CASE_INSENSITIVE;
468             }
469             if (jsFlags.contains("m")) {
470                 flags |= Pattern.MULTILINE;
471             }
472             if (jsFlags.contains("s")) {
473                 flags |= Pattern.DOTALL;
474             }
475             return flags;
476         }
477 
478         boolean isGlobal() {
479             return global_;
480         }
481 
482         Pattern getPattern() {
483             return pattern_;
484         }
485     }
486 
487     /**
488      * Transform a JavaScript regular expression to a Java regular expression
489      * @param re the JavaScript regular expression to transform
490      * @return the transformed expression
491      */
492     static String jsRegExpToJavaRegExp(final String re) {
493         final RegExpJsToJavaConverter regExpJsToJavaFSM = new RegExpJsToJavaConverter();
494         return regExpJsToJavaFSM.convert(re);
495     }
496 
497     /**
498      * Simple helper.
499      */
500     private static final class StaticStringMatcher implements MatchResult {
501         private final String original_;
502         private final String search_;
503 
504         private int start_;
505         private int end_;
506 
507         StaticStringMatcher(final String originalString, final String searchString) {
508             original_ = originalString;
509             search_ = searchString;
510 
511             start_ = -1;
512             end_ = 0;
513         }
514 
515         public boolean find() {
516             if (start_ == end_) {
517                 end_++;
518             }
519             if (end_ > original_.length()) {
520                 return false;
521             }
522 
523             final int pos = original_.indexOf(search_, end_);
524             if (pos != -1) {
525                 start_ = pos;
526                 end_ = pos + search_.length();
527                 return true;
528             }
529             return false;
530         }
531 
532         @Override
533         public String group() {
534             return search_;
535         }
536 
537         @Override
538         public int start() {
539             return start_;
540         }
541 
542         @Override
543         public int end() {
544             return end_;
545         }
546 
547         @Override
548         public int start(final int group) {
549             throw new NotYetImplementedException("StaticStringMatcher.start(int)");
550         }
551 
552         @Override
553         public int end(final int group) {
554             throw new NotYetImplementedException("StaticStringMatcher.end(int)");
555         }
556 
557         @Override
558         public String group(final int group) {
559             throw new NotYetImplementedException("StaticStringMatcher.group(int)");
560         }
561 
562         @Override
563         public int groupCount() {
564             return 0;
565         }
566     }
567 
568     // a bit of a hack but sufficent for the moment
569     private static class RegExStickyNotSupportedException extends IllegalArgumentException {
570         RegExStickyNotSupportedException(final String regex) {
571             super("RegEx sticky flag is not supported (" + regex + ") by HtmlUnitRegExProxy");
572         }
573     }
574 
575 }