1
2
3
4
5
6
7
8
9
10
11
12
13
14
15 package org.htmlunit.javascript.regexp;
16
17 import java.util.ArrayDeque;
18 import java.util.ArrayList;
19 import java.util.Deque;
20 import java.util.HashMap;
21 import java.util.List;
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38 public class RegExpJsToJavaConverter {
39
40 private static final String DIGITS = "0123456789";
41 private static final HashMap<String, String> UNICODE_ESCAPES;
42
43 private Tape tape_;
44 private boolean insideCharClass_;
45 private boolean insideRepetition_;
46 private Deque<Subexpresion> parsingSubexpressions_;
47 private List<Subexpresion> subexpressions_;
48
49 static {
50 UNICODE_ESCAPES = new HashMap<>();
51 UNICODE_ESCAPES.put("L", "L");
52 UNICODE_ESCAPES.put("LETTER", "L");
53 UNICODE_ESCAPES.put("LU", "Lu");
54 UNICODE_ESCAPES.put("UPPERCASELETTER", "Lu");
55 UNICODE_ESCAPES.put("LL", "Ll");
56 UNICODE_ESCAPES.put("LOWERCASELETTER", "Ll");
57 }
58
59
60
61
62 private static class Tape {
63 private final StringBuilder tape_;
64 private int currentPos_;
65
66
67
68
69
70
71 Tape(final String input) {
72 currentPos_ = 0;
73 tape_ = new StringBuilder(input);
74 }
75
76
77
78
79
80 public void move(final int offset) {
81 currentPos_ += offset;
82 }
83
84
85
86
87
88
89
90 public int read() {
91 if (currentPos_ < 0) {
92 return -1;
93 }
94 if (currentPos_ >= tape_.length()) {
95 return -1;
96 }
97
98 return tape_.charAt(currentPos_++);
99 }
100
101
102
103
104
105
106 public void insert(final String token, final int offset) {
107 tape_.insert(currentPos_ + offset, token);
108 currentPos_ += token.length();
109 }
110
111
112
113
114
115
116 public void insertAt(final String token, final int pos) {
117 tape_.insert(pos, token);
118 currentPos_ += token.length();
119 }
120
121
122
123
124
125 public void replace(final int count, final String token) {
126 tape_.replace(currentPos_, currentPos_ + count, token);
127 currentPos_ += token.length();
128 }
129
130
131
132
133
134 public void remove(final int count) {
135 tape_.delete(currentPos_, currentPos_ + count);
136 }
137
138
139
140
141
142
143 @Override
144 public String toString() {
145 return tape_.toString();
146 }
147 }
148
149 private static final class Subexpresion {
150 private boolean closed_;
151 private boolean optional_;
152 private final boolean enhanced_;
153 private int start_;
154 private int end_;
155
156 Subexpresion() {
157 closed_ = false;
158 optional_ = false;
159 enhanced_ = false;
160 start_ = -1;
161 end_ = -1;
162 }
163 }
164
165
166
167
168
169
170
171
172 public String convert(final String input) {
173 tape_ = new Tape(input);
174 insideCharClass_ = false;
175 insideRepetition_ = false;
176
177 parsingSubexpressions_ = new ArrayDeque<>();
178 subexpressions_ = new ArrayList<>();
179
180 int current = tape_.read();
181 while (current > -1) {
182 if ('\\' == current) {
183 processEscapeSequence();
184 }
185 else if ('[' == current) {
186 processCharClassStart();
187 }
188 else if (']' == current) {
189 processCharClassEnd();
190 }
191 else if ('{' == current) {
192 processRepetitionStart();
193 }
194 else if ('}' == current) {
195 processRepetitionEnd();
196 }
197 else if ('(' == current) {
198 processSubExpressionStart();
199 }
200 else if (')' == current) {
201 processSubExpressionEnd();
202 }
203
204
205 current = tape_.read();
206 }
207
208 return tape_.toString();
209 }
210
211 private void processCharClassStart() {
212 if (insideCharClass_) {
213 tape_.insert("\\", -1);
214 }
215 else {
216 insideCharClass_ = true;
217
218 int next = tape_.read();
219 if (next < 0) {
220 tape_.insert("\\", -1);
221 return;
222 }
223 if ('^' == next) {
224
225 next = tape_.read();
226 if (next < 0) {
227 tape_.insert("\\", -2);
228 return;
229 }
230 if ('\\' == next) {
231 next = tape_.read();
232 if (DIGITS.indexOf(next) < 0) {
233 tape_.move(-2);
234 return;
235 }
236
237 if (handleBackReferenceOrOctal(next)) {
238 next = tape_.read();
239 if (']' == next) {
240 tape_.move(-3);
241 tape_.replace(3, ".");
242 insideCharClass_ = false;
243 }
244 }
245 }
246 else if (']' == next) {
247
248 tape_.move(-3);
249 tape_.replace(3, "(?s:.)");
250 insideCharClass_ = false;
251 }
252 else {
253 tape_.move(-1);
254 }
255 }
256 else if (']' == next) {
257
258 tape_.move(-2);
259 tape_.replace(2, "(?!)");
260 insideCharClass_ = false;
261 }
262 else {
263 tape_.move(-1);
264 }
265 }
266 }
267
268 private void processCharClassEnd() {
269 insideCharClass_ = false;
270 }
271
272 private void processRepetitionStart() {
273 final int next = tape_.read();
274 if (next < 0) {
275 tape_.insert("\\", -1);
276 return;
277 }
278
279 if (DIGITS.indexOf(next) > -1) {
280 insideRepetition_ = true;
281 }
282 else {
283 tape_.insert("\\", -2);
284 tape_.move(-1);
285 }
286 }
287
288 private void processRepetitionEnd() {
289 if (insideRepetition_) {
290 insideRepetition_ = false;
291 return;
292 }
293
294 tape_.insert("\\", -1);
295 }
296
297 private void processSubExpressionStart() {
298 int next = tape_.read();
299 if (next < 0) {
300 return;
301 }
302
303 if ('?' != next) {
304 final Subexpresion sub = new Subexpresion();
305 sub.start_ = tape_.currentPos_;
306 parsingSubexpressions_.push(sub);
307 subexpressions_.add(sub);
308
309 tape_.move(-1);
310 return;
311 }
312
313 next = tape_.read();
314 if (next < 0) {
315 return;
316 }
317 if (':' != next) {
318 final Subexpresion sub = new Subexpresion();
319 sub.start_ = tape_.currentPos_;
320 parsingSubexpressions_.push(sub);
321 subexpressions_.add(sub);
322
323 tape_.move(-1);
324 return;
325 }
326
327 final Subexpresion sub = new Subexpresion();
328 sub.start_ = tape_.currentPos_;
329 parsingSubexpressions_.push(sub);
330 }
331
332 private void processSubExpressionEnd() {
333 if (parsingSubexpressions_.isEmpty()) {
334 return;
335 }
336 final Subexpresion sub = parsingSubexpressions_.pop();
337 sub.closed_ = true;
338 sub.end_ = tape_.currentPos_;
339
340 final int next = tape_.read();
341 sub.optional_ = '?' == next;
342 tape_.move(-1);
343 }
344
345 private void processEscapeSequence() {
346 final int escapeSequence = tape_.read();
347 if (escapeSequence < 0) {
348 return;
349 }
350
351 if ('x' == escapeSequence) {
352
353
354 tape_.move(2);
355 return;
356 }
357
358 if ('u' == escapeSequence) {
359 int next = tape_.read();
360 if (next > -1) {
361 if (next == '{') {
362 final int uPos = tape_.currentPos_ - 2;
363
364 do {
365 next = tape_.read();
366 }
367 while (next > -1 && next != '}');
368 if (next == '}') {
369 tape_.tape_.replace(uPos, uPos + 1, "x");
370 }
371 return;
372 }
373
374
375
376 }
377
378 return;
379 }
380
381
382 if ('p' == escapeSequence) {
383 int next = tape_.read();
384 if (next > -1) {
385 if (next == '{') {
386 final int uPos = tape_.currentPos_;
387 do {
388 next = tape_.read();
389 }
390 while (next > -1 && next != '}');
391 if (next == '}') {
392 return;
393 }
394
395
396 tape_.move(uPos - tape_.currentPos_ - 3);
397 tape_.remove(1);
398 }
399 }
400
401 return;
402 }
403
404 if ("ACEFGHIJKLMNOPQRTUVXYZaeghijklmpqyz".indexOf(escapeSequence) > -1) {
405
406 tape_.move(-2);
407 tape_.remove(1);
408 tape_.move(1);
409 return;
410 }
411
412 if (insideCharClass_ && 'b' == escapeSequence) {
413
414 tape_.move(-1);
415 tape_.replace(1, "cH");
416 return;
417 }
418
419 if (DIGITS.indexOf(escapeSequence) > -1) {
420 handleBackReferenceOrOctal(escapeSequence);
421 }
422 }
423
424 private boolean handleBackReferenceOrOctal(final int aFirstChar) {
425
426 final StringBuilder tmpNo = new StringBuilder(Character.toString((char) aFirstChar));
427 int tmpInsertPos = -1;
428 int next = tape_.read();
429 if (next > -1) {
430 if (DIGITS.indexOf(next) > -1) {
431 tmpNo.append(next);
432 tmpInsertPos--;
433 next = tape_.read();
434 if (next > -1) {
435 if (DIGITS.indexOf(next) > -1) {
436 tmpNo.append(next);
437 tmpInsertPos--;
438 }
439 else {
440 tape_.move(-1);
441 }
442 }
443 }
444 else {
445 tape_.move(-1);
446 if ('0' == aFirstChar) {
447
448 tape_.insert("x0", -1);
449 return false;
450 }
451 }
452 }
453 else {
454
455 if ('0' == aFirstChar) {
456
457 tape_.insert("x0", -1);
458 return false;
459 }
460 }
461
462 if (tmpNo.charAt(0) == '0') {
463
464 return false;
465 }
466
467 final int value = Integer.parseInt(tmpNo.toString());
468 if (value > subexpressions_.size()) {
469
470 tape_.insert("0", tmpInsertPos);
471 return false;
472 }
473
474
475
476 if (insideCharClass_
477 || (0 < value && !subexpressions_.get(value - 1).closed_)) {
478
479 for (int i = tmpInsertPos; i <= 0; i++) {
480 tape_.move(-1);
481 tape_.remove(1);
482 }
483 }
484
485
486 final Subexpresion back = subexpressions_.get(value - 1);
487 if (back.optional_ && !back.enhanced_) {
488
489 int insertPos = back.start_ - 1;
490 tape_.insertAt("(?:", insertPos);
491 for (final Subexpresion subexp : subexpressions_) {
492 if (subexp.start_ > insertPos) {
493 subexp.start_ = subexp.start_ + 3;
494 }
495 if (subexp.end_ > insertPos) {
496 subexp.end_ = subexp.end_ + 3;
497 }
498 }
499
500 insertPos = back.end_ + 1;
501 tape_.insertAt(")", insertPos);
502 for (final Subexpresion subexp : subexpressions_) {
503 if (subexp.start_ > insertPos) {
504 subexp.start_ = subexp.start_ + 1;
505 }
506 if (subexp.end_ > insertPos) {
507 subexp.end_ = subexp.end_ + 1;
508 }
509 }
510 }
511 return true;
512 }
513 }