RegExpScanner.java revision 1731:53537d04b6f4
1/*
2 * Copyright (c) 2010, 2013, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation.  Oracle designates this
8 * particular file as subject to the "Classpath" exception as provided
9 * by Oracle in the LICENSE file that accompanied this code.
10 *
11 * This code is distributed in the hope that it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
14 * version 2 for more details (a copy is included in the LICENSE file that
15 * accompanied this code).
16 *
17 * You should have received a copy of the GNU General Public License version
18 * 2 along with this work; if not, write to the Free Software Foundation,
19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20 *
21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22 * or visit www.oracle.com if you need additional information or have any
23 * questions.
24 */
25
26package jdk.nashorn.internal.runtime.regexp;
27
28import java.util.HashMap;
29import java.util.Iterator;
30import java.util.LinkedList;
31import java.util.List;
32import java.util.Map;
33import java.util.regex.PatternSyntaxException;
34import jdk.nashorn.internal.parser.Lexer;
35import jdk.nashorn.internal.parser.Scanner;
36import jdk.nashorn.internal.runtime.BitVector;
37
38/**
39 * Scan a JavaScript regexp, converting to Java regex if necessary.
40 *
41 */
42final class RegExpScanner extends Scanner {
43
44    /**
45     * String builder used to rewrite the pattern for the currently used regexp factory.
46     */
47    private final StringBuilder sb;
48
49    /** Expected token table */
50    private final Map<Character, Integer> expected = new HashMap<>();
51
52    /** Capturing parenthesis that have been found so far. */
53    private final List<Capture> caps = new LinkedList<>();
54
55    /** Forward references to capturing parenthesis to be resolved later.*/
56    private final LinkedList<Integer> forwardReferences = new LinkedList<>();
57
58    /** Current level of zero-width negative lookahead assertions. */
59    private int negLookaheadLevel;
60
61    /** Sequential id of current top-level zero-width negative lookahead assertion. */
62    private int negLookaheadGroup;
63
64    /** Are we currently inside a character class? */
65    private boolean inCharClass = false;
66
67    /** Are we currently inside a negated character class? */
68    private boolean inNegativeClass = false;
69
70    private static final String NON_IDENT_ESCAPES = "$^*+(){}[]|\\.?-";
71
72    private static class Capture {
73        /** Zero-width negative lookaheads enclosing the capture. */
74        private final int negLookaheadLevel;
75        /** Sequential id of top-level negative lookaheads containing the capture. */
76        private  final int negLookaheadGroup;
77
78        Capture(final int negLookaheadGroup, final int negLookaheadLevel) {
79            this.negLookaheadGroup = negLookaheadGroup;
80            this.negLookaheadLevel = negLookaheadLevel;
81        }
82
83        /**
84         * Returns true if this Capture can be referenced from the position specified by the
85         * group and level parameters. This is the case if either the group is not within
86         * a negative lookahead, or the position of the referrer is in the same negative lookahead.
87         *
88         * @param group current negative lookahead group
89         * @param level current negative lokahead level
90         * @return true if this capture group can be referenced from the given position
91         */
92        boolean canBeReferencedFrom(final int group, final int level) {
93            return this.negLookaheadLevel == 0 || (group == this.negLookaheadGroup && level >= this.negLookaheadLevel);
94        }
95
96    }
97
98    /**
99     * Constructor
100     * @param string the JavaScript regexp to parse
101     */
102    private RegExpScanner(final String string) {
103        super(string);
104        sb = new StringBuilder(limit);
105        reset(0);
106        expected.put(']', 0);
107        expected.put('}', 0);
108    }
109
110    private void processForwardReferences() {
111
112        final Iterator<Integer> iterator = forwardReferences.descendingIterator();
113        while (iterator.hasNext()) {
114            final int pos = iterator.next();
115            final int num = iterator.next();
116            if (num > caps.size()) {
117                // Non-existing backreference. If the number begins with a valid octal convert it to
118                // Unicode escape and append the rest to a literal character sequence.
119                final StringBuilder buffer = new StringBuilder();
120                octalOrLiteral(Integer.toString(num), buffer);
121                sb.insert(pos, buffer);
122            }
123        }
124
125        forwardReferences.clear();
126    }
127
128    /**
129     * Scan a JavaScript regexp string returning a Java safe regex string.
130     *
131     * @param string
132     *            JavaScript regexp string.
133     * @return Java safe regex string.
134     */
135    public static RegExpScanner scan(final String string) {
136        final RegExpScanner scanner = new RegExpScanner(string);
137
138        try {
139            scanner.disjunction();
140        } catch (final Exception e) {
141            throw new PatternSyntaxException(e.getMessage(), string, scanner.position);
142        }
143
144        scanner.processForwardReferences();
145
146        // Throw syntax error unless we parsed the entire JavaScript regexp without syntax errors
147        if (scanner.position != string.length()) {
148            final String p = scanner.getStringBuilder().toString();
149            throw new PatternSyntaxException(string, p, p.length() + 1);
150        }
151
152        return scanner;
153    }
154
155    final StringBuilder getStringBuilder() {
156        return sb;
157    }
158
159    String getJavaPattern() {
160        return sb.toString();
161    }
162
163    BitVector getGroupsInNegativeLookahead() {
164        BitVector vec = null;
165        for (int i = 0; i < caps.size(); i++) {
166            final Capture cap = caps.get(i);
167            if (cap.negLookaheadLevel > 0) {
168                if (vec == null) {
169                    vec = new BitVector(caps.size() + 1);
170                }
171                vec.set(i + 1);
172            }
173        }
174        return vec;
175    }
176
177    /**
178     * Commit n characters to the builder and to a given token
179     * @param n     Number of characters.
180     * @return Committed token
181     */
182    private boolean commit(final int n) {
183        switch (n) {
184        case 1:
185            sb.append(ch0);
186            skip(1);
187            break;
188        case 2:
189            sb.append(ch0);
190            sb.append(ch1);
191            skip(2);
192            break;
193        case 3:
194            sb.append(ch0);
195            sb.append(ch1);
196            sb.append(ch2);
197            skip(3);
198            break;
199        default:
200            assert false : "Should not reach here";
201        }
202
203        return true;
204    }
205
206    /**
207     * Restart the buffers back at an earlier position.
208     *
209     * @param startIn
210     *            Position in the input stream.
211     * @param startOut
212     *            Position in the output stream.
213     */
214    private void restart(final int startIn, final int startOut) {
215        reset(startIn);
216        sb.setLength(startOut);
217    }
218
219    private void push(final char ch) {
220        expected.put(ch, expected.get(ch) + 1);
221    }
222
223    private void pop(final char ch) {
224        expected.put(ch, Math.min(0, expected.get(ch) - 1));
225    }
226
227    /*
228     * Recursive descent tokenizer starts below.
229     */
230
231    /*
232     * Disjunction ::
233     *      Alternative
234     *      Alternative | Disjunction
235     */
236    private void disjunction() {
237        while (true) {
238            alternative();
239
240            if (ch0 == '|') {
241                commit(1);
242            } else {
243                break;
244            }
245        }
246    }
247
248    /*
249     * Alternative ::
250     *      [empty]
251     *      Alternative Term
252     */
253    private void alternative() {
254        while (term()) {
255            // do nothing
256        }
257    }
258
259    /*
260     * Term ::
261     *      Assertion
262     *      Atom
263     *      Atom Quantifier
264     */
265    private boolean term() {
266        final int startIn  = position;
267        final int startOut = sb.length();
268
269        if (assertion()) {
270            return true;
271        }
272
273        if (atom()) {
274            quantifier();
275            return true;
276        }
277
278        restart(startIn, startOut);
279        return false;
280    }
281
282    /*
283     * Assertion ::
284     *      ^
285     *      $
286     *      \b
287     *      \B
288     *      ( ? = Disjunction )
289     *      ( ? ! Disjunction )
290     */
291    private boolean assertion() {
292        final int startIn  = position;
293        final int startOut = sb.length();
294
295        switch (ch0) {
296        case '^':
297        case '$':
298            return commit(1);
299
300        case '\\':
301            if (ch1 == 'b' || ch1 == 'B') {
302                return commit(2);
303            }
304            break;
305
306        case '(':
307            if (ch1 != '?') {
308                break;
309            }
310            if (ch2 != '=' && ch2 != '!') {
311                break;
312            }
313            final boolean isNegativeLookahead = (ch2 == '!');
314            commit(3);
315
316            if (isNegativeLookahead) {
317                if (negLookaheadLevel == 0) {
318                    negLookaheadGroup++;
319                }
320                negLookaheadLevel++;
321            }
322            disjunction();
323            if (isNegativeLookahead) {
324                negLookaheadLevel--;
325            }
326
327            if (ch0 == ')') {
328                return commit(1);
329            }
330            break;
331
332        default:
333            break;
334        }
335
336        restart(startIn, startOut);
337        return false;
338    }
339
340    /*
341     * Quantifier ::
342     *      QuantifierPrefix
343     *      QuantifierPrefix ?
344     */
345    private boolean quantifier() {
346        if (quantifierPrefix()) {
347            if (ch0 == '?') {
348                commit(1);
349            }
350            return true;
351        }
352        return false;
353    }
354
355    /*
356     * QuantifierPrefix ::
357     *      *
358     *      +
359     *      ?
360     *      { DecimalDigits }
361     *      { DecimalDigits , }
362     *      { DecimalDigits , DecimalDigits }
363     */
364    private boolean quantifierPrefix() {
365        final int startIn  = position;
366        final int startOut = sb.length();
367
368        switch (ch0) {
369        case '*':
370        case '+':
371        case '?':
372            return commit(1);
373
374        case '{':
375            commit(1);
376
377            if (!decimalDigits()) {
378                break; // not a quantifier - back out
379            }
380            push('}');
381
382            if (ch0 == ',') {
383                commit(1);
384                decimalDigits();
385            }
386
387            if (ch0 == '}') {
388                pop('}');
389                commit(1);
390            } else {
391                // Bad quantifier should be rejected but is accepted by all major engines
392                restart(startIn, startOut);
393                return false;
394            }
395
396            return true;
397
398        default:
399            break;
400        }
401
402        restart(startIn, startOut);
403        return false;
404    }
405
406    /*
407     * Atom ::
408     *      PatternCharacter
409     *      .
410     *      \ AtomEscape
411     *      CharacterClass
412     *      ( Disjunction )
413     *      ( ? : Disjunction )
414     *
415     */
416    private boolean atom() {
417        final int startIn  = position;
418        final int startOut = sb.length();
419
420        if (patternCharacter()) {
421            return true;
422        }
423
424        if (ch0 == '.') {
425            return commit(1);
426        }
427
428        if (ch0 == '\\') {
429            commit(1);
430
431            if (atomEscape()) {
432                return true;
433            }
434        }
435
436        if (characterClass()) {
437            return true;
438        }
439
440        if (ch0 == '(') {
441            commit(1);
442            if (ch0 == '?' && ch1 == ':') {
443                commit(2);
444            } else {
445                caps.add(new Capture(negLookaheadGroup, negLookaheadLevel));
446            }
447
448            disjunction();
449
450            if (ch0 == ')') {
451                commit(1);
452                return true;
453            }
454        }
455
456        restart(startIn, startOut);
457        return false;
458    }
459
460    /*
461     * PatternCharacter ::
462     *      SourceCharacter but not any of: ^$\.*+?()[]{}|
463     */
464    @SuppressWarnings("fallthrough")
465    private boolean patternCharacter() {
466        if (atEOF()) {
467            return false;
468        }
469
470        switch (ch0) {
471        case '^':
472        case '$':
473        case '\\':
474        case '.':
475        case '*':
476        case '+':
477        case '?':
478        case '(':
479        case ')':
480        case '[':
481        case '|':
482            return false;
483
484        case '}':
485        case ']':
486            final int n = expected.get(ch0);
487            if (n != 0) {
488                return false;
489            }
490
491       case '{':
492           // if not a valid quantifier escape curly brace to match itself
493           // this ensures compatibility with other JS implementations
494           if (!quantifierPrefix()) {
495               sb.append('\\');
496               return commit(1);
497           }
498           return false;
499
500        default:
501            return commit(1); // SOURCECHARACTER
502        }
503    }
504
505    /*
506     * AtomEscape ::
507     *      DecimalEscape
508     *      CharacterEscape
509     *      CharacterClassEscape
510     */
511    private boolean atomEscape() {
512        // Note that contrary to ES 5.1 spec we put identityEscape() last because it acts as a catch-all
513        return decimalEscape() || characterClassEscape() || characterEscape() || identityEscape();
514    }
515
516    /*
517     * CharacterEscape ::
518     *      ControlEscape
519     *      c ControlLetter
520     *      HexEscapeSequence
521     *      UnicodeEscapeSequence
522     *      IdentityEscape
523     */
524    private boolean characterEscape() {
525        final int startIn  = position;
526        final int startOut = sb.length();
527
528        if (controlEscape()) {
529            return true;
530        }
531
532        if (ch0 == 'c') {
533            commit(1);
534            if (controlLetter()) {
535                return true;
536            }
537            restart(startIn, startOut);
538        }
539
540        if (hexEscapeSequence() || unicodeEscapeSequence()) {
541            return true;
542        }
543
544        restart(startIn, startOut);
545        return false;
546    }
547
548    private boolean scanEscapeSequence(final char leader, final int length) {
549        final int startIn  = position;
550        final int startOut = sb.length();
551
552        if (ch0 != leader) {
553            return false;
554        }
555
556        commit(1);
557        for (int i = 0; i < length; i++) {
558            final char ch0l = Character.toLowerCase(ch0);
559            if ((ch0l >= 'a' && ch0l <= 'f') || isDecimalDigit(ch0)) {
560                commit(1);
561            } else {
562                restart(startIn, startOut);
563                return false;
564            }
565        }
566
567        return true;
568    }
569
570    private boolean hexEscapeSequence() {
571        return scanEscapeSequence('x', 2);
572    }
573
574    private boolean unicodeEscapeSequence() {
575        return scanEscapeSequence('u', 4);
576    }
577
578    /*
579     * ControlEscape ::
580     *      one of fnrtv
581     */
582    private boolean controlEscape() {
583        switch (ch0) {
584        case 'f':
585        case 'n':
586        case 'r':
587        case 't':
588        case 'v':
589            return commit(1);
590
591        default:
592            return false;
593        }
594    }
595
596    /*
597     * ControlLetter ::
598     *      one of abcdefghijklmnopqrstuvwxyz
599     *      ABCDEFGHIJKLMNOPQRSTUVWXYZ
600     */
601    private boolean controlLetter() {
602        // To match other engines we also accept '0'..'9' and '_' as control letters inside a character class.
603        if ((ch0 >= 'A' && ch0 <= 'Z') || (ch0 >= 'a' && ch0 <= 'z')
604                || (inCharClass && (isDecimalDigit(ch0) || ch0 == '_'))) {
605            // for some reason java regexps don't like control characters on the
606            // form "\\ca".match([string with ascii 1 at char0]). Translating
607            // them to unicode does it though.
608            sb.setLength(sb.length() - 1);
609            unicode(ch0 % 32, sb);
610            skip(1);
611            return true;
612        }
613        return false;
614    }
615
616    /*
617     * IdentityEscape ::
618     *      SourceCharacter but not IdentifierPart
619     *      <ZWJ>  (200c)
620     *      <ZWNJ> (200d)
621     */
622    private boolean identityEscape() {
623        if (atEOF()) {
624            throw new RuntimeException("\\ at end of pattern"); // will be converted to PatternSyntaxException
625        }
626        // ES 5.1 A.7 requires "not IdentifierPart" here but all major engines accept any character here.
627        if (ch0 == 'c') {
628            sb.append('\\'); // Treat invalid \c control sequence as \\c
629        } else if (NON_IDENT_ESCAPES.indexOf(ch0) == -1) {
630            sb.setLength(sb.length() - 1);
631        }
632        return commit(1);
633    }
634
635    /*
636     * DecimalEscape ::
637     *      DecimalIntegerLiteral [lookahead DecimalDigit]
638     */
639    private boolean decimalEscape() {
640        final int startIn  = position;
641        final int startOut = sb.length();
642
643        if (ch0 == '0' && !isOctalDigit(ch1)) {
644            skip(1);
645            //  DecimalEscape :: 0. If i is zero, return the EscapeValue consisting of a <NUL> character (Unicodevalue0000);
646            sb.append("\u0000");
647            return true;
648        }
649
650        if (isDecimalDigit(ch0)) {
651
652            if (ch0 == '0') {
653                // We know this is an octal escape.
654                if (inCharClass) {
655                    // Convert octal escape to unicode escape if inside character class.
656                    int octalValue = 0;
657                    while (isOctalDigit(ch0)) {
658                        octalValue = octalValue * 8 + ch0 - '0';
659                        skip(1);
660                    }
661
662                    unicode(octalValue, sb);
663
664                } else {
665                    // Copy decimal escape as-is
666                    decimalDigits();
667                }
668            } else {
669                // This should be a backreference, but could also be an octal escape or even a literal string.
670                int decimalValue = 0;
671                while (isDecimalDigit(ch0)) {
672                    decimalValue = decimalValue * 10 + ch0 - '0';
673                    skip(1);
674                }
675
676                if (inCharClass) {
677                    // No backreferences in character classes. Encode as unicode escape or literal char sequence
678                    sb.setLength(sb.length() - 1);
679                    octalOrLiteral(Integer.toString(decimalValue), sb);
680
681                } else if (decimalValue <= caps.size()) {
682                    //  Captures inside a negative lookahead are undefined when referenced from the outside.
683                    final Capture capture = caps.get(decimalValue - 1);
684                    if (!capture.canBeReferencedFrom(negLookaheadGroup, negLookaheadLevel)) {
685                        // Outside reference to capture in negative lookahead, omit from output buffer.
686                        sb.setLength(sb.length() - 1);
687                    } else {
688                        // Append backreference to output buffer.
689                        sb.append(decimalValue);
690                    }
691                } else {
692                    // Forward references to a capture group are always undefined so we can omit it from the output buffer.
693                    // However, if the target capture does not exist, we need to rewrite the reference as hex escape
694                    // or literal string, so register the reference for later processing.
695                    sb.setLength(sb.length() - 1);
696                    forwardReferences.add(decimalValue);
697                    forwardReferences.add(sb.length());
698                }
699
700            }
701            return true;
702        }
703
704        restart(startIn, startOut);
705        return false;
706    }
707
708    /*
709     * CharacterClassEscape ::
710     *  one of dDsSwW
711     */
712    private boolean characterClassEscape() {
713        switch (ch0) {
714        // java.util.regex requires translation of \s and \S to explicit character list
715        case 's':
716            if (RegExpFactory.usesJavaUtilRegex()) {
717                sb.setLength(sb.length() - 1);
718                // No nested class required if we already are inside a character class
719                if (inCharClass) {
720                    sb.append(Lexer.getWhitespaceRegExp());
721                } else {
722                    sb.append('[').append(Lexer.getWhitespaceRegExp()).append(']');
723                }
724                skip(1);
725                return true;
726            }
727            return commit(1);
728        case 'S':
729            if (RegExpFactory.usesJavaUtilRegex()) {
730                sb.setLength(sb.length() - 1);
731                // In negative class we must use intersection to get double negation ("not anything else than space")
732                sb.append(inNegativeClass ? "&&[" : "[^").append(Lexer.getWhitespaceRegExp()).append(']');
733                skip(1);
734                return true;
735            }
736            return commit(1);
737        case 'd':
738        case 'D':
739        case 'w':
740        case 'W':
741            return commit(1);
742
743        default:
744            return false;
745        }
746    }
747
748    /*
749     * CharacterClass ::
750     *      [ [lookahead {^}] ClassRanges ]
751     *      [ ^ ClassRanges ]
752     */
753    private boolean characterClass() {
754        final int startIn  = position;
755        final int startOut = sb.length();
756
757        if (ch0 == '[') {
758            try {
759                inCharClass = true;
760                push(']');
761                commit(1);
762
763                if (ch0 == '^') {
764                    inNegativeClass = true;
765                    commit(1);
766                }
767
768                if (classRanges() && ch0 == ']') {
769                    pop(']');
770                    commit(1);
771
772                    // Substitute empty character classes [] and [^] that never or always match
773                    if (position == startIn + 2) {
774                        sb.setLength(sb.length() - 1);
775                        sb.append("^\\s\\S]");
776                    } else if (position == startIn + 3 && inNegativeClass) {
777                        sb.setLength(sb.length() - 2);
778                        sb.append("\\s\\S]");
779                    }
780
781                    return true;
782                }
783            } finally {
784                inCharClass = false;  // no nested character classes in JavaScript
785                inNegativeClass = false;
786            }
787        }
788
789        restart(startIn, startOut);
790        return false;
791    }
792
793    /*
794     * ClassRanges ::
795     *      [empty]
796     *      NonemptyClassRanges
797     */
798    private boolean classRanges() {
799        nonemptyClassRanges();
800        return true;
801    }
802
803    /*
804     * NonemptyClassRanges ::
805     *      ClassAtom
806     *      ClassAtom NonemptyClassRangesNoDash
807     *      ClassAtom - ClassAtom ClassRanges
808     */
809    private boolean nonemptyClassRanges() {
810        final int startIn  = position;
811        final int startOut = sb.length();
812
813        if (classAtom()) {
814
815            if (ch0 == '-') {
816                commit(1);
817
818                if (classAtom() && classRanges()) {
819                    return true;
820                }
821            }
822
823            nonemptyClassRangesNoDash();
824
825            return true;
826        }
827
828        restart(startIn, startOut);
829        return false;
830    }
831
832    /*
833     * NonemptyClassRangesNoDash ::
834     *      ClassAtom
835     *      ClassAtomNoDash NonemptyClassRangesNoDash
836     *      ClassAtomNoDash - ClassAtom ClassRanges
837     */
838    private boolean nonemptyClassRangesNoDash() {
839        final int startIn  = position;
840        final int startOut = sb.length();
841
842        if (classAtomNoDash()) {
843
844            // need to check dash first, as for e.g. [a-b|c-d] will otherwise parse - as an atom
845            if (ch0 == '-') {
846               commit(1);
847
848               if (classAtom() && classRanges()) {
849                   return true;
850               }
851               //fallthru
852           }
853
854            nonemptyClassRangesNoDash();
855            return true; // still a class atom
856        }
857
858        if (classAtom()) {
859            return true;
860        }
861
862        restart(startIn, startOut);
863        return false;
864    }
865
866    /*
867     * ClassAtom : - ClassAtomNoDash
868     */
869    private boolean classAtom() {
870
871        if (ch0 == '-') {
872            return commit(1);
873        }
874
875        return classAtomNoDash();
876    }
877
878    /*
879     * ClassAtomNoDash ::
880     *      SourceCharacter but not one of \ or ] or -
881     *      \ ClassEscape
882     */
883    private boolean classAtomNoDash() {
884        if (atEOF()) {
885            return false;
886        }
887        final int startIn  = position;
888        final int startOut = sb.length();
889
890        switch (ch0) {
891        case ']':
892        case '-':
893            return false;
894
895        case '[':
896            // unescaped left square bracket - add escape
897            sb.append('\\');
898            return commit(1);
899
900        case '\\':
901            commit(1);
902            if (classEscape()) {
903                return true;
904            }
905
906            restart(startIn, startOut);
907            return false;
908
909        default:
910            return commit(1);
911        }
912    }
913
914    /*
915     * ClassEscape ::
916     *      DecimalEscape
917     *      b
918     *      CharacterEscape
919     *      CharacterClassEscape
920     */
921    private boolean classEscape() {
922
923        if (decimalEscape()) {
924            return true;
925        }
926
927        if (ch0 == 'b') {
928            sb.setLength(sb.length() - 1);
929            sb.append('\b');
930            skip(1);
931            return true;
932        }
933
934        // Note that contrary to ES 5.1 spec we put identityEscape() last because it acts as a catch-all
935        return characterEscape() || characterClassEscape() || identityEscape();
936    }
937
938    /*
939     * DecimalDigits
940     */
941    private boolean decimalDigits() {
942        if (!isDecimalDigit(ch0)) {
943            return false;
944        }
945
946        while (isDecimalDigit(ch0)) {
947            commit(1);
948        }
949
950        return true;
951    }
952
953    private static void unicode(final int value, final StringBuilder buffer) {
954        final String hex = Integer.toHexString(value);
955        buffer.append('u');
956        for (int i = 0; i < 4 - hex.length(); i++) {
957            buffer.append('0');
958        }
959        buffer.append(hex);
960    }
961
962    // Convert what would have been a backreference into a unicode escape, or a number literal, or both.
963    private static void octalOrLiteral(final String numberLiteral, final StringBuilder buffer) {
964        final int length = numberLiteral.length();
965        int octalValue = 0;
966        int pos = 0;
967        // Maximum value for octal escape is 0377 (255) so we stop the loop at 32
968        while (pos < length && octalValue < 0x20) {
969            final char ch = numberLiteral.charAt(pos);
970            if (isOctalDigit(ch)) {
971                octalValue = octalValue * 8 + ch - '0';
972            } else {
973                break;
974            }
975            pos++;
976        }
977        if (octalValue > 0) {
978            buffer.append('\\');
979            unicode(octalValue, buffer);
980            buffer.append(numberLiteral.substring(pos));
981        } else {
982            buffer.append(numberLiteral);
983        }
984    }
985
986    private static boolean isOctalDigit(final char ch) {
987        return ch >= '0' && ch <= '7';
988    }
989
990    private static boolean isDecimalDigit(final char ch) {
991        return ch >= '0' && ch <= '9';
992    }
993}
994