EncodingHelper.java revision 1453:a261947d1e0e
145405Smsmith/*
245405Smsmith * Permission is hereby granted, free of charge, to any person obtaining a copy of
345405Smsmith * this software and associated documentation files (the "Software"), to deal in
445405Smsmith * the Software without restriction, including without limitation the rights to
545405Smsmith * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
645405Smsmith * of the Software, and to permit persons to whom the Software is furnished to do
745405Smsmith * so, subject to the following conditions:
845405Smsmith *
945405Smsmith * The above copyright notice and this permission notice shall be included in all
1045405Smsmith * copies or substantial portions of the Software.
1145405Smsmith *
1245405Smsmith * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
1345405Smsmith * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
1445405Smsmith * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
1545405Smsmith * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
1645405Smsmith * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
1745405Smsmith * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
1845405Smsmith * SOFTWARE.
1945405Smsmith */
2045405Smsmithpackage jdk.nashorn.internal.runtime.regexp.joni;
2145405Smsmith
2245405Smsmithimport java.util.Arrays;
2345405Smsmithimport jdk.nashorn.internal.runtime.regexp.joni.encoding.CharacterType;
2445405Smsmithimport jdk.nashorn.internal.runtime.regexp.joni.encoding.IntHolder;
2545405Smsmith
2648008Sgreen@SuppressWarnings("javadoc")
2745405Smsmithpublic final class EncodingHelper {
2845405Smsmith
2945405Smsmith    final static int NEW_LINE            = 0x000a;
3045405Smsmith    final static int RETURN              = 0x000d;
3145405Smsmith    final static int LINE_SEPARATOR      = 0x2028;
3245405Smsmith    final static int PARAGRAPH_SEPARATOR = 0x2029;
3345405Smsmith
3445405Smsmith    final static char[] EMPTYCHARS = new char[0];
3545405Smsmith    final static int[][] codeRanges = new int[15][];
3645405Smsmith
3745405Smsmith    public static int digitVal(final int code) {
3845405Smsmith        return code - '0';
3945405Smsmith    }
4045405Smsmith
4145405Smsmith    public static int odigitVal(final int code) {
4245405Smsmith        return digitVal(code);
4345405Smsmith    }
4445405Smsmith
4545405Smsmith    public static boolean isXDigit(final int code) {
4645405Smsmith        return Character.isDigit(code) || (code >= 'a' && code <= 'f') || (code >= 'A' && code <= 'F');
4745405Smsmith    }
4845405Smsmith
4945405Smsmith    public static int xdigitVal(final int code) {
5045405Smsmith        if (Character.isDigit(code)) {
5145405Smsmith            return code - '0';
5245405Smsmith        } else if (code >= 'a' && code <= 'f') {
5345405Smsmith            return code - 'a' + 10;
5445405Smsmith        } else {
5545405Smsmith            return code - 'A' + 10;
5645405Smsmith        }
5745405Smsmith    }
5845405Smsmith
5945405Smsmith    public static boolean isDigit(final int code) {
6045405Smsmith        return code >= '0' && code <= '9';
6145405Smsmith    }
6245405Smsmith
6345405Smsmith    public static boolean isWord(final int code) {
6445405Smsmith        // letter, digit, or '_'
6546215Smsmith        return (1 << Character.getType(code) & CharacterType.WORD_MASK) != 0;
6646215Smsmith    }
6746215Smsmith
6846215Smsmith    public static boolean isNewLine(final int code) {
6946215Smsmith        return code == NEW_LINE || code == RETURN || code == LINE_SEPARATOR || code == PARAGRAPH_SEPARATOR;
7045405Smsmith    }
7145405Smsmith
7245405Smsmith    public static boolean isNewLine(final char[] chars, final int p, final int end) {
7346215Smsmith        return p < end && isNewLine(chars[p]);
7446215Smsmith    }
7545405Smsmith
7645405Smsmith    // Encoding.prevCharHead
7746215Smsmith    public static int prevCharHead(final int p, final int s) {
7846215Smsmith        return s <= p ? -1 : s - 1;
7946215Smsmith    }
8045405Smsmith
8145405Smsmith    /* onigenc_get_right_adjust_char_head_with_prev */
8245405Smsmith    public static int rightAdjustCharHeadWithPrev(final int s, final IntHolder prev) {
8345405Smsmith        if (prev != null) {
8445405Smsmith            prev.value = -1; /* Sorry */
8546215Smsmith        }
8645405Smsmith        return s;
8745405Smsmith    }
8845405Smsmith
8945405Smsmith    // Encoding.stepBack
9045405Smsmith    public static int stepBack(final int p, final int sp, final int np) {
9145405Smsmith        int s = sp, n = np;
9245405Smsmith        while (s != -1 && n-- > 0) {
9345405Smsmith           if (s <= p) {
9445405Smsmith            return -1;
9545405Smsmith        }
9645405Smsmith           s--;
9745405Smsmith       }
9845405Smsmith       return s;
9945405Smsmith    }
10045405Smsmith
10145405Smsmith    public static int mbcodeStartPosition() {
10245405Smsmith        return 0x80;
10345405Smsmith    }
10445405Smsmith
10545405Smsmith    public static char[] caseFoldCodesByString(final int flag, final char c) {
10645405Smsmith        char[] codes = EMPTYCHARS;
10745405Smsmith        final char upper = toUpperCase(c);
10845405Smsmith
10945405Smsmith        if (upper != toLowerCase(upper)) {
11045405Smsmith            int count = 0;
11145405Smsmith            char ch = 0;
11245405Smsmith
11345405Smsmith            do {
11445405Smsmith                final char u = toUpperCase(ch);
11545405Smsmith                if (u == upper && ch != c) {
11645405Smsmith                    // Almost all characters will return array of length 1, very few 2 or 3, so growing by one is fine.
11745405Smsmith                    codes = count == 0 ? new char[1] : Arrays.copyOf(codes, count + 1);
11845405Smsmith                    codes[count++] = ch;
11945405Smsmith                }
12045405Smsmith            } while (ch++ < 0xffff);
12145405Smsmith        }
12245405Smsmith        return codes;
12345405Smsmith    }
12445405Smsmith
12545405Smsmith    public static void applyAllCaseFold(final int flag, final ApplyCaseFold fun, final Object arg) {
12645405Smsmith        for (int c = 0; c < 0xffff; c++) {
12745405Smsmith            if (Character.isLowerCase(c)) {
12845405Smsmith                final int upper = toUpperCase(c);
12945405Smsmith
13045405Smsmith                if (upper != c) {
13145405Smsmith                    ApplyCaseFold.apply(c, upper, arg);
13245405Smsmith                }
13345405Smsmith            }
13445405Smsmith        }
13545405Smsmith
13645405Smsmith        // Some characters have multiple lower case variants, hence we need to do a second run
13745405Smsmith        for (int c = 0; c < 0xffff; c++) {
13845405Smsmith            if (Character.isLowerCase(c)) {
13945405Smsmith                final int upper = toUpperCase(c);
14045405Smsmith
14145405Smsmith                if (upper != c) {
14245405Smsmith                    ApplyCaseFold.apply(upper, c, arg);
14345405Smsmith                }
14445405Smsmith            }
14545405Smsmith        }
14645405Smsmith    }
14745405Smsmith
14845405Smsmith    public static char toLowerCase(final char c) {
14945405Smsmith        return (char)toLowerCase((int)c);
15045405Smsmith    }
15145405Smsmith
15245405Smsmith    public static int toLowerCase(final int c) {
15345405Smsmith        if (c < 128) {
15445405Smsmith            return ('A' <= c && c <= 'Z') ? (c + ('a' - 'A')) : c;
15545405Smsmith        }
15645405Smsmith        // Do not convert non-ASCII upper case character to ASCII lower case.
15745405Smsmith        final int lower = Character.toLowerCase(c);
15845405Smsmith        return (lower < 128) ? c : lower;
15945405Smsmith
16045405Smsmith    }
16145405Smsmith
16245405Smsmith    public static char toUpperCase(final char c) {
16345405Smsmith        return (char)toUpperCase((int)c);
16445405Smsmith    }
16545405Smsmith
16645405Smsmith    public static int toUpperCase(final int c) {
16745405Smsmith        if (c < 128) {
16845405Smsmith            return ('a' <= c && c <= 'z') ? c + ('A' - 'a') : c;
16945405Smsmith        }
17045405Smsmith        // Do not convert non-ASCII lower case character to ASCII upper case.
17145405Smsmith        final int upper = Character.toUpperCase(c);
17245405Smsmith        return (upper < 128) ? c : upper;
17345405Smsmith    }
17445405Smsmith
17545405Smsmith    public static int[] ctypeCodeRange(final int ctype, final IntHolder sbOut) {
17645405Smsmith        sbOut.value = 0x100; // use bitset for codes smaller than 256
17745405Smsmith        int[] range = null;
17845405Smsmith
17945405Smsmith        if (ctype < codeRanges.length) {
18045405Smsmith            range = codeRanges[ctype];
18145405Smsmith
18245405Smsmith            if (range == null) {
18345405Smsmith                // format: [numberOfRanges, rangeStart, rangeEnd, ...]
18445405Smsmith                range = new int[16];
18545405Smsmith                int rangeCount = 0;
18645405Smsmith                int lastCode = -2;
18745405Smsmith
18845405Smsmith                for (int code = 0; code <= 0xffff; code++) {
18945405Smsmith                    if (isCodeCType(code, ctype)) {
19045405Smsmith                        if (lastCode < code -1) {
19145405Smsmith                            if (rangeCount * 2 + 2 >= range.length) {
19245405Smsmith                                range = Arrays.copyOf(range, range.length * 2);
19345405Smsmith                            }
19445405Smsmith                            range[rangeCount * 2 + 1] = code;
19545405Smsmith                            rangeCount++;
19645405Smsmith                        }
19745405Smsmith                        range[rangeCount * 2] = lastCode = code;
19845405Smsmith                    }
19945405Smsmith                }
20045405Smsmith
20145405Smsmith                if (rangeCount * 2 + 1 < range.length) {
20245405Smsmith                    range = Arrays.copyOf(range, rangeCount * 2 + 1);
20345405Smsmith                }
20445405Smsmith
20545405Smsmith                range[0] = rangeCount;
20645405Smsmith                codeRanges[ctype] = range;
20745405Smsmith            }
20845405Smsmith        }
20945405Smsmith
21045405Smsmith        return range;
21145405Smsmith    }
21245405Smsmith
21345405Smsmith    // CodeRange.isInCodeRange
21445405Smsmith    public static boolean isInCodeRange(final int[] p, final int offset, final int code) {
21545405Smsmith        int low = 0;
21645405Smsmith        final int n = p[offset];
21745405Smsmith        int high = n ;
21845405Smsmith
21945405Smsmith        while (low < high) {
22045405Smsmith            final int x = (low + high) >> 1;
22145405Smsmith            if (code > p[(x << 1) + 2 + offset]) {
22245405Smsmith                low = x + 1;
22345405Smsmith            } else {
22445405Smsmith                high = x;
22545405Smsmith            }
22645405Smsmith        }
22745405Smsmith        return low < n && code >= p[(low << 1) + 1 + offset];
22846215Smsmith    }
22946215Smsmith
23046215Smsmith    /**
23146215Smsmith     * @see <a href="http://www.geocities.jp/kosako3/oniguruma/doc/RE.txt">http://www.geocities.jp/kosako3/oniguruma/doc/RE.txt</a>
23245405Smsmith     *
23345405Smsmith     * @param code code
23445405Smsmith     * @param ctype ctype
23545405Smsmith     *
23645405Smsmith     * @return isCodeCType
23745405Smsmith     */
23845405Smsmith    public static boolean isCodeCType(final int code, final int ctype) {
23945405Smsmith        int type;
24045405Smsmith        switch (ctype) {
24145405Smsmith            case CharacterType.NEWLINE:
24245405Smsmith                return isNewLine(code);
24345405Smsmith            case CharacterType.ALPHA:
24445405Smsmith                return (1 << Character.getType(code) & CharacterType.ALPHA_MASK) != 0;
24545405Smsmith            case CharacterType.BLANK:
24645405Smsmith                return code == 0x09 || Character.getType(code) == Character.SPACE_SEPARATOR;
24745405Smsmith            case CharacterType.CNTRL:
24846215Smsmith                type = Character.getType(code);
24946215Smsmith                return (1 << type & CharacterType.CNTRL_MASK) != 0 || type == Character.UNASSIGNED;
25046215Smsmith            case CharacterType.DIGIT:
25146215Smsmith                return EncodingHelper.isDigit(code);
25246215Smsmith            case CharacterType.GRAPH:
25346215Smsmith                switch (code) {
25446215Smsmith                    case 0x09:
25546215Smsmith                    case 0x0a:
25646215Smsmith                    case 0x0b:
25746215Smsmith                    case 0x0c:
25846215Smsmith                    case 0x0d:
25946215Smsmith                        return false;
26046215Smsmith                    default:
26146215Smsmith                        type = Character.getType(code);
26246215Smsmith                        return (1 << type & CharacterType.GRAPH_MASK) == 0 && type != Character.UNASSIGNED;
26346215Smsmith                }
26446215Smsmith            case CharacterType.LOWER:
26545405Smsmith                return Character.isLowerCase(code);
26645405Smsmith            case CharacterType.PRINT:
26745405Smsmith                type = Character.getType(code);
26845405Smsmith                return (1 << type & CharacterType.PRINT_MASK) == 0 && type != Character.UNASSIGNED;
26945405Smsmith            case CharacterType.PUNCT:
27045405Smsmith                return (1 << Character.getType(code) & CharacterType.PUNCT_MASK) != 0;
27145405Smsmith            case CharacterType.SPACE:
27245405Smsmith                // ECMA 7.2 and 7.3
27345405Smsmith                switch (code) {
27445405Smsmith                    case 0x09:
27545405Smsmith                    case 0x0a:
27645405Smsmith                    case 0x0b:
27745405Smsmith                    case 0x0c:
27845405Smsmith                    case 0x0d:
27945405Smsmith                        return true;
28045405Smsmith                    default:
28145405Smsmith                        // true if Unicode separator or BOM or U+180E (see JDK-8138758)
28245405Smsmith                        return (1 << Character.getType(code) & CharacterType.SPACE_MASK) != 0
28345405Smsmith                                || code == 0xfeff || code == 0x180e;
28445405Smsmith                }
28545405Smsmith            case CharacterType.UPPER:
28645405Smsmith                return Character.isUpperCase(code);
28745405Smsmith            case CharacterType.XDIGIT:
28845405Smsmith                return EncodingHelper.isXDigit(code);
28945405Smsmith            case CharacterType.WORD:
29045405Smsmith                return (1 << Character.getType(code) & CharacterType.WORD_MASK) != 0;
29145405Smsmith            case CharacterType.ALNUM:
29245405Smsmith                return (1 << Character.getType(code) & CharacterType.ALNUM_MASK) != 0;
29345405Smsmith            case CharacterType.ASCII:
29445405Smsmith                return code < 0x80;
29545405Smsmith            default:
29645405Smsmith                throw new RuntimeException("illegal character type: " + ctype);
29745405Smsmith        }
29845405Smsmith    }
29945405Smsmith}
30045405Smsmith
30145405Smsmith