EncodingHelper.java revision 953:221a84ef44c0
1/*
2 * Permission is hereby granted, free of charge, to any person obtaining a copy of
3 * this software and associated documentation files (the "Software"), to deal in
4 * the Software without restriction, including without limitation the rights to
5 * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
6 * of the Software, and to permit persons to whom the Software is furnished to do
7 * so, subject to the following conditions:
8 *
9 * The above copyright notice and this permission notice shall be included in all
10 * copies or substantial portions of the Software.
11 *
12 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
13 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
14 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
15 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
16 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
17 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
18 * SOFTWARE.
19 */
20package jdk.nashorn.internal.runtime.regexp.joni;
21
22import java.util.Arrays;
23import jdk.nashorn.internal.runtime.regexp.joni.encoding.CharacterType;
24import jdk.nashorn.internal.runtime.regexp.joni.encoding.IntHolder;
25
26public final class EncodingHelper {
27
28    final static int NEW_LINE            = 0x000a;
29    final static int RETURN              = 0x000d;
30    final static int LINE_SEPARATOR      = 0x2028;
31    final static int PARAGRAPH_SEPARATOR = 0x2029;
32
33    final static char[] EMPTYCHARS = new char[0];
34    final static int[][] codeRanges = new int[15][];
35
36    public static int digitVal(final int code) {
37        return code - '0';
38    }
39
40    public static int odigitVal(final int code) {
41        return digitVal(code);
42    }
43
44    public static boolean isXDigit(final int code) {
45        return Character.isDigit(code) || (code >= 'a' && code <= 'f') || (code >= 'A' && code <= 'F');
46    }
47
48    public static int xdigitVal(final int code) {
49        if (Character.isDigit(code)) {
50            return code - '0';
51        } else if (code >= 'a' && code <= 'f') {
52            return code - 'a' + 10;
53        } else {
54            return code - 'A' + 10;
55        }
56    }
57
58    public static boolean isDigit(final int code) {
59        return code >= '0' && code <= '9';
60    }
61
62    public static boolean isWord(final int code) {
63        // letter, digit, or '_'
64        return (1 << Character.getType(code) & CharacterType.WORD_MASK) != 0;
65    }
66
67    public static boolean isNewLine(final int code) {
68        return code == NEW_LINE || code == RETURN || code == LINE_SEPARATOR || code == PARAGRAPH_SEPARATOR;
69    }
70
71    public static boolean isNewLine(final char[] chars, final int p, final int end) {
72        return p < end && isNewLine(chars[p]);
73    }
74
75    // Encoding.prevCharHead
76    public static int prevCharHead(final int p, final int s) {
77        return s <= p ? -1 : s - 1;
78    }
79
80    /* onigenc_get_right_adjust_char_head_with_prev */
81    public static int rightAdjustCharHeadWithPrev(final int s, final IntHolder prev) {
82        if (prev != null) prev.value = -1; /* Sorry */
83        return s;
84    }
85
86    // Encoding.stepBack
87    public static int stepBack(final int p, int s, int n) {
88       while (s != -1 && n-- > 0) {
89           if (s <= p) return -1;
90           s--;
91       }
92       return s;
93    }
94
95    public static int mbcodeStartPosition() {
96        return 0x80;
97    }
98
99    public static char[] caseFoldCodesByString(final int flag, final char c) {
100        char[] codes = EMPTYCHARS;
101        final char upper = toUpperCase(c);
102
103        if (upper != toLowerCase(upper)) {
104            int count = 0;
105            char ch = 0;
106
107            do {
108                final char u = toUpperCase(ch);
109                if (u == upper && ch != c) {
110                    // Almost all characters will return array of length 1, very few 2 or 3, so growing by one is fine.
111                    codes = count == 0 ? new char[1] : Arrays.copyOf(codes, count + 1);
112                    codes[count++] = ch;
113                }
114            } while (ch++ < 0xffff);
115        }
116        return codes;
117    }
118
119    public static void applyAllCaseFold(final int flag, final ApplyCaseFold fun, final Object arg) {
120        for (int c = 0; c < 0xffff; c++) {
121            if (Character.isLowerCase(c)) {
122                final int upper = toUpperCase(c);
123
124                if (upper != c) {
125                    fun.apply(c, upper, arg);
126                }
127            }
128        }
129
130        // Some characters have multiple lower case variants, hence we need to do a second run
131        for (int c = 0; c < 0xffff; c++) {
132            if (Character.isLowerCase(c)) {
133                final int upper = toUpperCase(c);
134
135                if (upper != c) {
136                    fun.apply(upper, c, arg);
137                }
138            }
139        }
140    }
141
142    public static char toLowerCase(final char c) {
143        return (char)toLowerCase((int)c);
144    }
145
146    public static int toLowerCase(final int c) {
147        if (c < 128) {
148            return ('A' <= c && c <= 'Z') ? (c + ('a' - 'A')) : c;
149        }
150        // Do not convert non-ASCII upper case character to ASCII lower case.
151        final int lower = Character.toLowerCase(c);
152        return (lower < 128) ? c : lower;
153
154    }
155
156    public static char toUpperCase(final char c) {
157        return (char)toUpperCase((int)c);
158    }
159
160    public static int toUpperCase(final int c) {
161        if (c < 128) {
162            return ('a' <= c && c <= 'z') ? c + ('A' - 'a') : c;
163        }
164        // Do not convert non-ASCII lower case character to ASCII upper case.
165        final int upper = Character.toUpperCase(c);
166        return (upper < 128) ? c : upper;
167    }
168
169    public static int[] ctypeCodeRange(final int ctype, final IntHolder sbOut) {
170        sbOut.value = 0x100; // use bitset for codes smaller than 256
171        int[] range = null;
172
173        if (ctype < codeRanges.length) {
174            range = codeRanges[ctype];
175
176            if (range == null) {
177                // format: [numberOfRanges, rangeStart, rangeEnd, ...]
178                range = new int[16];
179                int rangeCount = 0;
180                int lastCode = -2;
181
182                for (int code = 0; code <= 0xffff; code++) {
183                    if (isCodeCType(code, ctype)) {
184                        if (lastCode < code -1) {
185                            if (rangeCount * 2 + 2 >= range.length) {
186                                range = Arrays.copyOf(range, range.length * 2);
187                            }
188                            range[rangeCount * 2 + 1] = code;
189                            rangeCount++;
190                        }
191                        range[rangeCount * 2] = lastCode = code;
192                    }
193                }
194
195                if (rangeCount * 2 + 1 < range.length) {
196                    range = Arrays.copyOf(range, rangeCount * 2 + 1);
197                }
198
199                range[0] = rangeCount;
200                codeRanges[ctype] = range;
201            }
202        }
203
204        return range;
205    }
206
207    // CodeRange.isInCodeRange
208    public static boolean isInCodeRange(final int[] p, final int offset, final int code) {
209        int low = 0;
210        final int n = p[offset];
211        int high = n ;
212
213        while (low < high) {
214            final int x = (low + high) >> 1;
215            if (code > p[(x << 1) + 2 + offset]) {
216                low = x + 1;
217            } else {
218                high = x;
219            }
220        }
221        return low < n && code >= p[(low << 1) + 1 + offset];
222    }
223
224    /**
225     * @see <a href="http://www.geocities.jp/kosako3/oniguruma/doc/RE.txt">http://www.geocities.jp/kosako3/oniguruma/doc/RE.txt</a>
226     */
227    public static boolean isCodeCType(final int code, final int ctype) {
228        int type;
229        switch (ctype) {
230            case CharacterType.NEWLINE:
231                return isNewLine(code);
232            case CharacterType.ALPHA:
233                return (1 << Character.getType(code) & CharacterType.ALPHA_MASK) != 0;
234            case CharacterType.BLANK:
235                return code == 0x09 || Character.getType(code) == Character.SPACE_SEPARATOR;
236            case CharacterType.CNTRL:
237                type = Character.getType(code);
238                return (1 << type & CharacterType.CNTRL_MASK) != 0 || type == Character.UNASSIGNED;
239            case CharacterType.DIGIT:
240                return EncodingHelper.isDigit(code);
241            case CharacterType.GRAPH:
242                switch (code) {
243                    case 0x09:
244                    case 0x0a:
245                    case 0x0b:
246                    case 0x0c:
247                    case 0x0d:
248                        return false;
249                    default:
250                        type = Character.getType(code);
251                        return (1 << type & CharacterType.GRAPH_MASK) == 0 && type != Character.UNASSIGNED;
252                }
253            case CharacterType.LOWER:
254                return Character.isLowerCase(code);
255            case CharacterType.PRINT:
256                type = Character.getType(code);
257                return (1 << type & CharacterType.PRINT_MASK) == 0 && type != Character.UNASSIGNED;
258            case CharacterType.PUNCT:
259                return (1 << Character.getType(code) & CharacterType.PUNCT_MASK) != 0;
260            case CharacterType.SPACE:
261                // ECMA 7.2 and 7.3
262                switch (code) {
263                    case 0x09:
264                    case 0x0a:
265                    case 0x0b:
266                    case 0x0c:
267                    case 0x0d:
268                        return true;
269                    default:
270                        // true if Unicode separator or BOM
271                        return (1 << Character.getType(code) & CharacterType.SPACE_MASK) != 0 || code == 0xfeff;
272                }
273            case CharacterType.UPPER:
274                return Character.isUpperCase(code);
275            case CharacterType.XDIGIT:
276                return EncodingHelper.isXDigit(code);
277            case CharacterType.WORD:
278                return (1 << Character.getType(code) & CharacterType.WORD_MASK) != 0;
279            case CharacterType.ALNUM:
280                return (1 << Character.getType(code) & CharacterType.ALNUM_MASK) != 0;
281            case CharacterType.ASCII:
282                return code < 0x80;
283            default:
284                throw new RuntimeException("illegal character type: " + ctype);
285        }
286    }
287}
288
289