EncodingHelper.java revision 1453:a261947d1e0e
145405Smsmith/* 245405Smsmith * Permission is hereby granted, free of charge, to any person obtaining a copy of 345405Smsmith * this software and associated documentation files (the "Software"), to deal in 445405Smsmith * the Software without restriction, including without limitation the rights to 545405Smsmith * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies 645405Smsmith * of the Software, and to permit persons to whom the Software is furnished to do 745405Smsmith * so, subject to the following conditions: 845405Smsmith * 945405Smsmith * The above copyright notice and this permission notice shall be included in all 1045405Smsmith * copies or substantial portions of the Software. 1145405Smsmith * 1245405Smsmith * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 1345405Smsmith * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 1445405Smsmith * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 1545405Smsmith * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 1645405Smsmith * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 1745405Smsmith * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 1845405Smsmith * SOFTWARE. 1945405Smsmith */ 2045405Smsmithpackage jdk.nashorn.internal.runtime.regexp.joni; 2145405Smsmith 2245405Smsmithimport java.util.Arrays; 2345405Smsmithimport jdk.nashorn.internal.runtime.regexp.joni.encoding.CharacterType; 2445405Smsmithimport jdk.nashorn.internal.runtime.regexp.joni.encoding.IntHolder; 2545405Smsmith 2648008Sgreen@SuppressWarnings("javadoc") 2745405Smsmithpublic final class EncodingHelper { 2845405Smsmith 2945405Smsmith final static int NEW_LINE = 0x000a; 3045405Smsmith final static int RETURN = 0x000d; 3145405Smsmith final static int LINE_SEPARATOR = 0x2028; 3245405Smsmith final static int PARAGRAPH_SEPARATOR = 0x2029; 3345405Smsmith 3445405Smsmith final static char[] EMPTYCHARS = new char[0]; 3545405Smsmith final static int[][] codeRanges = new int[15][]; 3645405Smsmith 3745405Smsmith public static int digitVal(final int code) { 3845405Smsmith return code - '0'; 3945405Smsmith } 4045405Smsmith 4145405Smsmith public static int odigitVal(final int code) { 4245405Smsmith return digitVal(code); 4345405Smsmith } 4445405Smsmith 4545405Smsmith public static boolean isXDigit(final int code) { 4645405Smsmith return Character.isDigit(code) || (code >= 'a' && code <= 'f') || (code >= 'A' && code <= 'F'); 4745405Smsmith } 4845405Smsmith 4945405Smsmith public static int xdigitVal(final int code) { 5045405Smsmith if (Character.isDigit(code)) { 5145405Smsmith return code - '0'; 5245405Smsmith } else if (code >= 'a' && code <= 'f') { 5345405Smsmith return code - 'a' + 10; 5445405Smsmith } else { 5545405Smsmith return code - 'A' + 10; 5645405Smsmith } 5745405Smsmith } 5845405Smsmith 5945405Smsmith public static boolean isDigit(final int code) { 6045405Smsmith return code >= '0' && code <= '9'; 6145405Smsmith } 6245405Smsmith 6345405Smsmith public static boolean isWord(final int code) { 6445405Smsmith // letter, digit, or '_' 6546215Smsmith return (1 << Character.getType(code) & CharacterType.WORD_MASK) != 0; 6646215Smsmith } 6746215Smsmith 6846215Smsmith public static boolean isNewLine(final int code) { 6946215Smsmith return code == NEW_LINE || code == RETURN || code == LINE_SEPARATOR || code == PARAGRAPH_SEPARATOR; 7045405Smsmith } 7145405Smsmith 7245405Smsmith public static boolean isNewLine(final char[] chars, final int p, final int end) { 7346215Smsmith return p < end && isNewLine(chars[p]); 7446215Smsmith } 7545405Smsmith 7645405Smsmith // Encoding.prevCharHead 7746215Smsmith public static int prevCharHead(final int p, final int s) { 7846215Smsmith return s <= p ? -1 : s - 1; 7946215Smsmith } 8045405Smsmith 8145405Smsmith /* onigenc_get_right_adjust_char_head_with_prev */ 8245405Smsmith public static int rightAdjustCharHeadWithPrev(final int s, final IntHolder prev) { 8345405Smsmith if (prev != null) { 8445405Smsmith prev.value = -1; /* Sorry */ 8546215Smsmith } 8645405Smsmith return s; 8745405Smsmith } 8845405Smsmith 8945405Smsmith // Encoding.stepBack 9045405Smsmith public static int stepBack(final int p, final int sp, final int np) { 9145405Smsmith int s = sp, n = np; 9245405Smsmith while (s != -1 && n-- > 0) { 9345405Smsmith if (s <= p) { 9445405Smsmith return -1; 9545405Smsmith } 9645405Smsmith s--; 9745405Smsmith } 9845405Smsmith return s; 9945405Smsmith } 10045405Smsmith 10145405Smsmith public static int mbcodeStartPosition() { 10245405Smsmith return 0x80; 10345405Smsmith } 10445405Smsmith 10545405Smsmith public static char[] caseFoldCodesByString(final int flag, final char c) { 10645405Smsmith char[] codes = EMPTYCHARS; 10745405Smsmith final char upper = toUpperCase(c); 10845405Smsmith 10945405Smsmith if (upper != toLowerCase(upper)) { 11045405Smsmith int count = 0; 11145405Smsmith char ch = 0; 11245405Smsmith 11345405Smsmith do { 11445405Smsmith final char u = toUpperCase(ch); 11545405Smsmith if (u == upper && ch != c) { 11645405Smsmith // Almost all characters will return array of length 1, very few 2 or 3, so growing by one is fine. 11745405Smsmith codes = count == 0 ? new char[1] : Arrays.copyOf(codes, count + 1); 11845405Smsmith codes[count++] = ch; 11945405Smsmith } 12045405Smsmith } while (ch++ < 0xffff); 12145405Smsmith } 12245405Smsmith return codes; 12345405Smsmith } 12445405Smsmith 12545405Smsmith public static void applyAllCaseFold(final int flag, final ApplyCaseFold fun, final Object arg) { 12645405Smsmith for (int c = 0; c < 0xffff; c++) { 12745405Smsmith if (Character.isLowerCase(c)) { 12845405Smsmith final int upper = toUpperCase(c); 12945405Smsmith 13045405Smsmith if (upper != c) { 13145405Smsmith ApplyCaseFold.apply(c, upper, arg); 13245405Smsmith } 13345405Smsmith } 13445405Smsmith } 13545405Smsmith 13645405Smsmith // Some characters have multiple lower case variants, hence we need to do a second run 13745405Smsmith for (int c = 0; c < 0xffff; c++) { 13845405Smsmith if (Character.isLowerCase(c)) { 13945405Smsmith final int upper = toUpperCase(c); 14045405Smsmith 14145405Smsmith if (upper != c) { 14245405Smsmith ApplyCaseFold.apply(upper, c, arg); 14345405Smsmith } 14445405Smsmith } 14545405Smsmith } 14645405Smsmith } 14745405Smsmith 14845405Smsmith public static char toLowerCase(final char c) { 14945405Smsmith return (char)toLowerCase((int)c); 15045405Smsmith } 15145405Smsmith 15245405Smsmith public static int toLowerCase(final int c) { 15345405Smsmith if (c < 128) { 15445405Smsmith return ('A' <= c && c <= 'Z') ? (c + ('a' - 'A')) : c; 15545405Smsmith } 15645405Smsmith // Do not convert non-ASCII upper case character to ASCII lower case. 15745405Smsmith final int lower = Character.toLowerCase(c); 15845405Smsmith return (lower < 128) ? c : lower; 15945405Smsmith 16045405Smsmith } 16145405Smsmith 16245405Smsmith public static char toUpperCase(final char c) { 16345405Smsmith return (char)toUpperCase((int)c); 16445405Smsmith } 16545405Smsmith 16645405Smsmith public static int toUpperCase(final int c) { 16745405Smsmith if (c < 128) { 16845405Smsmith return ('a' <= c && c <= 'z') ? c + ('A' - 'a') : c; 16945405Smsmith } 17045405Smsmith // Do not convert non-ASCII lower case character to ASCII upper case. 17145405Smsmith final int upper = Character.toUpperCase(c); 17245405Smsmith return (upper < 128) ? c : upper; 17345405Smsmith } 17445405Smsmith 17545405Smsmith public static int[] ctypeCodeRange(final int ctype, final IntHolder sbOut) { 17645405Smsmith sbOut.value = 0x100; // use bitset for codes smaller than 256 17745405Smsmith int[] range = null; 17845405Smsmith 17945405Smsmith if (ctype < codeRanges.length) { 18045405Smsmith range = codeRanges[ctype]; 18145405Smsmith 18245405Smsmith if (range == null) { 18345405Smsmith // format: [numberOfRanges, rangeStart, rangeEnd, ...] 18445405Smsmith range = new int[16]; 18545405Smsmith int rangeCount = 0; 18645405Smsmith int lastCode = -2; 18745405Smsmith 18845405Smsmith for (int code = 0; code <= 0xffff; code++) { 18945405Smsmith if (isCodeCType(code, ctype)) { 19045405Smsmith if (lastCode < code -1) { 19145405Smsmith if (rangeCount * 2 + 2 >= range.length) { 19245405Smsmith range = Arrays.copyOf(range, range.length * 2); 19345405Smsmith } 19445405Smsmith range[rangeCount * 2 + 1] = code; 19545405Smsmith rangeCount++; 19645405Smsmith } 19745405Smsmith range[rangeCount * 2] = lastCode = code; 19845405Smsmith } 19945405Smsmith } 20045405Smsmith 20145405Smsmith if (rangeCount * 2 + 1 < range.length) { 20245405Smsmith range = Arrays.copyOf(range, rangeCount * 2 + 1); 20345405Smsmith } 20445405Smsmith 20545405Smsmith range[0] = rangeCount; 20645405Smsmith codeRanges[ctype] = range; 20745405Smsmith } 20845405Smsmith } 20945405Smsmith 21045405Smsmith return range; 21145405Smsmith } 21245405Smsmith 21345405Smsmith // CodeRange.isInCodeRange 21445405Smsmith public static boolean isInCodeRange(final int[] p, final int offset, final int code) { 21545405Smsmith int low = 0; 21645405Smsmith final int n = p[offset]; 21745405Smsmith int high = n ; 21845405Smsmith 21945405Smsmith while (low < high) { 22045405Smsmith final int x = (low + high) >> 1; 22145405Smsmith if (code > p[(x << 1) + 2 + offset]) { 22245405Smsmith low = x + 1; 22345405Smsmith } else { 22445405Smsmith high = x; 22545405Smsmith } 22645405Smsmith } 22745405Smsmith return low < n && code >= p[(low << 1) + 1 + offset]; 22846215Smsmith } 22946215Smsmith 23046215Smsmith /** 23146215Smsmith * @see <a href="http://www.geocities.jp/kosako3/oniguruma/doc/RE.txt">http://www.geocities.jp/kosako3/oniguruma/doc/RE.txt</a> 23245405Smsmith * 23345405Smsmith * @param code code 23445405Smsmith * @param ctype ctype 23545405Smsmith * 23645405Smsmith * @return isCodeCType 23745405Smsmith */ 23845405Smsmith public static boolean isCodeCType(final int code, final int ctype) { 23945405Smsmith int type; 24045405Smsmith switch (ctype) { 24145405Smsmith case CharacterType.NEWLINE: 24245405Smsmith return isNewLine(code); 24345405Smsmith case CharacterType.ALPHA: 24445405Smsmith return (1 << Character.getType(code) & CharacterType.ALPHA_MASK) != 0; 24545405Smsmith case CharacterType.BLANK: 24645405Smsmith return code == 0x09 || Character.getType(code) == Character.SPACE_SEPARATOR; 24745405Smsmith case CharacterType.CNTRL: 24846215Smsmith type = Character.getType(code); 24946215Smsmith return (1 << type & CharacterType.CNTRL_MASK) != 0 || type == Character.UNASSIGNED; 25046215Smsmith case CharacterType.DIGIT: 25146215Smsmith return EncodingHelper.isDigit(code); 25246215Smsmith case CharacterType.GRAPH: 25346215Smsmith switch (code) { 25446215Smsmith case 0x09: 25546215Smsmith case 0x0a: 25646215Smsmith case 0x0b: 25746215Smsmith case 0x0c: 25846215Smsmith case 0x0d: 25946215Smsmith return false; 26046215Smsmith default: 26146215Smsmith type = Character.getType(code); 26246215Smsmith return (1 << type & CharacterType.GRAPH_MASK) == 0 && type != Character.UNASSIGNED; 26346215Smsmith } 26446215Smsmith case CharacterType.LOWER: 26545405Smsmith return Character.isLowerCase(code); 26645405Smsmith case CharacterType.PRINT: 26745405Smsmith type = Character.getType(code); 26845405Smsmith return (1 << type & CharacterType.PRINT_MASK) == 0 && type != Character.UNASSIGNED; 26945405Smsmith case CharacterType.PUNCT: 27045405Smsmith return (1 << Character.getType(code) & CharacterType.PUNCT_MASK) != 0; 27145405Smsmith case CharacterType.SPACE: 27245405Smsmith // ECMA 7.2 and 7.3 27345405Smsmith switch (code) { 27445405Smsmith case 0x09: 27545405Smsmith case 0x0a: 27645405Smsmith case 0x0b: 27745405Smsmith case 0x0c: 27845405Smsmith case 0x0d: 27945405Smsmith return true; 28045405Smsmith default: 28145405Smsmith // true if Unicode separator or BOM or U+180E (see JDK-8138758) 28245405Smsmith return (1 << Character.getType(code) & CharacterType.SPACE_MASK) != 0 28345405Smsmith || code == 0xfeff || code == 0x180e; 28445405Smsmith } 28545405Smsmith case CharacterType.UPPER: 28645405Smsmith return Character.isUpperCase(code); 28745405Smsmith case CharacterType.XDIGIT: 28845405Smsmith return EncodingHelper.isXDigit(code); 28945405Smsmith case CharacterType.WORD: 29045405Smsmith return (1 << Character.getType(code) & CharacterType.WORD_MASK) != 0; 29145405Smsmith case CharacterType.ALNUM: 29245405Smsmith return (1 << Character.getType(code) & CharacterType.ALNUM_MASK) != 0; 29345405Smsmith case CharacterType.ASCII: 29445405Smsmith return code < 0x80; 29545405Smsmith default: 29645405Smsmith throw new RuntimeException("illegal character type: " + ctype); 29745405Smsmith } 29845405Smsmith } 29945405Smsmith} 30045405Smsmith 30145405Smsmith