EncodingHelper.java revision 953:221a84ef44c0
1/* 2 * Permission is hereby granted, free of charge, to any person obtaining a copy of 3 * this software and associated documentation files (the "Software"), to deal in 4 * the Software without restriction, including without limitation the rights to 5 * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies 6 * of the Software, and to permit persons to whom the Software is furnished to do 7 * so, subject to the following conditions: 8 * 9 * The above copyright notice and this permission notice shall be included in all 10 * copies or substantial portions of the Software. 11 * 12 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 13 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 14 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 15 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 16 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 17 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 18 * SOFTWARE. 19 */ 20package jdk.nashorn.internal.runtime.regexp.joni; 21 22import java.util.Arrays; 23import jdk.nashorn.internal.runtime.regexp.joni.encoding.CharacterType; 24import jdk.nashorn.internal.runtime.regexp.joni.encoding.IntHolder; 25 26public final class EncodingHelper { 27 28 final static int NEW_LINE = 0x000a; 29 final static int RETURN = 0x000d; 30 final static int LINE_SEPARATOR = 0x2028; 31 final static int PARAGRAPH_SEPARATOR = 0x2029; 32 33 final static char[] EMPTYCHARS = new char[0]; 34 final static int[][] codeRanges = new int[15][]; 35 36 public static int digitVal(final int code) { 37 return code - '0'; 38 } 39 40 public static int odigitVal(final int code) { 41 return digitVal(code); 42 } 43 44 public static boolean isXDigit(final int code) { 45 return Character.isDigit(code) || (code >= 'a' && code <= 'f') || (code >= 'A' && code <= 'F'); 46 } 47 48 public static int xdigitVal(final int code) { 49 if (Character.isDigit(code)) { 50 return code - '0'; 51 } else if (code >= 'a' && code <= 'f') { 52 return code - 'a' + 10; 53 } else { 54 return code - 'A' + 10; 55 } 56 } 57 58 public static boolean isDigit(final int code) { 59 return code >= '0' && code <= '9'; 60 } 61 62 public static boolean isWord(final int code) { 63 // letter, digit, or '_' 64 return (1 << Character.getType(code) & CharacterType.WORD_MASK) != 0; 65 } 66 67 public static boolean isNewLine(final int code) { 68 return code == NEW_LINE || code == RETURN || code == LINE_SEPARATOR || code == PARAGRAPH_SEPARATOR; 69 } 70 71 public static boolean isNewLine(final char[] chars, final int p, final int end) { 72 return p < end && isNewLine(chars[p]); 73 } 74 75 // Encoding.prevCharHead 76 public static int prevCharHead(final int p, final int s) { 77 return s <= p ? -1 : s - 1; 78 } 79 80 /* onigenc_get_right_adjust_char_head_with_prev */ 81 public static int rightAdjustCharHeadWithPrev(final int s, final IntHolder prev) { 82 if (prev != null) prev.value = -1; /* Sorry */ 83 return s; 84 } 85 86 // Encoding.stepBack 87 public static int stepBack(final int p, int s, int n) { 88 while (s != -1 && n-- > 0) { 89 if (s <= p) return -1; 90 s--; 91 } 92 return s; 93 } 94 95 public static int mbcodeStartPosition() { 96 return 0x80; 97 } 98 99 public static char[] caseFoldCodesByString(final int flag, final char c) { 100 char[] codes = EMPTYCHARS; 101 final char upper = toUpperCase(c); 102 103 if (upper != toLowerCase(upper)) { 104 int count = 0; 105 char ch = 0; 106 107 do { 108 final char u = toUpperCase(ch); 109 if (u == upper && ch != c) { 110 // Almost all characters will return array of length 1, very few 2 or 3, so growing by one is fine. 111 codes = count == 0 ? new char[1] : Arrays.copyOf(codes, count + 1); 112 codes[count++] = ch; 113 } 114 } while (ch++ < 0xffff); 115 } 116 return codes; 117 } 118 119 public static void applyAllCaseFold(final int flag, final ApplyCaseFold fun, final Object arg) { 120 for (int c = 0; c < 0xffff; c++) { 121 if (Character.isLowerCase(c)) { 122 final int upper = toUpperCase(c); 123 124 if (upper != c) { 125 fun.apply(c, upper, arg); 126 } 127 } 128 } 129 130 // Some characters have multiple lower case variants, hence we need to do a second run 131 for (int c = 0; c < 0xffff; c++) { 132 if (Character.isLowerCase(c)) { 133 final int upper = toUpperCase(c); 134 135 if (upper != c) { 136 fun.apply(upper, c, arg); 137 } 138 } 139 } 140 } 141 142 public static char toLowerCase(final char c) { 143 return (char)toLowerCase((int)c); 144 } 145 146 public static int toLowerCase(final int c) { 147 if (c < 128) { 148 return ('A' <= c && c <= 'Z') ? (c + ('a' - 'A')) : c; 149 } 150 // Do not convert non-ASCII upper case character to ASCII lower case. 151 final int lower = Character.toLowerCase(c); 152 return (lower < 128) ? c : lower; 153 154 } 155 156 public static char toUpperCase(final char c) { 157 return (char)toUpperCase((int)c); 158 } 159 160 public static int toUpperCase(final int c) { 161 if (c < 128) { 162 return ('a' <= c && c <= 'z') ? c + ('A' - 'a') : c; 163 } 164 // Do not convert non-ASCII lower case character to ASCII upper case. 165 final int upper = Character.toUpperCase(c); 166 return (upper < 128) ? c : upper; 167 } 168 169 public static int[] ctypeCodeRange(final int ctype, final IntHolder sbOut) { 170 sbOut.value = 0x100; // use bitset for codes smaller than 256 171 int[] range = null; 172 173 if (ctype < codeRanges.length) { 174 range = codeRanges[ctype]; 175 176 if (range == null) { 177 // format: [numberOfRanges, rangeStart, rangeEnd, ...] 178 range = new int[16]; 179 int rangeCount = 0; 180 int lastCode = -2; 181 182 for (int code = 0; code <= 0xffff; code++) { 183 if (isCodeCType(code, ctype)) { 184 if (lastCode < code -1) { 185 if (rangeCount * 2 + 2 >= range.length) { 186 range = Arrays.copyOf(range, range.length * 2); 187 } 188 range[rangeCount * 2 + 1] = code; 189 rangeCount++; 190 } 191 range[rangeCount * 2] = lastCode = code; 192 } 193 } 194 195 if (rangeCount * 2 + 1 < range.length) { 196 range = Arrays.copyOf(range, rangeCount * 2 + 1); 197 } 198 199 range[0] = rangeCount; 200 codeRanges[ctype] = range; 201 } 202 } 203 204 return range; 205 } 206 207 // CodeRange.isInCodeRange 208 public static boolean isInCodeRange(final int[] p, final int offset, final int code) { 209 int low = 0; 210 final int n = p[offset]; 211 int high = n ; 212 213 while (low < high) { 214 final int x = (low + high) >> 1; 215 if (code > p[(x << 1) + 2 + offset]) { 216 low = x + 1; 217 } else { 218 high = x; 219 } 220 } 221 return low < n && code >= p[(low << 1) + 1 + offset]; 222 } 223 224 /** 225 * @see <a href="http://www.geocities.jp/kosako3/oniguruma/doc/RE.txt">http://www.geocities.jp/kosako3/oniguruma/doc/RE.txt</a> 226 */ 227 public static boolean isCodeCType(final int code, final int ctype) { 228 int type; 229 switch (ctype) { 230 case CharacterType.NEWLINE: 231 return isNewLine(code); 232 case CharacterType.ALPHA: 233 return (1 << Character.getType(code) & CharacterType.ALPHA_MASK) != 0; 234 case CharacterType.BLANK: 235 return code == 0x09 || Character.getType(code) == Character.SPACE_SEPARATOR; 236 case CharacterType.CNTRL: 237 type = Character.getType(code); 238 return (1 << type & CharacterType.CNTRL_MASK) != 0 || type == Character.UNASSIGNED; 239 case CharacterType.DIGIT: 240 return EncodingHelper.isDigit(code); 241 case CharacterType.GRAPH: 242 switch (code) { 243 case 0x09: 244 case 0x0a: 245 case 0x0b: 246 case 0x0c: 247 case 0x0d: 248 return false; 249 default: 250 type = Character.getType(code); 251 return (1 << type & CharacterType.GRAPH_MASK) == 0 && type != Character.UNASSIGNED; 252 } 253 case CharacterType.LOWER: 254 return Character.isLowerCase(code); 255 case CharacterType.PRINT: 256 type = Character.getType(code); 257 return (1 << type & CharacterType.PRINT_MASK) == 0 && type != Character.UNASSIGNED; 258 case CharacterType.PUNCT: 259 return (1 << Character.getType(code) & CharacterType.PUNCT_MASK) != 0; 260 case CharacterType.SPACE: 261 // ECMA 7.2 and 7.3 262 switch (code) { 263 case 0x09: 264 case 0x0a: 265 case 0x0b: 266 case 0x0c: 267 case 0x0d: 268 return true; 269 default: 270 // true if Unicode separator or BOM 271 return (1 << Character.getType(code) & CharacterType.SPACE_MASK) != 0 || code == 0xfeff; 272 } 273 case CharacterType.UPPER: 274 return Character.isUpperCase(code); 275 case CharacterType.XDIGIT: 276 return EncodingHelper.isXDigit(code); 277 case CharacterType.WORD: 278 return (1 << Character.getType(code) & CharacterType.WORD_MASK) != 0; 279 case CharacterType.ALNUM: 280 return (1 << Character.getType(code) & CharacterType.ALNUM_MASK) != 0; 281 case CharacterType.ASCII: 282 return code < 0x80; 283 default: 284 throw new RuntimeException("illegal character type: " + ctype); 285 } 286 } 287} 288 289