Analyser.java revision 953:221a84ef44c0
1/* 2 * Permission is hereby granted, free of charge, to any person obtaining a copy of 3 * this software and associated documentation files (the "Software"), to deal in 4 * the Software without restriction, including without limitation the rights to 5 * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies 6 * of the Software, and to permit persons to whom the Software is furnished to do 7 * so, subject to the following conditions: 8 * 9 * The above copyright notice and this permission notice shall be included in all 10 * copies or substantial portions of the Software. 11 * 12 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 13 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 14 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 15 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 16 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 17 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 18 * SOFTWARE. 19 */ 20package jdk.nashorn.internal.runtime.regexp.joni; 21 22import static jdk.nashorn.internal.runtime.regexp.joni.BitStatus.bsAll; 23import static jdk.nashorn.internal.runtime.regexp.joni.BitStatus.bsAt; 24import static jdk.nashorn.internal.runtime.regexp.joni.BitStatus.bsOnAt; 25import static jdk.nashorn.internal.runtime.regexp.joni.Option.isFindCondition; 26import static jdk.nashorn.internal.runtime.regexp.joni.Option.isIgnoreCase; 27import static jdk.nashorn.internal.runtime.regexp.joni.Option.isMultiline; 28import static jdk.nashorn.internal.runtime.regexp.joni.ast.ConsAltNode.newAltNode; 29import static jdk.nashorn.internal.runtime.regexp.joni.ast.QuantifierNode.isRepeatInfinite; 30 31import java.util.HashSet; 32import jdk.nashorn.internal.runtime.regexp.joni.ast.AnchorNode; 33import jdk.nashorn.internal.runtime.regexp.joni.ast.BackRefNode; 34import jdk.nashorn.internal.runtime.regexp.joni.ast.CClassNode; 35import jdk.nashorn.internal.runtime.regexp.joni.ast.ConsAltNode; 36import jdk.nashorn.internal.runtime.regexp.joni.ast.EncloseNode; 37import jdk.nashorn.internal.runtime.regexp.joni.ast.Node; 38import jdk.nashorn.internal.runtime.regexp.joni.ast.QuantifierNode; 39import jdk.nashorn.internal.runtime.regexp.joni.ast.StringNode; 40import jdk.nashorn.internal.runtime.regexp.joni.constants.AnchorType; 41import jdk.nashorn.internal.runtime.regexp.joni.constants.EncloseType; 42import jdk.nashorn.internal.runtime.regexp.joni.constants.NodeType; 43import jdk.nashorn.internal.runtime.regexp.joni.constants.StackPopLevel; 44import jdk.nashorn.internal.runtime.regexp.joni.constants.TargetInfo; 45import jdk.nashorn.internal.runtime.regexp.joni.encoding.ObjPtr; 46import jdk.nashorn.internal.runtime.regexp.joni.exception.InternalException; 47import jdk.nashorn.internal.runtime.regexp.joni.exception.SyntaxException; 48import jdk.nashorn.internal.runtime.regexp.joni.exception.ValueException; 49 50final class Analyser extends Parser { 51 52 protected Analyser(final ScanEnvironment env, final char[] chars, final int p, final int end) { 53 super(env, chars, p, end); 54 } 55 56 protected final void compile() { 57 if (Config.DEBUG) { 58 Config.log.println(new String(chars, getBegin(), getEnd())); 59 } 60 61 reset(); 62 63 regex.numMem = 0; 64 regex.numRepeat = 0; 65 regex.numNullCheck = 0; 66 //regex.repeatRangeAlloc = 0; 67 regex.repeatRangeLo = null; 68 regex.repeatRangeHi = null; 69 70 parse(); 71 72 if (Config.DEBUG_PARSE_TREE_RAW && Config.DEBUG_PARSE_TREE) { 73 Config.log.println("<RAW TREE>"); 74 Config.log.println(root + "\n"); 75 } 76 77 root = setupTree(root, 0); 78 if (Config.DEBUG_PARSE_TREE) { 79 if (Config.DEBUG_PARSE_TREE_RAW) Config.log.println("<TREE>"); 80 root.verifyTree(new HashSet<Node>(), env.reg.warnings); 81 Config.log.println(root + "\n"); 82 } 83 84 regex.captureHistory = env.captureHistory; 85 regex.btMemStart = env.btMemStart; 86 regex.btMemEnd = env.btMemEnd; 87 88 if (isFindCondition(regex.options)) { 89 regex.btMemEnd = bsAll(); 90 } else { 91 regex.btMemEnd = env.btMemEnd; 92 regex.btMemEnd |= regex.captureHistory; 93 } 94 95 regex.clearOptimizeInfo(); 96 97 if (!Config.DONT_OPTIMIZE) setOptimizedInfoFromTree(root); 98 99 env.memNodes = null; 100 101 if (regex.numRepeat != 0 || regex.btMemEnd != 0) { 102 regex.stackPopLevel = StackPopLevel.ALL; 103 } else { 104 if (regex.btMemStart != 0) { 105 regex.stackPopLevel = StackPopLevel.MEM_START; 106 } else { 107 regex.stackPopLevel = StackPopLevel.FREE; 108 } 109 } 110 111 if (Config.DEBUG_COMPILE) { 112 Config.log.println("stack used: " + regex.stackNeeded); 113 if (Config.USE_STRING_TEMPLATES) Config.log.print("templates: " + regex.templateNum + "\n"); 114 Config.log.println(new ByteCodePrinter(regex).byteCodeListToString()); 115 116 } // DEBUG_COMPILE 117 } 118 119 private void swap(final Node a, final Node b) { 120 a.swap(b); 121 122 if (root == b) { 123 root = a; 124 } else if (root == a) { 125 root = b; 126 } 127 } 128 129 // USE_INFINITE_REPEAT_MONOMANIAC_MEM_STATUS_CHECK 130 private int quantifiersMemoryInfo(final Node node) { 131 int info = 0; 132 133 switch(node.getType()) { 134 case NodeType.LIST: 135 case NodeType.ALT: 136 ConsAltNode can = (ConsAltNode)node; 137 do { 138 final int v = quantifiersMemoryInfo(can.car); 139 if (v > info) info = v; 140 } while ((can = can.cdr) != null); 141 break; 142 143 case NodeType.QTFR: 144 final QuantifierNode qn = (QuantifierNode)node; 145 if (qn.upper != 0) { 146 info = quantifiersMemoryInfo(qn.target); 147 } 148 break; 149 150 case NodeType.ENCLOSE: 151 final EncloseNode en = (EncloseNode)node; 152 switch (en.type) { 153 case EncloseType.MEMORY: 154 return TargetInfo.IS_EMPTY_MEM; 155 156 case EncloseType.OPTION: 157 case EncloseNode.STOP_BACKTRACK: 158 info = quantifiersMemoryInfo(en.target); 159 break; 160 161 default: 162 break; 163 } // inner switch 164 break; 165 166 case NodeType.BREF: 167 case NodeType.STR: 168 case NodeType.CTYPE: 169 case NodeType.CCLASS: 170 case NodeType.CANY: 171 case NodeType.ANCHOR: 172 default: 173 break; 174 } // switch 175 176 return info; 177 } 178 179 private int getMinMatchLength(final Node node) { 180 int min = 0; 181 182 switch (node.getType()) { 183 case NodeType.BREF: 184 final BackRefNode br = (BackRefNode)node; 185 if (br.isRecursion()) break; 186 187 if (br.backRef > env.numMem) { 188 throw new ValueException(ERR_INVALID_BACKREF); 189 } 190 min = getMinMatchLength(env.memNodes[br.backRef]); 191 192 break; 193 194 case NodeType.LIST: 195 ConsAltNode can = (ConsAltNode)node; 196 do { 197 min += getMinMatchLength(can.car); 198 } while ((can = can.cdr) != null); 199 break; 200 201 case NodeType.ALT: 202 ConsAltNode y = (ConsAltNode)node; 203 do { 204 final Node x = y.car; 205 final int tmin = getMinMatchLength(x); 206 if (y == node) { 207 min = tmin; 208 } else if (min > tmin) { 209 min = tmin; 210 } 211 } while ((y = y.cdr) != null); 212 break; 213 214 case NodeType.STR: 215 min = ((StringNode)node).length(); 216 break; 217 218 case NodeType.CTYPE: 219 min = 1; 220 break; 221 222 case NodeType.CCLASS: 223 case NodeType.CANY: 224 min = 1; 225 break; 226 227 case NodeType.QTFR: 228 final QuantifierNode qn = (QuantifierNode)node; 229 if (qn.lower > 0) { 230 min = getMinMatchLength(qn.target); 231 min = MinMaxLen.distanceMultiply(min, qn.lower); 232 } 233 break; 234 235 case NodeType.ENCLOSE: 236 final EncloseNode en = (EncloseNode)node; 237 switch (en.type) { 238 case EncloseType.MEMORY: 239 if (en.isMinFixed()) { 240 min = en.minLength; 241 } else { 242 min = getMinMatchLength(en.target); 243 en.minLength = min; 244 en.setMinFixed(); 245 } 246 break; 247 248 case EncloseType.OPTION: 249 case EncloseType.STOP_BACKTRACK: 250 min = getMinMatchLength(en.target); 251 break; 252 } // inner switch 253 break; 254 255 case NodeType.ANCHOR: 256 default: 257 break; 258 } // switch 259 260 return min; 261 } 262 263 private int getMaxMatchLength(final Node node) { 264 int max = 0; 265 266 switch (node.getType()) { 267 case NodeType.LIST: 268 ConsAltNode ln = (ConsAltNode)node; 269 do { 270 final int tmax = getMaxMatchLength(ln.car); 271 max = MinMaxLen.distanceAdd(max, tmax); 272 } while ((ln = ln.cdr) != null); 273 break; 274 275 case NodeType.ALT: 276 ConsAltNode an = (ConsAltNode)node; 277 do { 278 final int tmax = getMaxMatchLength(an.car); 279 if (max < tmax) max = tmax; 280 } while ((an = an.cdr) != null); 281 break; 282 283 case NodeType.STR: 284 max = ((StringNode)node).length(); 285 break; 286 287 case NodeType.CTYPE: 288 max = 1; 289 break; 290 291 case NodeType.CCLASS: 292 case NodeType.CANY: 293 max = 1; 294 break; 295 296 case NodeType.BREF: 297 final BackRefNode br = (BackRefNode)node; 298 if (br.isRecursion()) { 299 max = MinMaxLen.INFINITE_DISTANCE; 300 break; 301 } 302 303 if (br.backRef > env.numMem) { 304 throw new ValueException(ERR_INVALID_BACKREF); 305 } 306 final int tmax = getMaxMatchLength(env.memNodes[br.backRef]); 307 if (max < tmax) max = tmax; 308 break; 309 310 case NodeType.QTFR: 311 final QuantifierNode qn = (QuantifierNode)node; 312 if (qn.upper != 0) { 313 max = getMaxMatchLength(qn.target); 314 if (max != 0) { 315 if (!isRepeatInfinite(qn.upper)) { 316 max = MinMaxLen.distanceMultiply(max, qn.upper); 317 } else { 318 max = MinMaxLen.INFINITE_DISTANCE; 319 } 320 } 321 } 322 break; 323 324 case NodeType.ENCLOSE: 325 final EncloseNode en = (EncloseNode)node; 326 switch (en.type) { 327 case EncloseType.MEMORY: 328 if (en.isMaxFixed()) { 329 max = en.maxLength; 330 } else { 331 max = getMaxMatchLength(en.target); 332 en.maxLength = max; 333 en.setMaxFixed(); 334 } 335 break; 336 337 case EncloseType.OPTION: 338 case EncloseType.STOP_BACKTRACK: 339 max = getMaxMatchLength(en.target); 340 break; 341 } // inner switch 342 break; 343 344 case NodeType.ANCHOR: 345 default: 346 break; 347 } // switch 348 349 return max; 350 } 351 352 private static final int GET_CHAR_LEN_VARLEN = -1; 353 private static final int GET_CHAR_LEN_TOP_ALT_VARLEN = -2; 354 protected final int getCharLengthTree(final Node node) { 355 return getCharLengthTree(node, 0); 356 } 357 358 private int getCharLengthTree(final Node node, int level) { 359 level++; 360 361 int len = 0; 362 returnCode = 0; 363 364 switch(node.getType()) { 365 case NodeType.LIST: 366 ConsAltNode ln = (ConsAltNode)node; 367 do { 368 final int tlen = getCharLengthTree(ln.car, level); 369 if (returnCode == 0) len = MinMaxLen.distanceAdd(len, tlen); 370 } while (returnCode == 0 && (ln = ln.cdr) != null); 371 break; 372 373 case NodeType.ALT: 374 ConsAltNode an = (ConsAltNode)node; 375 boolean varLen = false; 376 377 int tlen = getCharLengthTree(an.car, level); 378 while (returnCode == 0 && (an = an.cdr) != null) { 379 final int tlen2 = getCharLengthTree(an.car, level); 380 if (returnCode == 0) { 381 if (tlen != tlen2) varLen = true; 382 } 383 } 384 385 if (returnCode == 0) { 386 if (varLen) { 387 if (level == 1) { 388 returnCode = GET_CHAR_LEN_TOP_ALT_VARLEN; 389 } else { 390 returnCode = GET_CHAR_LEN_VARLEN; 391 } 392 } else { 393 len = tlen; 394 } 395 } 396 break; 397 398 case NodeType.STR: 399 final StringNode sn = (StringNode)node; 400 len = sn.length(); 401 break; 402 403 case NodeType.QTFR: 404 final QuantifierNode qn = (QuantifierNode)node; 405 if (qn.lower == qn.upper) { 406 tlen = getCharLengthTree(qn.target, level); 407 if (returnCode == 0) len = MinMaxLen.distanceMultiply(tlen, qn.lower); 408 } else { 409 returnCode = GET_CHAR_LEN_VARLEN; 410 } 411 break; 412 413 case NodeType.CTYPE: 414 case NodeType.CCLASS: 415 case NodeType.CANY: 416 len = 1; 417 break; 418 419 case NodeType.ENCLOSE: 420 final EncloseNode en = (EncloseNode)node; 421 switch(en.type) { 422 case EncloseType.MEMORY: 423 if (en.isCLenFixed()) { 424 len = en.charLength; 425 } else { 426 len = getCharLengthTree(en.target, level); 427 if (returnCode == 0) { 428 en.charLength = len; 429 en.setCLenFixed(); 430 } 431 } 432 break; 433 434 case EncloseType.OPTION: 435 case EncloseType.STOP_BACKTRACK: 436 len = getCharLengthTree(en.target, level); 437 break; 438 } // inner switch 439 break; 440 441 case NodeType.ANCHOR: 442 break; 443 444 default: 445 returnCode = GET_CHAR_LEN_VARLEN; 446 } // switch 447 return len; 448 } 449 450 /* x is not included y ==> 1 : 0 */ 451 private boolean isNotIncluded(Node x, Node y) { 452 Node tmp; 453 454 // !retry:! 455 retry: while(true) { 456 457 final int yType = y.getType(); 458 459 switch(x.getType()) { 460 case NodeType.CTYPE: 461 switch(yType) { 462 463 case NodeType.CCLASS: 464 // !swap:! 465 tmp = x; 466 x = y; 467 y = tmp; 468 // !goto retry;! 469 continue retry; 470 471 case NodeType.STR: 472 // !goto swap;! 473 tmp = x; 474 x = y; 475 y = tmp; 476 continue retry; 477 478 default: 479 break; 480 } // inner switch 481 break; 482 483 case NodeType.CCLASS: 484 final CClassNode xc = (CClassNode)x; 485 486 switch(yType) { 487 488 case NodeType.CCLASS: 489 final CClassNode yc = (CClassNode)y; 490 491 for (int i=0; i<BitSet.SINGLE_BYTE_SIZE; i++) { 492 boolean v = xc.bs.at(i); 493 if ((v && !xc.isNot()) || (!v && xc.isNot())) { 494 v = yc.bs.at(i); 495 if ((v && !yc.isNot()) || (!v && yc.isNot())) return false; 496 } 497 } 498 if ((xc.mbuf == null && !xc.isNot()) || yc.mbuf == null && !yc.isNot()) return true; 499 return false; 500 // break; not reached 501 502 case NodeType.STR: 503 // !goto swap;! 504 tmp = x; 505 x = y; 506 y = tmp; 507 continue retry; 508 509 default: 510 break; 511 512 } // inner switch 513 break; // case NodeType.CCLASS 514 515 case NodeType.STR: 516 final StringNode xs = (StringNode)x; 517 if (xs.length() == 0) break; 518 519 switch (yType) { 520 521 case NodeType.CCLASS: 522 final CClassNode cc = (CClassNode)y; 523 final int code = xs.chars[xs.p]; 524 return !cc.isCodeInCC(code); 525 526 case NodeType.STR: 527 final StringNode ys = (StringNode)y; 528 int len = xs.length(); 529 if (len > ys.length()) len = ys.length(); 530 if (xs.isAmbig() || ys.isAmbig()) { 531 /* tiny version */ 532 return false; 533 } else { 534 for (int i=0, p=ys.p, q=xs.p; i<len; i++, p++, q++) { 535 if (ys.chars[p] != xs.chars[q]) return true; 536 } 537 } 538 break; 539 540 default: 541 break; 542 } // inner switch 543 544 break; // case NodeType.STR 545 546 } // switch 547 548 break; 549 } // retry: while 550 return false; 551 } 552 553 private Node getHeadValueNode(final Node node, final boolean exact) { 554 Node n = null; 555 556 switch(node.getType()) { 557 case NodeType.BREF: 558 case NodeType.ALT: 559 case NodeType.CANY: 560 break; 561 562 case NodeType.CTYPE: 563 case NodeType.CCLASS: 564 if (!exact) n = node; 565 break; 566 567 case NodeType.LIST: 568 n = getHeadValueNode(((ConsAltNode)node).car, exact); 569 break; 570 571 case NodeType.STR: 572 final StringNode sn = (StringNode)node; 573 if (sn.end <= sn.p) break; // ??? 574 575 if (exact && !sn.isRaw() && isIgnoreCase(regex.options)){ 576 // nothing 577 } else { 578 n = node; 579 } 580 break; 581 582 case NodeType.QTFR: 583 final QuantifierNode qn = (QuantifierNode)node; 584 if (qn.lower > 0) { 585 if (qn.headExact != null) { 586 n = qn.headExact; 587 } else { 588 n = getHeadValueNode(qn.target, exact); 589 } 590 } 591 break; 592 593 case NodeType.ENCLOSE: 594 final EncloseNode en = (EncloseNode)node; 595 596 switch (en.type) { 597 case EncloseType.OPTION: 598 final int options = regex.options; 599 regex.options = en.option; 600 n = getHeadValueNode(en.target, exact); 601 regex.options = options; 602 break; 603 604 case EncloseType.MEMORY: 605 case EncloseType.STOP_BACKTRACK: 606 n = getHeadValueNode(en.target, exact); 607 break; 608 } // inner switch 609 break; 610 611 case NodeType.ANCHOR: 612 final AnchorNode an = (AnchorNode)node; 613 if (an.type == AnchorType.PREC_READ) n = getHeadValueNode(an.target, exact); 614 break; 615 616 default: 617 break; 618 } // switch 619 620 return n; 621 } 622 623 // true: invalid 624 private boolean checkTypeTree(final Node node, final int typeMask, final int encloseMask, final int anchorMask) { 625 if ((node.getType2Bit() & typeMask) == 0) return true; 626 627 boolean invalid = false; 628 629 switch(node.getType()) { 630 case NodeType.LIST: 631 case NodeType.ALT: 632 ConsAltNode can = (ConsAltNode)node; 633 do { 634 invalid = checkTypeTree(can.car, typeMask, encloseMask, anchorMask); 635 } while (!invalid && (can = can.cdr) != null); 636 break; 637 638 case NodeType.QTFR: 639 invalid = checkTypeTree(((QuantifierNode)node).target, typeMask, encloseMask, anchorMask); 640 break; 641 642 case NodeType.ENCLOSE: 643 final EncloseNode en = (EncloseNode)node; 644 if ((en.type & encloseMask) == 0) return true; 645 invalid = checkTypeTree(en.target, typeMask, encloseMask, anchorMask); 646 break; 647 648 case NodeType.ANCHOR: 649 final AnchorNode an = (AnchorNode)node; 650 if ((an.type & anchorMask) == 0) return true; 651 652 if (an.target != null) invalid = checkTypeTree(an.target, typeMask, encloseMask, anchorMask); 653 break; 654 655 default: 656 break; 657 658 } // switch 659 660 return invalid; 661 } 662 663 /* divide different length alternatives in look-behind. 664 (?<=A|B) ==> (?<=A)|(?<=B) 665 (?<!A|B) ==> (?<!A)(?<!B) 666 */ 667 private Node divideLookBehindAlternatives(Node node) { 668 final AnchorNode an = (AnchorNode)node; 669 final int anchorType = an.type; 670 Node head = an.target; 671 Node np = ((ConsAltNode)head).car; 672 673 swap(node, head); 674 675 final Node tmp = node; 676 node = head; 677 head = tmp; 678 679 ((ConsAltNode)node).setCar(head); 680 ((AnchorNode)head).setTarget(np); 681 np = node; 682 683 while ((np = ((ConsAltNode)np).cdr) != null) { 684 final AnchorNode insert = new AnchorNode(anchorType); 685 insert.setTarget(((ConsAltNode)np).car); 686 ((ConsAltNode)np).setCar(insert); 687 } 688 689 if (anchorType == AnchorType.LOOK_BEHIND_NOT) { 690 np = node; 691 do { 692 ((ConsAltNode)np).toListNode(); /* alt -> list */ 693 } while ((np = ((ConsAltNode)np).cdr) != null); 694 } 695 696 return node; 697 } 698 699 private Node setupLookBehind(final Node node) { 700 final AnchorNode an = (AnchorNode)node; 701 final int len = getCharLengthTree(an.target); 702 switch(returnCode) { 703 case 0: 704 an.charLength = len; 705 break; 706 case GET_CHAR_LEN_VARLEN: 707 throw new SyntaxException(ERR_INVALID_LOOK_BEHIND_PATTERN); 708 case GET_CHAR_LEN_TOP_ALT_VARLEN: 709 if (syntax.differentLengthAltLookBehind()) { 710 return divideLookBehindAlternatives(node); 711 } else { 712 throw new SyntaxException(ERR_INVALID_LOOK_BEHIND_PATTERN); 713 } 714 } 715 return node; 716 } 717 718 private void nextSetup(Node node, final Node nextNode) { 719 // retry: 720 retry: while(true) { 721 722 final int type = node.getType(); 723 if (type == NodeType.QTFR) { 724 final QuantifierNode qn = (QuantifierNode)node; 725 if (qn.greedy && isRepeatInfinite(qn.upper)) { 726 if (Config.USE_QTFR_PEEK_NEXT) { 727 final StringNode n = (StringNode)getHeadValueNode(nextNode, true); 728 /* '\0': for UTF-16BE etc... */ 729 if (n != null && n.chars[n.p] != 0) { // ????????? 730 qn.nextHeadExact = n; 731 } 732 } // USE_QTFR_PEEK_NEXT 733 /* automatic posseivation a*b ==> (?>a*)b */ 734 if (qn.lower <= 1) { 735 if (qn.target.isSimple()) { 736 final Node x = getHeadValueNode(qn.target, false); 737 if (x != null) { 738 final Node y = getHeadValueNode(nextNode, false); 739 if (y != null && isNotIncluded(x, y)) { 740 final EncloseNode en = new EncloseNode(EncloseType.STOP_BACKTRACK); //onig_node_new_enclose 741 en.setStopBtSimpleRepeat(); 742 //en.setTarget(qn.target); // optimize it ?? 743 swap(node, en); 744 745 en.setTarget(node); 746 } 747 } 748 } 749 } 750 } 751 } else if (type == NodeType.ENCLOSE) { 752 final EncloseNode en = (EncloseNode)node; 753 if (en.isMemory()) { 754 node = en.target; 755 // !goto retry;! 756 continue retry; 757 } 758 } 759 760 break; 761 } // while 762 } 763 764 private void updateStringNodeCaseFoldMultiByte(final StringNode sn) { 765 final char[] chars = sn.chars; 766 final int end = sn.end; 767 value = sn.p; 768 int sp = 0; 769 char buf; 770 771 while (value < end) { 772 final int ovalue = value; 773 buf = EncodingHelper.toLowerCase(chars[value++]); 774 775 if (chars[ovalue] != buf) { 776 777 char[] sbuf = new char[sn.length() << 1]; 778 System.arraycopy(chars, sn.p, sbuf, 0, ovalue - sn.p); 779 value = ovalue; 780 while (value < end) { 781 buf = EncodingHelper.toLowerCase(chars[value++]); 782 if (sp >= sbuf.length) { 783 final char[]tmp = new char[sbuf.length << 1]; 784 System.arraycopy(sbuf, 0, tmp, 0, sbuf.length); 785 sbuf = tmp; 786 } 787 sbuf[sp++] = buf; 788 } 789 sn.set(sbuf, 0, sp); 790 return; 791 } 792 sp++; 793 } 794 } 795 796 private void updateStringNodeCaseFold(final Node node) { 797 final StringNode sn = (StringNode)node; 798 updateStringNodeCaseFoldMultiByte(sn); 799 } 800 801 private Node expandCaseFoldMakeRemString(final char[] chars, final int p, final int end) { 802 final StringNode node = new StringNode(chars, p, end); 803 804 updateStringNodeCaseFold(node); 805 node.setAmbig(); 806 node.setDontGetOptInfo(); 807 return node; 808 } 809 810 private boolean expandCaseFoldStringAlt(final int itemNum, final char[] items, 811 final char[] chars, final int p, final int slen, final int end, final ObjPtr<Node> node) { 812 813 ConsAltNode altNode; 814 node.p = altNode = newAltNode(null, null); 815 816 StringNode snode = new StringNode(chars, p, p + slen); 817 altNode.setCar(snode); 818 819 for (int i=0; i<itemNum; i++) { 820 snode = new StringNode(); 821 822 snode.catCode(items[i]); 823 824 final ConsAltNode an = newAltNode(null, null); 825 an.setCar(snode); 826 altNode.setCdr(an); 827 altNode = an; 828 } 829 return false; 830 } 831 832 private static final int THRESHOLD_CASE_FOLD_ALT_FOR_EXPANSION = 8; 833 private Node expandCaseFoldString(final Node node) { 834 final StringNode sn = (StringNode)node; 835 836 if (sn.isAmbig() || sn.length() <= 0) return node; 837 838 final char[] chars = sn.chars; 839 int p = sn.p; 840 final int end = sn.end; 841 int altNum = 1; 842 843 ConsAltNode topRoot = null, root = null; 844 final ObjPtr<Node> prevNode = new ObjPtr<Node>(); 845 StringNode stringNode = null; 846 847 while (p < end) { 848 final char[] items = EncodingHelper.caseFoldCodesByString(regex.caseFoldFlag, chars[p]); 849 850 if (items.length == 0) { 851 if (stringNode == null) { 852 if (root == null && prevNode.p != null) { 853 topRoot = root = ConsAltNode.listAdd(null, prevNode.p); 854 } 855 856 prevNode.p = stringNode = new StringNode(); // onig_node_new_str(NULL, NULL); 857 858 if (root != null) ConsAltNode.listAdd(root, stringNode); 859 860 } 861 862 stringNode.cat(chars, p, p + 1); 863 } else { 864 altNum *= (items.length + 1); 865 if (altNum > THRESHOLD_CASE_FOLD_ALT_FOR_EXPANSION) break; 866 867 if (root == null && prevNode.p != null) { 868 topRoot = root = ConsAltNode.listAdd(null, prevNode.p); 869 } 870 871 expandCaseFoldStringAlt(items.length, items, chars, p, 1, end, prevNode); 872 if (root != null) ConsAltNode.listAdd(root, prevNode.p); 873 stringNode = null; 874 } 875 p++; 876 } 877 878 if (p < end) { 879 final Node srem = expandCaseFoldMakeRemString(chars, p, end); 880 881 if (prevNode.p != null && root == null) { 882 topRoot = root = ConsAltNode.listAdd(null, prevNode.p); 883 } 884 885 if (root == null) { 886 prevNode.p = srem; 887 } else { 888 ConsAltNode.listAdd(root, srem); 889 } 890 } 891 /* ending */ 892 final Node xnode = topRoot != null ? topRoot : prevNode.p; 893 894 swap(node, xnode); 895 return xnode; 896 } 897 898 private static final int IN_ALT = (1<<0); 899 private static final int IN_NOT = (1<<1); 900 private static final int IN_REPEAT = (1<<2); 901 private static final int IN_VAR_REPEAT = (1<<3); 902 private static final int EXPAND_STRING_MAX_LENGTH = 100; 903 904 /* setup_tree does the following work. 905 1. check empty loop. (set qn->target_empty_info) 906 2. expand ignore-case in char class. 907 3. set memory status bit flags. (reg->mem_stats) 908 4. set qn->head_exact for [push, exact] -> [push_or_jump_exact1, exact]. 909 5. find invalid patterns in look-behind. 910 6. expand repeated string. 911 */ 912 protected final Node setupTree(Node node, int state) { 913 restart: while (true) { 914 switch (node.getType()) { 915 case NodeType.LIST: 916 ConsAltNode lin = (ConsAltNode)node; 917 Node prev = null; 918 do { 919 setupTree(lin.car, state); 920 if (prev != null) { 921 nextSetup(prev, lin.car); 922 } 923 prev = lin.car; 924 } while ((lin = lin.cdr) != null); 925 break; 926 927 case NodeType.ALT: 928 ConsAltNode aln = (ConsAltNode)node; 929 do { 930 setupTree(aln.car, (state | IN_ALT)); 931 } while ((aln = aln.cdr) != null); 932 break; 933 934 case NodeType.CCLASS: 935 break; 936 937 case NodeType.STR: 938 if (isIgnoreCase(regex.options) && !((StringNode)node).isRaw()) { 939 node = expandCaseFoldString(node); 940 } 941 break; 942 943 case NodeType.CTYPE: 944 case NodeType.CANY: 945 break; 946 947 case NodeType.BREF: 948 final BackRefNode br = (BackRefNode)node; 949 if (br.backRef > env.numMem) { 950 throw new ValueException(ERR_INVALID_BACKREF); 951 } 952 env.backrefedMem = bsOnAt(env.backrefedMem, br.backRef); 953 env.btMemStart = bsOnAt(env.btMemStart, br.backRef); 954 ((EncloseNode)env.memNodes[br.backRef]).setMemBackrefed(); 955 break; 956 957 case NodeType.QTFR: 958 final QuantifierNode qn = (QuantifierNode)node; 959 Node target = qn.target; 960 961 if ((state & IN_REPEAT) != 0) qn.setInRepeat(); 962 963 if (isRepeatInfinite(qn.upper) || qn.lower >= 1) { 964 final int d = getMinMatchLength(target); 965 if (d == 0) { 966 qn.targetEmptyInfo = TargetInfo.IS_EMPTY; 967 if (Config.USE_MONOMANIAC_CHECK_CAPTURES_IN_ENDLESS_REPEAT) { 968 final int info = quantifiersMemoryInfo(target); 969 if (info > 0) qn.targetEmptyInfo = info; 970 } // USE_INFINITE_REPEAT_MONOMANIAC_MEM_STATUS_CHECK 971 // strange stuff here (turned off) 972 } 973 } 974 975 state |= IN_REPEAT; 976 if (qn.lower != qn.upper) state |= IN_VAR_REPEAT; 977 978 target = setupTree(target, state); 979 980 /* expand string */ 981 if (target.getType() == NodeType.STR) { 982 if (!isRepeatInfinite(qn.lower) && qn.lower == qn.upper && 983 qn.lower > 1 && qn.lower <= EXPAND_STRING_MAX_LENGTH) { 984 final StringNode sn = (StringNode)target; 985 final int len = sn.length(); 986 987 if (len * qn.lower <= EXPAND_STRING_MAX_LENGTH) { 988 final StringNode str = qn.convertToString(sn.flag); 989 final int n = qn.lower; 990 for (int i = 0; i < n; i++) { 991 str.cat(sn.chars, sn.p, sn.end); 992 } 993 break; /* break case NT_QTFR: */ 994 } 995 996 } 997 } 998 if (Config.USE_OP_PUSH_OR_JUMP_EXACT) { 999 if (qn.greedy && qn.targetEmptyInfo != 0) { 1000 if (target.getType() == NodeType.QTFR) { 1001 final QuantifierNode tqn = (QuantifierNode)target; 1002 if (tqn.headExact != null) { 1003 qn.headExact = tqn.headExact; 1004 tqn.headExact = null; 1005 } 1006 } else { 1007 qn.headExact = getHeadValueNode(qn.target, true); 1008 } 1009 } 1010 } // USE_OP_PUSH_OR_JUMP_EXACT 1011 break; 1012 1013 case NodeType.ENCLOSE: 1014 final EncloseNode en = (EncloseNode)node; 1015 switch (en.type) { 1016 case EncloseType.OPTION: 1017 final int options = regex.options; 1018 regex.options = en.option; 1019 setupTree(en.target, state); 1020 regex.options = options; 1021 break; 1022 1023 case EncloseType.MEMORY: 1024 if ((state & (IN_ALT | IN_NOT | IN_VAR_REPEAT)) != 0) { 1025 env.btMemStart = bsOnAt(env.btMemStart, en.regNum); 1026 /* SET_ENCLOSE_STATUS(node, NST_MEM_IN_ALT_NOT); */ 1027 1028 } 1029 setupTree(en.target, state); 1030 break; 1031 1032 case EncloseType.STOP_BACKTRACK: 1033 setupTree(en.target, state); 1034 if (en.target.getType() == NodeType.QTFR) { 1035 final QuantifierNode tqn = (QuantifierNode)en.target; 1036 if (isRepeatInfinite(tqn.upper) && tqn.lower <= 1 && tqn.greedy) { 1037 /* (?>a*), a*+ etc... */ 1038 if (tqn.target.isSimple()) en.setStopBtSimpleRepeat(); 1039 } 1040 } 1041 break; 1042 1043 } // inner switch 1044 break; 1045 1046 case NodeType.ANCHOR: 1047 final AnchorNode an = (AnchorNode)node; 1048 switch (an.type) { 1049 case AnchorType.PREC_READ: 1050 setupTree(an.target, state); 1051 break; 1052 1053 case AnchorType.PREC_READ_NOT: 1054 setupTree(an.target, (state | IN_NOT)); 1055 break; 1056 1057 case AnchorType.LOOK_BEHIND: 1058 if (checkTypeTree(an.target, NodeType.ALLOWED_IN_LB, EncloseType.ALLOWED_IN_LB, AnchorType.ALLOWED_IN_LB)) { 1059 throw new SyntaxException(ERR_INVALID_LOOK_BEHIND_PATTERN); 1060 } 1061 node = setupLookBehind(node); 1062 if (node.getType() != NodeType.ANCHOR) continue restart; 1063 setupTree(((AnchorNode)node).target, state); 1064 break; 1065 1066 case AnchorType.LOOK_BEHIND_NOT: 1067 if (checkTypeTree(an.target, NodeType.ALLOWED_IN_LB, EncloseType.ALLOWED_IN_LB, AnchorType.ALLOWED_IN_LB)) { 1068 throw new SyntaxException(ERR_INVALID_LOOK_BEHIND_PATTERN); 1069 } 1070 node = setupLookBehind(node); 1071 if (node.getType() != NodeType.ANCHOR) continue restart; 1072 setupTree(((AnchorNode)node).target, (state | IN_NOT)); 1073 break; 1074 1075 } // inner switch 1076 break; 1077 } // switch 1078 return node; 1079 } // restart: while 1080 } 1081 1082 private static final int MAX_NODE_OPT_INFO_REF_COUNT = 5; 1083 private void optimizeNodeLeft(final Node node, final NodeOptInfo opt, final OptEnvironment oenv) { // oenv remove, pass mmd 1084 opt.clear(); 1085 opt.setBoundNode(oenv.mmd); 1086 1087 switch (node.getType()) { 1088 case NodeType.LIST: { 1089 final OptEnvironment nenv = new OptEnvironment(); 1090 final NodeOptInfo nopt = new NodeOptInfo(); 1091 nenv.copy(oenv); 1092 ConsAltNode lin = (ConsAltNode)node; 1093 do { 1094 optimizeNodeLeft(lin.car, nopt, nenv); 1095 nenv.mmd.add(nopt.length); 1096 opt.concatLeftNode(nopt); 1097 } while ((lin = lin.cdr) != null); 1098 break; 1099 } 1100 1101 case NodeType.ALT: { 1102 final NodeOptInfo nopt = new NodeOptInfo(); 1103 ConsAltNode aln = (ConsAltNode)node; 1104 do { 1105 optimizeNodeLeft(aln.car, nopt, oenv); 1106 if (aln == node) { 1107 opt.copy(nopt); 1108 } else { 1109 opt.altMerge(nopt, oenv); 1110 } 1111 } while ((aln = aln.cdr) != null); 1112 break; 1113 } 1114 1115 case NodeType.STR: { 1116 final StringNode sn = (StringNode)node; 1117 1118 final int slen = sn.length(); 1119 1120 if (!sn.isAmbig()) { 1121 opt.exb.concatStr(sn.chars, sn.p, sn.end, sn.isRaw()); 1122 1123 if (slen > 0) { 1124 opt.map.addChar(sn.chars[sn.p]); 1125 } 1126 1127 opt.length.set(slen, slen); 1128 } else { 1129 int max; 1130 if (sn.isDontGetOptInfo()) { 1131 max = sn.length(); 1132 } else { 1133 opt.exb.concatStr(sn.chars, sn.p, sn.end, sn.isRaw()); 1134 opt.exb.ignoreCase = true; 1135 1136 if (slen > 0) { 1137 opt.map.addCharAmb(sn.chars, sn.p, sn.end, oenv.caseFoldFlag); 1138 } 1139 1140 max = slen; 1141 } 1142 opt.length.set(slen, max); 1143 } 1144 1145 if (opt.exb.length == slen) { 1146 opt.exb.reachEnd = true; 1147 } 1148 break; 1149 } 1150 1151 case NodeType.CCLASS: { 1152 final CClassNode cc = (CClassNode)node; 1153 /* no need to check ignore case. (setted in setup_tree()) */ 1154 if (cc.mbuf != null || cc.isNot()) { 1155 opt.length.set(1, 1); 1156 } else { 1157 for (int i=0; i<BitSet.SINGLE_BYTE_SIZE; i++) { 1158 final boolean z = cc.bs.at(i); 1159 if ((z && !cc.isNot()) || (!z && cc.isNot())) { 1160 opt.map.addChar(i); 1161 } 1162 } 1163 opt.length.set(1, 1); 1164 } 1165 break; 1166 } 1167 1168 case NodeType.CANY: { 1169 opt.length.set(1, 1); 1170 break; 1171 } 1172 1173 case NodeType.ANCHOR: { 1174 final AnchorNode an = (AnchorNode)node; 1175 switch (an.type) { 1176 case AnchorType.BEGIN_BUF: 1177 case AnchorType.BEGIN_POSITION: 1178 case AnchorType.BEGIN_LINE: 1179 case AnchorType.END_BUF: 1180 case AnchorType.SEMI_END_BUF: 1181 case AnchorType.END_LINE: 1182 opt.anchor.add(an.type); 1183 break; 1184 1185 case AnchorType.PREC_READ: 1186 final NodeOptInfo nopt = new NodeOptInfo(); 1187 optimizeNodeLeft(an.target, nopt, oenv); 1188 if (nopt.exb.length > 0) { 1189 opt.expr.copy(nopt.exb); 1190 } else if (nopt.exm.length > 0) { 1191 opt.expr.copy(nopt.exm); 1192 } 1193 opt.expr.reachEnd = false; 1194 if (nopt.map.value > 0) opt.map.copy(nopt.map); 1195 break; 1196 1197 case AnchorType.PREC_READ_NOT: 1198 case AnchorType.LOOK_BEHIND: /* Sorry, I can't make use of it. */ 1199 case AnchorType.LOOK_BEHIND_NOT: 1200 break; 1201 1202 } // inner switch 1203 break; 1204 } 1205 1206 case NodeType.BREF: { 1207 final BackRefNode br = (BackRefNode)node; 1208 1209 if (br.isRecursion()) { 1210 opt.length.set(0, MinMaxLen.INFINITE_DISTANCE); 1211 break; 1212 } 1213 1214 final Node[]nodes = oenv.scanEnv.memNodes; 1215 1216 final int min = getMinMatchLength(nodes[br.backRef]); 1217 final int max = getMaxMatchLength(nodes[br.backRef]); 1218 1219 opt.length.set(min, max); 1220 break; 1221 } 1222 1223 1224 case NodeType.QTFR: { 1225 final NodeOptInfo nopt = new NodeOptInfo(); 1226 final QuantifierNode qn = (QuantifierNode)node; 1227 optimizeNodeLeft(qn.target, nopt, oenv); 1228 if (qn.lower == 0 && isRepeatInfinite(qn.upper)) { 1229 if (oenv.mmd.max == 0 && qn.target.getType() == NodeType.CANY && qn.greedy) { 1230 if (isMultiline(oenv.options)) { 1231 opt.anchor.add(AnchorType.ANYCHAR_STAR_ML); 1232 } else { 1233 opt.anchor.add(AnchorType.ANYCHAR_STAR); 1234 } 1235 } 1236 } else { 1237 if (qn.lower > 0) { 1238 opt.copy(nopt); 1239 if (nopt.exb.length > 0) { 1240 if (nopt.exb.reachEnd) { 1241 int i; 1242 for (i = 2; i <= qn.lower && !opt.exb.isFull(); i++) { 1243 opt.exb.concat(nopt.exb); 1244 } 1245 if (i < qn.lower) { 1246 opt.exb.reachEnd = false; 1247 } 1248 } 1249 } 1250 if (qn.lower != qn.upper) { 1251 opt.exb.reachEnd = false; 1252 opt.exm.reachEnd = false; 1253 } 1254 if (qn.lower > 1) { 1255 opt.exm.reachEnd = false; 1256 } 1257 1258 } 1259 } 1260 final int min = MinMaxLen.distanceMultiply(nopt.length.min, qn.lower); 1261 int max; 1262 if (isRepeatInfinite(qn.upper)) { 1263 max = nopt.length.max > 0 ? MinMaxLen.INFINITE_DISTANCE : 0; 1264 } else { 1265 max = MinMaxLen.distanceMultiply(nopt.length.max, qn.upper); 1266 } 1267 opt.length.set(min, max); 1268 break; 1269 } 1270 1271 case NodeType.ENCLOSE: { 1272 final EncloseNode en = (EncloseNode)node; 1273 switch (en.type) { 1274 case EncloseType.OPTION: 1275 final int save = oenv.options; 1276 oenv.options = en.option; 1277 optimizeNodeLeft(en.target, opt, oenv); 1278 oenv.options = save; 1279 break; 1280 1281 case EncloseType.MEMORY: 1282 if (++en.optCount > MAX_NODE_OPT_INFO_REF_COUNT) { 1283 int min = 0; 1284 int max = MinMaxLen.INFINITE_DISTANCE; 1285 if (en.isMinFixed()) min = en.minLength; 1286 if (en.isMaxFixed()) max = en.maxLength; 1287 opt.length.set(min, max); 1288 } else { // USE_SUBEXP_CALL 1289 optimizeNodeLeft(en.target, opt, oenv); 1290 if (opt.anchor.isSet(AnchorType.ANYCHAR_STAR_MASK)) { 1291 if (bsAt(oenv.scanEnv.backrefedMem, en.regNum)) { 1292 opt.anchor.remove(AnchorType.ANYCHAR_STAR_MASK); 1293 } 1294 } 1295 } 1296 break; 1297 1298 case EncloseType.STOP_BACKTRACK: 1299 optimizeNodeLeft(en.target, opt, oenv); 1300 break; 1301 } // inner switch 1302 break; 1303 } 1304 1305 default: 1306 throw new InternalException(ERR_PARSER_BUG); 1307 } // switch 1308 } 1309 1310 protected final void setOptimizedInfoFromTree(final Node node) { 1311 final NodeOptInfo opt = new NodeOptInfo(); 1312 final OptEnvironment oenv = new OptEnvironment(); 1313 1314 oenv.options = regex.options; 1315 oenv.caseFoldFlag = regex.caseFoldFlag; 1316 oenv.scanEnv = env; 1317 oenv.mmd.clear(); // ?? 1318 1319 optimizeNodeLeft(node, opt, oenv); 1320 1321 regex.anchor = opt.anchor.leftAnchor & (AnchorType.BEGIN_BUF | 1322 AnchorType.BEGIN_POSITION | 1323 AnchorType.ANYCHAR_STAR | 1324 AnchorType.ANYCHAR_STAR_ML); 1325 1326 regex.anchor |= opt.anchor.rightAnchor & (AnchorType.END_BUF | 1327 AnchorType.SEMI_END_BUF); 1328 1329 if ((regex.anchor & (AnchorType.END_BUF | AnchorType.SEMI_END_BUF)) != 0) { 1330 regex.anchorDmin = opt.length.min; 1331 regex.anchorDmax = opt.length.max; 1332 } 1333 1334 if (opt.exb.length > 0 || opt.exm.length > 0) { 1335 opt.exb.select(opt.exm); 1336 if (opt.map.value > 0 && opt.exb.compare(opt.map) > 0) { 1337 // !goto set_map;! 1338 regex.setOptimizeMapInfo(opt.map); 1339 regex.setSubAnchor(opt.map.anchor); 1340 } else { 1341 regex.setExactInfo(opt.exb); 1342 regex.setSubAnchor(opt.exb.anchor); 1343 } 1344 } else if (opt.map.value > 0) { 1345 // !set_map:! 1346 regex.setOptimizeMapInfo(opt.map); 1347 regex.setSubAnchor(opt.map.anchor); 1348 } else { 1349 regex.subAnchor |= opt.anchor.leftAnchor & AnchorType.BEGIN_LINE; 1350 if (opt.length.max == 0) regex.subAnchor |= opt.anchor.rightAnchor & AnchorType.END_LINE; 1351 } 1352 1353 if (Config.DEBUG_COMPILE || Config.DEBUG_MATCH) { 1354 Config.log.println(regex.optimizeInfoToString()); 1355 } 1356 } 1357} 1358