1/* nfa - NFA construction routines */ 2 3/* Copyright (c) 1990 The Regents of the University of California. */ 4/* All rights reserved. */ 5 6/* This code is derived from software contributed to Berkeley by */ 7/* Vern Paxson. */ 8 9/* The United States Government has rights in this work pursuant */ 10/* to contract no. DE-AC03-76SF00098 between the United States */ 11/* Department of Energy and the University of California. */ 12 13/* This file is part of flex. */ 14 15/* Redistribution and use in source and binary forms, with or without */ 16/* modification, are permitted provided that the following conditions */ 17/* are met: */ 18 19/* 1. Redistributions of source code must retain the above copyright */ 20/* notice, this list of conditions and the following disclaimer. */ 21/* 2. Redistributions in binary form must reproduce the above copyright */ 22/* notice, this list of conditions and the following disclaimer in the */ 23/* documentation and/or other materials provided with the distribution. */ 24 25/* Neither the name of the University nor the names of its contributors */ 26/* may be used to endorse or promote products derived from this software */ 27/* without specific prior written permission. */ 28 29/* THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR */ 30/* IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED */ 31/* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR */ 32/* PURPOSE. */ 33 34#include "flexdef.h" 35 36 37/* declare functions that have forward references */ 38 39int dupmachine PROTO ((int)); 40void mkxtion PROTO ((int, int)); 41 42 43/* add_accept - add an accepting state to a machine 44 * 45 * accepting_number becomes mach's accepting number. 46 */ 47 48void add_accept (mach, accepting_number) 49 int mach, accepting_number; 50{ 51 /* Hang the accepting number off an epsilon state. if it is associated 52 * with a state that has a non-epsilon out-transition, then the state 53 * will accept BEFORE it makes that transition, i.e., one character 54 * too soon. 55 */ 56 57 if (transchar[finalst[mach]] == SYM_EPSILON) 58 accptnum[finalst[mach]] = accepting_number; 59 60 else { 61 int astate = mkstate (SYM_EPSILON); 62 63 accptnum[astate] = accepting_number; 64 (void) link_machines (mach, astate); 65 } 66} 67 68 69/* copysingl - make a given number of copies of a singleton machine 70 * 71 * synopsis 72 * 73 * newsng = copysingl( singl, num ); 74 * 75 * newsng - a new singleton composed of num copies of singl 76 * singl - a singleton machine 77 * num - the number of copies of singl to be present in newsng 78 */ 79 80int copysingl (singl, num) 81 int singl, num; 82{ 83 int copy, i; 84 85 copy = mkstate (SYM_EPSILON); 86 87 for (i = 1; i <= num; ++i) 88 copy = link_machines (copy, dupmachine (singl)); 89 90 return copy; 91} 92 93 94/* dumpnfa - debugging routine to write out an nfa */ 95 96void dumpnfa (state1) 97 int state1; 98 99{ 100 int sym, tsp1, tsp2, anum, ns; 101 102 fprintf (stderr, 103 _ 104 ("\n\n********** beginning dump of nfa with start state %d\n"), 105 state1); 106 107 /* We probably should loop starting at firstst[state1] and going to 108 * lastst[state1], but they're not maintained properly when we "or" 109 * all of the rules together. So we use our knowledge that the machine 110 * starts at state 1 and ends at lastnfa. 111 */ 112 113 /* for ( ns = firstst[state1]; ns <= lastst[state1]; ++ns ) */ 114 for (ns = 1; ns <= lastnfa; ++ns) { 115 fprintf (stderr, _("state # %4d\t"), ns); 116 117 sym = transchar[ns]; 118 tsp1 = trans1[ns]; 119 tsp2 = trans2[ns]; 120 anum = accptnum[ns]; 121 122 fprintf (stderr, "%3d: %4d, %4d", sym, tsp1, tsp2); 123 124 if (anum != NIL) 125 fprintf (stderr, " [%d]", anum); 126 127 fprintf (stderr, "\n"); 128 } 129 130 fprintf (stderr, _("********** end of dump\n")); 131} 132 133 134/* dupmachine - make a duplicate of a given machine 135 * 136 * synopsis 137 * 138 * copy = dupmachine( mach ); 139 * 140 * copy - holds duplicate of mach 141 * mach - machine to be duplicated 142 * 143 * note that the copy of mach is NOT an exact duplicate; rather, all the 144 * transition states values are adjusted so that the copy is self-contained, 145 * as the original should have been. 146 * 147 * also note that the original MUST be contiguous, with its low and high 148 * states accessible by the arrays firstst and lastst 149 */ 150 151int dupmachine (mach) 152 int mach; 153{ 154 int i, init, state_offset; 155 int state = 0; 156 int last = lastst[mach]; 157 158 for (i = firstst[mach]; i <= last; ++i) { 159 state = mkstate (transchar[i]); 160 161 if (trans1[i] != NO_TRANSITION) { 162 mkxtion (finalst[state], trans1[i] + state - i); 163 164 if (transchar[i] == SYM_EPSILON && 165 trans2[i] != NO_TRANSITION) 166 mkxtion (finalst[state], 167 trans2[i] + state - i); 168 } 169 170 accptnum[state] = accptnum[i]; 171 } 172 173 if (state == 0) 174 flexfatal (_("empty machine in dupmachine()")); 175 176 state_offset = state - i + 1; 177 178 init = mach + state_offset; 179 firstst[init] = firstst[mach] + state_offset; 180 finalst[init] = finalst[mach] + state_offset; 181 lastst[init] = lastst[mach] + state_offset; 182 183 return init; 184} 185 186 187/* finish_rule - finish up the processing for a rule 188 * 189 * An accepting number is added to the given machine. If variable_trail_rule 190 * is true then the rule has trailing context and both the head and trail 191 * are variable size. Otherwise if headcnt or trailcnt is non-zero then 192 * the machine recognizes a pattern with trailing context and headcnt is 193 * the number of characters in the matched part of the pattern, or zero 194 * if the matched part has variable length. trailcnt is the number of 195 * trailing context characters in the pattern, or zero if the trailing 196 * context has variable length. 197 */ 198 199void finish_rule (mach, variable_trail_rule, headcnt, trailcnt, 200 pcont_act) 201 int mach, variable_trail_rule, headcnt, trailcnt, pcont_act; 202{ 203 char action_text[MAXLINE]; 204 205 add_accept (mach, num_rules); 206 207 /* We did this in new_rule(), but it often gets the wrong 208 * number because we do it before we start parsing the current rule. 209 */ 210 rule_linenum[num_rules] = linenum; 211 212 /* If this is a continued action, then the line-number has already 213 * been updated, giving us the wrong number. 214 */ 215 if (continued_action) 216 --rule_linenum[num_rules]; 217 218 219 /* If the previous rule was continued action, then we inherit the 220 * previous newline flag, possibly overriding the current one. 221 */ 222 if (pcont_act && rule_has_nl[num_rules - 1]) 223 rule_has_nl[num_rules] = true; 224 225 snprintf (action_text, sizeof(action_text), "case %d:\n", num_rules); 226 add_action (action_text); 227 if (rule_has_nl[num_rules]) { 228 snprintf (action_text, sizeof(action_text), "/* rule %d can match eol */\n", 229 num_rules); 230 add_action (action_text); 231 } 232 233 234 if (variable_trail_rule) { 235 rule_type[num_rules] = RULE_VARIABLE; 236 237 if (performance_report > 0) 238 fprintf (stderr, 239 _ 240 ("Variable trailing context rule at line %d\n"), 241 rule_linenum[num_rules]); 242 243 variable_trailing_context_rules = true; 244 } 245 246 else { 247 rule_type[num_rules] = RULE_NORMAL; 248 249 if (headcnt > 0 || trailcnt > 0) { 250 /* Do trailing context magic to not match the trailing 251 * characters. 252 */ 253 char *scanner_cp = "YY_G(yy_c_buf_p) = yy_cp"; 254 char *scanner_bp = "yy_bp"; 255 256 add_action 257 ("*yy_cp = YY_G(yy_hold_char); /* undo effects of setting up yytext */\n"); 258 259 if (headcnt > 0) { 260 snprintf (action_text, sizeof(action_text), "%s = %s + %d;\n", 261 scanner_cp, scanner_bp, headcnt); 262 add_action (action_text); 263 } 264 265 else { 266 snprintf (action_text, sizeof(action_text), "%s -= %d;\n", 267 scanner_cp, trailcnt); 268 add_action (action_text); 269 } 270 271 add_action 272 ("YY_DO_BEFORE_ACTION; /* set up yytext again */\n"); 273 } 274 } 275 276 /* Okay, in the action code at this point yytext and yyleng have 277 * their proper final values for this rule, so here's the point 278 * to do any user action. But don't do it for continued actions, 279 * as that'll result in multiple YY_RULE_SETUP's. 280 */ 281 if (!continued_action) 282 add_action ("YY_RULE_SETUP\n"); 283 284 line_directive_out ((FILE *) 0, 1); 285} 286 287 288/* link_machines - connect two machines together 289 * 290 * synopsis 291 * 292 * new = link_machines( first, last ); 293 * 294 * new - a machine constructed by connecting first to last 295 * first - the machine whose successor is to be last 296 * last - the machine whose predecessor is to be first 297 * 298 * note: this routine concatenates the machine first with the machine 299 * last to produce a machine new which will pattern-match first first 300 * and then last, and will fail if either of the sub-patterns fails. 301 * FIRST is set to new by the operation. last is unmolested. 302 */ 303 304int link_machines (first, last) 305 int first, last; 306{ 307 if (first == NIL) 308 return last; 309 310 else if (last == NIL) 311 return first; 312 313 else { 314 mkxtion (finalst[first], last); 315 finalst[first] = finalst[last]; 316 lastst[first] = MAX (lastst[first], lastst[last]); 317 firstst[first] = MIN (firstst[first], firstst[last]); 318 319 return first; 320 } 321} 322 323 324/* mark_beginning_as_normal - mark each "beginning" state in a machine 325 * as being a "normal" (i.e., not trailing context- 326 * associated) states 327 * 328 * The "beginning" states are the epsilon closure of the first state 329 */ 330 331void mark_beginning_as_normal (mach) 332 int mach; 333{ 334 switch (state_type[mach]) { 335 case STATE_NORMAL: 336 /* Oh, we've already visited here. */ 337 return; 338 339 case STATE_TRAILING_CONTEXT: 340 state_type[mach] = STATE_NORMAL; 341 342 if (transchar[mach] == SYM_EPSILON) { 343 if (trans1[mach] != NO_TRANSITION) 344 mark_beginning_as_normal (trans1[mach]); 345 346 if (trans2[mach] != NO_TRANSITION) 347 mark_beginning_as_normal (trans2[mach]); 348 } 349 break; 350 351 default: 352 flexerror (_ 353 ("bad state type in mark_beginning_as_normal()")); 354 break; 355 } 356} 357 358 359/* mkbranch - make a machine that branches to two machines 360 * 361 * synopsis 362 * 363 * branch = mkbranch( first, second ); 364 * 365 * branch - a machine which matches either first's pattern or second's 366 * first, second - machines whose patterns are to be or'ed (the | operator) 367 * 368 * Note that first and second are NEITHER destroyed by the operation. Also, 369 * the resulting machine CANNOT be used with any other "mk" operation except 370 * more mkbranch's. Compare with mkor() 371 */ 372 373int mkbranch (first, second) 374 int first, second; 375{ 376 int eps; 377 378 if (first == NO_TRANSITION) 379 return second; 380 381 else if (second == NO_TRANSITION) 382 return first; 383 384 eps = mkstate (SYM_EPSILON); 385 386 mkxtion (eps, first); 387 mkxtion (eps, second); 388 389 return eps; 390} 391 392 393/* mkclos - convert a machine into a closure 394 * 395 * synopsis 396 * new = mkclos( state ); 397 * 398 * new - a new state which matches the closure of "state" 399 */ 400 401int mkclos (state) 402 int state; 403{ 404 return mkopt (mkposcl (state)); 405} 406 407 408/* mkopt - make a machine optional 409 * 410 * synopsis 411 * 412 * new = mkopt( mach ); 413 * 414 * new - a machine which optionally matches whatever mach matched 415 * mach - the machine to make optional 416 * 417 * notes: 418 * 1. mach must be the last machine created 419 * 2. mach is destroyed by the call 420 */ 421 422int mkopt (mach) 423 int mach; 424{ 425 int eps; 426 427 if (!SUPER_FREE_EPSILON (finalst[mach])) { 428 eps = mkstate (SYM_EPSILON); 429 mach = link_machines (mach, eps); 430 } 431 432 /* Can't skimp on the following if FREE_EPSILON(mach) is true because 433 * some state interior to "mach" might point back to the beginning 434 * for a closure. 435 */ 436 eps = mkstate (SYM_EPSILON); 437 mach = link_machines (eps, mach); 438 439 mkxtion (mach, finalst[mach]); 440 441 return mach; 442} 443 444 445/* mkor - make a machine that matches either one of two machines 446 * 447 * synopsis 448 * 449 * new = mkor( first, second ); 450 * 451 * new - a machine which matches either first's pattern or second's 452 * first, second - machines whose patterns are to be or'ed (the | operator) 453 * 454 * note that first and second are both destroyed by the operation 455 * the code is rather convoluted because an attempt is made to minimize 456 * the number of epsilon states needed 457 */ 458 459int mkor (first, second) 460 int first, second; 461{ 462 int eps, orend; 463 464 if (first == NIL) 465 return second; 466 467 else if (second == NIL) 468 return first; 469 470 else { 471 /* See comment in mkopt() about why we can't use the first 472 * state of "first" or "second" if they satisfy "FREE_EPSILON". 473 */ 474 eps = mkstate (SYM_EPSILON); 475 476 first = link_machines (eps, first); 477 478 mkxtion (first, second); 479 480 if (SUPER_FREE_EPSILON (finalst[first]) && 481 accptnum[finalst[first]] == NIL) { 482 orend = finalst[first]; 483 mkxtion (finalst[second], orend); 484 } 485 486 else if (SUPER_FREE_EPSILON (finalst[second]) && 487 accptnum[finalst[second]] == NIL) { 488 orend = finalst[second]; 489 mkxtion (finalst[first], orend); 490 } 491 492 else { 493 eps = mkstate (SYM_EPSILON); 494 495 first = link_machines (first, eps); 496 orend = finalst[first]; 497 498 mkxtion (finalst[second], orend); 499 } 500 } 501 502 finalst[first] = orend; 503 return first; 504} 505 506 507/* mkposcl - convert a machine into a positive closure 508 * 509 * synopsis 510 * new = mkposcl( state ); 511 * 512 * new - a machine matching the positive closure of "state" 513 */ 514 515int mkposcl (state) 516 int state; 517{ 518 int eps; 519 520 if (SUPER_FREE_EPSILON (finalst[state])) { 521 mkxtion (finalst[state], state); 522 return state; 523 } 524 525 else { 526 eps = mkstate (SYM_EPSILON); 527 mkxtion (eps, state); 528 return link_machines (state, eps); 529 } 530} 531 532 533/* mkrep - make a replicated machine 534 * 535 * synopsis 536 * new = mkrep( mach, lb, ub ); 537 * 538 * new - a machine that matches whatever "mach" matched from "lb" 539 * number of times to "ub" number of times 540 * 541 * note 542 * if "ub" is INFINITE_REPEAT then "new" matches "lb" or more occurrences of "mach" 543 */ 544 545int mkrep (mach, lb, ub) 546 int mach, lb, ub; 547{ 548 int base_mach, tail, copy, i; 549 550 base_mach = copysingl (mach, lb - 1); 551 552 if (ub == INFINITE_REPEAT) { 553 copy = dupmachine (mach); 554 mach = link_machines (mach, 555 link_machines (base_mach, 556 mkclos (copy))); 557 } 558 559 else { 560 tail = mkstate (SYM_EPSILON); 561 562 for (i = lb; i < ub; ++i) { 563 copy = dupmachine (mach); 564 tail = mkopt (link_machines (copy, tail)); 565 } 566 567 mach = 568 link_machines (mach, 569 link_machines (base_mach, tail)); 570 } 571 572 return mach; 573} 574 575 576/* mkstate - create a state with a transition on a given symbol 577 * 578 * synopsis 579 * 580 * state = mkstate( sym ); 581 * 582 * state - a new state matching sym 583 * sym - the symbol the new state is to have an out-transition on 584 * 585 * note that this routine makes new states in ascending order through the 586 * state array (and increments LASTNFA accordingly). The routine DUPMACHINE 587 * relies on machines being made in ascending order and that they are 588 * CONTIGUOUS. Change it and you will have to rewrite DUPMACHINE (kludge 589 * that it admittedly is) 590 */ 591 592int mkstate (sym) 593 int sym; 594{ 595 if (++lastnfa >= current_mns) { 596 if ((current_mns += MNS_INCREMENT) >= maximum_mns) 597 lerrif (_ 598 ("input rules are too complicated (>= %d NFA states)"), 599current_mns); 600 601 ++num_reallocs; 602 603 firstst = reallocate_integer_array (firstst, current_mns); 604 lastst = reallocate_integer_array (lastst, current_mns); 605 finalst = reallocate_integer_array (finalst, current_mns); 606 transchar = 607 reallocate_integer_array (transchar, current_mns); 608 trans1 = reallocate_integer_array (trans1, current_mns); 609 trans2 = reallocate_integer_array (trans2, current_mns); 610 accptnum = 611 reallocate_integer_array (accptnum, current_mns); 612 assoc_rule = 613 reallocate_integer_array (assoc_rule, current_mns); 614 state_type = 615 reallocate_integer_array (state_type, current_mns); 616 } 617 618 firstst[lastnfa] = lastnfa; 619 finalst[lastnfa] = lastnfa; 620 lastst[lastnfa] = lastnfa; 621 transchar[lastnfa] = sym; 622 trans1[lastnfa] = NO_TRANSITION; 623 trans2[lastnfa] = NO_TRANSITION; 624 accptnum[lastnfa] = NIL; 625 assoc_rule[lastnfa] = num_rules; 626 state_type[lastnfa] = current_state_type; 627 628 /* Fix up equivalence classes base on this transition. Note that any 629 * character which has its own transition gets its own equivalence 630 * class. Thus only characters which are only in character classes 631 * have a chance at being in the same equivalence class. E.g. "a|b" 632 * puts 'a' and 'b' into two different equivalence classes. "[ab]" 633 * puts them in the same equivalence class (barring other differences 634 * elsewhere in the input). 635 */ 636 637 if (sym < 0) { 638 /* We don't have to update the equivalence classes since 639 * that was already done when the ccl was created for the 640 * first time. 641 */ 642 } 643 644 else if (sym == SYM_EPSILON) 645 ++numeps; 646 647 else { 648 check_char (sym); 649 650 if (useecs) 651 /* Map NUL's to csize. */ 652 mkechar (sym ? sym : csize, nextecm, ecgroup); 653 } 654 655 return lastnfa; 656} 657 658 659/* mkxtion - make a transition from one state to another 660 * 661 * synopsis 662 * 663 * mkxtion( statefrom, stateto ); 664 * 665 * statefrom - the state from which the transition is to be made 666 * stateto - the state to which the transition is to be made 667 */ 668 669void mkxtion (statefrom, stateto) 670 int statefrom, stateto; 671{ 672 if (trans1[statefrom] == NO_TRANSITION) 673 trans1[statefrom] = stateto; 674 675 else if ((transchar[statefrom] != SYM_EPSILON) || 676 (trans2[statefrom] != NO_TRANSITION)) 677 flexfatal (_("found too many transitions in mkxtion()")); 678 679 else { /* second out-transition for an epsilon state */ 680 ++eps2; 681 trans2[statefrom] = stateto; 682 } 683} 684 685/* new_rule - initialize for a new rule */ 686 687void new_rule () 688{ 689 if (++num_rules >= current_max_rules) { 690 ++num_reallocs; 691 current_max_rules += MAX_RULES_INCREMENT; 692 rule_type = reallocate_integer_array (rule_type, 693 current_max_rules); 694 rule_linenum = reallocate_integer_array (rule_linenum, 695 current_max_rules); 696 rule_useful = reallocate_integer_array (rule_useful, 697 current_max_rules); 698 rule_has_nl = reallocate_bool_array (rule_has_nl, 699 current_max_rules); 700 } 701 702 if (num_rules > MAX_RULE) 703 lerrif (_("too many rules (> %d)!"), MAX_RULE); 704 705 rule_linenum[num_rules] = linenum; 706 rule_useful[num_rules] = false; 707 rule_has_nl[num_rules] = false; 708} 709