1/**************************************************************** 2Copyright (C) Lucent Technologies 1997 3All Rights Reserved 4 5Permission to use, copy, modify, and distribute this software and 6its documentation for any purpose and without fee is hereby 7granted, provided that the above copyright notice appear in all 8copies and that both that the copyright notice and this 9permission notice and warranty disclaimer appear in supporting 10documentation, and that the name Lucent Technologies or any of 11its entities not be used in advertising or publicity pertaining 12to distribution of the software without specific, written prior 13permission. 14 15LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, 16INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. 17IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY 18SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 19WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER 20IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, 21ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF 22THIS SOFTWARE. 23****************************************************************/ 24 25#include <stdio.h> 26#include <stdlib.h> 27#include <string.h> 28#include <ctype.h> 29#include "awk.h" 30#include "awkgram.tab.h" 31 32extern YYSTYPE yylval; 33extern bool infunc; 34 35int lineno = 1; 36int bracecnt = 0; 37int brackcnt = 0; 38int parencnt = 0; 39 40typedef struct Keyword { 41 const char *word; 42 int sub; 43 int type; 44} Keyword; 45 46const Keyword keywords[] = { /* keep sorted: binary searched */ 47 { "BEGIN", XBEGIN, XBEGIN }, 48 { "END", XEND, XEND }, 49 { "NF", VARNF, VARNF }, 50 { "and", FAND, BLTIN }, 51 { "atan2", FATAN, BLTIN }, 52 { "break", BREAK, BREAK }, 53 { "close", CLOSE, CLOSE }, 54 { "compl", FCOMPL, BLTIN }, 55 { "continue", CONTINUE, CONTINUE }, 56 { "cos", FCOS, BLTIN }, 57 { "delete", DELETE, DELETE }, 58 { "do", DO, DO }, 59 { "else", ELSE, ELSE }, 60 { "exit", EXIT, EXIT }, 61 { "exp", FEXP, BLTIN }, 62 { "fflush", FFLUSH, BLTIN }, 63 { "for", FOR, FOR }, 64 { "func", FUNC, FUNC }, 65 { "function", FUNC, FUNC }, 66 { "gensub", GENSUB, GENSUB }, 67 { "getline", GETLINE, GETLINE }, 68 { "gsub", GSUB, GSUB }, 69 { "if", IF, IF }, 70 { "in", IN, IN }, 71 { "index", INDEX, INDEX }, 72 { "int", FINT, BLTIN }, 73 { "length", FLENGTH, BLTIN }, 74 { "log", FLOG, BLTIN }, 75 { "lshift", FLSHIFT, BLTIN }, 76 { "match", MATCHFCN, MATCHFCN }, 77 { "next", NEXT, NEXT }, 78 { "nextfile", NEXTFILE, NEXTFILE }, 79 { "or", FFOR, BLTIN }, 80 { "print", PRINT, PRINT }, 81 { "printf", PRINTF, PRINTF }, 82 { "rand", FRAND, BLTIN }, 83 { "return", RETURN, RETURN }, 84 { "rshift", FRSHIFT, BLTIN }, 85 { "sin", FSIN, BLTIN }, 86 { "split", SPLIT, SPLIT }, 87 { "sprintf", SPRINTF, SPRINTF }, 88 { "sqrt", FSQRT, BLTIN }, 89 { "srand", FSRAND, BLTIN }, 90 { "strftime", FSTRFTIME, BLTIN }, 91 { "sub", SUB, SUB }, 92 { "substr", SUBSTR, SUBSTR }, 93 { "system", FSYSTEM, BLTIN }, 94 { "systime", FSYSTIME, BLTIN }, 95 { "tolower", FTOLOWER, BLTIN }, 96 { "toupper", FTOUPPER, BLTIN }, 97 { "while", WHILE, WHILE }, 98 { "xor", FXOR, BLTIN }, 99}; 100 101#define RET(x) { if(dbg)printf("lex %s\n", tokname(x)); return(x); } 102 103static int peek(void) 104{ 105 int c = input(); 106 unput(c); 107 return c; 108} 109 110static int gettok(char **pbuf, int *psz) /* get next input token */ 111{ 112 int c, retc; 113 char *buf = *pbuf; 114 int sz = *psz; 115 char *bp = buf; 116 117 c = input(); 118 if (c == 0) 119 return 0; 120 buf[0] = c; 121 buf[1] = 0; 122 if (!isalnum(c) && c != '.' && c != '_') 123 return c; 124 125 *bp++ = c; 126 if (isalpha(c) || c == '_') { /* it's a varname */ 127 for ( ; (c = input()) != 0; ) { 128 if (bp-buf >= sz) 129 if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, "gettok")) 130 FATAL( "out of space for name %.10s...", buf ); 131 if (isalnum(c) || c == '_') 132 *bp++ = c; 133 else { 134 *bp = 0; 135 unput(c); 136 break; 137 } 138 } 139 *bp = 0; 140 retc = 'a'; /* alphanumeric */ 141 } else { /* maybe it's a number, but could be . */ 142 char *rem; 143 /* read input until can't be a number */ 144 for ( ; (c = input()) != 0; ) { 145 if (bp-buf >= sz) 146 if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, "gettok")) 147 FATAL( "out of space for number %.10s...", buf ); 148 if (isdigit(c) || c == 'e' || c == 'E' 149 || c == '.' || c == '+' || c == '-') 150 *bp++ = c; 151 else { 152 unput(c); 153 break; 154 } 155 } 156 *bp = 0; 157 strtod(buf, &rem); /* parse the number */ 158 if (rem == buf) { /* it wasn't a valid number at all */ 159 buf[1] = 0; /* return one character as token */ 160 retc = (uschar)buf[0]; /* character is its own type */ 161 unputstr(rem+1); /* put rest back for later */ 162 } else { /* some prefix was a number */ 163 unputstr(rem); /* put rest back for later */ 164 rem[0] = 0; /* truncate buf after number part */ 165 retc = '0'; /* type is number */ 166 } 167 } 168 *pbuf = buf; 169 *psz = sz; 170 return retc; 171} 172 173int word(char *); 174int string(void); 175int regexpr(void); 176bool sc = false; /* true => return a } right now */ 177bool reg = false; /* true => return a REGEXPR now */ 178 179int yylex(void) 180{ 181 int c; 182 static char *buf = NULL; 183 static int bufsize = 5; /* BUG: setting this small causes core dump! */ 184 185 if (buf == NULL && (buf = (char *) malloc(bufsize)) == NULL) 186 FATAL( "out of space in yylex" ); 187 if (sc) { 188 sc = false; 189 RET('}'); 190 } 191 if (reg) { 192 reg = false; 193 return regexpr(); 194 } 195 for (;;) { 196 c = gettok(&buf, &bufsize); 197 if (c == 0) 198 return 0; 199 if (isalpha(c) || c == '_') 200 return word(buf); 201 if (isdigit(c)) { 202 char *cp = tostring(buf); 203 double result; 204 205 if (is_number(cp, & result)) 206 yylval.cp = setsymtab(buf, cp, result, CON|NUM, symtab); 207 else 208 yylval.cp = setsymtab(buf, cp, 0.0, STR, symtab); 209 free(cp); 210 /* should this also have STR set? */ 211 RET(NUMBER); 212 } 213 214 yylval.i = c; 215 switch (c) { 216 case '\n': /* {EOL} */ 217 lineno++; 218 RET(NL); 219 case '\r': /* assume \n is coming */ 220 case ' ': /* {WS}+ */ 221 case '\t': 222 break; 223 case '#': /* #.* strip comments */ 224 while ((c = input()) != '\n' && c != 0) 225 ; 226 unput(c); 227 /* 228 * Next line is a hack, itcompensates for 229 * unput's treatment of \n. 230 */ 231 lineno++; 232 break; 233 case ';': 234 RET(';'); 235 case '\\': 236 if (peek() == '\n') { 237 input(); 238 lineno++; 239 } else if (peek() == '\r') { 240 input(); input(); /* \n */ 241 lineno++; 242 } else { 243 RET(c); 244 } 245 break; 246 case '&': 247 if (peek() == '&') { 248 input(); RET(AND); 249 } else 250 RET('&'); 251 case '|': 252 if (peek() == '|') { 253 input(); RET(BOR); 254 } else 255 RET('|'); 256 case '!': 257 if (peek() == '=') { 258 input(); yylval.i = NE; RET(NE); 259 } else if (peek() == '~') { 260 input(); yylval.i = NOTMATCH; RET(MATCHOP); 261 } else 262 RET(NOT); 263 case '~': 264 yylval.i = MATCH; 265 RET(MATCHOP); 266 case '<': 267 if (peek() == '=') { 268 input(); yylval.i = LE; RET(LE); 269 } else { 270 yylval.i = LT; RET(LT); 271 } 272 case '=': 273 if (peek() == '=') { 274 input(); yylval.i = EQ; RET(EQ); 275 } else { 276 yylval.i = ASSIGN; RET(ASGNOP); 277 } 278 case '>': 279 if (peek() == '=') { 280 input(); yylval.i = GE; RET(GE); 281 } else if (peek() == '>') { 282 input(); yylval.i = APPEND; RET(APPEND); 283 } else { 284 yylval.i = GT; RET(GT); 285 } 286 case '+': 287 if (peek() == '+') { 288 input(); yylval.i = INCR; RET(INCR); 289 } else if (peek() == '=') { 290 input(); yylval.i = ADDEQ; RET(ASGNOP); 291 } else 292 RET('+'); 293 case '-': 294 if (peek() == '-') { 295 input(); yylval.i = DECR; RET(DECR); 296 } else if (peek() == '=') { 297 input(); yylval.i = SUBEQ; RET(ASGNOP); 298 } else 299 RET('-'); 300 case '*': 301 if (peek() == '=') { /* *= */ 302 input(); yylval.i = MULTEQ; RET(ASGNOP); 303 } else if (peek() == '*') { /* ** or **= */ 304 input(); /* eat 2nd * */ 305 if (peek() == '=') { 306 input(); yylval.i = POWEQ; RET(ASGNOP); 307 } else { 308 RET(POWER); 309 } 310 } else 311 RET('*'); 312 case '/': 313 RET('/'); 314 case '%': 315 if (peek() == '=') { 316 input(); yylval.i = MODEQ; RET(ASGNOP); 317 } else 318 RET('%'); 319 case '^': 320 if (peek() == '=') { 321 input(); yylval.i = POWEQ; RET(ASGNOP); 322 } else 323 RET(POWER); 324 325 case '$': 326 /* BUG: awkward, if not wrong */ 327 c = gettok(&buf, &bufsize); 328 if (isalpha(c)) { 329 if (strcmp(buf, "NF") == 0) { /* very special */ 330 unputstr("(NF)"); 331 RET(INDIRECT); 332 } 333 c = peek(); 334 if (c == '(' || c == '[' || (infunc && isarg(buf) >= 0)) { 335 unputstr(buf); 336 RET(INDIRECT); 337 } 338 yylval.cp = setsymtab(buf, "", 0.0, STR|NUM, symtab); 339 RET(IVAR); 340 } else if (c == 0) { /* */ 341 SYNTAX( "unexpected end of input after $" ); 342 RET(';'); 343 } else { 344 unputstr(buf); 345 RET(INDIRECT); 346 } 347 348 case '}': 349 if (--bracecnt < 0) 350 SYNTAX( "extra }" ); 351 sc = true; 352 RET(';'); 353 case ']': 354 if (--brackcnt < 0) 355 SYNTAX( "extra ]" ); 356 RET(']'); 357 case ')': 358 if (--parencnt < 0) 359 SYNTAX( "extra )" ); 360 RET(')'); 361 case '{': 362 bracecnt++; 363 RET('{'); 364 case '[': 365 brackcnt++; 366 RET('['); 367 case '(': 368 parencnt++; 369 RET('('); 370 371 case '"': 372 return string(); /* BUG: should be like tran.c ? */ 373 374 default: 375 RET(c); 376 } 377 } 378} 379 380extern int runetochar(char *str, int c); 381 382int string(void) 383{ 384 int c, n; 385 char *s, *bp; 386 static char *buf = NULL; 387 static int bufsz = 500; 388 389 if (buf == NULL && (buf = (char *) malloc(bufsz)) == NULL) 390 FATAL("out of space for strings"); 391 for (bp = buf; (c = input()) != '"'; ) { 392 if (!adjbuf(&buf, &bufsz, bp-buf+2, 500, &bp, "string")) 393 FATAL("out of space for string %.10s...", buf); 394 switch (c) { 395 case '\n': 396 case '\r': 397 case 0: 398 *bp = '\0'; 399 SYNTAX( "non-terminated string %.10s...", buf ); 400 if (c == 0) /* hopeless */ 401 FATAL( "giving up" ); 402 lineno++; 403 break; 404 case '\\': 405 c = input(); 406 switch (c) { 407 case '\n': break; 408 case '"': *bp++ = '"'; break; 409 case 'n': *bp++ = '\n'; break; 410 case 't': *bp++ = '\t'; break; 411 case 'f': *bp++ = '\f'; break; 412 case 'r': *bp++ = '\r'; break; 413 case 'b': *bp++ = '\b'; break; 414 case 'v': *bp++ = '\v'; break; 415 case 'a': *bp++ = '\a'; break; 416 case '\\': *bp++ = '\\'; break; 417 418 case '0': case '1': case '2': /* octal: \d \dd \ddd */ 419 case '3': case '4': case '5': case '6': case '7': 420 n = c - '0'; 421 if ((c = peek()) >= '0' && c < '8') { 422 n = 8 * n + input() - '0'; 423 if ((c = peek()) >= '0' && c < '8') 424 n = 8 * n + input() - '0'; 425 } 426 *bp++ = n; 427 break; 428 429 case 'x': /* hex \x0-9a-fA-F (exactly two) */ 430 { 431 int i; 432 433 if (!isxdigit(peek())) { 434 unput(c); 435 break; 436 } 437 n = 0; 438 for (i = 0; i < 2; i++) { 439 c = input(); 440 if (c == 0) 441 break; 442 if (isxdigit(c)) { 443 c = tolower(c); 444 n *= 16; 445 if (isdigit(c)) 446 n += (c - '0'); 447 else 448 n += 10 + (c - 'a'); 449 } else { 450 unput(c); 451 break; 452 } 453 } 454 if (i) 455 *bp++ = n; 456 break; 457 } 458 459 case 'u': /* utf \u0-9a-fA-F (1..8) */ 460 { 461 int i; 462 463 n = 0; 464 for (i = 0; i < 8; i++) { 465 c = input(); 466 if (!isxdigit(c) || c == 0) 467 break; 468 c = tolower(c); 469 n *= 16; 470 if (isdigit(c)) 471 n += (c - '0'); 472 else 473 n += 10 + (c - 'a'); 474 } 475 unput(c); 476 bp += runetochar(bp, n); 477 break; 478 } 479 480 default: 481 *bp++ = c; 482 break; 483 } 484 break; 485 default: 486 *bp++ = c; 487 break; 488 } 489 } 490 *bp = 0; 491 s = tostring(buf); 492 *bp++ = ' '; *bp++ = '\0'; 493 yylval.cp = setsymtab(buf, s, 0.0, CON|STR|DONTFREE, symtab); 494 free(s); 495 RET(STRING); 496} 497 498 499static int binsearch(char *w, const Keyword *kp, int n) 500{ 501 int cond, low, mid, high; 502 503 low = 0; 504 high = n - 1; 505 while (low <= high) { 506 mid = (low + high) / 2; 507 if ((cond = strcmp(w, kp[mid].word)) < 0) 508 high = mid - 1; 509 else if (cond > 0) 510 low = mid + 1; 511 else 512 return mid; 513 } 514 return -1; 515} 516 517int word(char *w) 518{ 519 const Keyword *kp; 520 int c, n; 521 522 n = binsearch(w, keywords, sizeof(keywords)/sizeof(keywords[0])); 523 if (n != -1) { /* found in table */ 524 kp = keywords + n; 525 yylval.i = kp->sub; 526 switch (kp->type) { /* special handling */ 527 case BLTIN: 528 if (kp->sub == FSYSTEM && safe) 529 SYNTAX( "system is unsafe" ); 530 RET(kp->type); 531 case FUNC: 532 if (infunc) 533 SYNTAX( "illegal nested function" ); 534 RET(kp->type); 535 case RETURN: 536 if (!infunc) 537 SYNTAX( "return not in function" ); 538 RET(kp->type); 539 case VARNF: 540 yylval.cp = setsymtab("NF", "", 0.0, NUM, symtab); 541 RET(VARNF); 542 default: 543 RET(kp->type); 544 } 545 } 546 c = peek(); /* look for '(' */ 547 if (c != '(' && infunc && (n=isarg(w)) >= 0) { 548 yylval.i = n; 549 RET(ARG); 550 } else { 551 yylval.cp = setsymtab(w, "", 0.0, STR|NUM|DONTFREE, symtab); 552 if (c == '(') { 553 RET(CALL); 554 } else { 555 RET(VAR); 556 } 557 } 558} 559 560void startreg(void) /* next call to yylex will return a regular expression */ 561{ 562 reg = true; 563} 564 565int regexpr(void) 566{ 567 int c; 568 static char *buf = NULL; 569 static int bufsz = 500; 570 char *bp; 571 572 if (buf == NULL && (buf = (char *) malloc(bufsz)) == NULL) 573 FATAL("out of space for reg expr"); 574 bp = buf; 575 for ( ; (c = input()) != '/' && c != 0; ) { 576 if (!adjbuf(&buf, &bufsz, bp-buf+3, 500, &bp, "regexpr")) 577 FATAL("out of space for reg expr %.10s...", buf); 578 if (c == '\n') { 579 *bp = '\0'; 580 SYNTAX( "newline in regular expression %.10s...", buf ); 581 unput('\n'); 582 break; 583 } else if (c == '\\') { 584 *bp++ = '\\'; 585 *bp++ = input(); 586 } else { 587 *bp++ = c; 588 } 589 } 590 *bp = 0; 591 if (c == 0) 592 SYNTAX("non-terminated regular expression %.10s...", buf); 593 yylval.s = tostring(buf); 594 unput('/'); 595 RET(REGEXPR); 596} 597 598/* low-level lexical stuff, sort of inherited from lex */ 599 600char ebuf[300]; 601char *ep = ebuf; 602char yysbuf[100]; /* pushback buffer */ 603char *yysptr = yysbuf; 604FILE *yyin = NULL; 605 606int input(void) /* get next lexical input character */ 607{ 608 int c; 609 extern char *lexprog; 610 611 if (yysptr > yysbuf) 612 c = (uschar)*--yysptr; 613 else if (lexprog != NULL) { /* awk '...' */ 614 if ((c = (uschar)*lexprog) != 0) 615 lexprog++; 616 } else /* awk -f ... */ 617 c = pgetc(); 618 if (c == EOF) 619 c = 0; 620 if (ep >= ebuf + sizeof ebuf) 621 ep = ebuf; 622 *ep = c; 623 if (c != 0) { 624 ep++; 625 } 626 return (c); 627} 628 629void unput(int c) /* put lexical character back on input */ 630{ 631 if (c == '\n') 632 lineno--; 633 if (yysptr >= yysbuf + sizeof(yysbuf)) 634 FATAL("pushed back too much: %.20s...", yysbuf); 635 *yysptr++ = c; 636 if (--ep < ebuf) 637 ep = ebuf + sizeof(ebuf) - 1; 638} 639 640void unputstr(const char *s) /* put a string back on input */ 641{ 642 int i; 643 644 for (i = strlen(s)-1; i >= 0; i--) 645 unput(s[i]); 646} 647