1/* $NetBSD: lexi.c,v 1.12 2003/08/07 11:14:09 agc Exp $ */ 2 3/* 4 * Copyright (c) 1980, 1993 5 * The Regents of the University of California. All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 3. Neither the name of the University nor the names of its contributors 16 * may be used to endorse or promote products derived from this software 17 * without specific prior written permission. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 29 * SUCH DAMAGE. 30 */ 31 32/* 33 * Copyright (c) 1976 Board of Trustees of the University of Illinois. 34 * Copyright (c) 1985 Sun Microsystems, Inc. 35 * All rights reserved. 36 * 37 * Redistribution and use in source and binary forms, with or without 38 * modification, are permitted provided that the following conditions 39 * are met: 40 * 1. Redistributions of source code must retain the above copyright 41 * notice, this list of conditions and the following disclaimer. 42 * 2. Redistributions in binary form must reproduce the above copyright 43 * notice, this list of conditions and the following disclaimer in the 44 * documentation and/or other materials provided with the distribution. 45 * 3. All advertising materials mentioning features or use of this software 46 * must display the following acknowledgement: 47 * This product includes software developed by the University of 48 * California, Berkeley and its contributors. 49 * 4. Neither the name of the University nor the names of its contributors 50 * may be used to endorse or promote products derived from this software 51 * without specific prior written permission. 52 * 53 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 54 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 55 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 56 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 57 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 58 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 59 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 60 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 61 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 62 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 63 * SUCH DAMAGE. 64 */ 65 66#include <sys/cdefs.h> 67#ifndef lint 68#if 0 69static char sccsid[] = "@(#)lexi.c 8.1 (Berkeley) 6/6/93"; 70#else 71__RCSID("$NetBSD: lexi.c,v 1.12 2003/08/07 11:14:09 agc Exp $"); 72#endif 73#endif /* not lint */ 74 75/* 76 * Here we have the token scanner for indent. It scans off one token and puts 77 * it in the global variable "token". It returns a code, indicating the type 78 * of token scanned. 79 */ 80 81#include <stdio.h> 82#include <ctype.h> 83#include <stdlib.h> 84#include <string.h> 85#include "indent_globs.h" 86#include "indent_codes.h" 87 88#define alphanum 1 89#define opchar 3 90 91struct templ { 92 const char *rwd; 93 int rwcode; 94}; 95 96struct templ specials[1000] = 97{ 98 {"switch", 1}, 99 {"case", 2}, 100 {"break", 0}, 101 {"struct", 3}, 102 {"union", 3}, 103 {"enum", 3}, 104 {"default", 2}, 105 {"int", 4}, 106 {"char", 4}, 107 {"float", 4}, 108 {"double", 4}, 109 {"long", 4}, 110 {"short", 4}, 111 {"typdef", 4}, 112 {"unsigned", 4}, 113 {"register", 4}, 114 {"static", 4}, 115 {"global", 4}, 116 {"extern", 4}, 117 {"void", 4}, 118 {"goto", 0}, 119 {"return", 0}, 120 {"if", 5}, 121 {"while", 5}, 122 {"for", 5}, 123 {"else", 6}, 124 {"do", 6}, 125 {"sizeof", 7}, 126 {0, 0} 127}; 128 129char chartype[128] = 130{ /* this is used to facilitate the decision of 131 * what type (alphanumeric, operator) each 132 * character is */ 133 0, 0, 0, 0, 0, 0, 0, 0, 134 0, 0, 0, 0, 0, 0, 0, 0, 135 0, 0, 0, 0, 0, 0, 0, 0, 136 0, 0, 0, 0, 0, 0, 0, 0, 137 0, 3, 0, 0, 1, 3, 3, 0, 138 0, 0, 3, 3, 0, 3, 0, 3, 139 1, 1, 1, 1, 1, 1, 1, 1, 140 1, 1, 0, 0, 3, 3, 3, 3, 141 0, 1, 1, 1, 1, 1, 1, 1, 142 1, 1, 1, 1, 1, 1, 1, 1, 143 1, 1, 1, 1, 1, 1, 1, 1, 144 1, 1, 1, 0, 0, 0, 3, 1, 145 0, 1, 1, 1, 1, 1, 1, 1, 146 1, 1, 1, 1, 1, 1, 1, 1, 147 1, 1, 1, 1, 1, 1, 1, 1, 148 1, 1, 1, 0, 3, 0, 3, 0 149}; 150 151 152 153 154int 155lexi(void) 156{ 157 int unary_delim; /* this is set to 1 if the current token 158 * 159 * forces a following operator to be unary */ 160 static int last_code; /* the last token type returned */ 161 static int l_struct; /* set to 1 if the last token was 'struct' */ 162 int code; /* internal code to be returned */ 163 char qchar; /* the delimiter character for a string */ 164 165 e_token = s_token; /* point to start of place to save token */ 166 unary_delim = false; 167 ps.col_1 = ps.last_nl; /* tell world that this token started in 168 * column 1 iff the last thing scanned was nl */ 169 ps.last_nl = false; 170 171 while (*buf_ptr == ' ' || *buf_ptr == '\t') { /* get rid of blanks */ 172 ps.col_1 = false; /* leading blanks imply token is not 173 * in column 1 */ 174 if (++buf_ptr >= buf_end) 175 fill_buffer(); 176 } 177 178 /* Scan an alphanumeric token */ 179 if (chartype[(int) *buf_ptr] == alphanum || 180 (buf_ptr[0] == '.' && isdigit((unsigned char)buf_ptr[1]))) { 181 /* 182 * we have a character or number 183 */ 184 const char *j; /* used for searching thru list of 185 * reserved words */ 186 struct templ *p; 187 188 if (isdigit((unsigned char)*buf_ptr) || 189 (buf_ptr[0] == '.' && isdigit((unsigned char)buf_ptr[1]))) { 190 int seendot = 0, seenexp = 0, seensfx = 0; 191 if (*buf_ptr == '0' && 192 (buf_ptr[1] == 'x' || buf_ptr[1] == 'X')) { 193 *e_token++ = *buf_ptr++; 194 *e_token++ = *buf_ptr++; 195 while (isxdigit((unsigned char)*buf_ptr)) { 196 CHECK_SIZE_TOKEN; 197 *e_token++ = *buf_ptr++; 198 } 199 } else { 200 while (1) { 201 if (*buf_ptr == '.') { 202 if (seendot) 203 break; 204 else 205 seendot++; 206 } 207 CHECK_SIZE_TOKEN; 208 *e_token++ = *buf_ptr++; 209 if (!isdigit((unsigned char)*buf_ptr) 210 && *buf_ptr != '.') { 211 if ((*buf_ptr != 'E' 212 && *buf_ptr != 'e') || seenexp) 213 break; 214 else { 215 seenexp++; 216 seendot++; 217 CHECK_SIZE_TOKEN; 218 *e_token++ = *buf_ptr++; 219 if (*buf_ptr == '+' || *buf_ptr == '-') 220 *e_token++ = *buf_ptr++; 221 } 222 } 223 } 224 } 225 if (*buf_ptr == 'F' || *buf_ptr == 'f') { 226 /* float constant */ 227 *e_token++ = *buf_ptr++; 228 } else { 229 /* integer constant */ 230 while (1) { 231 if (!(seensfx & 1) && 232 (*buf_ptr == 'U' || 233 *buf_ptr == 'u')) { 234 CHECK_SIZE_TOKEN; 235 *e_token++ = *buf_ptr++; 236 seensfx |= 1; 237 continue; 238 } 239 if (!(seensfx & 2) && 240 (*buf_ptr == 'L' || 241 *buf_ptr == 'l')) { 242 CHECK_SIZE_TOKEN; 243 if (buf_ptr[1] == buf_ptr[0]) 244 *e_token++ = *buf_ptr++; 245 *e_token++ = *buf_ptr++; 246 seensfx |= 2; 247 continue; 248 } 249 break; 250 } 251 } 252 } else 253 while (chartype[(int) *buf_ptr] == alphanum) { /* copy it over */ 254 CHECK_SIZE_TOKEN; 255 *e_token++ = *buf_ptr++; 256 if (buf_ptr >= buf_end) 257 fill_buffer(); 258 } 259 *e_token++ = '\0'; 260 while (*buf_ptr == ' ' || *buf_ptr == '\t') { /* get rid of blanks */ 261 if (++buf_ptr >= buf_end) 262 fill_buffer(); 263 } 264 ps.its_a_keyword = false; 265 ps.sizeof_keyword = false; 266 if (l_struct) { /* if last token was 'struct', then this token 267 * should be treated as a declaration */ 268 l_struct = false; 269 last_code = ident; 270 ps.last_u_d = true; 271 return (decl); 272 } 273 ps.last_u_d = false; /* Operator after indentifier is 274 * binary */ 275 last_code = ident; /* Remember that this is the code we 276 * will return */ 277 278 /* 279 * This loop will check if the token is a keyword. 280 */ 281 for (p = specials; (j = p->rwd) != 0; p++) { 282 char *pt = s_token; /* point at scanned token */ 283 if (*j++ != *pt++ || *j++ != *pt++) 284 continue; /* This test depends on the 285 * fact that identifiers are 286 * always at least 1 character 287 * long (ie. the first two 288 * bytes of the identifier are 289 * always meaningful) */ 290 if (pt[-1] == 0) 291 break; /* If its a one-character identifier */ 292 while (*pt++ == *j) 293 if (*j++ == 0) 294 goto found_keyword; /* I wish that C had a 295 * multi-level break... */ 296 } 297 if (p->rwd) { /* we have a keyword */ 298 found_keyword: 299 ps.its_a_keyword = true; 300 ps.last_u_d = true; 301 switch (p->rwcode) { 302 case 1:/* it is a switch */ 303 return (swstmt); 304 case 2:/* a case or default */ 305 return (casestmt); 306 307 case 3:/* a "struct" */ 308 if (ps.p_l_follow) 309 break; /* inside parens: cast */ 310 l_struct = true; 311 312 /* 313 * Next time around, we will want to know that we have had a 314 * 'struct' 315 */ 316 case 4:/* one of the declaration keywords */ 317 if (ps.p_l_follow) { 318 ps.cast_mask |= 1 << ps.p_l_follow; 319 break; /* inside parens: cast */ 320 } 321 last_code = decl; 322 return (decl); 323 324 case 5:/* if, while, for */ 325 return (sp_paren); 326 327 case 6:/* do, else */ 328 return (sp_nparen); 329 330 case 7: 331 ps.sizeof_keyword = true; 332 default: /* all others are treated like any 333 * other identifier */ 334 return (ident); 335 } /* end of switch */ 336 } /* end of if (found_it) */ 337 if (*buf_ptr == '(' && ps.tos <= 1 && ps.ind_level == 0) { 338 char *tp = buf_ptr; 339 while (tp < buf_end) 340 if (*tp++ == ')' && (*tp == ';' || *tp == ',')) 341 goto not_proc; 342 strncpy(ps.procname, token, sizeof ps.procname - 1); 343 ps.in_parameter_declaration = 1; 344 rparen_count = 1; 345 not_proc: ; 346 } 347 /* 348 * The following hack attempts to guess whether or not the current 349 * token is in fact a declaration keyword -- one that has been 350 * typedefd 351 */ 352 if (((*buf_ptr == '*' && buf_ptr[1] != '=') || 353 isalpha((unsigned char)*buf_ptr) || *buf_ptr == '_') 354 && !ps.p_l_follow 355 && !ps.block_init 356 && (ps.last_token == rparen || ps.last_token == semicolon || 357 ps.last_token == decl || 358 ps.last_token == lbrace || ps.last_token == rbrace)) { 359 ps.its_a_keyword = true; 360 ps.last_u_d = true; 361 last_code = decl; 362 return decl; 363 } 364 if (last_code == decl) /* if this is a declared variable, 365 * then following sign is unary */ 366 ps.last_u_d = true; /* will make "int a -1" work */ 367 last_code = ident; 368 return (ident); /* the ident is not in the list */ 369 } /* end of procesing for alpanum character */ 370 /* Scan a non-alphanumeric token */ 371 *e_token++ = *buf_ptr; /* if it is only a one-character token, it is 372 * moved here */ 373 *e_token = '\0'; 374 if (++buf_ptr >= buf_end) 375 fill_buffer(); 376 377 switch (*token) { 378 case '\n': 379 unary_delim = ps.last_u_d; 380 ps.last_nl = true; /* remember that we just had a newline */ 381 code = (had_eof ? 0 : newline); 382 383 /* 384 * if data has been exausted, the newline is a dummy, and we should 385 * return code to stop 386 */ 387 break; 388 389 case '\'': /* start of quoted character */ 390 case '"': /* start of string */ 391 qchar = *token; 392 if (troff) { 393 e_token[-1] = '`'; 394 if (qchar == '"') 395 *e_token++ = '`'; 396 e_token = chfont(&bodyf, &stringf, e_token); 397 } 398 do { /* copy the string */ 399 while (1) { /* move one character or 400 * [/<char>]<char> */ 401 if (*buf_ptr == '\n') { 402 printf("%d: Unterminated literal\n", line_no); 403 goto stop_lit; 404 } 405 CHECK_SIZE_TOKEN; /* Only have to do this 406 * once in this loop, 407 * since CHECK_SIZE 408 * guarantees that there 409 * are at least 5 410 * entries left */ 411 *e_token = *buf_ptr++; 412 if (buf_ptr >= buf_end) 413 fill_buffer(); 414 if (*e_token == BACKSLASH) { /* if escape, copy extra 415 * char */ 416 if (*buf_ptr == '\n') /* check for escaped 417 * newline */ 418 ++line_no; 419 if (troff) { 420 *++e_token = BACKSLASH; 421 if (*buf_ptr == BACKSLASH) 422 *++e_token = BACKSLASH; 423 } 424 *++e_token = *buf_ptr++; 425 ++e_token; /* we must increment 426 * this again because we 427 * copied two chars */ 428 if (buf_ptr >= buf_end) 429 fill_buffer(); 430 } else 431 break; /* we copied one character */ 432 } /* end of while (1) */ 433 } while (*e_token++ != qchar); 434 if (troff) { 435 e_token = chfont(&stringf, &bodyf, e_token - 1); 436 if (qchar == '"') 437 *e_token++ = '\''; 438 } 439stop_lit: 440 code = ident; 441 break; 442 443 case ('('): 444 case ('['): 445 unary_delim = true; 446 code = lparen; 447 break; 448 449 case (')'): 450 case (']'): 451 code = rparen; 452 break; 453 454 case '#': 455 unary_delim = ps.last_u_d; 456 code = preesc; 457 break; 458 459 case '?': 460 unary_delim = true; 461 code = question; 462 break; 463 464 case (':'): 465 code = colon; 466 unary_delim = true; 467 break; 468 469 case (';'): 470 unary_delim = true; 471 code = semicolon; 472 break; 473 474 case ('{'): 475 unary_delim = true; 476 477 /* 478 * if (ps.in_or_st) ps.block_init = 1; 479 */ 480 /* ? code = ps.block_init ? lparen : lbrace; */ 481 code = lbrace; 482 break; 483 484 case ('}'): 485 unary_delim = true; 486 /* ? code = ps.block_init ? rparen : rbrace; */ 487 code = rbrace; 488 break; 489 490 case 014: /* a form feed */ 491 unary_delim = ps.last_u_d; 492 ps.last_nl = true; /* remember this so we can set 493 * 'ps.col_1' right */ 494 code = form_feed; 495 break; 496 497 case (','): 498 unary_delim = true; 499 code = comma; 500 break; 501 502 case '.': 503 unary_delim = false; 504 code = period; 505 break; 506 507 case '-': 508 case '+': /* check for -, +, --, ++ */ 509 code = (ps.last_u_d ? unary_op : binary_op); 510 unary_delim = true; 511 512 if (*buf_ptr == token[0]) { 513 /* check for doubled character */ 514 *e_token++ = *buf_ptr++; 515 /* buffer overflow will be checked at end of loop */ 516 if (last_code == ident || last_code == rparen) { 517 code = (ps.last_u_d ? unary_op : postop); 518 /* check for following ++ or -- */ 519 unary_delim = false; 520 } 521 } else 522 if (*buf_ptr == '=') 523 /* check for operator += */ 524 *e_token++ = *buf_ptr++; 525 else 526 if (*buf_ptr == '>') { 527 /* check for operator -> */ 528 *e_token++ = *buf_ptr++; 529 if (!pointer_as_binop) { 530 unary_delim = false; 531 code = unary_op; 532 ps.want_blank = false; 533 } 534 } 535 break; /* buffer overflow will be checked at end of 536 * switch */ 537 538 case '=': 539 if (ps.in_or_st) 540 ps.block_init = 1; 541#ifdef undef 542 if (chartype[*buf_ptr] == opchar) { /* we have two char 543 * assignment */ 544 e_token[-1] = *buf_ptr++; 545 if ((e_token[-1] == '<' || e_token[-1] == '>') && e_token[-1] == *buf_ptr) 546 *e_token++ = *buf_ptr++; 547 *e_token++ = '='; /* Flip =+ to += */ 548 *e_token = 0; 549 } 550#else 551 if (*buf_ptr == '=') { /* == */ 552 *e_token++ = '='; /* Flip =+ to += */ 553 buf_ptr++; 554 *e_token = 0; 555 } 556#endif 557 code = binary_op; 558 unary_delim = true; 559 break; 560 /* can drop thru!!! */ 561 562 case '>': 563 case '<': 564 case '!': /* ops like <, <<, <=, !=, etc */ 565 if (*buf_ptr == '>' || *buf_ptr == '<' || *buf_ptr == '=') { 566 *e_token++ = *buf_ptr; 567 if (++buf_ptr >= buf_end) 568 fill_buffer(); 569 } 570 if (*buf_ptr == '=') 571 *e_token++ = *buf_ptr++; 572 code = (ps.last_u_d ? unary_op : binary_op); 573 unary_delim = true; 574 break; 575 576 default: 577 if (token[0] == '/' && *buf_ptr == '*') { 578 /* it is start of comment */ 579 *e_token++ = '*'; 580 581 if (++buf_ptr >= buf_end) 582 fill_buffer(); 583 584 code = comment; 585 unary_delim = ps.last_u_d; 586 break; 587 } 588 while (*(e_token - 1) == *buf_ptr || *buf_ptr == '=') { 589 /* 590 * handle ||, &&, etc, and also things as in int *****i 591 */ 592 *e_token++ = *buf_ptr; 593 if (++buf_ptr >= buf_end) 594 fill_buffer(); 595 } 596 code = (ps.last_u_d ? unary_op : binary_op); 597 unary_delim = true; 598 599 600 } /* end of switch */ 601 if (code != newline) { 602 l_struct = false; 603 last_code = code; 604 } 605 if (buf_ptr >= buf_end) /* check for input buffer empty */ 606 fill_buffer(); 607 ps.last_u_d = unary_delim; 608 *e_token = '\0'; /* null terminate the token */ 609 return (code); 610} 611/* 612 * Add the given keyword to the keyword table, using val as the keyword type 613 */ 614void 615addkey(char *key, int val) 616{ 617 struct templ *p = specials; 618 while (p->rwd) 619 if (p->rwd[0] == key[0] && strcmp(p->rwd, key) == 0) 620 return; 621 else 622 p++; 623 if (p >= specials + sizeof specials / sizeof specials[0]) 624 return; /* For now, table overflows are silently 625 * ignored */ 626 p->rwd = key; 627 p->rwcode = val; 628 p[1].rwd = 0; 629 p[1].rwcode = 0; 630} 631