1/*	$NetBSD: lexi.c,v 1.12 2003/08/07 11:14:09 agc Exp $	*/
2
3/*
4 * Copyright (c) 1980, 1993
5 *	The Regents of the University of California.  All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 * 3. Neither the name of the University nor the names of its contributors
16 *    may be used to endorse or promote products derived from this software
17 *    without specific prior written permission.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29 * SUCH DAMAGE.
30 */
31
32/*
33 * Copyright (c) 1976 Board of Trustees of the University of Illinois.
34 * Copyright (c) 1985 Sun Microsystems, Inc.
35 * All rights reserved.
36 *
37 * Redistribution and use in source and binary forms, with or without
38 * modification, are permitted provided that the following conditions
39 * are met:
40 * 1. Redistributions of source code must retain the above copyright
41 *    notice, this list of conditions and the following disclaimer.
42 * 2. Redistributions in binary form must reproduce the above copyright
43 *    notice, this list of conditions and the following disclaimer in the
44 *    documentation and/or other materials provided with the distribution.
45 * 3. All advertising materials mentioning features or use of this software
46 *    must display the following acknowledgement:
47 *	This product includes software developed by the University of
48 *	California, Berkeley and its contributors.
49 * 4. Neither the name of the University nor the names of its contributors
50 *    may be used to endorse or promote products derived from this software
51 *    without specific prior written permission.
52 *
53 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
54 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
55 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
56 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
57 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
58 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
59 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
60 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
61 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
62 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
63 * SUCH DAMAGE.
64 */
65
66#include <sys/cdefs.h>
67#ifndef lint
68#if 0
69static char sccsid[] = "@(#)lexi.c	8.1 (Berkeley) 6/6/93";
70#else
71__RCSID("$NetBSD: lexi.c,v 1.12 2003/08/07 11:14:09 agc Exp $");
72#endif
73#endif				/* not lint */
74
75/*
76 * Here we have the token scanner for indent.  It scans off one token and puts
77 * it in the global variable "token".  It returns a code, indicating the type
78 * of token scanned.
79 */
80
81#include <stdio.h>
82#include <ctype.h>
83#include <stdlib.h>
84#include <string.h>
85#include "indent_globs.h"
86#include "indent_codes.h"
87
88#define alphanum 1
89#define opchar 3
90
91struct templ {
92	const char	*rwd;
93	int		rwcode;
94};
95
96struct templ specials[1000] =
97{
98	{"switch", 1},
99	{"case", 2},
100	{"break", 0},
101	{"struct", 3},
102	{"union", 3},
103	{"enum", 3},
104	{"default", 2},
105	{"int", 4},
106	{"char", 4},
107	{"float", 4},
108	{"double", 4},
109	{"long", 4},
110	{"short", 4},
111	{"typdef", 4},
112	{"unsigned", 4},
113	{"register", 4},
114	{"static", 4},
115	{"global", 4},
116	{"extern", 4},
117	{"void", 4},
118	{"goto", 0},
119	{"return", 0},
120	{"if", 5},
121	{"while", 5},
122	{"for", 5},
123	{"else", 6},
124	{"do", 6},
125	{"sizeof", 7},
126	{0, 0}
127};
128
129char    chartype[128] =
130{				/* this is used to facilitate the decision of
131				 * what type (alphanumeric, operator) each
132				 * character is */
133	0, 0, 0, 0, 0, 0, 0, 0,
134	0, 0, 0, 0, 0, 0, 0, 0,
135	0, 0, 0, 0, 0, 0, 0, 0,
136	0, 0, 0, 0, 0, 0, 0, 0,
137	0, 3, 0, 0, 1, 3, 3, 0,
138	0, 0, 3, 3, 0, 3, 0, 3,
139	1, 1, 1, 1, 1, 1, 1, 1,
140	1, 1, 0, 0, 3, 3, 3, 3,
141	0, 1, 1, 1, 1, 1, 1, 1,
142	1, 1, 1, 1, 1, 1, 1, 1,
143	1, 1, 1, 1, 1, 1, 1, 1,
144	1, 1, 1, 0, 0, 0, 3, 1,
145	0, 1, 1, 1, 1, 1, 1, 1,
146	1, 1, 1, 1, 1, 1, 1, 1,
147	1, 1, 1, 1, 1, 1, 1, 1,
148	1, 1, 1, 0, 3, 0, 3, 0
149};
150
151
152
153
154int
155lexi(void)
156{
157	int     unary_delim;	/* this is set to 1 if the current token
158				 *
159				 * forces a following operator to be unary */
160	static int last_code;	/* the last token type returned */
161	static int l_struct;	/* set to 1 if the last token was 'struct' */
162	int     code;		/* internal code to be returned */
163	char    qchar;		/* the delimiter character for a string */
164
165	e_token = s_token;	/* point to start of place to save token */
166	unary_delim = false;
167	ps.col_1 = ps.last_nl;	/* tell world that this token started in
168				 * column 1 iff the last thing scanned was nl */
169	ps.last_nl = false;
170
171	while (*buf_ptr == ' ' || *buf_ptr == '\t') {	/* get rid of blanks */
172		ps.col_1 = false;	/* leading blanks imply token is not
173					 * in column 1 */
174		if (++buf_ptr >= buf_end)
175			fill_buffer();
176	}
177
178	/* Scan an alphanumeric token */
179	if (chartype[(int) *buf_ptr] == alphanum ||
180	    (buf_ptr[0] == '.' && isdigit((unsigned char)buf_ptr[1]))) {
181		/*
182		 * we have a character or number
183		 */
184		const char *j;	/* used for searching thru list of
185				 * reserved words */
186		struct templ *p;
187
188		if (isdigit((unsigned char)*buf_ptr) ||
189		    (buf_ptr[0] == '.' && isdigit((unsigned char)buf_ptr[1]))) {
190			int     seendot = 0, seenexp = 0, seensfx = 0;
191			if (*buf_ptr == '0' &&
192			    (buf_ptr[1] == 'x' || buf_ptr[1] == 'X')) {
193				*e_token++ = *buf_ptr++;
194				*e_token++ = *buf_ptr++;
195				while (isxdigit((unsigned char)*buf_ptr)) {
196					CHECK_SIZE_TOKEN;
197					*e_token++ = *buf_ptr++;
198				}
199			} else {
200				while (1) {
201					if (*buf_ptr == '.') {
202						if (seendot)
203							break;
204						else
205							seendot++;
206					}
207					CHECK_SIZE_TOKEN;
208					*e_token++ = *buf_ptr++;
209					if (!isdigit((unsigned char)*buf_ptr)
210					&& *buf_ptr != '.') {
211						if ((*buf_ptr != 'E'
212						&& *buf_ptr != 'e') || seenexp)
213							break;
214						else {
215							seenexp++;
216							seendot++;
217							CHECK_SIZE_TOKEN;
218							*e_token++ = *buf_ptr++;
219							if (*buf_ptr == '+' || *buf_ptr == '-')
220								*e_token++ = *buf_ptr++;
221						}
222					}
223				}
224			}
225			if (*buf_ptr == 'F' || *buf_ptr == 'f') {
226				/* float constant */
227				*e_token++ = *buf_ptr++;
228			} else {
229				/* integer constant */
230				while (1) {
231					if (!(seensfx & 1) &&
232					    (*buf_ptr == 'U' ||
233					     *buf_ptr == 'u')) {
234						CHECK_SIZE_TOKEN;
235						*e_token++ = *buf_ptr++;
236						seensfx |= 1;
237						continue;
238					}
239					if (!(seensfx & 2) &&
240					    (*buf_ptr == 'L' ||
241					     *buf_ptr == 'l')) {
242						CHECK_SIZE_TOKEN;
243						if (buf_ptr[1] == buf_ptr[0])
244							*e_token++ = *buf_ptr++;
245						*e_token++ = *buf_ptr++;
246						seensfx |= 2;
247						continue;
248					}
249					break;
250				}
251			}
252		} else
253			while (chartype[(int) *buf_ptr] == alphanum) {	/* copy it over */
254				CHECK_SIZE_TOKEN;
255				*e_token++ = *buf_ptr++;
256				if (buf_ptr >= buf_end)
257					fill_buffer();
258			}
259		*e_token++ = '\0';
260		while (*buf_ptr == ' ' || *buf_ptr == '\t') {	/* get rid of blanks */
261			if (++buf_ptr >= buf_end)
262				fill_buffer();
263		}
264		ps.its_a_keyword = false;
265		ps.sizeof_keyword = false;
266		if (l_struct) {	/* if last token was 'struct', then this token
267				 * should be treated as a declaration */
268			l_struct = false;
269			last_code = ident;
270			ps.last_u_d = true;
271			return (decl);
272		}
273		ps.last_u_d = false;	/* Operator after indentifier is
274					 * binary */
275		last_code = ident;	/* Remember that this is the code we
276					 * will return */
277
278		/*
279		 * This loop will check if the token is a keyword.
280		 */
281		for (p = specials; (j = p->rwd) != 0; p++) {
282			char   *pt = s_token;	/* point at scanned token */
283			if (*j++ != *pt++ || *j++ != *pt++)
284				continue;	/* This test depends on the
285						 * fact that identifiers are
286						 * always at least 1 character
287						 * long (ie. the first two
288						 * bytes of the identifier are
289						 * always meaningful) */
290			if (pt[-1] == 0)
291				break;	/* If its a one-character identifier */
292			while (*pt++ == *j)
293				if (*j++ == 0)
294					goto found_keyword;	/* I wish that C had a
295								 * multi-level break... */
296		}
297		if (p->rwd) {	/* we have a keyword */
298	found_keyword:
299			ps.its_a_keyword = true;
300			ps.last_u_d = true;
301			switch (p->rwcode) {
302			case 1:/* it is a switch */
303				return (swstmt);
304			case 2:/* a case or default */
305				return (casestmt);
306
307			case 3:/* a "struct" */
308				if (ps.p_l_follow)
309					break;	/* inside parens: cast */
310				l_struct = true;
311
312				/*
313				 * Next time around, we will want to know that we have had a
314				 * 'struct'
315				 */
316			case 4:/* one of the declaration keywords */
317				if (ps.p_l_follow) {
318					ps.cast_mask |= 1 << ps.p_l_follow;
319					break;	/* inside parens: cast */
320				}
321				last_code = decl;
322				return (decl);
323
324			case 5:/* if, while, for */
325				return (sp_paren);
326
327			case 6:/* do, else */
328				return (sp_nparen);
329
330			case 7:
331				ps.sizeof_keyword = true;
332			default:	/* all others are treated like any
333					 * other identifier */
334				return (ident);
335			}	/* end of switch */
336		}		/* end of if (found_it) */
337		if (*buf_ptr == '(' && ps.tos <= 1 && ps.ind_level == 0) {
338			char   *tp = buf_ptr;
339			while (tp < buf_end)
340				if (*tp++ == ')' && (*tp == ';' || *tp == ','))
341					goto not_proc;
342			strncpy(ps.procname, token, sizeof ps.procname - 1);
343			ps.in_parameter_declaration = 1;
344			rparen_count = 1;
345	not_proc:	;
346		}
347		/*
348		 * The following hack attempts to guess whether or not the current
349		 * token is in fact a declaration keyword -- one that has been
350		 * typedefd
351		 */
352		if (((*buf_ptr == '*' && buf_ptr[1] != '=') ||
353		    isalpha((unsigned char)*buf_ptr) || *buf_ptr == '_')
354		    && !ps.p_l_follow
355		    && !ps.block_init
356		    && (ps.last_token == rparen || ps.last_token == semicolon ||
357			ps.last_token == decl ||
358			ps.last_token == lbrace || ps.last_token == rbrace)) {
359			ps.its_a_keyword = true;
360			ps.last_u_d = true;
361			last_code = decl;
362			return decl;
363		}
364		if (last_code == decl)	/* if this is a declared variable,
365					 * then following sign is unary */
366			ps.last_u_d = true;	/* will make "int a -1" work */
367		last_code = ident;
368		return (ident);	/* the ident is not in the list */
369	}			/* end of procesing for alpanum character */
370	/* Scan a non-alphanumeric token */
371	*e_token++ = *buf_ptr;	/* if it is only a one-character token, it is
372				 * moved here */
373	*e_token = '\0';
374	if (++buf_ptr >= buf_end)
375		fill_buffer();
376
377	switch (*token) {
378	case '\n':
379		unary_delim = ps.last_u_d;
380		ps.last_nl = true;	/* remember that we just had a newline */
381		code = (had_eof ? 0 : newline);
382
383		/*
384		 * if data has been exausted, the newline is a dummy, and we should
385		 * return code to stop
386		 */
387		break;
388
389	case '\'':		/* start of quoted character */
390	case '"':		/* start of string */
391		qchar = *token;
392		if (troff) {
393			e_token[-1] = '`';
394			if (qchar == '"')
395				*e_token++ = '`';
396			e_token = chfont(&bodyf, &stringf, e_token);
397		}
398		do {		/* copy the string */
399			while (1) {	/* move one character or
400					 * [/<char>]<char> */
401				if (*buf_ptr == '\n') {
402					printf("%d: Unterminated literal\n", line_no);
403					goto stop_lit;
404				}
405				CHECK_SIZE_TOKEN;	/* Only have to do this
406							 * once in this loop,
407							 * since CHECK_SIZE
408							 * guarantees that there
409							 * are at least 5
410							 * entries left */
411				*e_token = *buf_ptr++;
412				if (buf_ptr >= buf_end)
413					fill_buffer();
414				if (*e_token == BACKSLASH) {	/* if escape, copy extra
415								 * char */
416					if (*buf_ptr == '\n')	/* check for escaped
417								 * newline */
418						++line_no;
419					if (troff) {
420						*++e_token = BACKSLASH;
421						if (*buf_ptr == BACKSLASH)
422							*++e_token = BACKSLASH;
423					}
424					*++e_token = *buf_ptr++;
425					++e_token;	/* we must increment
426							 * this again because we
427							 * copied two chars */
428					if (buf_ptr >= buf_end)
429						fill_buffer();
430				} else
431					break;	/* we copied one character */
432			}	/* end of while (1) */
433		} while (*e_token++ != qchar);
434		if (troff) {
435			e_token = chfont(&stringf, &bodyf, e_token - 1);
436			if (qchar == '"')
437				*e_token++ = '\'';
438		}
439stop_lit:
440		code = ident;
441		break;
442
443	case ('('):
444	case ('['):
445		unary_delim = true;
446		code = lparen;
447		break;
448
449	case (')'):
450	case (']'):
451		code = rparen;
452		break;
453
454	case '#':
455		unary_delim = ps.last_u_d;
456		code = preesc;
457		break;
458
459	case '?':
460		unary_delim = true;
461		code = question;
462		break;
463
464	case (':'):
465		code = colon;
466		unary_delim = true;
467		break;
468
469	case (';'):
470		unary_delim = true;
471		code = semicolon;
472		break;
473
474	case ('{'):
475		unary_delim = true;
476
477		/*
478		 * if (ps.in_or_st) ps.block_init = 1;
479		 */
480		/* ?	code = ps.block_init ? lparen : lbrace; */
481		code = lbrace;
482		break;
483
484	case ('}'):
485		unary_delim = true;
486		/* ?	code = ps.block_init ? rparen : rbrace; */
487		code = rbrace;
488		break;
489
490	case 014:		/* a form feed */
491		unary_delim = ps.last_u_d;
492		ps.last_nl = true;	/* remember this so we can set
493					 * 'ps.col_1' right */
494		code = form_feed;
495		break;
496
497	case (','):
498		unary_delim = true;
499		code = comma;
500		break;
501
502	case '.':
503		unary_delim = false;
504		code = period;
505		break;
506
507	case '-':
508	case '+':		/* check for -, +, --, ++ */
509		code = (ps.last_u_d ? unary_op : binary_op);
510		unary_delim = true;
511
512		if (*buf_ptr == token[0]) {
513			/* check for doubled character */
514			*e_token++ = *buf_ptr++;
515			/* buffer overflow will be checked at end of loop */
516			if (last_code == ident || last_code == rparen) {
517				code = (ps.last_u_d ? unary_op : postop);
518				/* check for following ++ or -- */
519				unary_delim = false;
520			}
521		} else
522			if (*buf_ptr == '=')
523				/* check for operator += */
524				*e_token++ = *buf_ptr++;
525			else
526				if (*buf_ptr == '>') {
527					/* check for operator -> */
528					*e_token++ = *buf_ptr++;
529					if (!pointer_as_binop) {
530						unary_delim = false;
531						code = unary_op;
532						ps.want_blank = false;
533					}
534				}
535		break;		/* buffer overflow will be checked at end of
536				 * switch */
537
538	case '=':
539		if (ps.in_or_st)
540			ps.block_init = 1;
541#ifdef undef
542		if (chartype[*buf_ptr] == opchar) {	/* we have two char
543							 * assignment */
544			e_token[-1] = *buf_ptr++;
545			if ((e_token[-1] == '<' || e_token[-1] == '>') && e_token[-1] == *buf_ptr)
546				*e_token++ = *buf_ptr++;
547			*e_token++ = '=';	/* Flip =+ to += */
548			*e_token = 0;
549		}
550#else
551		if (*buf_ptr == '=') {	/* == */
552			*e_token++ = '=';	/* Flip =+ to += */
553			buf_ptr++;
554			*e_token = 0;
555		}
556#endif
557		code = binary_op;
558		unary_delim = true;
559		break;
560		/* can drop thru!!! */
561
562	case '>':
563	case '<':
564	case '!':		/* ops like <, <<, <=, !=, etc */
565		if (*buf_ptr == '>' || *buf_ptr == '<' || *buf_ptr == '=') {
566			*e_token++ = *buf_ptr;
567			if (++buf_ptr >= buf_end)
568				fill_buffer();
569		}
570		if (*buf_ptr == '=')
571			*e_token++ = *buf_ptr++;
572		code = (ps.last_u_d ? unary_op : binary_op);
573		unary_delim = true;
574		break;
575
576	default:
577		if (token[0] == '/' && *buf_ptr == '*') {
578			/* it is start of comment */
579			*e_token++ = '*';
580
581			if (++buf_ptr >= buf_end)
582				fill_buffer();
583
584			code = comment;
585			unary_delim = ps.last_u_d;
586			break;
587		}
588		while (*(e_token - 1) == *buf_ptr || *buf_ptr == '=') {
589			/*
590		         * handle ||, &&, etc, and also things as in int *****i
591		         */
592			*e_token++ = *buf_ptr;
593			if (++buf_ptr >= buf_end)
594				fill_buffer();
595		}
596		code = (ps.last_u_d ? unary_op : binary_op);
597		unary_delim = true;
598
599
600	}			/* end of switch */
601	if (code != newline) {
602		l_struct = false;
603		last_code = code;
604	}
605	if (buf_ptr >= buf_end)	/* check for input buffer empty */
606		fill_buffer();
607	ps.last_u_d = unary_delim;
608	*e_token = '\0';	/* null terminate the token */
609	return (code);
610}
611/*
612 * Add the given keyword to the keyword table, using val as the keyword type
613 */
614void
615addkey(char *key, int val)
616{
617	struct templ *p = specials;
618	while (p->rwd)
619		if (p->rwd[0] == key[0] && strcmp(p->rwd, key) == 0)
620			return;
621		else
622			p++;
623	if (p >= specials + sizeof specials / sizeof specials[0])
624		return;		/* For now, table overflows are silently
625				 * ignored */
626	p->rwd = key;
627	p->rwcode = val;
628	p[1].rwd = 0;
629	p[1].rwcode = 0;
630}
631