1/****************************************************************
2Copyright (C) Lucent Technologies 1997
3All Rights Reserved
4
5Permission to use, copy, modify, and distribute this software and
6its documentation for any purpose and without fee is hereby
7granted, provided that the above copyright notice appear in all
8copies and that both that the copyright notice and this
9permission notice and warranty disclaimer appear in supporting
10documentation, and that the name Lucent Technologies or any of
11its entities not be used in advertising or publicity pertaining
12to distribution of the software without specific, written prior
13permission.
14
15LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
16INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
17IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
18SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
20IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
21ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
22THIS SOFTWARE.
23****************************************************************/
24
25#include <stdio.h>
26#include <stdlib.h>
27#include <string.h>
28#include <ctype.h>
29#include "awk.h"
30#include "awkgram.tab.h"
31
32extern YYSTYPE	yylval;
33extern bool	infunc;
34
35int	lineno	= 1;
36int	bracecnt = 0;
37int	brackcnt  = 0;
38int	parencnt = 0;
39
40typedef struct Keyword {
41	const char *word;
42	int	sub;
43	int	type;
44} Keyword;
45
46const Keyword keywords[] = {	/* keep sorted: binary searched */
47	{ "BEGIN",	XBEGIN,		XBEGIN },
48	{ "END",	XEND,		XEND },
49	{ "NF",		VARNF,		VARNF },
50	{ "and",	FAND,		BLTIN },
51	{ "atan2",	FATAN,		BLTIN },
52	{ "break",	BREAK,		BREAK },
53	{ "close",	CLOSE,		CLOSE },
54	{ "compl",	FCOMPL,		BLTIN },
55	{ "continue",	CONTINUE,	CONTINUE },
56	{ "cos",	FCOS,		BLTIN },
57	{ "delete",	DELETE,		DELETE },
58	{ "do",		DO,		DO },
59	{ "else",	ELSE,		ELSE },
60	{ "exit",	EXIT,		EXIT },
61	{ "exp",	FEXP,		BLTIN },
62	{ "fflush",	FFLUSH,		BLTIN },
63	{ "for",	FOR,		FOR },
64	{ "func",	FUNC,		FUNC },
65	{ "function",	FUNC,		FUNC },
66	{ "gensub",	GENSUB,		GENSUB },
67	{ "getline",	GETLINE,	GETLINE },
68	{ "gsub",	GSUB,		GSUB },
69	{ "if",		IF,		IF },
70	{ "in",		IN,		IN },
71	{ "index",	INDEX,		INDEX },
72	{ "int",	FINT,		BLTIN },
73	{ "length",	FLENGTH,	BLTIN },
74	{ "log",	FLOG,		BLTIN },
75	{ "lshift",	FLSHIFT,	BLTIN },
76	{ "match",	MATCHFCN,	MATCHFCN },
77	{ "next",	NEXT,		NEXT },
78	{ "nextfile",	NEXTFILE,	NEXTFILE },
79	{ "or",		FFOR,		BLTIN },
80	{ "print",	PRINT,		PRINT },
81	{ "printf",	PRINTF,		PRINTF },
82	{ "rand",	FRAND,		BLTIN },
83	{ "return",	RETURN,		RETURN },
84	{ "rshift",	FRSHIFT,	BLTIN },
85	{ "sin",	FSIN,		BLTIN },
86	{ "split",	SPLIT,		SPLIT },
87	{ "sprintf",	SPRINTF,	SPRINTF },
88	{ "sqrt",	FSQRT,		BLTIN },
89	{ "srand",	FSRAND,		BLTIN },
90	{ "strftime",	FSTRFTIME,	BLTIN },
91	{ "sub",	SUB,		SUB },
92	{ "substr",	SUBSTR,		SUBSTR },
93	{ "system",	FSYSTEM,	BLTIN },
94	{ "systime",	FSYSTIME,	BLTIN },
95	{ "tolower",	FTOLOWER,	BLTIN },
96	{ "toupper",	FTOUPPER,	BLTIN },
97	{ "while",	WHILE,		WHILE },
98	{ "xor",	FXOR,		BLTIN },
99};
100
101#define	RET(x)	{ if(dbg)printf("lex %s\n", tokname(x)); return(x); }
102
103static int peek(void)
104{
105	int c = input();
106	unput(c);
107	return c;
108}
109
110static int gettok(char **pbuf, int *psz)	/* get next input token */
111{
112	int c, retc;
113	char *buf = *pbuf;
114	int sz = *psz;
115	char *bp = buf;
116
117	c = input();
118	if (c == 0)
119		return 0;
120	buf[0] = c;
121	buf[1] = 0;
122	if (!isalnum(c) && c != '.' && c != '_')
123		return c;
124
125	*bp++ = c;
126	if (isalpha(c) || c == '_') {	/* it's a varname */
127		for ( ; (c = input()) != 0; ) {
128			if (bp-buf >= sz)
129				if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, "gettok"))
130					FATAL( "out of space for name %.10s...", buf );
131			if (isalnum(c) || c == '_')
132				*bp++ = c;
133			else {
134				*bp = 0;
135				unput(c);
136				break;
137			}
138		}
139		*bp = 0;
140		retc = 'a';	/* alphanumeric */
141	} else {	/* maybe it's a number, but could be . */
142		char *rem;
143		/* read input until can't be a number */
144		for ( ; (c = input()) != 0; ) {
145			if (bp-buf >= sz)
146				if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, "gettok"))
147					FATAL( "out of space for number %.10s...", buf );
148			if (isdigit(c) || c == 'e' || c == 'E'
149			  || c == '.' || c == '+' || c == '-')
150				*bp++ = c;
151			else {
152				unput(c);
153				break;
154			}
155		}
156		*bp = 0;
157		strtod(buf, &rem);	/* parse the number */
158		if (rem == buf) {	/* it wasn't a valid number at all */
159			buf[1] = 0;	/* return one character as token */
160			retc = (uschar)buf[0];	/* character is its own type */
161			unputstr(rem+1); /* put rest back for later */
162		} else {	/* some prefix was a number */
163			unputstr(rem);	/* put rest back for later */
164			rem[0] = 0;	/* truncate buf after number part */
165			retc = '0';	/* type is number */
166		}
167	}
168	*pbuf = buf;
169	*psz = sz;
170	return retc;
171}
172
173int	word(char *);
174int	string(void);
175int	regexpr(void);
176bool	sc	= false;	/* true => return a } right now */
177bool	reg	= false;	/* true => return a REGEXPR now */
178
179int yylex(void)
180{
181	int c;
182	static char *buf = NULL;
183	static int bufsize = 5; /* BUG: setting this small causes core dump! */
184
185	if (buf == NULL && (buf = (char *) malloc(bufsize)) == NULL)
186		FATAL( "out of space in yylex" );
187	if (sc) {
188		sc = false;
189		RET('}');
190	}
191	if (reg) {
192		reg = false;
193		return regexpr();
194	}
195	for (;;) {
196		c = gettok(&buf, &bufsize);
197		if (c == 0)
198			return 0;
199		if (isalpha(c) || c == '_')
200			return word(buf);
201		if (isdigit(c)) {
202			char *cp = tostring(buf);
203			double result;
204
205			if (is_number(cp, & result))
206				yylval.cp = setsymtab(buf, cp, result, CON|NUM, symtab);
207			else
208				yylval.cp = setsymtab(buf, cp, 0.0, STR, symtab);
209			free(cp);
210			/* should this also have STR set? */
211			RET(NUMBER);
212		}
213
214		yylval.i = c;
215		switch (c) {
216		case '\n':	/* {EOL} */
217			lineno++;
218			RET(NL);
219		case '\r':	/* assume \n is coming */
220		case ' ':	/* {WS}+ */
221		case '\t':
222			break;
223		case '#':	/* #.* strip comments */
224			while ((c = input()) != '\n' && c != 0)
225				;
226			unput(c);
227			/*
228			 * Next line is a hack, itcompensates for
229			 * unput's treatment of \n.
230			 */
231			lineno++;
232			break;
233		case ';':
234			RET(';');
235		case '\\':
236			if (peek() == '\n') {
237				input();
238				lineno++;
239			} else if (peek() == '\r') {
240				input(); input();	/* \n */
241				lineno++;
242			} else {
243				RET(c);
244			}
245			break;
246		case '&':
247			if (peek() == '&') {
248				input(); RET(AND);
249			} else
250				RET('&');
251		case '|':
252			if (peek() == '|') {
253				input(); RET(BOR);
254			} else
255				RET('|');
256		case '!':
257			if (peek() == '=') {
258				input(); yylval.i = NE; RET(NE);
259			} else if (peek() == '~') {
260				input(); yylval.i = NOTMATCH; RET(MATCHOP);
261			} else
262				RET(NOT);
263		case '~':
264			yylval.i = MATCH;
265			RET(MATCHOP);
266		case '<':
267			if (peek() == '=') {
268				input(); yylval.i = LE; RET(LE);
269			} else {
270				yylval.i = LT; RET(LT);
271			}
272		case '=':
273			if (peek() == '=') {
274				input(); yylval.i = EQ; RET(EQ);
275			} else {
276				yylval.i = ASSIGN; RET(ASGNOP);
277			}
278		case '>':
279			if (peek() == '=') {
280				input(); yylval.i = GE; RET(GE);
281			} else if (peek() == '>') {
282				input(); yylval.i = APPEND; RET(APPEND);
283			} else {
284				yylval.i = GT; RET(GT);
285			}
286		case '+':
287			if (peek() == '+') {
288				input(); yylval.i = INCR; RET(INCR);
289			} else if (peek() == '=') {
290				input(); yylval.i = ADDEQ; RET(ASGNOP);
291			} else
292				RET('+');
293		case '-':
294			if (peek() == '-') {
295				input(); yylval.i = DECR; RET(DECR);
296			} else if (peek() == '=') {
297				input(); yylval.i = SUBEQ; RET(ASGNOP);
298			} else
299				RET('-');
300		case '*':
301			if (peek() == '=') {	/* *= */
302				input(); yylval.i = MULTEQ; RET(ASGNOP);
303			} else if (peek() == '*') {	/* ** or **= */
304				input();	/* eat 2nd * */
305				if (peek() == '=') {
306					input(); yylval.i = POWEQ; RET(ASGNOP);
307				} else {
308					RET(POWER);
309				}
310			} else
311				RET('*');
312		case '/':
313			RET('/');
314		case '%':
315			if (peek() == '=') {
316				input(); yylval.i = MODEQ; RET(ASGNOP);
317			} else
318				RET('%');
319		case '^':
320			if (peek() == '=') {
321				input(); yylval.i = POWEQ; RET(ASGNOP);
322			} else
323				RET(POWER);
324
325		case '$':
326			/* BUG: awkward, if not wrong */
327			c = gettok(&buf, &bufsize);
328			if (isalpha(c)) {
329				if (strcmp(buf, "NF") == 0) {	/* very special */
330					unputstr("(NF)");
331					RET(INDIRECT);
332				}
333				c = peek();
334				if (c == '(' || c == '[' || (infunc && isarg(buf) >= 0)) {
335					unputstr(buf);
336					RET(INDIRECT);
337				}
338				yylval.cp = setsymtab(buf, "", 0.0, STR|NUM, symtab);
339				RET(IVAR);
340			} else if (c == 0) {	/*  */
341				SYNTAX( "unexpected end of input after $" );
342				RET(';');
343			} else {
344				unputstr(buf);
345				RET(INDIRECT);
346			}
347
348		case '}':
349			if (--bracecnt < 0)
350				SYNTAX( "extra }" );
351			sc = true;
352			RET(';');
353		case ']':
354			if (--brackcnt < 0)
355				SYNTAX( "extra ]" );
356			RET(']');
357		case ')':
358			if (--parencnt < 0)
359				SYNTAX( "extra )" );
360			RET(')');
361		case '{':
362			bracecnt++;
363			RET('{');
364		case '[':
365			brackcnt++;
366			RET('[');
367		case '(':
368			parencnt++;
369			RET('(');
370
371		case '"':
372			return string();	/* BUG: should be like tran.c ? */
373
374		default:
375			RET(c);
376		}
377	}
378}
379
380extern int runetochar(char *str, int c);
381
382int string(void)
383{
384	int c, n;
385	char *s, *bp;
386	static char *buf = NULL;
387	static int bufsz = 500;
388
389	if (buf == NULL && (buf = (char *) malloc(bufsz)) == NULL)
390		FATAL("out of space for strings");
391	for (bp = buf; (c = input()) != '"'; ) {
392		if (!adjbuf(&buf, &bufsz, bp-buf+2, 500, &bp, "string"))
393			FATAL("out of space for string %.10s...", buf);
394		switch (c) {
395		case '\n':
396		case '\r':
397		case 0:
398			*bp = '\0';
399			SYNTAX( "non-terminated string %.10s...", buf );
400			if (c == 0)	/* hopeless */
401				FATAL( "giving up" );
402			lineno++;
403			break;
404		case '\\':
405			c = input();
406			switch (c) {
407			case '\n': break;
408			case '"': *bp++ = '"'; break;
409			case 'n': *bp++ = '\n'; break;
410			case 't': *bp++ = '\t'; break;
411			case 'f': *bp++ = '\f'; break;
412			case 'r': *bp++ = '\r'; break;
413			case 'b': *bp++ = '\b'; break;
414			case 'v': *bp++ = '\v'; break;
415			case 'a': *bp++ = '\a'; break;
416			case '\\': *bp++ = '\\'; break;
417
418			case '0': case '1': case '2': /* octal: \d \dd \ddd */
419			case '3': case '4': case '5': case '6': case '7':
420				n = c - '0';
421				if ((c = peek()) >= '0' && c < '8') {
422					n = 8 * n + input() - '0';
423					if ((c = peek()) >= '0' && c < '8')
424						n = 8 * n + input() - '0';
425				}
426				*bp++ = n;
427				break;
428
429			case 'x':	/* hex  \x0-9a-fA-F (exactly two) */
430			    {
431				int i;
432
433				if (!isxdigit(peek())) {
434					unput(c);
435					break;
436				}
437				n = 0;
438				for (i = 0; i < 2; i++) {
439					c = input();
440					if (c == 0)
441						break;
442					if (isxdigit(c)) {
443						c = tolower(c);
444						n *= 16;
445						if (isdigit(c))
446							n += (c - '0');
447						else
448							n += 10 + (c - 'a');
449					} else {
450						unput(c);
451						break;
452					}
453				}
454				if (i)
455					*bp++ = n;
456				break;
457			    }
458
459			case 'u':	/* utf  \u0-9a-fA-F (1..8) */
460			    {
461				int i;
462
463				n = 0;
464				for (i = 0; i < 8; i++) {
465					c = input();
466					if (!isxdigit(c) || c == 0)
467						break;
468					c = tolower(c);
469					n *= 16;
470					if (isdigit(c))
471						n += (c - '0');
472					else
473						n += 10 + (c - 'a');
474				}
475				unput(c);
476				bp += runetochar(bp, n);
477				break;
478			    }
479
480			default:
481				*bp++ = c;
482				break;
483			}
484			break;
485		default:
486			*bp++ = c;
487			break;
488		}
489	}
490	*bp = 0;
491	s = tostring(buf);
492	*bp++ = ' '; *bp++ = '\0';
493	yylval.cp = setsymtab(buf, s, 0.0, CON|STR|DONTFREE, symtab);
494	free(s);
495	RET(STRING);
496}
497
498
499static int binsearch(char *w, const Keyword *kp, int n)
500{
501	int cond, low, mid, high;
502
503	low = 0;
504	high = n - 1;
505	while (low <= high) {
506		mid = (low + high) / 2;
507		if ((cond = strcmp(w, kp[mid].word)) < 0)
508			high = mid - 1;
509		else if (cond > 0)
510			low = mid + 1;
511		else
512			return mid;
513	}
514	return -1;
515}
516
517int word(char *w)
518{
519	const Keyword *kp;
520	int c, n;
521
522	n = binsearch(w, keywords, sizeof(keywords)/sizeof(keywords[0]));
523	if (n != -1) {	/* found in table */
524		kp = keywords + n;
525		yylval.i = kp->sub;
526		switch (kp->type) {	/* special handling */
527		case BLTIN:
528			if (kp->sub == FSYSTEM && safe)
529				SYNTAX( "system is unsafe" );
530			RET(kp->type);
531		case FUNC:
532			if (infunc)
533				SYNTAX( "illegal nested function" );
534			RET(kp->type);
535		case RETURN:
536			if (!infunc)
537				SYNTAX( "return not in function" );
538			RET(kp->type);
539		case VARNF:
540			yylval.cp = setsymtab("NF", "", 0.0, NUM, symtab);
541			RET(VARNF);
542		default:
543			RET(kp->type);
544		}
545	}
546	c = peek();	/* look for '(' */
547	if (c != '(' && infunc && (n=isarg(w)) >= 0) {
548		yylval.i = n;
549		RET(ARG);
550	} else {
551		yylval.cp = setsymtab(w, "", 0.0, STR|NUM|DONTFREE, symtab);
552		if (c == '(') {
553			RET(CALL);
554		} else {
555			RET(VAR);
556		}
557	}
558}
559
560void startreg(void)	/* next call to yylex will return a regular expression */
561{
562	reg = true;
563}
564
565int regexpr(void)
566{
567	int c;
568	static char *buf = NULL;
569	static int bufsz = 500;
570	char *bp;
571
572	if (buf == NULL && (buf = (char *) malloc(bufsz)) == NULL)
573		FATAL("out of space for reg expr");
574	bp = buf;
575	for ( ; (c = input()) != '/' && c != 0; ) {
576		if (!adjbuf(&buf, &bufsz, bp-buf+3, 500, &bp, "regexpr"))
577			FATAL("out of space for reg expr %.10s...", buf);
578		if (c == '\n') {
579			*bp = '\0';
580			SYNTAX( "newline in regular expression %.10s...", buf );
581			unput('\n');
582			break;
583		} else if (c == '\\') {
584			*bp++ = '\\';
585			*bp++ = input();
586		} else {
587			*bp++ = c;
588		}
589	}
590	*bp = 0;
591	if (c == 0)
592		SYNTAX("non-terminated regular expression %.10s...", buf);
593	yylval.s = tostring(buf);
594	unput('/');
595	RET(REGEXPR);
596}
597
598/* low-level lexical stuff, sort of inherited from lex */
599
600char	ebuf[300];
601char	*ep = ebuf;
602char	yysbuf[100];	/* pushback buffer */
603char	*yysptr = yysbuf;
604FILE	*yyin = NULL;
605
606int input(void)	/* get next lexical input character */
607{
608	int c;
609	extern char *lexprog;
610
611	if (yysptr > yysbuf)
612		c = (uschar)*--yysptr;
613	else if (lexprog != NULL) {	/* awk '...' */
614		if ((c = (uschar)*lexprog) != 0)
615			lexprog++;
616	} else				/* awk -f ... */
617		c = pgetc();
618	if (c == EOF)
619		c = 0;
620	if (ep >= ebuf + sizeof ebuf)
621		ep = ebuf;
622	*ep = c;
623	if (c != 0) {
624		ep++;
625	}
626	return (c);
627}
628
629void unput(int c)	/* put lexical character back on input */
630{
631	if (c == '\n')
632		lineno--;
633	if (yysptr >= yysbuf + sizeof(yysbuf))
634		FATAL("pushed back too much: %.20s...", yysbuf);
635	*yysptr++ = c;
636	if (--ep < ebuf)
637		ep = ebuf + sizeof(ebuf) - 1;
638}
639
640void unputstr(const char *s)	/* put a string back on input */
641{
642	int i;
643
644	for (i = strlen(s)-1; i >= 0; i--)
645		unput(s[i]);
646}
647