1/*
2 * *****************************************************************************
3 *
4 * SPDX-License-Identifier: BSD-2-Clause
5 *
6 * Copyright (c) 2018-2023 Gavin D. Howard and contributors.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions are met:
10 *
11 * * Redistributions of source code must retain the above copyright notice, this
12 *   list of conditions and the following disclaimer.
13 *
14 * * Redistributions in binary form must reproduce the above copyright notice,
15 *   this list of conditions and the following disclaimer in the documentation
16 *   and/or other materials provided with the distribution.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
22 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
23 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
24 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
25 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
26 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
27 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28 * POSSIBILITY OF SUCH DAMAGE.
29 *
30 * *****************************************************************************
31 *
32 * The lexer for bc.
33 *
34 */
35
36#if BC_ENABLED
37
38#include <assert.h>
39#include <ctype.h>
40#include <string.h>
41
42#include <bc.h>
43#include <vm.h>
44
45/**
46 * Lexes an identifier, which may be a keyword.
47 * @param l  The lexer.
48 */
49static void
50bc_lex_identifier(BcLex* l)
51{
52	// We already passed the first character, so we need to be sure to include
53	// it.
54	const char* buf = l->buf + l->i - 1;
55	size_t i;
56
57	// This loop is simply checking for keywords.
58	for (i = 0; i < bc_lex_kws_len; ++i)
59	{
60		const BcLexKeyword* kw = bc_lex_kws + i;
61		size_t n = BC_LEX_KW_LEN(kw);
62
63		if (!strncmp(buf, kw->name, n) && !isalnum(buf[n]) && buf[n] != '_')
64		{
65			// If the keyword has been redefined, and redefinition is allowed
66			// (it is not allowed for builtin libraries), break out of the loop
67			// and use it as a name. This depends on the argument parser to
68			// ensure that only non-POSIX keywords get redefined.
69			if (!vm->no_redefine && vm->redefined_kws[i]) break;
70
71			l->t = BC_LEX_KW_AUTO + (BcLexType) i;
72
73			// Warn or error, as appropriate for the mode, if the keyword is not
74			// in the POSIX standard.
75			if (!BC_LEX_KW_POSIX(kw)) bc_lex_verr(l, BC_ERR_POSIX_KW, kw->name);
76
77			// We minus 1 because the index has already been incremented.
78			l->i += n - 1;
79
80			// Already have the token; bail.
81			return;
82		}
83	}
84
85	// If not a keyword, parse the name.
86	bc_lex_name(l);
87
88	// POSIX doesn't allow identifiers that are more than one character, so we
89	// might have to warn or error here too.
90	if (BC_ERR(l->str.len - 1 > 1))
91	{
92		bc_lex_verr(l, BC_ERR_POSIX_NAME_LEN, l->str.v);
93	}
94}
95
96/**
97 * Parses a bc string. This is separate from dc strings because dc strings need
98 * to be balanced.
99 * @param l  The lexer.
100 */
101static void
102bc_lex_string(BcLex* l)
103{
104	// We need to keep track of newlines to increment them properly.
105	size_t len, nlines, i;
106	const char* buf;
107	char c;
108	bool got_more;
109
110	l->t = BC_LEX_STR;
111
112	do
113	{
114		nlines = 0;
115		buf = l->buf;
116		got_more = false;
117
118		assert(vm->mode != BC_MODE_STDIN || buf == vm->buffer.v);
119
120		// Fortunately for us, bc doesn't escape quotes. Instead, the equivalent
121		// is '\q', which makes this loop simpler.
122		for (i = l->i; (c = buf[i]) && c != '"'; ++i)
123		{
124			nlines += (c == '\n');
125		}
126
127		if (BC_ERR(c == '\0') && !vm->eof && l->mode != BC_MODE_FILE)
128		{
129			got_more = bc_lex_readLine(l);
130		}
131	}
132	while (got_more && c != '"');
133
134	// If the string did not end properly, barf.
135	if (c != '"')
136	{
137		l->i = i;
138		bc_lex_err(l, BC_ERR_PARSE_STRING);
139	}
140
141	// Set the temp string to the parsed string.
142	len = i - l->i;
143	bc_vec_string(&l->str, len, l->buf + l->i);
144
145	l->i = i + 1;
146	l->line += nlines;
147}
148
149/**
150 * This function takes a lexed operator and checks to see if it's the assignment
151 * version, setting the token appropriately.
152 * @param l        The lexer.
153 * @param with     The token to assign if it is an assignment operator.
154 * @param without  The token to assign if it is not an assignment operator.
155 */
156static void
157bc_lex_assign(BcLex* l, BcLexType with, BcLexType without)
158{
159	if (l->buf[l->i] == '=')
160	{
161		l->i += 1;
162		l->t = with;
163	}
164	else l->t = without;
165}
166
167void
168bc_lex_token(BcLex* l)
169{
170	// We increment here. This means that all lexing needs to take that into
171	// account, such as when parsing an identifier. If we don't, the first
172	// character of every identifier would be missing.
173	char c = l->buf[l->i++], c2;
174
175	BC_SIG_ASSERT_LOCKED;
176
177	// This is the workhorse of the lexer.
178	switch (c)
179	{
180		case '\0':
181		case '\n':
182		case '\t':
183		case '\v':
184		case '\f':
185		case '\r':
186		case ' ':
187		{
188			bc_lex_commonTokens(l, c);
189			break;
190		}
191
192		case '!':
193		{
194			// Even though it's not an assignment, we can use this.
195			bc_lex_assign(l, BC_LEX_OP_REL_NE, BC_LEX_OP_BOOL_NOT);
196
197			// POSIX doesn't allow boolean not.
198			if (l->t == BC_LEX_OP_BOOL_NOT)
199			{
200				bc_lex_verr(l, BC_ERR_POSIX_BOOL, "!");
201			}
202
203			break;
204		}
205
206		case '"':
207		{
208			bc_lex_string(l);
209			break;
210		}
211
212		case '#':
213		{
214			// POSIX does not allow line comments.
215			bc_lex_err(l, BC_ERR_POSIX_COMMENT);
216			bc_lex_lineComment(l);
217			break;
218		}
219
220		case '%':
221		{
222			bc_lex_assign(l, BC_LEX_OP_ASSIGN_MODULUS, BC_LEX_OP_MODULUS);
223			break;
224		}
225
226		case '&':
227		{
228			c2 = l->buf[l->i];
229
230			// Either we have boolean and or an error. And boolean and is not
231			// allowed by POSIX.
232			if (BC_NO_ERR(c2 == '&'))
233			{
234				bc_lex_verr(l, BC_ERR_POSIX_BOOL, "&&");
235
236				l->i += 1;
237				l->t = BC_LEX_OP_BOOL_AND;
238			}
239			else bc_lex_invalidChar(l, c);
240
241			break;
242		}
243#if BC_ENABLE_EXTRA_MATH
244		case '$':
245		{
246			l->t = BC_LEX_OP_TRUNC;
247			break;
248		}
249
250		case '@':
251		{
252			bc_lex_assign(l, BC_LEX_OP_ASSIGN_PLACES, BC_LEX_OP_PLACES);
253			break;
254		}
255#endif // BC_ENABLE_EXTRA_MATH
256		case '(':
257		case ')':
258		{
259			l->t = (BcLexType) (c - '(' + BC_LEX_LPAREN);
260			break;
261		}
262
263		case '*':
264		{
265			bc_lex_assign(l, BC_LEX_OP_ASSIGN_MULTIPLY, BC_LEX_OP_MULTIPLY);
266			break;
267		}
268
269		case '+':
270		{
271			c2 = l->buf[l->i];
272
273			// Have to check for increment first.
274			if (c2 == '+')
275			{
276				l->i += 1;
277				l->t = BC_LEX_OP_INC;
278			}
279			else bc_lex_assign(l, BC_LEX_OP_ASSIGN_PLUS, BC_LEX_OP_PLUS);
280			break;
281		}
282
283		case ',':
284		{
285			l->t = BC_LEX_COMMA;
286			break;
287		}
288
289		case '-':
290		{
291			c2 = l->buf[l->i];
292
293			// Have to check for decrement first.
294			if (c2 == '-')
295			{
296				l->i += 1;
297				l->t = BC_LEX_OP_DEC;
298			}
299			else bc_lex_assign(l, BC_LEX_OP_ASSIGN_MINUS, BC_LEX_OP_MINUS);
300			break;
301		}
302
303		case '.':
304		{
305			c2 = l->buf[l->i];
306
307			// If it's alone, it's an alias for last.
308			if (BC_LEX_NUM_CHAR(c2, true, false)) bc_lex_number(l, c);
309			else
310			{
311				l->t = BC_LEX_KW_LAST;
312				bc_lex_err(l, BC_ERR_POSIX_DOT);
313			}
314
315			break;
316		}
317
318		case '/':
319		{
320			c2 = l->buf[l->i];
321			if (c2 == '*') bc_lex_comment(l);
322			else bc_lex_assign(l, BC_LEX_OP_ASSIGN_DIVIDE, BC_LEX_OP_DIVIDE);
323			break;
324		}
325
326		case '0':
327		case '1':
328		case '2':
329		case '3':
330		case '4':
331		case '5':
332		case '6':
333		case '7':
334		case '8':
335		case '9':
336		case 'A':
337		case 'B':
338		case 'C':
339		case 'D':
340		case 'E':
341		case 'F':
342		// Apparently, GNU bc (and maybe others) allows any uppercase letter as
343		// a number. When single digits, they act like the ones above. When
344		// multi-digit, any letter above the input base is automatically set to
345		// the biggest allowable digit in the input base.
346		case 'G':
347		case 'H':
348		case 'I':
349		case 'J':
350		case 'K':
351		case 'L':
352		case 'M':
353		case 'N':
354		case 'O':
355		case 'P':
356		case 'Q':
357		case 'R':
358		case 'S':
359		case 'T':
360		case 'U':
361		case 'V':
362		case 'W':
363		case 'X':
364		case 'Y':
365		case 'Z':
366		{
367			bc_lex_number(l, c);
368			break;
369		}
370
371		case ';':
372		{
373			l->t = BC_LEX_SCOLON;
374			break;
375		}
376
377		case '<':
378		{
379#if BC_ENABLE_EXTRA_MATH
380			c2 = l->buf[l->i];
381
382			// Check for shift.
383			if (c2 == '<')
384			{
385				l->i += 1;
386				bc_lex_assign(l, BC_LEX_OP_ASSIGN_LSHIFT, BC_LEX_OP_LSHIFT);
387				break;
388			}
389#endif // BC_ENABLE_EXTRA_MATH
390			bc_lex_assign(l, BC_LEX_OP_REL_LE, BC_LEX_OP_REL_LT);
391			break;
392		}
393
394		case '=':
395		{
396			bc_lex_assign(l, BC_LEX_OP_REL_EQ, BC_LEX_OP_ASSIGN);
397			break;
398		}
399
400		case '>':
401		{
402#if BC_ENABLE_EXTRA_MATH
403			c2 = l->buf[l->i];
404
405			// Check for shift.
406			if (c2 == '>')
407			{
408				l->i += 1;
409				bc_lex_assign(l, BC_LEX_OP_ASSIGN_RSHIFT, BC_LEX_OP_RSHIFT);
410				break;
411			}
412#endif // BC_ENABLE_EXTRA_MATH
413			bc_lex_assign(l, BC_LEX_OP_REL_GE, BC_LEX_OP_REL_GT);
414			break;
415		}
416
417		case '[':
418		case ']':
419		{
420			l->t = (BcLexType) (c - '[' + BC_LEX_LBRACKET);
421			break;
422		}
423
424		case '\\':
425		{
426			// In bc, a backslash+newline is whitespace.
427			if (BC_NO_ERR(l->buf[l->i] == '\n'))
428			{
429				l->i += 1;
430				l->t = BC_LEX_WHITESPACE;
431			}
432			else bc_lex_invalidChar(l, c);
433			break;
434		}
435
436		case '^':
437		{
438			bc_lex_assign(l, BC_LEX_OP_ASSIGN_POWER, BC_LEX_OP_POWER);
439			break;
440		}
441
442		case 'a':
443		case 'b':
444		case 'c':
445		case 'd':
446		case 'e':
447		case 'f':
448		case 'g':
449		case 'h':
450		case 'i':
451		case 'j':
452		case 'k':
453		case 'l':
454		case 'm':
455		case 'n':
456		case 'o':
457		case 'p':
458		case 'q':
459		case 'r':
460		case 's':
461		case 't':
462		case 'u':
463		case 'v':
464		case 'w':
465		case 'x':
466		case 'y':
467		case 'z':
468		{
469			bc_lex_identifier(l);
470			break;
471		}
472
473		case '{':
474		case '}':
475		{
476			l->t = (BcLexType) (c - '{' + BC_LEX_LBRACE);
477			break;
478		}
479
480		case '|':
481		{
482			c2 = l->buf[l->i];
483
484			// Once again, boolean or is not allowed by POSIX.
485			if (BC_NO_ERR(c2 == '|'))
486			{
487				bc_lex_verr(l, BC_ERR_POSIX_BOOL, "||");
488
489				l->i += 1;
490				l->t = BC_LEX_OP_BOOL_OR;
491			}
492			else bc_lex_invalidChar(l, c);
493
494			break;
495		}
496
497		default:
498		{
499			bc_lex_invalidChar(l, c);
500		}
501	}
502}
503#endif // BC_ENABLED
504