1/*	$OpenBSD: tokenizer.c,v 1.21 2016/04/11 21:17:29 schwarze Exp $	*/
2/*	$NetBSD: tokenizer.c,v 1.28 2016/04/11 18:56:31 christos Exp $	*/
3
4/*-
5 * Copyright (c) 1992, 1993
6 *	The Regents of the University of California.  All rights reserved.
7 *
8 * This code is derived from software contributed to Berkeley by
9 * Christos Zoulas of Cornell University.
10 *
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
13 * are met:
14 * 1. Redistributions of source code must retain the above copyright
15 *    notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 *    notice, this list of conditions and the following disclaimer in the
18 *    documentation and/or other materials provided with the distribution.
19 * 3. Neither the name of the University nor the names of its contributors
20 *    may be used to endorse or promote products derived from this software
21 *    without specific prior written permission.
22 *
23 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
24 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
26 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
27 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
29 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
32 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33 * SUCH DAMAGE.
34 */
35
36#include "config.h"
37
38/* We build this file twice, once as NARROW, once as WIDE. */
39/*
40 * tokenize.c: Bourne shell like tokenizer
41 */
42#include <stdlib.h>
43#include <string.h>
44
45#include "histedit.h"
46
47typedef enum {
48	Q_none, Q_single, Q_double, Q_one, Q_doubleone
49} quote_t;
50
51#define	TOK_KEEP	1
52#define	TOK_EAT		2
53
54#define	WINCR		20
55#define	AINCR		10
56
57#define	IFS		STR("\t \n")
58
59#ifdef NARROWCHAR
60#define	Char			char
61#define	FUN(prefix, rest)	prefix ## _ ## rest
62#define	TYPE(type)		type
63#define	STR(x)			x
64#define	Strchr(s, c)		strchr(s, c)
65#define	tok_strdup(s)		strdup(s)
66#else
67#define	Char			wchar_t
68#define	FUN(prefix, rest)	prefix ## _w ## rest
69#define	TYPE(type)		type ## W
70#define	STR(x)			L ## x
71#define	Strchr(s, c)		wcschr(s, c)
72#define	tok_strdup(s)		wcsdup(s)
73#endif
74
75struct TYPE(tokenizer) {
76	Char	*ifs;		/* In field separator			 */
77	int	 argc, amax;	/* Current and maximum number of args	 */
78	Char   **argv;		/* Argument list			 */
79	Char	*wptr, *wmax;	/* Space and limit on the word buffer	 */
80	Char	*wstart;	/* Beginning of next word		 */
81	Char	*wspace;	/* Space of word buffer			 */
82	quote_t	 quote;		/* Quoting state			 */
83	int	 flags;		/* flags;				 */
84};
85
86
87static void FUN(tok,finish)(TYPE(Tokenizer) *);
88
89
90/* FUN(tok,finish)():
91 *	Finish a word in the tokenizer.
92 */
93static void
94FUN(tok,finish)(TYPE(Tokenizer) *tok)
95{
96
97	*tok->wptr = '\0';
98	if ((tok->flags & TOK_KEEP) || tok->wptr != tok->wstart) {
99		tok->argv[tok->argc++] = tok->wstart;
100		tok->argv[tok->argc] = NULL;
101		tok->wstart = ++tok->wptr;
102	}
103	tok->flags &= ~TOK_KEEP;
104}
105
106
107/* FUN(tok,init)():
108 *	Initialize the tokenizer
109 */
110TYPE(Tokenizer) *
111FUN(tok,init)(const Char *ifs)
112{
113	TYPE(Tokenizer) *tok = malloc(sizeof(TYPE(Tokenizer)));
114
115	if (tok == NULL)
116		return NULL;
117	tok->ifs = tok_strdup(ifs ? ifs : IFS);
118	if (tok->ifs == NULL) {
119		free(tok);
120		return NULL;
121	}
122	tok->argc = 0;
123	tok->amax = AINCR;
124	tok->argv = reallocarray(NULL, tok->amax, sizeof(*tok->argv));
125	if (tok->argv == NULL) {
126		free(tok->ifs);
127		free(tok);
128		return NULL;
129	}
130	tok->argv[0] = NULL;
131	tok->wspace = reallocarray(NULL, WINCR, sizeof(*tok->wspace));
132	if (tok->wspace == NULL) {
133		free(tok->argv);
134		free(tok->ifs);
135		free(tok);
136		return NULL;
137	}
138	tok->wmax = tok->wspace + WINCR;
139	tok->wstart = tok->wspace;
140	tok->wptr = tok->wspace;
141	tok->flags = 0;
142	tok->quote = Q_none;
143
144	return tok;
145}
146
147
148/* FUN(tok,reset)():
149 *	Reset the tokenizer
150 */
151void
152FUN(tok,reset)(TYPE(Tokenizer) *tok)
153{
154
155	tok->argc = 0;
156	tok->wstart = tok->wspace;
157	tok->wptr = tok->wspace;
158	tok->flags = 0;
159	tok->quote = Q_none;
160}
161
162
163/* FUN(tok,end)():
164 *	Clean up
165 */
166void
167FUN(tok,end)(TYPE(Tokenizer) *tok)
168{
169
170	free(tok->ifs);
171	free(tok->wspace);
172	free(tok->argv);
173	free(tok);
174}
175
176
177
178/* FUN(tok,line)():
179 *	Bourne shell (sh(1)) like tokenizing
180 *	Arguments:
181 *		tok	current tokenizer state (setup with FUN(tok,init)())
182 *		line	line to parse
183 *	Returns:
184 *		-1	Internal error
185 *		 3	Quoted return
186 *		 2	Unmatched double quote
187 *		 1	Unmatched single quote
188 *		 0	Ok
189 *	Modifies (if return value is 0):
190 *		argc	number of arguments
191 *		argv	argument array
192 *		cursorc	if !NULL, argv element containing cursor
193 *		cursorv	if !NULL, offset in argv[cursorc] of cursor
194 */
195int
196FUN(tok,line)(TYPE(Tokenizer) *tok, const TYPE(LineInfo) *line,
197    int *argc, const Char ***argv, int *cursorc, int *cursoro)
198{
199	const Char *ptr;
200	int cc, co;
201
202	cc = co = -1;
203	ptr = line->buffer;
204	for (ptr = line->buffer; ;ptr++) {
205		if (ptr >= line->lastchar)
206			ptr = STR("");
207		if (ptr == line->cursor) {
208			cc = tok->argc;
209			co = (int)(tok->wptr - tok->wstart);
210		}
211		switch (*ptr) {
212		case '\'':
213			tok->flags |= TOK_KEEP;
214			tok->flags &= ~TOK_EAT;
215			switch (tok->quote) {
216			case Q_none:
217				tok->quote = Q_single;	/* Enter single quote
218							 * mode */
219				break;
220
221			case Q_single:	/* Exit single quote mode */
222				tok->quote = Q_none;
223				break;
224
225			case Q_one:	/* Quote this ' */
226				tok->quote = Q_none;
227				*tok->wptr++ = *ptr;
228				break;
229
230			case Q_double:	/* Stay in double quote mode */
231				*tok->wptr++ = *ptr;
232				break;
233
234			case Q_doubleone:	/* Quote this ' */
235				tok->quote = Q_double;
236				*tok->wptr++ = *ptr;
237				break;
238
239			default:
240				return -1;
241			}
242			break;
243
244		case '"':
245			tok->flags &= ~TOK_EAT;
246			tok->flags |= TOK_KEEP;
247			switch (tok->quote) {
248			case Q_none:	/* Enter double quote mode */
249				tok->quote = Q_double;
250				break;
251
252			case Q_double:	/* Exit double quote mode */
253				tok->quote = Q_none;
254				break;
255
256			case Q_one:	/* Quote this " */
257				tok->quote = Q_none;
258				*tok->wptr++ = *ptr;
259				break;
260
261			case Q_single:	/* Stay in single quote mode */
262				*tok->wptr++ = *ptr;
263				break;
264
265			case Q_doubleone:	/* Quote this " */
266				tok->quote = Q_double;
267				*tok->wptr++ = *ptr;
268				break;
269
270			default:
271				return -1;
272			}
273			break;
274
275		case '\\':
276			tok->flags |= TOK_KEEP;
277			tok->flags &= ~TOK_EAT;
278			switch (tok->quote) {
279			case Q_none:	/* Quote next character */
280				tok->quote = Q_one;
281				break;
282
283			case Q_double:	/* Quote next character */
284				tok->quote = Q_doubleone;
285				break;
286
287			case Q_one:	/* Quote this, restore state */
288				*tok->wptr++ = *ptr;
289				tok->quote = Q_none;
290				break;
291
292			case Q_single:	/* Stay in single quote mode */
293				*tok->wptr++ = *ptr;
294				break;
295
296			case Q_doubleone:	/* Quote this \ */
297				tok->quote = Q_double;
298				*tok->wptr++ = *ptr;
299				break;
300
301			default:
302				return -1;
303			}
304			break;
305
306		case '\n':
307			tok->flags &= ~TOK_EAT;
308			switch (tok->quote) {
309			case Q_none:
310				goto tok_line_outok;
311
312			case Q_single:
313			case Q_double:
314				*tok->wptr++ = *ptr;	/* Add the return */
315				break;
316
317			case Q_doubleone:   /* Back to double, eat the '\n' */
318				tok->flags |= TOK_EAT;
319				tok->quote = Q_double;
320				break;
321
322			case Q_one:	/* No quote, more eat the '\n' */
323				tok->flags |= TOK_EAT;
324				tok->quote = Q_none;
325				break;
326
327			default:
328				return 0;
329			}
330			break;
331
332		case '\0':
333			switch (tok->quote) {
334			case Q_none:
335				/* Finish word and return */
336				if (tok->flags & TOK_EAT) {
337					tok->flags &= ~TOK_EAT;
338					return 3;
339				}
340				goto tok_line_outok;
341
342			case Q_single:
343				return 1;
344
345			case Q_double:
346				return 2;
347
348			case Q_doubleone:
349				tok->quote = Q_double;
350				*tok->wptr++ = *ptr;
351				break;
352
353			case Q_one:
354				tok->quote = Q_none;
355				*tok->wptr++ = *ptr;
356				break;
357
358			default:
359				return -1;
360			}
361			break;
362
363		default:
364			tok->flags &= ~TOK_EAT;
365			switch (tok->quote) {
366			case Q_none:
367				if (Strchr(tok->ifs, *ptr) != NULL)
368					FUN(tok,finish)(tok);
369				else
370					*tok->wptr++ = *ptr;
371				break;
372
373			case Q_single:
374			case Q_double:
375				*tok->wptr++ = *ptr;
376				break;
377
378
379			case Q_doubleone:
380				*tok->wptr++ = '\\';
381				tok->quote = Q_double;
382				*tok->wptr++ = *ptr;
383				break;
384
385			case Q_one:
386				tok->quote = Q_none;
387				*tok->wptr++ = *ptr;
388				break;
389
390			default:
391				return -1;
392
393			}
394			break;
395		}
396
397		if (tok->wptr >= tok->wmax - 4) {
398			size_t size = tok->wmax - tok->wspace + WINCR;
399			Char *s = reallocarray(tok->wspace, size, sizeof(*s));
400			if (s == NULL)
401				return -1;
402
403			if (s != tok->wspace) {
404				int i;
405				for (i = 0; i < tok->argc; i++) {
406				    tok->argv[i] =
407					(tok->argv[i] - tok->wspace) + s;
408				}
409				tok->wptr = (tok->wptr - tok->wspace) + s;
410				tok->wstart = (tok->wstart - tok->wspace) + s;
411				tok->wspace = s;
412			}
413			tok->wmax = s + size;
414		}
415		if (tok->argc >= tok->amax - 4) {
416			Char **p;
417			tok->amax += AINCR;
418			p = reallocarray(tok->argv, tok->amax, sizeof(*p));
419			if (p == NULL) {
420				tok->amax -= AINCR;
421				return -1;
422			}
423			tok->argv = p;
424		}
425	}
426 tok_line_outok:
427	if (cc == -1 && co == -1) {
428		cc = tok->argc;
429		co = (int)(tok->wptr - tok->wstart);
430	}
431	if (cursorc != NULL)
432		*cursorc = cc;
433	if (cursoro != NULL)
434		*cursoro = co;
435	FUN(tok,finish)(tok);
436	*argv = (const Char **)tok->argv;
437	*argc = tok->argc;
438	return 0;
439}
440
441/* FUN(tok,str)():
442 *	Simpler version of tok_line, taking a NUL terminated line
443 *	and splitting into words, ignoring cursor state.
444 */
445int
446FUN(tok,str)(TYPE(Tokenizer) *tok, const Char *line, int *argc,
447    const Char ***argv)
448{
449	TYPE(LineInfo) li;
450
451	memset(&li, 0, sizeof(li));
452	li.buffer = line;
453	li.cursor = li.lastchar = Strchr(line, '\0');
454	return FUN(tok,line)(tok, &li, argc, argv, NULL, NULL);
455}
456