11573Srgrimes/*- 21573Srgrimes * Copyright (c) 1992, 1993 31573Srgrimes * The Regents of the University of California. All rights reserved. 41573Srgrimes * 51573Srgrimes * This code is derived from software contributed to Berkeley by 61573Srgrimes * Christos Zoulas of Cornell University. 71573Srgrimes * 81573Srgrimes * Redistribution and use in source and binary forms, with or without 91573Srgrimes * modification, are permitted provided that the following conditions 101573Srgrimes * are met: 111573Srgrimes * 1. Redistributions of source code must retain the above copyright 121573Srgrimes * notice, this list of conditions and the following disclaimer. 131573Srgrimes * 2. Redistributions in binary form must reproduce the above copyright 141573Srgrimes * notice, this list of conditions and the following disclaimer in the 151573Srgrimes * documentation and/or other materials provided with the distribution. 16148834Sstefanf * 3. Neither the name of the University nor the names of its contributors 171573Srgrimes * may be used to endorse or promote products derived from this software 181573Srgrimes * without specific prior written permission. 191573Srgrimes * 201573Srgrimes * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 211573Srgrimes * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 221573Srgrimes * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 231573Srgrimes * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 241573Srgrimes * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 251573Srgrimes * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 261573Srgrimes * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 271573Srgrimes * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 281573Srgrimes * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 291573Srgrimes * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 301573Srgrimes * SUCH DAMAGE. 3184260Sobrien * 32238624Spfg * $NetBSD: tokenizer.c,v 1.15 2009/02/15 21:55:23 christos Exp $ 331573Srgrimes */ 341573Srgrimes 351573Srgrimes#if !defined(lint) && !defined(SCCSID) 361573Srgrimesstatic char sccsid[] = "@(#)tokenizer.c 8.1 (Berkeley) 6/4/93"; 371573Srgrimes#endif /* not lint && not SCCSID */ 3884260Sobrien#include <sys/cdefs.h> 3984260Sobrien__FBSDID("$FreeBSD$"); 401573Srgrimes 411573Srgrimes/* 421573Srgrimes * tokenize.c: Bourne shell like tokenizer 431573Srgrimes */ 441573Srgrimes#include "sys.h" 451573Srgrimes#include <string.h> 461573Srgrimes#include <stdlib.h> 47148834Sstefanf#include "histedit.h" 481573Srgrimes 4984260Sobrientypedef enum { 5084260Sobrien Q_none, Q_single, Q_double, Q_one, Q_doubleone 5184260Sobrien} quote_t; 521573Srgrimes 5384260Sobrien#define IFS "\t \n" 541573Srgrimes 5584260Sobrien#define TOK_KEEP 1 5684260Sobrien#define TOK_EAT 2 571573Srgrimes 5884260Sobrien#define WINCR 20 5984260Sobrien#define AINCR 10 601573Srgrimes 61148834Sstefanf#define tok_strdup(a) strdup(a) 6284260Sobrien#define tok_malloc(a) malloc(a) 6384260Sobrien#define tok_free(a) free(a) 6484260Sobrien#define tok_realloc(a, b) realloc(a, b) 651573Srgrimes 661573Srgrimes 671573Srgrimesstruct tokenizer { 6884260Sobrien char *ifs; /* In field separator */ 6984260Sobrien int argc, amax; /* Current and maximum number of args */ 7084260Sobrien char **argv; /* Argument list */ 7184260Sobrien char *wptr, *wmax; /* Space and limit on the word buffer */ 7284260Sobrien char *wstart; /* Beginning of next word */ 7384260Sobrien char *wspace; /* Space of word buffer */ 7484260Sobrien quote_t quote; /* Quoting state */ 7584260Sobrien int flags; /* flags; */ 761573Srgrimes}; 771573Srgrimes 781573Srgrimes 7984260Sobrienprivate void tok_finish(Tokenizer *); 801573Srgrimes 811573Srgrimes 821573Srgrimes/* tok_finish(): 831573Srgrimes * Finish a word in the tokenizer. 841573Srgrimes */ 851573Srgrimesprivate void 8684260Sobrientok_finish(Tokenizer *tok) 871573Srgrimes{ 8884260Sobrien 8984260Sobrien *tok->wptr = '\0'; 9084260Sobrien if ((tok->flags & TOK_KEEP) || tok->wptr != tok->wstart) { 9184260Sobrien tok->argv[tok->argc++] = tok->wstart; 9284260Sobrien tok->argv[tok->argc] = NULL; 9384260Sobrien tok->wstart = ++tok->wptr; 9484260Sobrien } 9584260Sobrien tok->flags &= ~TOK_KEEP; 961573Srgrimes} 971573Srgrimes 981573Srgrimes 991573Srgrimes/* tok_init(): 1001573Srgrimes * Initialize the tokenizer 1011573Srgrimes */ 1021573Srgrimespublic Tokenizer * 10384260Sobrientok_init(const char *ifs) 1041573Srgrimes{ 10584260Sobrien Tokenizer *tok = (Tokenizer *) tok_malloc(sizeof(Tokenizer)); 1061573Srgrimes 107148834Sstefanf if (tok == NULL) 108148834Sstefanf return NULL; 109148834Sstefanf tok->ifs = tok_strdup(ifs ? ifs : IFS); 110148834Sstefanf if (tok->ifs == NULL) { 111148834Sstefanf tok_free((ptr_t)tok); 112148834Sstefanf return NULL; 113148834Sstefanf } 11484260Sobrien tok->argc = 0; 11584260Sobrien tok->amax = AINCR; 11684260Sobrien tok->argv = (char **) tok_malloc(sizeof(char *) * tok->amax); 117148834Sstefanf if (tok->argv == NULL) { 118148834Sstefanf tok_free((ptr_t)tok->ifs); 119148834Sstefanf tok_free((ptr_t)tok); 120148834Sstefanf return NULL; 121148834Sstefanf } 12284260Sobrien tok->argv[0] = NULL; 12384260Sobrien tok->wspace = (char *) tok_malloc(WINCR); 124148834Sstefanf if (tok->wspace == NULL) { 125148834Sstefanf tok_free((ptr_t)tok->argv); 126148834Sstefanf tok_free((ptr_t)tok->ifs); 127148834Sstefanf tok_free((ptr_t)tok); 128148834Sstefanf return NULL; 129148834Sstefanf } 13084260Sobrien tok->wmax = tok->wspace + WINCR; 13184260Sobrien tok->wstart = tok->wspace; 13284260Sobrien tok->wptr = tok->wspace; 13384260Sobrien tok->flags = 0; 13484260Sobrien tok->quote = Q_none; 1351573Srgrimes 13684260Sobrien return (tok); 1371573Srgrimes} 1381573Srgrimes 1391573Srgrimes 1401573Srgrimes/* tok_reset(): 1411573Srgrimes * Reset the tokenizer 1421573Srgrimes */ 1431573Srgrimespublic void 14484260Sobrientok_reset(Tokenizer *tok) 1451573Srgrimes{ 14684260Sobrien 14784260Sobrien tok->argc = 0; 14884260Sobrien tok->wstart = tok->wspace; 14984260Sobrien tok->wptr = tok->wspace; 15084260Sobrien tok->flags = 0; 15184260Sobrien tok->quote = Q_none; 1521573Srgrimes} 1531573Srgrimes 1541573Srgrimes 1551573Srgrimes/* tok_end(): 1561573Srgrimes * Clean up 1571573Srgrimes */ 1581573Srgrimespublic void 15984260Sobrientok_end(Tokenizer *tok) 1601573Srgrimes{ 16184260Sobrien 16284260Sobrien tok_free((ptr_t) tok->ifs); 16384260Sobrien tok_free((ptr_t) tok->wspace); 16484260Sobrien tok_free((ptr_t) tok->argv); 16584260Sobrien tok_free((ptr_t) tok); 1661573Srgrimes} 1671573Srgrimes 1681573Srgrimes 1691573Srgrimes 1701573Srgrimes/* tok_line(): 171148834Sstefanf * Bourne shell (sh(1)) like tokenizing 172148834Sstefanf * Arguments: 173148834Sstefanf * tok current tokenizer state (setup with tok_init()) 174148834Sstefanf * line line to parse 175148834Sstefanf * Returns: 176148834Sstefanf * -1 Internal error 177148834Sstefanf * 3 Quoted return 178148834Sstefanf * 2 Unmatched double quote 179148834Sstefanf * 1 Unmatched single quote 180148834Sstefanf * 0 Ok 181148834Sstefanf * Modifies (if return value is 0): 182148834Sstefanf * argc number of arguments 183148834Sstefanf * argv argument array 184148834Sstefanf * cursorc if !NULL, argv element containing cursor 185148834Sstefanf * cursorv if !NULL, offset in argv[cursorc] of cursor 1861573Srgrimes */ 1871573Srgrimespublic int 188148834Sstefanftok_line(Tokenizer *tok, const LineInfo *line, 189148834Sstefanf int *argc, const char ***argv, int *cursorc, int *cursoro) 1901573Srgrimes{ 19184260Sobrien const char *ptr; 192148834Sstefanf int cc, co; 1931573Srgrimes 194148834Sstefanf cc = co = -1; 195148834Sstefanf ptr = line->buffer; 196148834Sstefanf for (ptr = line->buffer; ;ptr++) { 197148834Sstefanf if (ptr >= line->lastchar) 198148834Sstefanf ptr = ""; 199148834Sstefanf if (ptr == line->cursor) { 200148834Sstefanf cc = tok->argc; 201238624Spfg co = (int)(tok->wptr - tok->wstart); 202148834Sstefanf } 203148834Sstefanf switch (*ptr) { 20484260Sobrien case '\'': 20584260Sobrien tok->flags |= TOK_KEEP; 20684260Sobrien tok->flags &= ~TOK_EAT; 20784260Sobrien switch (tok->quote) { 20884260Sobrien case Q_none: 20984260Sobrien tok->quote = Q_single; /* Enter single quote 21084260Sobrien * mode */ 21184260Sobrien break; 2121573Srgrimes 21384260Sobrien case Q_single: /* Exit single quote mode */ 21484260Sobrien tok->quote = Q_none; 21584260Sobrien break; 2161573Srgrimes 21784260Sobrien case Q_one: /* Quote this ' */ 21884260Sobrien tok->quote = Q_none; 21984260Sobrien *tok->wptr++ = *ptr; 22084260Sobrien break; 2211573Srgrimes 22284260Sobrien case Q_double: /* Stay in double quote mode */ 22384260Sobrien *tok->wptr++ = *ptr; 22484260Sobrien break; 2251573Srgrimes 22684260Sobrien case Q_doubleone: /* Quote this ' */ 22784260Sobrien tok->quote = Q_double; 22884260Sobrien *tok->wptr++ = *ptr; 22984260Sobrien break; 2301573Srgrimes 23184260Sobrien default: 23284260Sobrien return (-1); 23384260Sobrien } 23484260Sobrien break; 2351573Srgrimes 23684260Sobrien case '"': 23784260Sobrien tok->flags &= ~TOK_EAT; 23884260Sobrien tok->flags |= TOK_KEEP; 23984260Sobrien switch (tok->quote) { 24084260Sobrien case Q_none: /* Enter double quote mode */ 24184260Sobrien tok->quote = Q_double; 24284260Sobrien break; 2431573Srgrimes 24484260Sobrien case Q_double: /* Exit double quote mode */ 24584260Sobrien tok->quote = Q_none; 24684260Sobrien break; 2471573Srgrimes 24884260Sobrien case Q_one: /* Quote this " */ 24984260Sobrien tok->quote = Q_none; 25084260Sobrien *tok->wptr++ = *ptr; 25184260Sobrien break; 2521573Srgrimes 25384260Sobrien case Q_single: /* Stay in single quote mode */ 25484260Sobrien *tok->wptr++ = *ptr; 25584260Sobrien break; 2561573Srgrimes 25784260Sobrien case Q_doubleone: /* Quote this " */ 25884260Sobrien tok->quote = Q_double; 25984260Sobrien *tok->wptr++ = *ptr; 26084260Sobrien break; 2611573Srgrimes 26284260Sobrien default: 26384260Sobrien return (-1); 26484260Sobrien } 26584260Sobrien break; 2661573Srgrimes 26784260Sobrien case '\\': 26884260Sobrien tok->flags |= TOK_KEEP; 26984260Sobrien tok->flags &= ~TOK_EAT; 27084260Sobrien switch (tok->quote) { 27184260Sobrien case Q_none: /* Quote next character */ 27284260Sobrien tok->quote = Q_one; 27384260Sobrien break; 2741573Srgrimes 27584260Sobrien case Q_double: /* Quote next character */ 27684260Sobrien tok->quote = Q_doubleone; 27784260Sobrien break; 2781573Srgrimes 27984260Sobrien case Q_one: /* Quote this, restore state */ 28084260Sobrien *tok->wptr++ = *ptr; 28184260Sobrien tok->quote = Q_none; 28284260Sobrien break; 2831573Srgrimes 28484260Sobrien case Q_single: /* Stay in single quote mode */ 28584260Sobrien *tok->wptr++ = *ptr; 28684260Sobrien break; 2871573Srgrimes 28884260Sobrien case Q_doubleone: /* Quote this \ */ 28984260Sobrien tok->quote = Q_double; 29084260Sobrien *tok->wptr++ = *ptr; 29184260Sobrien break; 2921573Srgrimes 29384260Sobrien default: 29484260Sobrien return (-1); 29584260Sobrien } 29684260Sobrien break; 2971573Srgrimes 29884260Sobrien case '\n': 29984260Sobrien tok->flags &= ~TOK_EAT; 30084260Sobrien switch (tok->quote) { 30184260Sobrien case Q_none: 302148834Sstefanf goto tok_line_outok; 3031573Srgrimes 30484260Sobrien case Q_single: 30584260Sobrien case Q_double: 30684260Sobrien *tok->wptr++ = *ptr; /* Add the return */ 30784260Sobrien break; 3088870Srgrimes 30984260Sobrien case Q_doubleone: /* Back to double, eat the '\n' */ 31084260Sobrien tok->flags |= TOK_EAT; 31184260Sobrien tok->quote = Q_double; 31284260Sobrien break; 3131573Srgrimes 31484260Sobrien case Q_one: /* No quote, more eat the '\n' */ 31584260Sobrien tok->flags |= TOK_EAT; 31684260Sobrien tok->quote = Q_none; 31784260Sobrien break; 3181573Srgrimes 31984260Sobrien default: 32084260Sobrien return (0); 32184260Sobrien } 32284260Sobrien break; 3231573Srgrimes 32484260Sobrien case '\0': 32584260Sobrien switch (tok->quote) { 32684260Sobrien case Q_none: 32784260Sobrien /* Finish word and return */ 32884260Sobrien if (tok->flags & TOK_EAT) { 32984260Sobrien tok->flags &= ~TOK_EAT; 33084260Sobrien return (3); 33184260Sobrien } 332148834Sstefanf goto tok_line_outok; 3331573Srgrimes 33484260Sobrien case Q_single: 33584260Sobrien return (1); 3361573Srgrimes 33784260Sobrien case Q_double: 33884260Sobrien return (2); 3391573Srgrimes 34084260Sobrien case Q_doubleone: 34184260Sobrien tok->quote = Q_double; 34284260Sobrien *tok->wptr++ = *ptr; 34384260Sobrien break; 3441573Srgrimes 34584260Sobrien case Q_one: 34684260Sobrien tok->quote = Q_none; 34784260Sobrien *tok->wptr++ = *ptr; 34884260Sobrien break; 3491573Srgrimes 35084260Sobrien default: 35184260Sobrien return (-1); 35284260Sobrien } 35384260Sobrien break; 3541573Srgrimes 35584260Sobrien default: 35684260Sobrien tok->flags &= ~TOK_EAT; 35784260Sobrien switch (tok->quote) { 35884260Sobrien case Q_none: 35984260Sobrien if (strchr(tok->ifs, *ptr) != NULL) 36084260Sobrien tok_finish(tok); 36184260Sobrien else 36284260Sobrien *tok->wptr++ = *ptr; 36384260Sobrien break; 3641573Srgrimes 36584260Sobrien case Q_single: 36684260Sobrien case Q_double: 36784260Sobrien *tok->wptr++ = *ptr; 36884260Sobrien break; 3691573Srgrimes 3701573Srgrimes 37184260Sobrien case Q_doubleone: 37284260Sobrien *tok->wptr++ = '\\'; 37384260Sobrien tok->quote = Q_double; 37484260Sobrien *tok->wptr++ = *ptr; 37584260Sobrien break; 3761573Srgrimes 37784260Sobrien case Q_one: 37884260Sobrien tok->quote = Q_none; 37984260Sobrien *tok->wptr++ = *ptr; 38084260Sobrien break; 3811573Srgrimes 38284260Sobrien default: 38384260Sobrien return (-1); 3841573Srgrimes 38584260Sobrien } 38684260Sobrien break; 38784260Sobrien } 3881573Srgrimes 38984260Sobrien if (tok->wptr >= tok->wmax - 4) { 39084260Sobrien size_t size = tok->wmax - tok->wspace + WINCR; 39184260Sobrien char *s = (char *) tok_realloc(tok->wspace, size); 39284260Sobrien if (s == NULL) 39384260Sobrien return (-1); 3941573Srgrimes 395148834Sstefanf if (s != tok->wspace) { 39684260Sobrien int i; 397148834Sstefanf for (i = 0; i < tok->argc; i++) { 398148834Sstefanf tok->argv[i] = 399148834Sstefanf (tok->argv[i] - tok->wspace) + s; 400148834Sstefanf } 401148834Sstefanf tok->wptr = (tok->wptr - tok->wspace) + s; 402148834Sstefanf tok->wstart = (tok->wstart - tok->wspace) + s; 40384260Sobrien tok->wspace = s; 40484260Sobrien } 405148834Sstefanf tok->wmax = s + size; 40684260Sobrien } 40784260Sobrien if (tok->argc >= tok->amax - 4) { 40884260Sobrien char **p; 40984260Sobrien tok->amax += AINCR; 410148814Sstefanf p = (char **) tok_realloc(tok->argv, 41184260Sobrien tok->amax * sizeof(char *)); 41284260Sobrien if (p == NULL) 41384260Sobrien return (-1); 41484260Sobrien tok->argv = p; 41584260Sobrien } 4161573Srgrimes } 417148834Sstefanf tok_line_outok: 418148834Sstefanf if (cc == -1 && co == -1) { 419148834Sstefanf cc = tok->argc; 420238624Spfg co = (int)(tok->wptr - tok->wstart); 421148834Sstefanf } 422148834Sstefanf if (cursorc != NULL) 423148834Sstefanf *cursorc = cc; 424148834Sstefanf if (cursoro != NULL) 425148834Sstefanf *cursoro = co; 426148834Sstefanf tok_finish(tok); 427148834Sstefanf *argv = (const char **)tok->argv; 428148834Sstefanf *argc = tok->argc; 429148834Sstefanf return (0); 4301573Srgrimes} 431148834Sstefanf 432148834Sstefanf/* tok_str(): 433148834Sstefanf * Simpler version of tok_line, taking a NUL terminated line 434148834Sstefanf * and splitting into words, ignoring cursor state. 435148834Sstefanf */ 436148834Sstefanfpublic int 437148834Sstefanftok_str(Tokenizer *tok, const char *line, int *argc, const char ***argv) 438148834Sstefanf{ 439148834Sstefanf LineInfo li; 440148834Sstefanf 441148834Sstefanf memset(&li, 0, sizeof(li)); 442148834Sstefanf li.buffer = line; 443148834Sstefanf li.cursor = li.lastchar = strchr(line, '\0'); 444148834Sstefanf return (tok_line(tok, &li, argc, argv, NULL, NULL)); 445148834Sstefanf} 446