1228072Sbapt/* ccl - routines for character classes */
2228072Sbapt
3228072Sbapt/*  Copyright (c) 1990 The Regents of the University of California. */
4228072Sbapt/*  All rights reserved. */
5228072Sbapt
6228072Sbapt/*  This code is derived from software contributed to Berkeley by */
7228072Sbapt/*  Vern Paxson. */
8228072Sbapt
9228072Sbapt/*  The United States Government has rights in this work pursuant */
10228072Sbapt/*  to contract no. DE-AC03-76SF00098 between the United States */
11228072Sbapt /*  Department of Energy and the University of California. */
12228072Sbapt
13228072Sbapt/*  This file is part of flex. */
14228072Sbapt
15228072Sbapt/*  Redistribution and use in source and binary forms, with or without */
16228072Sbapt/*  modification, are permitted provided that the following conditions */
17228072Sbapt/*  are met: */
18228072Sbapt
19228072Sbapt/*  1. Redistributions of source code must retain the above copyright */
20228072Sbapt/*     notice, this list of conditions and the following disclaimer. */
21228072Sbapt/*  2. Redistributions in binary form must reproduce the above copyright */
22228072Sbapt/*     notice, this list of conditions and the following disclaimer in the */
23228072Sbapt/*     documentation and/or other materials provided with the distribution. */
24228072Sbapt
25228072Sbapt/*  Neither the name of the University nor the names of its contributors */
26228072Sbapt/*  may be used to endorse or promote products derived from this software */
27228072Sbapt/*  without specific prior written permission. */
28228072Sbapt
29228072Sbapt/*  THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR */
30228072Sbapt/*  IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED */
31228072Sbapt/*  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR */
32228072Sbapt/*  PURPOSE. */
33228072Sbapt
34228072Sbapt#include "flexdef.h"
35228072Sbapt
36228072Sbapt/* return true if the chr is in the ccl. Takes negation into account. */
37228072Sbaptstatic bool
38228072Sbaptccl_contains (const int cclp, const int ch)
39228072Sbapt{
40228072Sbapt	int     ind, len, i;
41228072Sbapt
42228072Sbapt	len = ccllen[cclp];
43228072Sbapt	ind = cclmap[cclp];
44228072Sbapt
45228072Sbapt	for (i = 0; i < len; ++i)
46228072Sbapt		if (ccltbl[ind + i] == ch)
47228072Sbapt			return !cclng[cclp];
48228072Sbapt
49228072Sbapt    return cclng[cclp];
50228072Sbapt}
51228072Sbapt
52228072Sbapt
53228072Sbapt/* ccladd - add a single character to a ccl */
54228072Sbapt
55228072Sbaptvoid    ccladd (cclp, ch)
56228072Sbapt     int     cclp;
57228072Sbapt     int     ch;
58228072Sbapt{
59228072Sbapt	int     ind, len, newpos, i;
60228072Sbapt
61228072Sbapt	check_char (ch);
62228072Sbapt
63228072Sbapt	len = ccllen[cclp];
64228072Sbapt	ind = cclmap[cclp];
65228072Sbapt
66228072Sbapt	/* check to see if the character is already in the ccl */
67228072Sbapt
68228072Sbapt	for (i = 0; i < len; ++i)
69228072Sbapt		if (ccltbl[ind + i] == ch)
70228072Sbapt			return;
71228072Sbapt
72228072Sbapt	/* mark newlines */
73228072Sbapt	if (ch == nlch)
74228072Sbapt		ccl_has_nl[cclp] = true;
75228072Sbapt
76228072Sbapt	newpos = ind + len;
77228072Sbapt
78228072Sbapt	if (newpos >= current_max_ccl_tbl_size) {
79228072Sbapt		current_max_ccl_tbl_size += MAX_CCL_TBL_SIZE_INCREMENT;
80228072Sbapt
81228072Sbapt		++num_reallocs;
82228072Sbapt
83228072Sbapt		ccltbl = reallocate_Character_array (ccltbl,
84228072Sbapt						     current_max_ccl_tbl_size);
85228072Sbapt	}
86228072Sbapt
87228072Sbapt	ccllen[cclp] = len + 1;
88228072Sbapt	ccltbl[newpos] = ch;
89228072Sbapt}
90228072Sbapt
91228072Sbapt/* dump_cclp - same thing as list_character_set, but for cclps.  */
92228072Sbapt
93228072Sbaptstatic void    dump_cclp (FILE* file, int cclp)
94228072Sbapt{
95250874Sjkim	int i;
96228072Sbapt
97228072Sbapt	putc ('[', file);
98228072Sbapt
99228072Sbapt	for (i = 0; i < csize; ++i) {
100228072Sbapt		if (ccl_contains(cclp, i)){
101250874Sjkim			int start_char = i;
102228072Sbapt
103228072Sbapt			putc (' ', file);
104228072Sbapt
105228072Sbapt			fputs (readable_form (i), file);
106228072Sbapt
107228072Sbapt			while (++i < csize && ccl_contains(cclp,i)) ;
108228072Sbapt
109228072Sbapt			if (i - 1 > start_char)
110228072Sbapt				/* this was a run */
111228072Sbapt				fprintf (file, "-%s",
112228072Sbapt					 readable_form (i - 1));
113228072Sbapt
114228072Sbapt			putc (' ', file);
115228072Sbapt		}
116228072Sbapt	}
117228072Sbapt
118228072Sbapt	putc (']', file);
119228072Sbapt}
120228072Sbapt
121228072Sbapt
122228072Sbapt
123228072Sbapt/* ccl_set_diff - create a new ccl as the set difference of the two given ccls. */
124228072Sbaptint
125228072Sbaptccl_set_diff (int a, int b)
126228072Sbapt{
127228072Sbapt    int  d, ch;
128228072Sbapt
129228072Sbapt    /* create new class  */
130228072Sbapt    d = cclinit();
131228072Sbapt
132228072Sbapt    /* In order to handle negation, we spin through all possible chars,
133228072Sbapt     * addding each char in a that is not in b.
134228072Sbapt     * (This could be O(n^2), but n is small and bounded.)
135228072Sbapt     */
136228072Sbapt	for ( ch = 0; ch < csize; ++ch )
137228072Sbapt        if (ccl_contains (a, ch) && !ccl_contains(b, ch))
138228072Sbapt            ccladd (d, ch);
139228072Sbapt
140228072Sbapt    /* debug */
141228072Sbapt    if (0){
142228072Sbapt        fprintf(stderr, "ccl_set_diff (");
143228072Sbapt            fprintf(stderr, "\n    ");
144228072Sbapt            dump_cclp (stderr, a);
145228072Sbapt            fprintf(stderr, "\n    ");
146228072Sbapt            dump_cclp (stderr, b);
147228072Sbapt            fprintf(stderr, "\n    ");
148228072Sbapt            dump_cclp (stderr, d);
149228072Sbapt        fprintf(stderr, "\n)\n");
150228072Sbapt    }
151228072Sbapt    return d;
152228072Sbapt}
153228072Sbapt
154228072Sbapt/* ccl_set_union - create a new ccl as the set union of the two given ccls. */
155228072Sbaptint
156228072Sbaptccl_set_union (int a, int b)
157228072Sbapt{
158228072Sbapt    int  d, i;
159228072Sbapt
160228072Sbapt    /* create new class  */
161228072Sbapt    d = cclinit();
162228072Sbapt
163228072Sbapt    /* Add all of a */
164228072Sbapt    for (i = 0; i < ccllen[a]; ++i)
165228072Sbapt		ccladd (d, ccltbl[cclmap[a] + i]);
166228072Sbapt
167228072Sbapt    /* Add all of b */
168228072Sbapt    for (i = 0; i < ccllen[b]; ++i)
169228072Sbapt		ccladd (d, ccltbl[cclmap[b] + i]);
170228072Sbapt
171228072Sbapt    /* debug */
172228072Sbapt    if (0){
173228072Sbapt        fprintf(stderr, "ccl_set_union (%d + %d = %d", a, b, d);
174228072Sbapt            fprintf(stderr, "\n    ");
175228072Sbapt            dump_cclp (stderr, a);
176228072Sbapt            fprintf(stderr, "\n    ");
177228072Sbapt            dump_cclp (stderr, b);
178228072Sbapt            fprintf(stderr, "\n    ");
179228072Sbapt            dump_cclp (stderr, d);
180228072Sbapt        fprintf(stderr, "\n)\n");
181228072Sbapt    }
182228072Sbapt    return d;
183228072Sbapt}
184228072Sbapt
185228072Sbapt
186228072Sbapt/* cclinit - return an empty ccl */
187228072Sbapt
188228072Sbaptint     cclinit ()
189228072Sbapt{
190228072Sbapt	if (++lastccl >= current_maxccls) {
191228072Sbapt		current_maxccls += MAX_CCLS_INCREMENT;
192228072Sbapt
193228072Sbapt		++num_reallocs;
194228072Sbapt
195228072Sbapt		cclmap =
196228072Sbapt			reallocate_integer_array (cclmap, current_maxccls);
197228072Sbapt		ccllen =
198228072Sbapt			reallocate_integer_array (ccllen, current_maxccls);
199228072Sbapt		cclng = reallocate_integer_array (cclng, current_maxccls);
200228072Sbapt		ccl_has_nl =
201228072Sbapt			reallocate_bool_array (ccl_has_nl,
202228072Sbapt					       current_maxccls);
203228072Sbapt	}
204228072Sbapt
205228072Sbapt	if (lastccl == 1)
206228072Sbapt		/* we're making the first ccl */
207228072Sbapt		cclmap[lastccl] = 0;
208228072Sbapt
209228072Sbapt	else
210228072Sbapt		/* The new pointer is just past the end of the last ccl.
211228072Sbapt		 * Since the cclmap points to the \first/ character of a
212228072Sbapt		 * ccl, adding the length of the ccl to the cclmap pointer
213228072Sbapt		 * will produce a cursor to the first free space.
214228072Sbapt		 */
215228072Sbapt		cclmap[lastccl] =
216228072Sbapt			cclmap[lastccl - 1] + ccllen[lastccl - 1];
217228072Sbapt
218228072Sbapt	ccllen[lastccl] = 0;
219228072Sbapt	cclng[lastccl] = 0;	/* ccl's start out life un-negated */
220228072Sbapt	ccl_has_nl[lastccl] = false;
221228072Sbapt
222228072Sbapt	return lastccl;
223228072Sbapt}
224228072Sbapt
225228072Sbapt
226228072Sbapt/* cclnegate - negate the given ccl */
227228072Sbapt
228228072Sbaptvoid    cclnegate (cclp)
229228072Sbapt     int     cclp;
230228072Sbapt{
231228072Sbapt	cclng[cclp] = 1;
232228072Sbapt	ccl_has_nl[cclp] = !ccl_has_nl[cclp];
233228072Sbapt}
234228072Sbapt
235228072Sbapt
236228072Sbapt/* list_character_set - list the members of a set of characters in CCL form
237228072Sbapt *
238228072Sbapt * Writes to the given file a character-class representation of those
239228072Sbapt * characters present in the given CCL.  A character is present if it
240228072Sbapt * has a non-zero value in the cset array.
241228072Sbapt */
242228072Sbapt
243228072Sbaptvoid    list_character_set (file, cset)
244228072Sbapt     FILE   *file;
245228072Sbapt     int     cset[];
246228072Sbapt{
247250874Sjkim	int i;
248228072Sbapt
249228072Sbapt	putc ('[', file);
250228072Sbapt
251228072Sbapt	for (i = 0; i < csize; ++i) {
252228072Sbapt		if (cset[i]) {
253250874Sjkim			int start_char = i;
254228072Sbapt
255228072Sbapt			putc (' ', file);
256228072Sbapt
257228072Sbapt			fputs (readable_form (i), file);
258228072Sbapt
259228072Sbapt			while (++i < csize && cset[i]) ;
260228072Sbapt
261228072Sbapt			if (i - 1 > start_char)
262228072Sbapt				/* this was a run */
263228072Sbapt				fprintf (file, "-%s",
264228072Sbapt					 readable_form (i - 1));
265228072Sbapt
266228072Sbapt			putc (' ', file);
267228072Sbapt		}
268228072Sbapt	}
269228072Sbapt
270228072Sbapt	putc (']', file);
271228072Sbapt}
272228072Sbapt
273228072Sbapt/** Determines if the range [c1-c2] is unambiguous in a case-insensitive
274228072Sbapt * scanner.  Specifically, if a lowercase or uppercase character, x, is in the
275228072Sbapt * range [c1-c2], then we require that UPPERCASE(x) and LOWERCASE(x) must also
276228072Sbapt * be in the range. If not, then this range is ambiguous, and the function
277228072Sbapt * returns false.  For example, [@-_] spans [a-z] but not [A-Z].  Beware that
278228072Sbapt * [a-z] will be labeled ambiguous because it does not include [A-Z].
279228072Sbapt *
280228072Sbapt * @param c1 the lower end of the range
281228072Sbapt * @param c2 the upper end of the range
282228072Sbapt * @return true if [c1-c2] is not ambiguous for a caseless scanner.
283228072Sbapt */
284228072Sbaptbool range_covers_case (int c1, int c2)
285228072Sbapt{
286228072Sbapt	int     i, o;
287228072Sbapt
288228072Sbapt	for (i = c1; i <= c2; i++) {
289228072Sbapt		if (has_case (i)) {
290228072Sbapt			o = reverse_case (i);
291228072Sbapt			if (o < c1 || c2 < o)
292228072Sbapt				return false;
293228072Sbapt		}
294228072Sbapt	}
295228072Sbapt	return true;
296228072Sbapt}
297228072Sbapt
298228072Sbapt/** Reverse the case of a character, if possible.
299228072Sbapt * @return c if case-reversal does not apply.
300228072Sbapt */
301228072Sbaptint reverse_case (int c)
302228072Sbapt{
303228072Sbapt	return isupper (c) ? tolower (c) : (islower (c) ? toupper (c) : c);
304228072Sbapt}
305228072Sbapt
306228072Sbapt/** Return true if c is uppercase or lowercase. */
307228072Sbaptbool has_case (int c)
308228072Sbapt{
309228072Sbapt	return (isupper (c) || islower (c)) ? true : false;
310228072Sbapt}
311