1/*-
2 * Copyright (c) 2004 Tim J. Robbins.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 */
26/*
27 * "Set of characters" ADT implemented as a splay tree of extents, with
28 * a lookup table cache to simplify looking up the first bunch of
29 * characters (which are presumably more common than others).
30 */
31
32#include <sys/cdefs.h>
33__FBSDID("$FreeBSD$");
34
35#include <assert.h>
36#include <stdbool.h>
37#include <stdlib.h>
38#include <wchar.h>
39#include <wctype.h>
40#include "cset.h"
41
42static struct csnode *	cset_delete(struct csnode *, wchar_t);
43static __inline int	cset_rangecmp(struct csnode *, wchar_t);
44static struct csnode *	cset_splay(struct csnode *, wchar_t);
45
46/*
47 * cset_alloc --
48 *	Allocate a set of characters.
49 */
50struct cset *
51cset_alloc(void)
52{
53	struct cset *cs;
54
55	if ((cs = malloc(sizeof(*cs))) == NULL)
56		return (NULL);
57	cs->cs_root = NULL;
58	cs->cs_classes = NULL;
59	cs->cs_havecache = false;
60	cs->cs_invert = false;
61	return (cs);
62}
63
64/*
65 * cset_add --
66 *	Add a character to the set.
67 */
68bool
69cset_add(struct cset *cs, wchar_t ch)
70{
71	struct csnode *csn, *ncsn;
72	wchar_t oval;
73
74	cs->cs_havecache = false;
75
76	/*
77	 * Inserting into empty tree; new item becomes the root.
78	 */
79	if (cs->cs_root == NULL) {
80		csn = malloc(sizeof(*cs->cs_root));
81		if (csn == NULL)
82			return (false);
83		csn->csn_left = csn->csn_right = NULL;
84		csn->csn_min = csn->csn_max = ch;
85		cs->cs_root = csn;
86		return (true);
87	}
88
89	/*
90	 * Splay to check whether the item already exists, and otherwise,
91	 * where we should put it.
92	 */
93	csn = cs->cs_root = cset_splay(cs->cs_root, ch);
94
95	/*
96	 * Avoid adding duplicate nodes.
97	 */
98	if (cset_rangecmp(csn, ch) == 0)
99		return (true);
100
101	/*
102	 * Allocate a new node and make it the new root.
103	 */
104	ncsn = malloc(sizeof(*ncsn));
105	if (ncsn == NULL)
106		return (false);
107	ncsn->csn_min = ncsn->csn_max = ch;
108	if (cset_rangecmp(csn, ch) < 0) {
109		ncsn->csn_left = csn->csn_left;
110		ncsn->csn_right = csn;
111		csn->csn_left = NULL;
112	} else {
113		ncsn->csn_right = csn->csn_right;
114		ncsn->csn_left = csn;
115		csn->csn_right = NULL;
116	}
117	cs->cs_root = ncsn;
118
119	/*
120	 * Coalesce with left and right neighbours if possible.
121	 */
122	if (ncsn->csn_left != NULL) {
123		ncsn->csn_left = cset_splay(ncsn->csn_left, ncsn->csn_min - 1);
124		if (ncsn->csn_left->csn_max == ncsn->csn_min - 1) {
125			oval = ncsn->csn_left->csn_min;
126			ncsn->csn_left = cset_delete(ncsn->csn_left,
127			    ncsn->csn_left->csn_min);
128			ncsn->csn_min = oval;
129		}
130	}
131	if (ncsn->csn_right != NULL) {
132		ncsn->csn_right = cset_splay(ncsn->csn_right,
133		    ncsn->csn_max + 1);
134		if (ncsn->csn_right->csn_min == ncsn->csn_max + 1) {
135			oval = ncsn->csn_right->csn_max;
136			ncsn->csn_right = cset_delete(ncsn->csn_right,
137			    ncsn->csn_right->csn_min);
138			ncsn->csn_max = oval;
139		}
140	}
141
142	return (true);
143}
144
145/*
146 * cset_in_hard --
147 *	Determine whether a character is in the set without using
148 *	the cache.
149 */
150bool
151cset_in_hard(struct cset *cs, wchar_t ch)
152{
153	struct csclass *csc;
154
155	for (csc = cs->cs_classes; csc != NULL; csc = csc->csc_next)
156		if (csc->csc_invert ^ (iswctype(ch, csc->csc_type) != 0))
157			return (cs->cs_invert ^ true);
158	if (cs->cs_root != NULL) {
159		cs->cs_root = cset_splay(cs->cs_root, ch);
160		return (cs->cs_invert ^ (cset_rangecmp(cs->cs_root, ch) == 0));
161	}
162	return (cs->cs_invert ^ false);
163}
164
165/*
166 * cset_cache --
167 *	Update the cache.
168 */
169void
170cset_cache(struct cset *cs)
171{
172	wchar_t i;
173
174	for (i = 0; i < CS_CACHE_SIZE; i++)
175		cs->cs_cache[i] = cset_in_hard(cs, i);
176
177	cs->cs_havecache = true;
178}
179
180/*
181 * cset_invert --
182 *	Invert the character set.
183 */
184void
185cset_invert(struct cset *cs)
186{
187
188	cs->cs_invert ^= true;
189	cs->cs_havecache = false;
190}
191
192/*
193 * cset_addclass --
194 *	Add a wctype()-style character class to the set, optionally
195 *	inverting it.
196 */
197bool
198cset_addclass(struct cset *cs, wctype_t type, bool invert)
199{
200	struct csclass *csc;
201
202	csc = malloc(sizeof(*csc));
203	if (csc == NULL)
204		return (false);
205	csc->csc_type = type;
206	csc->csc_invert = invert;
207	csc->csc_next = cs->cs_classes;
208	cs->cs_classes = csc;
209	cs->cs_havecache = false;
210	return (true);
211}
212
213static __inline int
214cset_rangecmp(struct csnode *t, wchar_t ch)
215{
216
217	if (ch < t->csn_min)
218		return (-1);
219	if (ch > t->csn_max)
220		return (1);
221	return (0);
222}
223
224static struct csnode *
225cset_splay(struct csnode *t, wchar_t ch)
226{
227	struct csnode N, *l, *r, *y;
228
229	/*
230	 * Based on public domain code from Sleator.
231	 */
232
233	assert(t != NULL);
234
235	N.csn_left = N.csn_right = NULL;
236	l = r = &N;
237	for (;;) {
238		if (cset_rangecmp(t, ch) < 0) {
239			if (t->csn_left != NULL &&
240			    cset_rangecmp(t->csn_left, ch) < 0) {
241				y = t->csn_left;
242				t->csn_left = y->csn_right;
243				y->csn_right = t;
244				t = y;
245			}
246			if (t->csn_left == NULL)
247				break;
248			r->csn_left = t;
249			r = t;
250			t = t->csn_left;
251		} else if (cset_rangecmp(t, ch) > 0) {
252			if (t->csn_right != NULL &&
253			    cset_rangecmp(t->csn_right, ch) > 0) {
254				y = t->csn_right;
255				t->csn_right = y->csn_left;
256				y->csn_left = t;
257				t = y;
258			}
259			if (t->csn_right == NULL)
260				break;
261			l->csn_right = t;
262			l = t;
263			t = t->csn_right;
264		} else
265			break;
266	}
267	l->csn_right = t->csn_left;
268	r->csn_left = t->csn_right;
269	t->csn_left = N.csn_right;
270	t->csn_right = N.csn_left;
271	return (t);
272}
273
274static struct csnode *
275cset_delete(struct csnode *t, wchar_t ch)
276{
277	struct csnode *x;
278
279	assert(t != NULL);
280	t = cset_splay(t, ch);
281	assert(cset_rangecmp(t, ch) == 0);
282	if (t->csn_left == NULL)
283		x = t->csn_right;
284	else {
285		x = cset_splay(t->csn_left, ch);
286		x->csn_right = t->csn_right;
287	}
288	free(t);
289	return x;
290}
291