1131846Stjr/*-
2131846Stjr * Copyright (c) 2004 Tim J. Robbins.
3131846Stjr * All rights reserved.
4131846Stjr *
5131846Stjr * Redistribution and use in source and binary forms, with or without
6131846Stjr * modification, are permitted provided that the following conditions
7131846Stjr * are met:
8131846Stjr * 1. Redistributions of source code must retain the above copyright
9131846Stjr *    notice, this list of conditions and the following disclaimer.
10131846Stjr * 2. Redistributions in binary form must reproduce the above copyright
11131846Stjr *    notice, this list of conditions and the following disclaimer in the
12131846Stjr *    documentation and/or other materials provided with the distribution.
13131846Stjr *
14131846Stjr * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15131846Stjr * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16131846Stjr * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17131846Stjr * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18131846Stjr * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19131846Stjr * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20131846Stjr * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21131846Stjr * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22131846Stjr * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23131846Stjr * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24131846Stjr * SUCH DAMAGE.
25131846Stjr */
26131846Stjr/*
27131846Stjr * "Set of characters" ADT implemented as a splay tree of extents, with
28131846Stjr * a lookup table cache to simplify looking up the first bunch of
29131846Stjr * characters (which are presumably more common than others).
30131846Stjr */
31131846Stjr
32131846Stjr#include <sys/cdefs.h>
33131846Stjr__FBSDID("$FreeBSD$");
34131846Stjr
35131846Stjr#include <assert.h>
36200462Sdelphij#include <stdbool.h>
37131846Stjr#include <stdlib.h>
38200462Sdelphij#include <wchar.h>
39200462Sdelphij#include <wctype.h>
40131846Stjr#include "cset.h"
41131846Stjr
42131846Stjrstatic struct csnode *	cset_delete(struct csnode *, wchar_t);
43131846Stjrstatic __inline int	cset_rangecmp(struct csnode *, wchar_t);
44131846Stjrstatic struct csnode *	cset_splay(struct csnode *, wchar_t);
45131846Stjr
46131846Stjr/*
47131846Stjr * cset_alloc --
48131846Stjr *	Allocate a set of characters.
49131846Stjr */
50131846Stjrstruct cset *
51131846Stjrcset_alloc(void)
52131846Stjr{
53131846Stjr	struct cset *cs;
54131846Stjr
55131846Stjr	if ((cs = malloc(sizeof(*cs))) == NULL)
56131846Stjr		return (NULL);
57131846Stjr	cs->cs_root = NULL;
58131846Stjr	cs->cs_classes = NULL;
59131846Stjr	cs->cs_havecache = false;
60131891Stjr	cs->cs_invert = false;
61131846Stjr	return (cs);
62131846Stjr}
63131846Stjr
64131846Stjr/*
65131846Stjr * cset_add --
66131846Stjr *	Add a character to the set.
67131846Stjr */
68131846Stjrbool
69131846Stjrcset_add(struct cset *cs, wchar_t ch)
70131846Stjr{
71131846Stjr	struct csnode *csn, *ncsn;
72131846Stjr	wchar_t oval;
73131846Stjr
74131846Stjr	cs->cs_havecache = false;
75131846Stjr
76131846Stjr	/*
77131846Stjr	 * Inserting into empty tree; new item becomes the root.
78131846Stjr	 */
79131846Stjr	if (cs->cs_root == NULL) {
80131846Stjr		csn = malloc(sizeof(*cs->cs_root));
81131846Stjr		if (csn == NULL)
82131846Stjr			return (false);
83131846Stjr		csn->csn_left = csn->csn_right = NULL;
84131846Stjr		csn->csn_min = csn->csn_max = ch;
85131846Stjr		cs->cs_root = csn;
86131846Stjr		return (true);
87131846Stjr	}
88131846Stjr
89131846Stjr	/*
90131846Stjr	 * Splay to check whether the item already exists, and otherwise,
91131846Stjr	 * where we should put it.
92131846Stjr	 */
93131846Stjr	csn = cs->cs_root = cset_splay(cs->cs_root, ch);
94131846Stjr
95131846Stjr	/*
96132142Stjr	 * Avoid adding duplicate nodes.
97131846Stjr	 */
98131846Stjr	if (cset_rangecmp(csn, ch) == 0)
99131846Stjr		return (true);
100131846Stjr
101131846Stjr	/*
102132142Stjr	 * Allocate a new node and make it the new root.
103131846Stjr	 */
104131846Stjr	ncsn = malloc(sizeof(*ncsn));
105131846Stjr	if (ncsn == NULL)
106131846Stjr		return (false);
107131846Stjr	ncsn->csn_min = ncsn->csn_max = ch;
108131846Stjr	if (cset_rangecmp(csn, ch) < 0) {
109131846Stjr		ncsn->csn_left = csn->csn_left;
110131846Stjr		ncsn->csn_right = csn;
111131846Stjr		csn->csn_left = NULL;
112131846Stjr	} else {
113131846Stjr		ncsn->csn_right = csn->csn_right;
114131846Stjr		ncsn->csn_left = csn;
115131846Stjr		csn->csn_right = NULL;
116131846Stjr	}
117131846Stjr	cs->cs_root = ncsn;
118131846Stjr
119131846Stjr	/*
120132142Stjr	 * Coalesce with left and right neighbours if possible.
121131846Stjr	 */
122132142Stjr	if (ncsn->csn_left != NULL) {
123132142Stjr		ncsn->csn_left = cset_splay(ncsn->csn_left, ncsn->csn_min - 1);
124132142Stjr		if (ncsn->csn_left->csn_max == ncsn->csn_min - 1) {
125132142Stjr			oval = ncsn->csn_left->csn_min;
126132142Stjr			ncsn->csn_left = cset_delete(ncsn->csn_left,
127132142Stjr			    ncsn->csn_left->csn_min);
128132142Stjr			ncsn->csn_min = oval;
129132142Stjr		}
130131846Stjr	}
131132142Stjr	if (ncsn->csn_right != NULL) {
132132142Stjr		ncsn->csn_right = cset_splay(ncsn->csn_right,
133132142Stjr		    ncsn->csn_max + 1);
134132142Stjr		if (ncsn->csn_right->csn_min == ncsn->csn_max + 1) {
135132142Stjr			oval = ncsn->csn_right->csn_max;
136132142Stjr			ncsn->csn_right = cset_delete(ncsn->csn_right,
137132142Stjr			    ncsn->csn_right->csn_min);
138132142Stjr			ncsn->csn_max = oval;
139132142Stjr		}
140131846Stjr	}
141131846Stjr
142131846Stjr	return (true);
143131846Stjr}
144131846Stjr
145131846Stjr/*
146131846Stjr * cset_in_hard --
147131846Stjr *	Determine whether a character is in the set without using
148131846Stjr *	the cache.
149131846Stjr */
150131846Stjrbool
151131846Stjrcset_in_hard(struct cset *cs, wchar_t ch)
152131846Stjr{
153131846Stjr	struct csclass *csc;
154131846Stjr
155131846Stjr	for (csc = cs->cs_classes; csc != NULL; csc = csc->csc_next)
156226363Sed		if (csc->csc_invert ^ (iswctype(ch, csc->csc_type) != 0))
157131846Stjr			return (cs->cs_invert ^ true);
158131846Stjr	if (cs->cs_root != NULL) {
159131846Stjr		cs->cs_root = cset_splay(cs->cs_root, ch);
160226363Sed		return (cs->cs_invert ^ (cset_rangecmp(cs->cs_root, ch) == 0));
161131846Stjr	}
162131846Stjr	return (cs->cs_invert ^ false);
163131846Stjr}
164131846Stjr
165131846Stjr/*
166131846Stjr * cset_cache --
167131846Stjr *	Update the cache.
168131846Stjr */
169131846Stjrvoid
170131846Stjrcset_cache(struct cset *cs)
171131846Stjr{
172131846Stjr	wchar_t i;
173131846Stjr
174131846Stjr	for (i = 0; i < CS_CACHE_SIZE; i++)
175131846Stjr		cs->cs_cache[i] = cset_in_hard(cs, i);
176131846Stjr
177131846Stjr	cs->cs_havecache = true;
178131846Stjr}
179131846Stjr
180131846Stjr/*
181131846Stjr * cset_invert --
182131846Stjr *	Invert the character set.
183131846Stjr */
184131846Stjrvoid
185131846Stjrcset_invert(struct cset *cs)
186131846Stjr{
187131846Stjr
188131846Stjr	cs->cs_invert ^= true;
189131846Stjr	cs->cs_havecache = false;
190131846Stjr}
191131846Stjr
192131846Stjr/*
193131846Stjr * cset_addclass --
194131846Stjr *	Add a wctype()-style character class to the set, optionally
195131846Stjr *	inverting it.
196131846Stjr */
197131846Stjrbool
198131846Stjrcset_addclass(struct cset *cs, wctype_t type, bool invert)
199131846Stjr{
200131846Stjr	struct csclass *csc;
201131846Stjr
202131846Stjr	csc = malloc(sizeof(*csc));
203131846Stjr	if (csc == NULL)
204131846Stjr		return (false);
205131846Stjr	csc->csc_type = type;
206131846Stjr	csc->csc_invert = invert;
207131846Stjr	csc->csc_next = cs->cs_classes;
208131846Stjr	cs->cs_classes = csc;
209131846Stjr	cs->cs_havecache = false;
210131846Stjr	return (true);
211131846Stjr}
212131846Stjr
213131846Stjrstatic __inline int
214131846Stjrcset_rangecmp(struct csnode *t, wchar_t ch)
215131846Stjr{
216131846Stjr
217131846Stjr	if (ch < t->csn_min)
218131846Stjr		return (-1);
219131846Stjr	if (ch > t->csn_max)
220131846Stjr		return (1);
221131846Stjr	return (0);
222131846Stjr}
223131846Stjr
224131846Stjrstatic struct csnode *
225131846Stjrcset_splay(struct csnode *t, wchar_t ch)
226131846Stjr{
227131846Stjr	struct csnode N, *l, *r, *y;
228131846Stjr
229131846Stjr	/*
230131846Stjr	 * Based on public domain code from Sleator.
231131846Stjr	 */
232131846Stjr
233131846Stjr	assert(t != NULL);
234131846Stjr
235131846Stjr	N.csn_left = N.csn_right = NULL;
236131846Stjr	l = r = &N;
237131846Stjr	for (;;) {
238131846Stjr		if (cset_rangecmp(t, ch) < 0) {
239131846Stjr			if (t->csn_left != NULL &&
240131846Stjr			    cset_rangecmp(t->csn_left, ch) < 0) {
241131846Stjr				y = t->csn_left;
242131846Stjr				t->csn_left = y->csn_right;
243131846Stjr				y->csn_right = t;
244131846Stjr				t = y;
245131846Stjr			}
246131846Stjr			if (t->csn_left == NULL)
247131846Stjr				break;
248131846Stjr			r->csn_left = t;
249131846Stjr			r = t;
250131846Stjr			t = t->csn_left;
251131846Stjr		} else if (cset_rangecmp(t, ch) > 0) {
252131846Stjr			if (t->csn_right != NULL &&
253131846Stjr			    cset_rangecmp(t->csn_right, ch) > 0) {
254131846Stjr				y = t->csn_right;
255131846Stjr				t->csn_right = y->csn_left;
256131846Stjr				y->csn_left = t;
257131846Stjr				t = y;
258131846Stjr			}
259131846Stjr			if (t->csn_right == NULL)
260131846Stjr				break;
261131846Stjr			l->csn_right = t;
262131846Stjr			l = t;
263131846Stjr			t = t->csn_right;
264131846Stjr		} else
265131846Stjr			break;
266131846Stjr	}
267131846Stjr	l->csn_right = t->csn_left;
268131846Stjr	r->csn_left = t->csn_right;
269131846Stjr	t->csn_left = N.csn_right;
270131846Stjr	t->csn_right = N.csn_left;
271131846Stjr	return (t);
272131846Stjr}
273131846Stjr
274131846Stjrstatic struct csnode *
275131846Stjrcset_delete(struct csnode *t, wchar_t ch)
276131846Stjr{
277131846Stjr	struct csnode *x;
278131846Stjr
279131846Stjr	assert(t != NULL);
280131846Stjr	t = cset_splay(t, ch);
281131846Stjr	assert(cset_rangecmp(t, ch) == 0);
282131846Stjr	if (t->csn_left == NULL)
283131846Stjr		x = t->csn_right;
284131846Stjr	else {
285131846Stjr		x = cset_splay(t->csn_left, ch);
286131846Stjr		x->csn_right = t->csn_right;
287131846Stjr	}
288131846Stjr	free(t);
289131846Stjr	return x;
290131846Stjr}
291