radix.c revision 178167
1/*-
2 * Copyright (c) 1988, 1989, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 * 4. Neither the name of the University nor the names of its contributors
14 *    may be used to endorse or promote products derived from this software
15 *    without specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 *
29 *	@(#)radix.c	8.5 (Berkeley) 5/19/95
30 * $FreeBSD: head/sys/net/radix.c 178167 2008-04-13 05:45:14Z qingli $
31 */
32
33/*
34 * Routines to build and maintain radix trees for routing lookups.
35 */
36#ifndef _RADIX_H_
37#include <sys/param.h>
38#ifdef	_KERNEL
39#include <sys/lock.h>
40#include <sys/mutex.h>
41#include <sys/systm.h>
42#include <sys/malloc.h>
43#include <sys/domain.h>
44#else
45#include <stdlib.h>
46#endif
47#include <sys/syslog.h>
48#include <net/radix.h>
49#endif
50
51#include "opt_mpath.h"
52
53#ifdef RADIX_MPATH
54#include <net/radix_mpath.h>
55#endif
56
57
58static int	rn_walktree_from(struct radix_node_head *h, void *a, void *m,
59		    walktree_f_t *f, void *w);
60static int rn_walktree(struct radix_node_head *, walktree_f_t *, void *);
61static struct radix_node
62	 *rn_insert(void *, struct radix_node_head *, int *,
63	     struct radix_node [2]),
64	 *rn_newpair(void *, int, struct radix_node[2]),
65	 *rn_search(void *, struct radix_node *),
66	 *rn_search_m(void *, struct radix_node *, void *);
67
68static int	max_keylen;
69static struct radix_mask *rn_mkfreelist;
70static struct radix_node_head *mask_rnhead;
71/*
72 * Work area -- the following point to 3 buffers of size max_keylen,
73 * allocated in this order in a block of memory malloc'ed by rn_init.
74 */
75static char *rn_zeros, *rn_ones, *addmask_key;
76
77#define MKGet(m) {						\
78	if (rn_mkfreelist) {					\
79		m = rn_mkfreelist;				\
80		rn_mkfreelist = (m)->rm_mklist;			\
81	} else							\
82		R_Malloc(m, struct radix_mask *, sizeof (struct radix_mask)); }
83
84#define MKFree(m) { (m)->rm_mklist = rn_mkfreelist; rn_mkfreelist = (m);}
85
86#define rn_masktop (mask_rnhead->rnh_treetop)
87
88static int	rn_lexobetter(void *m_arg, void *n_arg);
89static struct radix_mask *
90		rn_new_radix_mask(struct radix_node *tt,
91		    struct radix_mask *next);
92static int	rn_satisfies_leaf(char *trial, struct radix_node *leaf,
93		    int skip);
94
95/*
96 * The data structure for the keys is a radix tree with one way
97 * branching removed.  The index rn_bit at an internal node n represents a bit
98 * position to be tested.  The tree is arranged so that all descendants
99 * of a node n have keys whose bits all agree up to position rn_bit - 1.
100 * (We say the index of n is rn_bit.)
101 *
102 * There is at least one descendant which has a one bit at position rn_bit,
103 * and at least one with a zero there.
104 *
105 * A route is determined by a pair of key and mask.  We require that the
106 * bit-wise logical and of the key and mask to be the key.
107 * We define the index of a route to associated with the mask to be
108 * the first bit number in the mask where 0 occurs (with bit number 0
109 * representing the highest order bit).
110 *
111 * We say a mask is normal if every bit is 0, past the index of the mask.
112 * If a node n has a descendant (k, m) with index(m) == index(n) == rn_bit,
113 * and m is a normal mask, then the route applies to every descendant of n.
114 * If the index(m) < rn_bit, this implies the trailing last few bits of k
115 * before bit b are all 0, (and hence consequently true of every descendant
116 * of n), so the route applies to all descendants of the node as well.
117 *
118 * Similar logic shows that a non-normal mask m such that
119 * index(m) <= index(n) could potentially apply to many children of n.
120 * Thus, for each non-host route, we attach its mask to a list at an internal
121 * node as high in the tree as we can go.
122 *
123 * The present version of the code makes use of normal routes in short-
124 * circuiting an explict mask and compare operation when testing whether
125 * a key satisfies a normal route, and also in remembering the unique leaf
126 * that governs a subtree.
127 */
128
129/*
130 * Most of the functions in this code assume that the key/mask arguments
131 * are sockaddr-like structures, where the first byte is an u_char
132 * indicating the size of the entire structure.
133 *
134 * To make the assumption more explicit, we use the LEN() macro to access
135 * this field. It is safe to pass an expression with side effects
136 * to LEN() as the argument is evaluated only once.
137 */
138#define LEN(x) (*(const u_char *)(x))
139
140/*
141 * XXX THIS NEEDS TO BE FIXED
142 * In the code, pointers to keys and masks are passed as either
143 * 'void *' (because callers use to pass pointers of various kinds), or
144 * 'caddr_t' (which is fine for pointer arithmetics, but not very
145 * clean when you dereference it to access data). Furthermore, caddr_t
146 * is really 'char *', while the natural type to operate on keys and
147 * masks would be 'u_char'. This mismatch require a lot of casts and
148 * intermediate variables to adapt types that clutter the code.
149 */
150
151/*
152 * Search a node in the tree matching the key.
153 */
154static struct radix_node *
155rn_search(v_arg, head)
156	void *v_arg;
157	struct radix_node *head;
158{
159	register struct radix_node *x;
160	register caddr_t v;
161
162	for (x = head, v = v_arg; x->rn_bit >= 0;) {
163		if (x->rn_bmask & v[x->rn_offset])
164			x = x->rn_right;
165		else
166			x = x->rn_left;
167	}
168	return (x);
169}
170
171/*
172 * Same as above, but with an additional mask.
173 * XXX note this function is used only once.
174 */
175static struct radix_node *
176rn_search_m(v_arg, head, m_arg)
177	struct radix_node *head;
178	void *v_arg, *m_arg;
179{
180	register struct radix_node *x;
181	register caddr_t v = v_arg, m = m_arg;
182
183	for (x = head; x->rn_bit >= 0;) {
184		if ((x->rn_bmask & m[x->rn_offset]) &&
185		    (x->rn_bmask & v[x->rn_offset]))
186			x = x->rn_right;
187		else
188			x = x->rn_left;
189	}
190	return x;
191}
192
193int
194rn_refines(m_arg, n_arg)
195	void *m_arg, *n_arg;
196{
197	register caddr_t m = m_arg, n = n_arg;
198	register caddr_t lim, lim2 = lim = n + LEN(n);
199	int longer = LEN(n++) - (int)LEN(m++);
200	int masks_are_equal = 1;
201
202	if (longer > 0)
203		lim -= longer;
204	while (n < lim) {
205		if (*n & ~(*m))
206			return 0;
207		if (*n++ != *m++)
208			masks_are_equal = 0;
209	}
210	while (n < lim2)
211		if (*n++)
212			return 0;
213	if (masks_are_equal && (longer < 0))
214		for (lim2 = m - longer; m < lim2; )
215			if (*m++)
216				return 1;
217	return (!masks_are_equal);
218}
219
220struct radix_node *
221rn_lookup(v_arg, m_arg, head)
222	void *v_arg, *m_arg;
223	struct radix_node_head *head;
224{
225	register struct radix_node *x;
226	caddr_t netmask = 0;
227
228	if (m_arg) {
229		x = rn_addmask(m_arg, 1, head->rnh_treetop->rn_offset);
230		if (x == 0)
231			return (0);
232		netmask = x->rn_key;
233	}
234	x = rn_match(v_arg, head);
235	if (x && netmask) {
236		while (x && x->rn_mask != netmask)
237			x = x->rn_dupedkey;
238	}
239	return x;
240}
241
242static int
243rn_satisfies_leaf(trial, leaf, skip)
244	char *trial;
245	register struct radix_node *leaf;
246	int skip;
247{
248	register char *cp = trial, *cp2 = leaf->rn_key, *cp3 = leaf->rn_mask;
249	char *cplim;
250	int length = min(LEN(cp), LEN(cp2));
251
252	if (cp3 == 0)
253		cp3 = rn_ones;
254	else
255		length = min(length, *(u_char *)cp3);
256	cplim = cp + length; cp3 += skip; cp2 += skip;
257	for (cp += skip; cp < cplim; cp++, cp2++, cp3++)
258		if ((*cp ^ *cp2) & *cp3)
259			return 0;
260	return 1;
261}
262
263struct radix_node *
264rn_match(v_arg, head)
265	void *v_arg;
266	struct radix_node_head *head;
267{
268	caddr_t v = v_arg;
269	register struct radix_node *t = head->rnh_treetop, *x;
270	register caddr_t cp = v, cp2;
271	caddr_t cplim;
272	struct radix_node *saved_t, *top = t;
273	int off = t->rn_offset, vlen = LEN(cp), matched_off;
274	register int test, b, rn_bit;
275
276	/*
277	 * Open code rn_search(v, top) to avoid overhead of extra
278	 * subroutine call.
279	 */
280	for (; t->rn_bit >= 0; ) {
281		if (t->rn_bmask & cp[t->rn_offset])
282			t = t->rn_right;
283		else
284			t = t->rn_left;
285	}
286	/*
287	 * See if we match exactly as a host destination
288	 * or at least learn how many bits match, for normal mask finesse.
289	 *
290	 * It doesn't hurt us to limit how many bytes to check
291	 * to the length of the mask, since if it matches we had a genuine
292	 * match and the leaf we have is the most specific one anyway;
293	 * if it didn't match with a shorter length it would fail
294	 * with a long one.  This wins big for class B&C netmasks which
295	 * are probably the most common case...
296	 */
297	if (t->rn_mask)
298		vlen = *(u_char *)t->rn_mask;
299	cp += off; cp2 = t->rn_key + off; cplim = v + vlen;
300	for (; cp < cplim; cp++, cp2++)
301		if (*cp != *cp2)
302			goto on1;
303	/*
304	 * This extra grot is in case we are explicitly asked
305	 * to look up the default.  Ugh!
306	 *
307	 * Never return the root node itself, it seems to cause a
308	 * lot of confusion.
309	 */
310	if (t->rn_flags & RNF_ROOT)
311		t = t->rn_dupedkey;
312	return t;
313on1:
314	test = (*cp ^ *cp2) & 0xff; /* find first bit that differs */
315	for (b = 7; (test >>= 1) > 0;)
316		b--;
317	matched_off = cp - v;
318	b += matched_off << 3;
319	rn_bit = -1 - b;
320	/*
321	 * If there is a host route in a duped-key chain, it will be first.
322	 */
323	if ((saved_t = t)->rn_mask == 0)
324		t = t->rn_dupedkey;
325	for (; t; t = t->rn_dupedkey)
326		/*
327		 * Even if we don't match exactly as a host,
328		 * we may match if the leaf we wound up at is
329		 * a route to a net.
330		 */
331		if (t->rn_flags & RNF_NORMAL) {
332			if (rn_bit <= t->rn_bit)
333				return t;
334		} else if (rn_satisfies_leaf(v, t, matched_off))
335				return t;
336	t = saved_t;
337	/* start searching up the tree */
338	do {
339		register struct radix_mask *m;
340		t = t->rn_parent;
341		m = t->rn_mklist;
342		/*
343		 * If non-contiguous masks ever become important
344		 * we can restore the masking and open coding of
345		 * the search and satisfaction test and put the
346		 * calculation of "off" back before the "do".
347		 */
348		while (m) {
349			if (m->rm_flags & RNF_NORMAL) {
350				if (rn_bit <= m->rm_bit)
351					return (m->rm_leaf);
352			} else {
353				off = min(t->rn_offset, matched_off);
354				x = rn_search_m(v, t, m->rm_mask);
355				while (x && x->rn_mask != m->rm_mask)
356					x = x->rn_dupedkey;
357				if (x && rn_satisfies_leaf(v, x, off))
358					return x;
359			}
360			m = m->rm_mklist;
361		}
362	} while (t != top);
363	return 0;
364}
365
366#ifdef RN_DEBUG
367int	rn_nodenum;
368struct	radix_node *rn_clist;
369int	rn_saveinfo;
370int	rn_debug =  1;
371#endif
372
373/*
374 * Whenever we add a new leaf to the tree, we also add a parent node,
375 * so we allocate them as an array of two elements: the first one must be
376 * the leaf (see RNTORT() in route.c), the second one is the parent.
377 * This routine initializes the relevant fields of the nodes, so that
378 * the leaf is the left child of the parent node, and both nodes have
379 * (almost) all all fields filled as appropriate.
380 * (XXX some fields are left unset, see the '#if 0' section).
381 * The function returns a pointer to the parent node.
382 */
383
384static struct radix_node *
385rn_newpair(v, b, nodes)
386	void *v;
387	int b;
388	struct radix_node nodes[2];
389{
390	register struct radix_node *tt = nodes, *t = tt + 1;
391	t->rn_bit = b;
392	t->rn_bmask = 0x80 >> (b & 7);
393	t->rn_left = tt;
394	t->rn_offset = b >> 3;
395
396#if 0  /* XXX perhaps we should fill these fields as well. */
397	t->rn_parent = t->rn_right = NULL;
398
399	tt->rn_mask = NULL;
400	tt->rn_dupedkey = NULL;
401	tt->rn_bmask = 0;
402#endif
403	tt->rn_bit = -1;
404	tt->rn_key = (caddr_t)v;
405	tt->rn_parent = t;
406	tt->rn_flags = t->rn_flags = RNF_ACTIVE;
407	tt->rn_mklist = t->rn_mklist = 0;
408#ifdef RN_DEBUG
409	tt->rn_info = rn_nodenum++; t->rn_info = rn_nodenum++;
410	tt->rn_twin = t;
411	tt->rn_ybro = rn_clist;
412	rn_clist = tt;
413#endif
414	return t;
415}
416
417static struct radix_node *
418rn_insert(v_arg, head, dupentry, nodes)
419	void *v_arg;
420	struct radix_node_head *head;
421	int *dupentry;
422	struct radix_node nodes[2];
423{
424	caddr_t v = v_arg;
425	struct radix_node *top = head->rnh_treetop;
426	int head_off = top->rn_offset, vlen = (int)LEN(v);
427	register struct radix_node *t = rn_search(v_arg, top);
428	register caddr_t cp = v + head_off;
429	register int b;
430	struct radix_node *tt;
431    	/*
432	 * Find first bit at which v and t->rn_key differ
433	 */
434    {
435	register caddr_t cp2 = t->rn_key + head_off;
436	register int cmp_res;
437	caddr_t cplim = v + vlen;
438
439	while (cp < cplim)
440		if (*cp2++ != *cp++)
441			goto on1;
442	*dupentry = 1;
443	return t;
444on1:
445	*dupentry = 0;
446	cmp_res = (cp[-1] ^ cp2[-1]) & 0xff;
447	for (b = (cp - v) << 3; cmp_res; b--)
448		cmp_res >>= 1;
449    }
450    {
451	register struct radix_node *p, *x = top;
452	cp = v;
453	do {
454		p = x;
455		if (cp[x->rn_offset] & x->rn_bmask)
456			x = x->rn_right;
457		else
458			x = x->rn_left;
459	} while (b > (unsigned) x->rn_bit);
460				/* x->rn_bit < b && x->rn_bit >= 0 */
461#ifdef RN_DEBUG
462	if (rn_debug)
463		log(LOG_DEBUG, "rn_insert: Going In:\n"), traverse(p);
464#endif
465	t = rn_newpair(v_arg, b, nodes);
466	tt = t->rn_left;
467	if ((cp[p->rn_offset] & p->rn_bmask) == 0)
468		p->rn_left = t;
469	else
470		p->rn_right = t;
471	x->rn_parent = t;
472	t->rn_parent = p; /* frees x, p as temp vars below */
473	if ((cp[t->rn_offset] & t->rn_bmask) == 0) {
474		t->rn_right = x;
475	} else {
476		t->rn_right = tt;
477		t->rn_left = x;
478	}
479#ifdef RN_DEBUG
480	if (rn_debug)
481		log(LOG_DEBUG, "rn_insert: Coming Out:\n"), traverse(p);
482#endif
483    }
484	return (tt);
485}
486
487struct radix_node *
488rn_addmask(n_arg, search, skip)
489	int search, skip;
490	void *n_arg;
491{
492	caddr_t netmask = (caddr_t)n_arg;
493	register struct radix_node *x;
494	register caddr_t cp, cplim;
495	register int b = 0, mlen, j;
496	int maskduplicated, m0, isnormal;
497	struct radix_node *saved_x;
498	static int last_zeroed = 0;
499
500	if ((mlen = LEN(netmask)) > max_keylen)
501		mlen = max_keylen;
502	if (skip == 0)
503		skip = 1;
504	if (mlen <= skip)
505		return (mask_rnhead->rnh_nodes);
506	if (skip > 1)
507		bcopy(rn_ones + 1, addmask_key + 1, skip - 1);
508	if ((m0 = mlen) > skip)
509		bcopy(netmask + skip, addmask_key + skip, mlen - skip);
510	/*
511	 * Trim trailing zeroes.
512	 */
513	for (cp = addmask_key + mlen; (cp > addmask_key) && cp[-1] == 0;)
514		cp--;
515	mlen = cp - addmask_key;
516	if (mlen <= skip) {
517		if (m0 >= last_zeroed)
518			last_zeroed = mlen;
519		return (mask_rnhead->rnh_nodes);
520	}
521	if (m0 < last_zeroed)
522		bzero(addmask_key + m0, last_zeroed - m0);
523	*addmask_key = last_zeroed = mlen;
524	x = rn_search(addmask_key, rn_masktop);
525	if (bcmp(addmask_key, x->rn_key, mlen) != 0)
526		x = 0;
527	if (x || search)
528		return (x);
529	R_Zalloc(x, struct radix_node *, max_keylen + 2 * sizeof (*x));
530	if ((saved_x = x) == 0)
531		return (0);
532	netmask = cp = (caddr_t)(x + 2);
533	bcopy(addmask_key, cp, mlen);
534	x = rn_insert(cp, mask_rnhead, &maskduplicated, x);
535	if (maskduplicated) {
536		log(LOG_ERR, "rn_addmask: mask impossibly already in tree");
537		Free(saved_x);
538		return (x);
539	}
540	/*
541	 * Calculate index of mask, and check for normalcy.
542	 * First find the first byte with a 0 bit, then if there are
543	 * more bits left (remember we already trimmed the trailing 0's),
544	 * the pattern must be one of those in normal_chars[], or we have
545	 * a non-contiguous mask.
546	 */
547	cplim = netmask + mlen;
548	isnormal = 1;
549	for (cp = netmask + skip; (cp < cplim) && *(u_char *)cp == 0xff;)
550		cp++;
551	if (cp != cplim) {
552		static char normal_chars[] = {
553			0, 0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe, 0xff};
554
555		for (j = 0x80; (j & *cp) != 0; j >>= 1)
556			b++;
557		if (*cp != normal_chars[b] || cp != (cplim - 1))
558			isnormal = 0;
559	}
560	b += (cp - netmask) << 3;
561	x->rn_bit = -1 - b;
562	if (isnormal)
563		x->rn_flags |= RNF_NORMAL;
564	return (x);
565}
566
567static int	/* XXX: arbitrary ordering for non-contiguous masks */
568rn_lexobetter(m_arg, n_arg)
569	void *m_arg, *n_arg;
570{
571	register u_char *mp = m_arg, *np = n_arg, *lim;
572
573	if (LEN(mp) > LEN(np))
574		return 1;  /* not really, but need to check longer one first */
575	if (LEN(mp) == LEN(np))
576		for (lim = mp + LEN(mp); mp < lim;)
577			if (*mp++ > *np++)
578				return 1;
579	return 0;
580}
581
582static struct radix_mask *
583rn_new_radix_mask(tt, next)
584	register struct radix_node *tt;
585	register struct radix_mask *next;
586{
587	register struct radix_mask *m;
588
589	MKGet(m);
590	if (m == 0) {
591		log(LOG_ERR, "Mask for route not entered\n");
592		return (0);
593	}
594	bzero(m, sizeof *m);
595	m->rm_bit = tt->rn_bit;
596	m->rm_flags = tt->rn_flags;
597	if (tt->rn_flags & RNF_NORMAL)
598		m->rm_leaf = tt;
599	else
600		m->rm_mask = tt->rn_mask;
601	m->rm_mklist = next;
602	tt->rn_mklist = m;
603	return m;
604}
605
606struct radix_node *
607rn_addroute(v_arg, n_arg, head, treenodes)
608	void *v_arg, *n_arg;
609	struct radix_node_head *head;
610	struct radix_node treenodes[2];
611{
612	caddr_t v = (caddr_t)v_arg, netmask = (caddr_t)n_arg;
613	register struct radix_node *t, *x = 0, *tt;
614	struct radix_node *saved_tt, *top = head->rnh_treetop;
615	short b = 0, b_leaf = 0;
616	int keyduplicated;
617	caddr_t mmask;
618	struct radix_mask *m, **mp;
619
620	/*
621	 * In dealing with non-contiguous masks, there may be
622	 * many different routes which have the same mask.
623	 * We will find it useful to have a unique pointer to
624	 * the mask to speed avoiding duplicate references at
625	 * nodes and possibly save time in calculating indices.
626	 */
627	if (netmask)  {
628		if ((x = rn_addmask(netmask, 0, top->rn_offset)) == 0)
629			return (0);
630		b_leaf = x->rn_bit;
631		b = -1 - x->rn_bit;
632		netmask = x->rn_key;
633	}
634	/*
635	 * Deal with duplicated keys: attach node to previous instance
636	 */
637	saved_tt = tt = rn_insert(v, head, &keyduplicated, treenodes);
638	if (keyduplicated) {
639		for (t = tt; tt; t = tt, tt = tt->rn_dupedkey) {
640#ifdef RADIX_MPATH
641			/* permit multipath, if enabled for the family */
642			if (rn_mpath_capable(head) && netmask == tt->rn_mask) {
643				/*
644				 * go down to the end of multipaths, so that
645				 * new entry goes into the end of rn_dupedkey
646				 * chain.
647				 */
648				do {
649					t = tt;
650					tt = tt->rn_dupedkey;
651				} while (tt && t->rn_mask == tt->rn_mask);
652				break;
653			}
654#endif
655			if (tt->rn_mask == netmask)
656				return (0);
657			if (netmask == 0 ||
658			    (tt->rn_mask &&
659			     ((b_leaf < tt->rn_bit) /* index(netmask) > node */
660			      || rn_refines(netmask, tt->rn_mask)
661			      || rn_lexobetter(netmask, tt->rn_mask))))
662				break;
663		}
664		/*
665		 * If the mask is not duplicated, we wouldn't
666		 * find it among possible duplicate key entries
667		 * anyway, so the above test doesn't hurt.
668		 *
669		 * We sort the masks for a duplicated key the same way as
670		 * in a masklist -- most specific to least specific.
671		 * This may require the unfortunate nuisance of relocating
672		 * the head of the list.
673		 *
674		 * We also reverse, or doubly link the list through the
675		 * parent pointer.
676		 */
677		if (tt == saved_tt) {
678			struct	radix_node *xx = x;
679			/* link in at head of list */
680			(tt = treenodes)->rn_dupedkey = t;
681			tt->rn_flags = t->rn_flags;
682			tt->rn_parent = x = t->rn_parent;
683			t->rn_parent = tt;	 		/* parent */
684			if (x->rn_left == t)
685				x->rn_left = tt;
686			else
687				x->rn_right = tt;
688			saved_tt = tt; x = xx;
689		} else {
690			(tt = treenodes)->rn_dupedkey = t->rn_dupedkey;
691			t->rn_dupedkey = tt;
692			tt->rn_parent = t;			/* parent */
693			if (tt->rn_dupedkey)			/* parent */
694				tt->rn_dupedkey->rn_parent = tt; /* parent */
695		}
696#ifdef RN_DEBUG
697		t=tt+1; tt->rn_info = rn_nodenum++; t->rn_info = rn_nodenum++;
698		tt->rn_twin = t; tt->rn_ybro = rn_clist; rn_clist = tt;
699#endif
700		tt->rn_key = (caddr_t) v;
701		tt->rn_bit = -1;
702		tt->rn_flags = RNF_ACTIVE;
703	}
704	/*
705	 * Put mask in tree.
706	 */
707	if (netmask) {
708		tt->rn_mask = netmask;
709		tt->rn_bit = x->rn_bit;
710		tt->rn_flags |= x->rn_flags & RNF_NORMAL;
711	}
712	t = saved_tt->rn_parent;
713	if (keyduplicated)
714		goto on2;
715	b_leaf = -1 - t->rn_bit;
716	if (t->rn_right == saved_tt)
717		x = t->rn_left;
718	else
719		x = t->rn_right;
720	/* Promote general routes from below */
721	if (x->rn_bit < 0) {
722	    for (mp = &t->rn_mklist; x; x = x->rn_dupedkey)
723		if (x->rn_mask && (x->rn_bit >= b_leaf) && x->rn_mklist == 0) {
724			*mp = m = rn_new_radix_mask(x, 0);
725			if (m)
726				mp = &m->rm_mklist;
727		}
728	} else if (x->rn_mklist) {
729		/*
730		 * Skip over masks whose index is > that of new node
731		 */
732		for (mp = &x->rn_mklist; (m = *mp); mp = &m->rm_mklist)
733			if (m->rm_bit >= b_leaf)
734				break;
735		t->rn_mklist = m; *mp = 0;
736	}
737on2:
738	/* Add new route to highest possible ancestor's list */
739	if ((netmask == 0) || (b > t->rn_bit ))
740		return tt; /* can't lift at all */
741	b_leaf = tt->rn_bit;
742	do {
743		x = t;
744		t = t->rn_parent;
745	} while (b <= t->rn_bit && x != top);
746	/*
747	 * Search through routes associated with node to
748	 * insert new route according to index.
749	 * Need same criteria as when sorting dupedkeys to avoid
750	 * double loop on deletion.
751	 */
752	for (mp = &x->rn_mklist; (m = *mp); mp = &m->rm_mklist) {
753		if (m->rm_bit < b_leaf)
754			continue;
755		if (m->rm_bit > b_leaf)
756			break;
757		if (m->rm_flags & RNF_NORMAL) {
758			mmask = m->rm_leaf->rn_mask;
759			if (tt->rn_flags & RNF_NORMAL) {
760			    log(LOG_ERR,
761			        "Non-unique normal route, mask not entered\n");
762				return tt;
763			}
764		} else
765			mmask = m->rm_mask;
766		if (mmask == netmask) {
767			m->rm_refs++;
768			tt->rn_mklist = m;
769			return tt;
770		}
771		if (rn_refines(netmask, mmask)
772		    || rn_lexobetter(netmask, mmask))
773			break;
774	}
775	*mp = rn_new_radix_mask(tt, *mp);
776	return tt;
777}
778
779struct radix_node *
780rn_delete(v_arg, netmask_arg, head)
781	void *v_arg, *netmask_arg;
782	struct radix_node_head *head;
783{
784	register struct radix_node *t, *p, *x, *tt;
785	struct radix_mask *m, *saved_m, **mp;
786	struct radix_node *dupedkey, *saved_tt, *top;
787	caddr_t v, netmask;
788	int b, head_off, vlen;
789
790	v = v_arg;
791	netmask = netmask_arg;
792	x = head->rnh_treetop;
793	tt = rn_search(v, x);
794	head_off = x->rn_offset;
795	vlen =  LEN(v);
796	saved_tt = tt;
797	top = x;
798	if (tt == 0 ||
799	    bcmp(v + head_off, tt->rn_key + head_off, vlen - head_off))
800		return (0);
801	/*
802	 * Delete our route from mask lists.
803	 */
804	if (netmask) {
805		if ((x = rn_addmask(netmask, 1, head_off)) == 0)
806			return (0);
807		netmask = x->rn_key;
808		while (tt->rn_mask != netmask)
809			if ((tt = tt->rn_dupedkey) == 0)
810				return (0);
811	}
812	if (tt->rn_mask == 0 || (saved_m = m = tt->rn_mklist) == 0)
813		goto on1;
814	if (tt->rn_flags & RNF_NORMAL) {
815		if (m->rm_leaf != tt || m->rm_refs > 0) {
816			log(LOG_ERR, "rn_delete: inconsistent annotation\n");
817			return 0;  /* dangling ref could cause disaster */
818		}
819	} else {
820		if (m->rm_mask != tt->rn_mask) {
821			log(LOG_ERR, "rn_delete: inconsistent annotation\n");
822			goto on1;
823		}
824		if (--m->rm_refs >= 0)
825			goto on1;
826	}
827	b = -1 - tt->rn_bit;
828	t = saved_tt->rn_parent;
829	if (b > t->rn_bit)
830		goto on1; /* Wasn't lifted at all */
831	do {
832		x = t;
833		t = t->rn_parent;
834	} while (b <= t->rn_bit && x != top);
835	for (mp = &x->rn_mklist; (m = *mp); mp = &m->rm_mklist)
836		if (m == saved_m) {
837			*mp = m->rm_mklist;
838			MKFree(m);
839			break;
840		}
841	if (m == 0) {
842		log(LOG_ERR, "rn_delete: couldn't find our annotation\n");
843		if (tt->rn_flags & RNF_NORMAL)
844			return (0); /* Dangling ref to us */
845	}
846on1:
847	/*
848	 * Eliminate us from tree
849	 */
850	if (tt->rn_flags & RNF_ROOT)
851		return (0);
852#ifdef RN_DEBUG
853	/* Get us out of the creation list */
854	for (t = rn_clist; t && t->rn_ybro != tt; t = t->rn_ybro) {}
855	if (t) t->rn_ybro = tt->rn_ybro;
856#endif
857	t = tt->rn_parent;
858	dupedkey = saved_tt->rn_dupedkey;
859	if (dupedkey) {
860		/*
861		 * Here, tt is the deletion target and
862		 * saved_tt is the head of the dupekey chain.
863		 */
864		if (tt == saved_tt) {
865			/* remove from head of chain */
866			x = dupedkey; x->rn_parent = t;
867			if (t->rn_left == tt)
868				t->rn_left = x;
869			else
870				t->rn_right = x;
871		} else {
872			/* find node in front of tt on the chain */
873			for (x = p = saved_tt; p && p->rn_dupedkey != tt;)
874				p = p->rn_dupedkey;
875			if (p) {
876				p->rn_dupedkey = tt->rn_dupedkey;
877				if (tt->rn_dupedkey)		/* parent */
878					tt->rn_dupedkey->rn_parent = p;
879								/* parent */
880			} else log(LOG_ERR, "rn_delete: couldn't find us\n");
881		}
882		t = tt + 1;
883		if  (t->rn_flags & RNF_ACTIVE) {
884#ifndef RN_DEBUG
885			*++x = *t;
886			p = t->rn_parent;
887#else
888			b = t->rn_info;
889			*++x = *t;
890			t->rn_info = b;
891			p = t->rn_parent;
892#endif
893			if (p->rn_left == t)
894				p->rn_left = x;
895			else
896				p->rn_right = x;
897			x->rn_left->rn_parent = x;
898			x->rn_right->rn_parent = x;
899		}
900		goto out;
901	}
902	if (t->rn_left == tt)
903		x = t->rn_right;
904	else
905		x = t->rn_left;
906	p = t->rn_parent;
907	if (p->rn_right == t)
908		p->rn_right = x;
909	else
910		p->rn_left = x;
911	x->rn_parent = p;
912	/*
913	 * Demote routes attached to us.
914	 */
915	if (t->rn_mklist) {
916		if (x->rn_bit >= 0) {
917			for (mp = &x->rn_mklist; (m = *mp);)
918				mp = &m->rm_mklist;
919			*mp = t->rn_mklist;
920		} else {
921			/* If there are any key,mask pairs in a sibling
922			   duped-key chain, some subset will appear sorted
923			   in the same order attached to our mklist */
924			for (m = t->rn_mklist; m && x; x = x->rn_dupedkey)
925				if (m == x->rn_mklist) {
926					struct radix_mask *mm = m->rm_mklist;
927					x->rn_mklist = 0;
928					if (--(m->rm_refs) < 0)
929						MKFree(m);
930					m = mm;
931				}
932			if (m)
933				log(LOG_ERR,
934				    "rn_delete: Orphaned Mask %p at %p\n",
935				    (void *)m, (void *)x);
936		}
937	}
938	/*
939	 * We may be holding an active internal node in the tree.
940	 */
941	x = tt + 1;
942	if (t != x) {
943#ifndef RN_DEBUG
944		*t = *x;
945#else
946		b = t->rn_info;
947		*t = *x;
948		t->rn_info = b;
949#endif
950		t->rn_left->rn_parent = t;
951		t->rn_right->rn_parent = t;
952		p = x->rn_parent;
953		if (p->rn_left == x)
954			p->rn_left = t;
955		else
956			p->rn_right = t;
957	}
958out:
959	tt->rn_flags &= ~RNF_ACTIVE;
960	tt[1].rn_flags &= ~RNF_ACTIVE;
961	return (tt);
962}
963
964/*
965 * This is the same as rn_walktree() except for the parameters and the
966 * exit.
967 */
968static int
969rn_walktree_from(h, a, m, f, w)
970	struct radix_node_head *h;
971	void *a, *m;
972	walktree_f_t *f;
973	void *w;
974{
975	int error;
976	struct radix_node *base, *next;
977	u_char *xa = (u_char *)a;
978	u_char *xm = (u_char *)m;
979	register struct radix_node *rn, *last = 0 /* shut up gcc */;
980	int stopping = 0;
981	int lastb;
982
983	/*
984	 * rn_search_m is sort-of-open-coded here. We cannot use the
985	 * function because we need to keep track of the last node seen.
986	 */
987	/* printf("about to search\n"); */
988	for (rn = h->rnh_treetop; rn->rn_bit >= 0; ) {
989		last = rn;
990		/* printf("rn_bit %d, rn_bmask %x, xm[rn_offset] %x\n",
991		       rn->rn_bit, rn->rn_bmask, xm[rn->rn_offset]); */
992		if (!(rn->rn_bmask & xm[rn->rn_offset])) {
993			break;
994		}
995		if (rn->rn_bmask & xa[rn->rn_offset]) {
996			rn = rn->rn_right;
997		} else {
998			rn = rn->rn_left;
999		}
1000	}
1001	/* printf("done searching\n"); */
1002
1003	/*
1004	 * Two cases: either we stepped off the end of our mask,
1005	 * in which case last == rn, or we reached a leaf, in which
1006	 * case we want to start from the last node we looked at.
1007	 * Either way, last is the node we want to start from.
1008	 */
1009	rn = last;
1010	lastb = rn->rn_bit;
1011
1012	/* printf("rn %p, lastb %d\n", rn, lastb);*/
1013
1014	/*
1015	 * This gets complicated because we may delete the node
1016	 * while applying the function f to it, so we need to calculate
1017	 * the successor node in advance.
1018	 */
1019	while (rn->rn_bit >= 0)
1020		rn = rn->rn_left;
1021
1022	while (!stopping) {
1023		/* printf("node %p (%d)\n", rn, rn->rn_bit); */
1024		base = rn;
1025		/* If at right child go back up, otherwise, go right */
1026		while (rn->rn_parent->rn_right == rn
1027		       && !(rn->rn_flags & RNF_ROOT)) {
1028			rn = rn->rn_parent;
1029
1030			/* if went up beyond last, stop */
1031			if (rn->rn_bit <= lastb) {
1032				stopping = 1;
1033				/* printf("up too far\n"); */
1034				/*
1035				 * XXX we should jump to the 'Process leaves'
1036				 * part, because the values of 'rn' and 'next'
1037				 * we compute will not be used. Not a big deal
1038				 * because this loop will terminate, but it is
1039				 * inefficient and hard to understand!
1040				 */
1041			}
1042		}
1043
1044		/*
1045		 * At the top of the tree, no need to traverse the right
1046		 * half, prevent the traversal of the entire tree in the
1047		 * case of default route.
1048		 */
1049		if (rn->rn_parent->rn_flags & RNF_ROOT)
1050			stopping = 1;
1051
1052		/* Find the next *leaf* since next node might vanish, too */
1053		for (rn = rn->rn_parent->rn_right; rn->rn_bit >= 0;)
1054			rn = rn->rn_left;
1055		next = rn;
1056		/* Process leaves */
1057		while ((rn = base) != 0) {
1058			base = rn->rn_dupedkey;
1059			/* printf("leaf %p\n", rn); */
1060			if (!(rn->rn_flags & RNF_ROOT)
1061			    && (error = (*f)(rn, w)))
1062				return (error);
1063		}
1064		rn = next;
1065
1066		if (rn->rn_flags & RNF_ROOT) {
1067			/* printf("root, stopping"); */
1068			stopping = 1;
1069		}
1070
1071	}
1072	return 0;
1073}
1074
1075static int
1076rn_walktree(h, f, w)
1077	struct radix_node_head *h;
1078	walktree_f_t *f;
1079	void *w;
1080{
1081	int error;
1082	struct radix_node *base, *next;
1083	register struct radix_node *rn = h->rnh_treetop;
1084	/*
1085	 * This gets complicated because we may delete the node
1086	 * while applying the function f to it, so we need to calculate
1087	 * the successor node in advance.
1088	 */
1089	/* First time through node, go left */
1090	while (rn->rn_bit >= 0)
1091		rn = rn->rn_left;
1092	for (;;) {
1093		base = rn;
1094		/* If at right child go back up, otherwise, go right */
1095		while (rn->rn_parent->rn_right == rn
1096		       && (rn->rn_flags & RNF_ROOT) == 0)
1097			rn = rn->rn_parent;
1098		/* Find the next *leaf* since next node might vanish, too */
1099		for (rn = rn->rn_parent->rn_right; rn->rn_bit >= 0;)
1100			rn = rn->rn_left;
1101		next = rn;
1102		/* Process leaves */
1103		while ((rn = base)) {
1104			base = rn->rn_dupedkey;
1105			if (!(rn->rn_flags & RNF_ROOT)
1106			    && (error = (*f)(rn, w)))
1107				return (error);
1108		}
1109		rn = next;
1110		if (rn->rn_flags & RNF_ROOT)
1111			return (0);
1112	}
1113	/* NOTREACHED */
1114}
1115
1116/*
1117 * Allocate and initialize an empty tree. This has 3 nodes, which are
1118 * part of the radix_node_head (in the order <left,root,right>) and are
1119 * marked RNF_ROOT so they cannot be freed.
1120 * The leaves have all-zero and all-one keys, with significant
1121 * bits starting at 'off'.
1122 * Return 1 on success, 0 on error.
1123 */
1124int
1125rn_inithead(head, off)
1126	void **head;
1127	int off;
1128{
1129	register struct radix_node_head *rnh;
1130	register struct radix_node *t, *tt, *ttt;
1131	if (*head)
1132		return (1);
1133	R_Zalloc(rnh, struct radix_node_head *, sizeof (*rnh));
1134	if (rnh == 0)
1135		return (0);
1136#ifdef _KERNEL
1137	RADIX_NODE_HEAD_LOCK_INIT(rnh);
1138#endif
1139	*head = rnh;
1140	t = rn_newpair(rn_zeros, off, rnh->rnh_nodes);
1141	ttt = rnh->rnh_nodes + 2;
1142	t->rn_right = ttt;
1143	t->rn_parent = t;
1144	tt = t->rn_left;	/* ... which in turn is rnh->rnh_nodes */
1145	tt->rn_flags = t->rn_flags = RNF_ROOT | RNF_ACTIVE;
1146	tt->rn_bit = -1 - off;
1147	*ttt = *tt;
1148	ttt->rn_key = rn_ones;
1149	rnh->rnh_addaddr = rn_addroute;
1150	rnh->rnh_deladdr = rn_delete;
1151	rnh->rnh_matchaddr = rn_match;
1152	rnh->rnh_lookup = rn_lookup;
1153	rnh->rnh_walktree = rn_walktree;
1154	rnh->rnh_walktree_from = rn_walktree_from;
1155	rnh->rnh_treetop = t;
1156	return (1);
1157}
1158
1159void
1160rn_init()
1161{
1162	char *cp, *cplim;
1163#ifdef _KERNEL
1164	struct domain *dom;
1165
1166	for (dom = domains; dom; dom = dom->dom_next)
1167		if (dom->dom_maxrtkey > max_keylen)
1168			max_keylen = dom->dom_maxrtkey;
1169#endif
1170	if (max_keylen == 0) {
1171		log(LOG_ERR,
1172		    "rn_init: radix functions require max_keylen be set\n");
1173		return;
1174	}
1175	R_Malloc(rn_zeros, char *, 3 * max_keylen);
1176	if (rn_zeros == NULL)
1177		panic("rn_init");
1178	bzero(rn_zeros, 3 * max_keylen);
1179	rn_ones = cp = rn_zeros + max_keylen;
1180	addmask_key = cplim = rn_ones + max_keylen;
1181	while (cp < cplim)
1182		*cp++ = -1;
1183	if (rn_inithead((void **)(void *)&mask_rnhead, 0) == 0)
1184		panic("rn_init 2");
1185}
1186