1178168Sqingli/*	$KAME: radix_mpath.c,v 1.17 2004/11/08 10:29:39 itojun Exp $	*/
2178168Sqingli
3178168Sqingli/*
4178168Sqingli * Copyright (C) 2001 WIDE Project.
5178168Sqingli * All rights reserved.
6178168Sqingli *
7178168Sqingli * Redistribution and use in source and binary forms, with or without
8178168Sqingli * modification, are permitted provided that the following conditions
9178168Sqingli * are met:
10178168Sqingli * 1. Redistributions of source code must retain the above copyright
11178168Sqingli *    notice, this list of conditions and the following disclaimer.
12178168Sqingli * 2. Redistributions in binary form must reproduce the above copyright
13178168Sqingli *    notice, this list of conditions and the following disclaimer in the
14178168Sqingli *    documentation and/or other materials provided with the distribution.
15178168Sqingli * 3. Neither the name of the project nor the names of its contributors
16178168Sqingli *    may be used to endorse or promote products derived from this software
17178168Sqingli *    without specific prior written permission.
18178168Sqingli *
19178168Sqingli * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
20178168Sqingli * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21178168Sqingli * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22178168Sqingli * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
23178168Sqingli * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24178168Sqingli * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25178168Sqingli * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26178168Sqingli * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27178168Sqingli * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28178168Sqingli * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29178168Sqingli * SUCH DAMAGE.
30178168Sqingli * THE AUTHORS DO NOT GUARANTEE THAT THIS SOFTWARE DOES NOT INFRINGE
31178168Sqingli * ANY OTHERS' INTELLECTUAL PROPERTIES. IN NO EVENT SHALL THE AUTHORS
32178168Sqingli * BE LIABLE FOR ANY INFRINGEMENT OF ANY OTHERS' INTELLECTUAL
33178168Sqingli * PROPERTIES.
34178168Sqingli */
35178168Sqingli
36178168Sqingli#include <sys/cdefs.h>
37178168Sqingli__FBSDID("$FreeBSD$");
38178168Sqingli
39184681Sbz#include "opt_inet.h"
40178187Sqingli#include "opt_inet6.h"
41178187Sqingli
42178168Sqingli#include <sys/param.h>
43178168Sqingli#include <sys/systm.h>
44178168Sqingli#include <sys/malloc.h>
45178168Sqingli#include <sys/socket.h>
46178168Sqingli#include <sys/domain.h>
47178168Sqingli#include <sys/syslog.h>
48178168Sqingli#include <net/radix.h>
49178168Sqingli#include <net/radix_mpath.h>
50178168Sqingli#include <net/route.h>
51178168Sqingli#include <net/if.h>
52178168Sqingli#include <net/if_var.h>
53178168Sqingli
54178168Sqingli/*
55178168Sqingli * give some jitter to hash, to avoid synchronization between routers
56178168Sqingli */
57184837Skmacystatic uint32_t hashjitter;
58178168Sqingli
59178168Sqingliint
60178168Sqinglirn_mpath_capable(struct radix_node_head *rnh)
61178168Sqingli{
62178168Sqingli
63178168Sqingli	return rnh->rnh_multipath;
64178168Sqingli}
65178168Sqingli
66178168Sqinglistruct radix_node *
67178168Sqinglirn_mpath_next(struct radix_node *rn)
68178168Sqingli{
69178168Sqingli	struct radix_node *next;
70178168Sqingli
71178168Sqingli	if (!rn->rn_dupedkey)
72178168Sqingli		return NULL;
73178168Sqingli	next = rn->rn_dupedkey;
74178168Sqingli	if (rn->rn_mask == next->rn_mask)
75178168Sqingli		return next;
76178168Sqingli	else
77178168Sqingli		return NULL;
78178168Sqingli}
79178168Sqingli
80191080Skmacyuint32_t
81178168Sqinglirn_mpath_count(struct radix_node *rn)
82178168Sqingli{
83191080Skmacy	uint32_t i = 0;
84191080Skmacy	struct rtentry *rt;
85191080Skmacy
86191080Skmacy	while (rn != NULL) {
87191080Skmacy		rt = (struct rtentry *)rn;
88263478Sglebius		i += rt->rt_weight;
89191080Skmacy		rn = rn_mpath_next(rn);
90191080Skmacy	}
91191080Skmacy	return (i);
92178168Sqingli}
93178168Sqingli
94178168Sqinglistruct rtentry *
95178168Sqinglirt_mpath_matchgate(struct rtentry *rt, struct sockaddr *gate)
96178168Sqingli{
97178168Sqingli	struct radix_node *rn;
98178168Sqingli
99225163Sqingli	if (!gate || !rt->rt_gateway)
100178168Sqingli		return NULL;
101178168Sqingli
102178168Sqingli	/* beyond here, we use rn as the master copy */
103178168Sqingli	rn = (struct radix_node *)rt;
104178168Sqingli	do {
105178168Sqingli		rt = (struct rtentry *)rn;
106178168Sqingli		/*
107178168Sqingli		 * we are removing an address alias that has
108178168Sqingli		 * the same prefix as another address
109178168Sqingli		 * we need to compare the interface address because
110178168Sqingli		 * rt_gateway is a special sockadd_dl structure
111178168Sqingli		 */
112178168Sqingli		if (rt->rt_gateway->sa_family == AF_LINK) {
113178168Sqingli			if (!memcmp(rt->rt_ifa->ifa_addr, gate, gate->sa_len))
114178168Sqingli				break;
115178168Sqingli		}
116265711Smelifaro
117265711Smelifaro		/*
118265711Smelifaro		 * Check for other options:
119265711Smelifaro		 * 1) Routes with 'real' IPv4/IPv6 gateway
120265711Smelifaro		 * 2) Loopback host routes (another AF_LINK/sockadd_dl check)
121265711Smelifaro		 * */
122265711Smelifaro		if (rt->rt_gateway->sa_len == gate->sa_len &&
123265711Smelifaro		    !memcmp(rt->rt_gateway, gate, gate->sa_len))
124265711Smelifaro			break;
125178168Sqingli	} while ((rn = rn_mpath_next(rn)) != NULL);
126178168Sqingli
127178168Sqingli	return (struct rtentry *)rn;
128178168Sqingli}
129178168Sqingli
130178168Sqingli/*
131178168Sqingli * go through the chain and unlink "rt" from the list
132178168Sqingli * the caller will free "rt"
133178168Sqingli */
134178168Sqingliint
135178168Sqinglirt_mpath_deldup(struct rtentry *headrt, struct rtentry *rt)
136178168Sqingli{
137178168Sqingli        struct radix_node *t, *tt;
138178168Sqingli
139178168Sqingli        if (!headrt || !rt)
140178168Sqingli            return (0);
141178168Sqingli        t = (struct radix_node *)headrt;
142178168Sqingli        tt = rn_mpath_next(t);
143178168Sqingli        while (tt) {
144178168Sqingli            if (tt == (struct radix_node *)rt) {
145178168Sqingli                t->rn_dupedkey = tt->rn_dupedkey;
146178168Sqingli                tt->rn_dupedkey = NULL;
147178168Sqingli    	        tt->rn_flags &= ~RNF_ACTIVE;
148178168Sqingli	        tt[1].rn_flags &= ~RNF_ACTIVE;
149178168Sqingli                return (1);
150178168Sqingli            }
151178168Sqingli            t = tt;
152178168Sqingli            tt = rn_mpath_next((struct radix_node *)t);
153178168Sqingli        }
154178168Sqingli        return (0);
155178168Sqingli}
156178168Sqingli
157178168Sqingli/*
158178168Sqingli * check if we have the same key/mask/gateway on the table already.
159265708Smelifaro * Assume @rt rt_key host bits are cleared according to @netmask
160178168Sqingli */
161178168Sqingliint
162178168Sqinglirt_mpath_conflict(struct radix_node_head *rnh, struct rtentry *rt,
163178168Sqingli    struct sockaddr *netmask)
164178168Sqingli{
165178168Sqingli	struct radix_node *rn, *rn1;
166178168Sqingli	struct rtentry *rt1;
167178168Sqingli
168178168Sqingli	rn = (struct radix_node *)rt;
169178168Sqingli	rn1 = rnh->rnh_lookup(rt_key(rt), netmask, rnh);
170178168Sqingli	if (!rn1 || rn1->rn_flags & RNF_ROOT)
171265708Smelifaro		return (0);
172178168Sqingli
173265708Smelifaro	/* key/mask are the same. compare gateway for all multipaths */
174178168Sqingli	do {
175178168Sqingli		rt1 = (struct rtentry *)rn1;
176178168Sqingli
177178168Sqingli		/* sanity: no use in comparing the same thing */
178178168Sqingli		if (rn1 == rn)
179178168Sqingli			continue;
180178168Sqingli
181178168Sqingli		if (rt1->rt_gateway->sa_family == AF_LINK) {
182178168Sqingli			if (rt1->rt_ifa->ifa_addr->sa_len != rt->rt_ifa->ifa_addr->sa_len ||
183178168Sqingli			    bcmp(rt1->rt_ifa->ifa_addr, rt->rt_ifa->ifa_addr,
184178168Sqingli			    rt1->rt_ifa->ifa_addr->sa_len))
185178168Sqingli				continue;
186178168Sqingli		} else {
187178168Sqingli			if (rt1->rt_gateway->sa_len != rt->rt_gateway->sa_len ||
188178168Sqingli			    bcmp(rt1->rt_gateway, rt->rt_gateway,
189178168Sqingli			    rt1->rt_gateway->sa_len))
190178168Sqingli				continue;
191178168Sqingli		}
192178168Sqingli
193178168Sqingli		/* all key/mask/gateway are the same.  conflicting entry. */
194265708Smelifaro		return (EEXIST);
195178168Sqingli	} while ((rn1 = rn_mpath_next(rn1)) != NULL);
196178168Sqingli
197265708Smelifaro	return (0);
198178168Sqingli}
199178168Sqingli
200178168Sqinglivoid
201191080Skmacyrtalloc_mpath_fib(struct route *ro, uint32_t hash, u_int fibnum)
202178168Sqingli{
203178168Sqingli	struct radix_node *rn0, *rn;
204179426Sqingli	u_int32_t n;
205191080Skmacy	struct rtentry *rt;
206191080Skmacy	int64_t weight;
207178168Sqingli
208178168Sqingli	/*
209178168Sqingli	 * XXX we don't attempt to lookup cached route again; what should
210178168Sqingli	 * be done for sendto(3) case?
211178168Sqingli	 */
212204902Sqingli	if (ro->ro_rt && ro->ro_rt->rt_ifp && (ro->ro_rt->rt_flags & RTF_UP)
213204902Sqingli	    && RT_LINK_IS_UP(ro->ro_rt->rt_ifp))
214186119Sqingli		return;
215186119Sqingli	ro->ro_rt = rtalloc1_fib(&ro->ro_dst, 1, 0, fibnum);
216178168Sqingli
217178168Sqingli	/* if the route does not exist or it is not multipath, don't care */
218178454Sqingli	if (ro->ro_rt == NULL)
219178454Sqingli		return;
220178454Sqingli	if (rn_mpath_next((struct radix_node *)ro->ro_rt) == NULL) {
221178168Sqingli		RT_UNLOCK(ro->ro_rt);
222178168Sqingli		return;
223178168Sqingli	}
224178168Sqingli
225178168Sqingli	/* beyond here, we use rn as the master copy */
226178168Sqingli	rn0 = rn = (struct radix_node *)ro->ro_rt;
227178168Sqingli	n = rn_mpath_count(rn0);
228178168Sqingli
229178168Sqingli	/* gw selection by Modulo-N Hash (RFC2991) XXX need improvement? */
230178168Sqingli	hash += hashjitter;
231178168Sqingli	hash %= n;
232191080Skmacy	for (weight = abs((int32_t)hash), rt = ro->ro_rt;
233263478Sglebius	     weight >= rt->rt_weight && rn;
234263478Sglebius	     weight -= rt->rt_weight) {
235191080Skmacy
236178168Sqingli		/* stay within the multipath routes */
237178168Sqingli		if (rn->rn_dupedkey && rn->rn_mask != rn->rn_dupedkey->rn_mask)
238178168Sqingli			break;
239178168Sqingli		rn = rn->rn_dupedkey;
240191080Skmacy		rt = (struct rtentry *)rn;
241178168Sqingli	}
242178168Sqingli	/* XXX try filling rt_gwroute and avoid unreachable gw  */
243178168Sqingli
244191080Skmacy	/* gw selection has failed - there must be only zero weight routes */
245178168Sqingli	if (!rn) {
246178168Sqingli		RT_UNLOCK(ro->ro_rt);
247191080Skmacy		ro->ro_rt = NULL;
248178168Sqingli		return;
249178168Sqingli	}
250191080Skmacy	if (ro->ro_rt != rt) {
251191080Skmacy		RTFREE_LOCKED(ro->ro_rt);
252191080Skmacy		ro->ro_rt = (struct rtentry *)rn;
253191080Skmacy		RT_LOCK(ro->ro_rt);
254191080Skmacy		RT_ADDREF(ro->ro_rt);
255191080Skmacy
256191080Skmacy	}
257178168Sqingli	RT_UNLOCK(ro->ro_rt);
258178168Sqingli}
259178168Sqingli
260178168Sqingliextern int	in6_inithead(void **head, int off);
261178168Sqingliextern int	in_inithead(void **head, int off);
262178168Sqingli
263184681Sbz#ifdef INET
264178168Sqingliint
265178168Sqinglirn4_mpath_inithead(void **head, int off)
266178168Sqingli{
267178168Sqingli	struct radix_node_head *rnh;
268178168Sqingli
269178168Sqingli	hashjitter = arc4random();
270178168Sqingli	if (in_inithead(head, off) == 1) {
271178168Sqingli		rnh = (struct radix_node_head *)*head;
272178168Sqingli		rnh->rnh_multipath = 1;
273178168Sqingli		return 1;
274178168Sqingli	} else
275178168Sqingli		return 0;
276178168Sqingli}
277184681Sbz#endif
278178168Sqingli
279178183Sphk#ifdef INET6
280178168Sqingliint
281178168Sqinglirn6_mpath_inithead(void **head, int off)
282178168Sqingli{
283178168Sqingli	struct radix_node_head *rnh;
284178168Sqingli
285178168Sqingli	hashjitter = arc4random();
286178168Sqingli	if (in6_inithead(head, off) == 1) {
287178168Sqingli		rnh = (struct radix_node_head *)*head;
288178168Sqingli		rnh->rnh_multipath = 1;
289178168Sqingli		return 1;
290178168Sqingli	} else
291178168Sqingli		return 0;
292178168Sqingli}
293178168Sqingli
294178183Sphk#endif
295