route.c revision 262743
1/*-
2 * Copyright (c) 1980, 1986, 1991, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 * 4. Neither the name of the University nor the names of its contributors
14 *    may be used to endorse or promote products derived from this software
15 *    without specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 *
29 *	@(#)route.c	8.3.1.1 (Berkeley) 2/23/95
30 * $FreeBSD: stable/10/sys/net/route.c 262743 2014-03-04 15:14:47Z glebius $
31 */
32/************************************************************************
33 * Note: In this file a 'fib' is a "forwarding information base"	*
34 * Which is the new name for an in kernel routing (next hop) table.	*
35 ***********************************************************************/
36
37#include "opt_inet.h"
38#include "opt_inet6.h"
39#include "opt_route.h"
40#include "opt_mrouting.h"
41#include "opt_mpath.h"
42
43#include <sys/param.h>
44#include <sys/systm.h>
45#include <sys/syslog.h>
46#include <sys/malloc.h>
47#include <sys/mbuf.h>
48#include <sys/socket.h>
49#include <sys/sysctl.h>
50#include <sys/syslog.h>
51#include <sys/sysproto.h>
52#include <sys/proc.h>
53#include <sys/domain.h>
54#include <sys/kernel.h>
55
56#include <net/if.h>
57#include <net/if_dl.h>
58#include <net/route.h>
59#include <net/vnet.h>
60#include <net/flowtable.h>
61
62#ifdef RADIX_MPATH
63#include <net/radix_mpath.h>
64#endif
65
66#include <netinet/in.h>
67#include <netinet/ip_mroute.h>
68
69#include <vm/uma.h>
70
71#define	RT_MAXFIBS	UINT16_MAX
72
73/* Kernel config default option. */
74#ifdef ROUTETABLES
75#if ROUTETABLES <= 0
76#error "ROUTETABLES defined too low"
77#endif
78#if ROUTETABLES > RT_MAXFIBS
79#error "ROUTETABLES defined too big"
80#endif
81#define	RT_NUMFIBS	ROUTETABLES
82#endif /* ROUTETABLES */
83/* Initialize to default if not otherwise set. */
84#ifndef	RT_NUMFIBS
85#define	RT_NUMFIBS	1
86#endif
87
88/* This is read-only.. */
89u_int rt_numfibs = RT_NUMFIBS;
90SYSCTL_UINT(_net, OID_AUTO, fibs, CTLFLAG_RD, &rt_numfibs, 0, "");
91/* and this can be set too big but will be fixed before it is used */
92TUNABLE_INT("net.fibs", &rt_numfibs);
93
94/*
95 * By default add routes to all fibs for new interfaces.
96 * Once this is set to 0 then only allocate routes on interface
97 * changes for the FIB of the caller when adding a new set of addresses
98 * to an interface.  XXX this is a shotgun aproach to a problem that needs
99 * a more fine grained solution.. that will come.
100 * XXX also has the problems getting the FIB from curthread which will not
101 * always work given the fib can be overridden and prefixes can be added
102 * from the network stack context.
103 */
104u_int rt_add_addr_allfibs = 1;
105SYSCTL_UINT(_net, OID_AUTO, add_addr_allfibs, CTLFLAG_RW,
106    &rt_add_addr_allfibs, 0, "");
107TUNABLE_INT("net.add_addr_allfibs", &rt_add_addr_allfibs);
108
109VNET_DEFINE(struct rtstat, rtstat);
110#define	V_rtstat	VNET(rtstat)
111
112VNET_DEFINE(struct radix_node_head *, rt_tables);
113#define	V_rt_tables	VNET(rt_tables)
114
115VNET_DEFINE(int, rttrash);		/* routes not in table but not freed */
116#define	V_rttrash	VNET(rttrash)
117
118
119/* compare two sockaddr structures */
120#define	sa_equal(a1, a2) (bcmp((a1), (a2), (a1)->sa_len) == 0)
121
122/*
123 * Convert a 'struct radix_node *' to a 'struct rtentry *'.
124 * The operation can be done safely (in this code) because a
125 * 'struct rtentry' starts with two 'struct radix_node''s, the first
126 * one representing leaf nodes in the routing tree, which is
127 * what the code in radix.c passes us as a 'struct radix_node'.
128 *
129 * But because there are a lot of assumptions in this conversion,
130 * do not cast explicitly, but always use the macro below.
131 */
132#define RNTORT(p)	((struct rtentry *)(p))
133
134static VNET_DEFINE(uma_zone_t, rtzone);		/* Routing table UMA zone. */
135#define	V_rtzone	VNET(rtzone)
136
137/*
138 * handler for net.my_fibnum
139 */
140static int
141sysctl_my_fibnum(SYSCTL_HANDLER_ARGS)
142{
143        int fibnum;
144        int error;
145
146        fibnum = curthread->td_proc->p_fibnum;
147        error = sysctl_handle_int(oidp, &fibnum, 0, req);
148        return (error);
149}
150
151SYSCTL_PROC(_net, OID_AUTO, my_fibnum, CTLTYPE_INT|CTLFLAG_RD,
152            NULL, 0, &sysctl_my_fibnum, "I", "default FIB of caller");
153
154static __inline struct radix_node_head **
155rt_tables_get_rnh_ptr(int table, int fam)
156{
157	struct radix_node_head **rnh;
158
159	KASSERT(table >= 0 && table < rt_numfibs, ("%s: table out of bounds.",
160	    __func__));
161	KASSERT(fam >= 0 && fam < (AF_MAX+1), ("%s: fam out of bounds.",
162	    __func__));
163
164	/* rnh is [fib=0][af=0]. */
165	rnh = (struct radix_node_head **)V_rt_tables;
166	/* Get the offset to the requested table and fam. */
167	rnh += table * (AF_MAX+1) + fam;
168
169	return (rnh);
170}
171
172struct radix_node_head *
173rt_tables_get_rnh(int table, int fam)
174{
175
176	return (*rt_tables_get_rnh_ptr(table, fam));
177}
178
179/*
180 * route initialization must occur before ip6_init2(), which happenas at
181 * SI_ORDER_MIDDLE.
182 */
183static void
184route_init(void)
185{
186	struct domain *dom;
187	int max_keylen = 0;
188
189	/* whack the tunable ints into  line. */
190	if (rt_numfibs > RT_MAXFIBS)
191		rt_numfibs = RT_MAXFIBS;
192	if (rt_numfibs == 0)
193		rt_numfibs = 1;
194
195	for (dom = domains; dom; dom = dom->dom_next)
196		if (dom->dom_maxrtkey > max_keylen)
197			max_keylen = dom->dom_maxrtkey;
198
199	rn_init(max_keylen);	/* init all zeroes, all ones, mask table */
200}
201SYSINIT(route_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, route_init, 0);
202
203static void
204vnet_route_init(const void *unused __unused)
205{
206	struct domain *dom;
207	struct radix_node_head **rnh;
208	int table;
209	int fam;
210
211	V_rt_tables = malloc(rt_numfibs * (AF_MAX+1) *
212	    sizeof(struct radix_node_head *), M_RTABLE, M_WAITOK|M_ZERO);
213
214	V_rtzone = uma_zcreate("rtentry", sizeof(struct rtentry), NULL, NULL,
215	    NULL, NULL, UMA_ALIGN_PTR, 0);
216	for (dom = domains; dom; dom = dom->dom_next) {
217		if (dom->dom_rtattach == NULL)
218			continue;
219
220		for  (table = 0; table < rt_numfibs; table++) {
221			fam = dom->dom_family;
222			if (table != 0 && fam != AF_INET6 && fam != AF_INET)
223				break;
224
225			/*
226			 * XXX MRT rtattach will be also called from
227			 * vfs_export.c but the offset will be 0 (only for
228			 * AF_INET and AF_INET6 which don't need it anyhow).
229			 */
230			rnh = rt_tables_get_rnh_ptr(table, fam);
231			if (rnh == NULL)
232				panic("%s: rnh NULL", __func__);
233			dom->dom_rtattach((void **)rnh, dom->dom_rtoffset);
234		}
235	}
236}
237VNET_SYSINIT(vnet_route_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_FOURTH,
238    vnet_route_init, 0);
239
240#ifdef VIMAGE
241static void
242vnet_route_uninit(const void *unused __unused)
243{
244	int table;
245	int fam;
246	struct domain *dom;
247	struct radix_node_head **rnh;
248
249	for (dom = domains; dom; dom = dom->dom_next) {
250		if (dom->dom_rtdetach == NULL)
251			continue;
252
253		for (table = 0; table < rt_numfibs; table++) {
254			fam = dom->dom_family;
255
256			if (table != 0 && fam != AF_INET6 && fam != AF_INET)
257				break;
258
259			rnh = rt_tables_get_rnh_ptr(table, fam);
260			if (rnh == NULL)
261				panic("%s: rnh NULL", __func__);
262			dom->dom_rtdetach((void **)rnh, dom->dom_rtoffset);
263		}
264	}
265
266	free(V_rt_tables, M_RTABLE);
267	uma_zdestroy(V_rtzone);
268}
269VNET_SYSUNINIT(vnet_route_uninit, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD,
270    vnet_route_uninit, 0);
271#endif
272
273#ifndef _SYS_SYSPROTO_H_
274struct setfib_args {
275	int     fibnum;
276};
277#endif
278int
279sys_setfib(struct thread *td, struct setfib_args *uap)
280{
281	if (uap->fibnum < 0 || uap->fibnum >= rt_numfibs)
282		return EINVAL;
283	td->td_proc->p_fibnum = uap->fibnum;
284	return (0);
285}
286
287/*
288 * Packet routing routines.
289 */
290void
291rtalloc(struct route *ro)
292{
293
294	rtalloc_ign_fib(ro, 0UL, RT_DEFAULT_FIB);
295}
296
297void
298rtalloc_fib(struct route *ro, u_int fibnum)
299{
300	rtalloc_ign_fib(ro, 0UL, fibnum);
301}
302
303void
304rtalloc_ign(struct route *ro, u_long ignore)
305{
306	struct rtentry *rt;
307
308	if ((rt = ro->ro_rt) != NULL) {
309		if (rt->rt_ifp != NULL && rt->rt_flags & RTF_UP)
310			return;
311		RTFREE(rt);
312		ro->ro_rt = NULL;
313	}
314	ro->ro_rt = rtalloc1_fib(&ro->ro_dst, 1, ignore, RT_DEFAULT_FIB);
315	if (ro->ro_rt)
316		RT_UNLOCK(ro->ro_rt);
317}
318
319void
320rtalloc_ign_fib(struct route *ro, u_long ignore, u_int fibnum)
321{
322	struct rtentry *rt;
323
324	if ((rt = ro->ro_rt) != NULL) {
325		if (rt->rt_ifp != NULL && rt->rt_flags & RTF_UP)
326			return;
327		RTFREE(rt);
328		ro->ro_rt = NULL;
329	}
330	ro->ro_rt = rtalloc1_fib(&ro->ro_dst, 1, ignore, fibnum);
331	if (ro->ro_rt)
332		RT_UNLOCK(ro->ro_rt);
333}
334
335/*
336 * Look up the route that matches the address given
337 * Or, at least try.. Create a cloned route if needed.
338 *
339 * The returned route, if any, is locked.
340 */
341struct rtentry *
342rtalloc1(struct sockaddr *dst, int report, u_long ignflags)
343{
344
345	return (rtalloc1_fib(dst, report, ignflags, RT_DEFAULT_FIB));
346}
347
348struct rtentry *
349rtalloc1_fib(struct sockaddr *dst, int report, u_long ignflags,
350		    u_int fibnum)
351{
352	struct radix_node_head *rnh;
353	struct radix_node *rn;
354	struct rtentry *newrt;
355	struct rt_addrinfo info;
356	int err = 0, msgtype = RTM_MISS;
357	int needlock;
358
359	KASSERT((fibnum < rt_numfibs), ("rtalloc1_fib: bad fibnum"));
360	switch (dst->sa_family) {
361	case AF_INET6:
362	case AF_INET:
363		/* We support multiple FIBs. */
364		break;
365	default:
366		fibnum = RT_DEFAULT_FIB;
367		break;
368	}
369	rnh = rt_tables_get_rnh(fibnum, dst->sa_family);
370	newrt = NULL;
371	if (rnh == NULL)
372		goto miss;
373
374	/*
375	 * Look up the address in the table for that Address Family
376	 */
377	needlock = !(ignflags & RTF_RNH_LOCKED);
378	if (needlock)
379		RADIX_NODE_HEAD_RLOCK(rnh);
380#ifdef INVARIANTS
381	else
382		RADIX_NODE_HEAD_LOCK_ASSERT(rnh);
383#endif
384	rn = rnh->rnh_matchaddr(dst, rnh);
385	if (rn && ((rn->rn_flags & RNF_ROOT) == 0)) {
386		newrt = RNTORT(rn);
387		RT_LOCK(newrt);
388		RT_ADDREF(newrt);
389		if (needlock)
390			RADIX_NODE_HEAD_RUNLOCK(rnh);
391		goto done;
392
393	} else if (needlock)
394		RADIX_NODE_HEAD_RUNLOCK(rnh);
395
396	/*
397	 * Either we hit the root or couldn't find any match,
398	 * Which basically means
399	 * "caint get there frm here"
400	 */
401miss:
402	V_rtstat.rts_unreach++;
403
404	if (report) {
405		/*
406		 * If required, report the failure to the supervising
407		 * Authorities.
408		 * For a delete, this is not an error. (report == 0)
409		 */
410		bzero(&info, sizeof(info));
411		info.rti_info[RTAX_DST] = dst;
412		rt_missmsg_fib(msgtype, &info, 0, err, fibnum);
413	}
414done:
415	if (newrt)
416		RT_LOCK_ASSERT(newrt);
417	return (newrt);
418}
419
420/*
421 * Remove a reference count from an rtentry.
422 * If the count gets low enough, take it out of the routing table
423 */
424void
425rtfree(struct rtentry *rt)
426{
427	struct radix_node_head *rnh;
428
429	KASSERT(rt != NULL,("%s: NULL rt", __func__));
430	rnh = rt_tables_get_rnh(rt->rt_fibnum, rt_key(rt)->sa_family);
431	KASSERT(rnh != NULL,("%s: NULL rnh", __func__));
432
433	RT_LOCK_ASSERT(rt);
434
435	/*
436	 * The callers should use RTFREE_LOCKED() or RTFREE(), so
437	 * we should come here exactly with the last reference.
438	 */
439	RT_REMREF(rt);
440	if (rt->rt_refcnt > 0) {
441		log(LOG_DEBUG, "%s: %p has %d refs\n", __func__, rt, rt->rt_refcnt);
442		goto done;
443	}
444
445	/*
446	 * On last reference give the "close method" a chance
447	 * to cleanup private state.  This also permits (for
448	 * IPv4 and IPv6) a chance to decide if the routing table
449	 * entry should be purged immediately or at a later time.
450	 * When an immediate purge is to happen the close routine
451	 * typically calls rtexpunge which clears the RTF_UP flag
452	 * on the entry so that the code below reclaims the storage.
453	 */
454	if (rt->rt_refcnt == 0 && rnh->rnh_close)
455		rnh->rnh_close((struct radix_node *)rt, rnh);
456
457	/*
458	 * If we are no longer "up" (and ref == 0)
459	 * then we can free the resources associated
460	 * with the route.
461	 */
462	if ((rt->rt_flags & RTF_UP) == 0) {
463		if (rt->rt_nodes->rn_flags & (RNF_ACTIVE | RNF_ROOT))
464			panic("rtfree 2");
465		/*
466		 * the rtentry must have been removed from the routing table
467		 * so it is represented in rttrash.. remove that now.
468		 */
469		V_rttrash--;
470#ifdef	DIAGNOSTIC
471		if (rt->rt_refcnt < 0) {
472			printf("rtfree: %p not freed (neg refs)\n", rt);
473			goto done;
474		}
475#endif
476		/*
477		 * release references on items we hold them on..
478		 * e.g other routes and ifaddrs.
479		 */
480		if (rt->rt_ifa)
481			ifa_free(rt->rt_ifa);
482		/*
483		 * The key is separatly alloc'd so free it (see rt_setgate()).
484		 * This also frees the gateway, as they are always malloc'd
485		 * together.
486		 */
487		Free(rt_key(rt));
488
489		/*
490		 * and the rtentry itself of course
491		 */
492		RT_LOCK_DESTROY(rt);
493		uma_zfree(V_rtzone, rt);
494		return;
495	}
496done:
497	RT_UNLOCK(rt);
498}
499
500
501/*
502 * Force a routing table entry to the specified
503 * destination to go through the given gateway.
504 * Normally called as a result of a routing redirect
505 * message from the network layer.
506 */
507void
508rtredirect(struct sockaddr *dst,
509	struct sockaddr *gateway,
510	struct sockaddr *netmask,
511	int flags,
512	struct sockaddr *src)
513{
514
515	rtredirect_fib(dst, gateway, netmask, flags, src, RT_DEFAULT_FIB);
516}
517
518void
519rtredirect_fib(struct sockaddr *dst,
520	struct sockaddr *gateway,
521	struct sockaddr *netmask,
522	int flags,
523	struct sockaddr *src,
524	u_int fibnum)
525{
526	struct rtentry *rt, *rt0 = NULL;
527	int error = 0;
528	short *stat = NULL;
529	struct rt_addrinfo info;
530	struct ifaddr *ifa;
531	struct radix_node_head *rnh;
532
533	ifa = NULL;
534	rnh = rt_tables_get_rnh(fibnum, dst->sa_family);
535	if (rnh == NULL) {
536		error = EAFNOSUPPORT;
537		goto out;
538	}
539
540	/* verify the gateway is directly reachable */
541	if ((ifa = ifa_ifwithnet(gateway, 0)) == NULL) {
542		error = ENETUNREACH;
543		goto out;
544	}
545	rt = rtalloc1_fib(dst, 0, 0UL, fibnum);	/* NB: rt is locked */
546	/*
547	 * If the redirect isn't from our current router for this dst,
548	 * it's either old or wrong.  If it redirects us to ourselves,
549	 * we have a routing loop, perhaps as a result of an interface
550	 * going down recently.
551	 */
552	if (!(flags & RTF_DONE) && rt &&
553	     (!sa_equal(src, rt->rt_gateway) || rt->rt_ifa != ifa))
554		error = EINVAL;
555	else if (ifa_ifwithaddr_check(gateway))
556		error = EHOSTUNREACH;
557	if (error)
558		goto done;
559	/*
560	 * Create a new entry if we just got back a wildcard entry
561	 * or the lookup failed.  This is necessary for hosts
562	 * which use routing redirects generated by smart gateways
563	 * to dynamically build the routing tables.
564	 */
565	if (rt == NULL || (rt_mask(rt) && rt_mask(rt)->sa_len < 2))
566		goto create;
567	/*
568	 * Don't listen to the redirect if it's
569	 * for a route to an interface.
570	 */
571	if (rt->rt_flags & RTF_GATEWAY) {
572		if (((rt->rt_flags & RTF_HOST) == 0) && (flags & RTF_HOST)) {
573			/*
574			 * Changing from route to net => route to host.
575			 * Create new route, rather than smashing route to net.
576			 */
577		create:
578			rt0 = rt;
579			rt = NULL;
580
581			flags |=  RTF_GATEWAY | RTF_DYNAMIC;
582			bzero((caddr_t)&info, sizeof(info));
583			info.rti_info[RTAX_DST] = dst;
584			info.rti_info[RTAX_GATEWAY] = gateway;
585			info.rti_info[RTAX_NETMASK] = netmask;
586			info.rti_ifa = ifa;
587			info.rti_flags = flags;
588			if (rt0 != NULL)
589				RT_UNLOCK(rt0);	/* drop lock to avoid LOR with RNH */
590			error = rtrequest1_fib(RTM_ADD, &info, &rt, fibnum);
591			if (rt != NULL) {
592				RT_LOCK(rt);
593				if (rt0 != NULL)
594					EVENTHANDLER_INVOKE(route_redirect_event, rt0, rt, dst);
595				flags = rt->rt_flags;
596			}
597			if (rt0 != NULL)
598				RTFREE(rt0);
599
600			stat = &V_rtstat.rts_dynamic;
601		} else {
602			struct rtentry *gwrt;
603
604			/*
605			 * Smash the current notion of the gateway to
606			 * this destination.  Should check about netmask!!!
607			 */
608			rt->rt_flags |= RTF_MODIFIED;
609			flags |= RTF_MODIFIED;
610			stat = &V_rtstat.rts_newgateway;
611			/*
612			 * add the key and gateway (in one malloc'd chunk).
613			 */
614			RT_UNLOCK(rt);
615			RADIX_NODE_HEAD_LOCK(rnh);
616			RT_LOCK(rt);
617			rt_setgate(rt, rt_key(rt), gateway);
618			gwrt = rtalloc1(gateway, 1, RTF_RNH_LOCKED);
619			RADIX_NODE_HEAD_UNLOCK(rnh);
620			EVENTHANDLER_INVOKE(route_redirect_event, rt, gwrt, dst);
621			RTFREE_LOCKED(gwrt);
622		}
623	} else
624		error = EHOSTUNREACH;
625done:
626	if (rt)
627		RTFREE_LOCKED(rt);
628out:
629	if (error)
630		V_rtstat.rts_badredirect++;
631	else if (stat != NULL)
632		(*stat)++;
633	bzero((caddr_t)&info, sizeof(info));
634	info.rti_info[RTAX_DST] = dst;
635	info.rti_info[RTAX_GATEWAY] = gateway;
636	info.rti_info[RTAX_NETMASK] = netmask;
637	info.rti_info[RTAX_AUTHOR] = src;
638	rt_missmsg_fib(RTM_REDIRECT, &info, flags, error, fibnum);
639	if (ifa != NULL)
640		ifa_free(ifa);
641}
642
643int
644rtioctl(u_long req, caddr_t data)
645{
646
647	return (rtioctl_fib(req, data, RT_DEFAULT_FIB));
648}
649
650/*
651 * Routing table ioctl interface.
652 */
653int
654rtioctl_fib(u_long req, caddr_t data, u_int fibnum)
655{
656
657	/*
658	 * If more ioctl commands are added here, make sure the proper
659	 * super-user checks are being performed because it is possible for
660	 * prison-root to make it this far if raw sockets have been enabled
661	 * in jails.
662	 */
663#ifdef INET
664	/* Multicast goop, grrr... */
665	return mrt_ioctl ? mrt_ioctl(req, data, fibnum) : EOPNOTSUPP;
666#else /* INET */
667	return ENXIO;
668#endif /* INET */
669}
670
671/*
672 * For both ifa_ifwithroute() routines, 'ifa' is returned referenced.
673 */
674struct ifaddr *
675ifa_ifwithroute(int flags, struct sockaddr *dst, struct sockaddr *gateway)
676{
677
678	return (ifa_ifwithroute_fib(flags, dst, gateway, RT_DEFAULT_FIB));
679}
680
681struct ifaddr *
682ifa_ifwithroute_fib(int flags, struct sockaddr *dst, struct sockaddr *gateway,
683				u_int fibnum)
684{
685	register struct ifaddr *ifa;
686	int not_found = 0;
687
688	if ((flags & RTF_GATEWAY) == 0) {
689		/*
690		 * If we are adding a route to an interface,
691		 * and the interface is a pt to pt link
692		 * we should search for the destination
693		 * as our clue to the interface.  Otherwise
694		 * we can use the local address.
695		 */
696		ifa = NULL;
697		if (flags & RTF_HOST)
698			ifa = ifa_ifwithdstaddr(dst);
699		if (ifa == NULL)
700			ifa = ifa_ifwithaddr(gateway);
701	} else {
702		/*
703		 * If we are adding a route to a remote net
704		 * or host, the gateway may still be on the
705		 * other end of a pt to pt link.
706		 */
707		ifa = ifa_ifwithdstaddr(gateway);
708	}
709	if (ifa == NULL)
710		ifa = ifa_ifwithnet(gateway, 0);
711	if (ifa == NULL) {
712		struct rtentry *rt = rtalloc1_fib(gateway, 0, RTF_RNH_LOCKED, fibnum);
713		if (rt == NULL)
714			return (NULL);
715		/*
716		 * dismiss a gateway that is reachable only
717		 * through the default router
718		 */
719		switch (gateway->sa_family) {
720		case AF_INET:
721			if (satosin(rt_key(rt))->sin_addr.s_addr == INADDR_ANY)
722				not_found = 1;
723			break;
724		case AF_INET6:
725			if (IN6_IS_ADDR_UNSPECIFIED(&satosin6(rt_key(rt))->sin6_addr))
726				not_found = 1;
727			break;
728		default:
729			break;
730		}
731		if (!not_found && rt->rt_ifa != NULL) {
732			ifa = rt->rt_ifa;
733			ifa_ref(ifa);
734		}
735		RT_REMREF(rt);
736		RT_UNLOCK(rt);
737		if (not_found || ifa == NULL)
738			return (NULL);
739	}
740	if (ifa->ifa_addr->sa_family != dst->sa_family) {
741		struct ifaddr *oifa = ifa;
742		ifa = ifaof_ifpforaddr(dst, ifa->ifa_ifp);
743		if (ifa == NULL)
744			ifa = oifa;
745		else
746			ifa_free(oifa);
747	}
748	return (ifa);
749}
750
751/*
752 * Do appropriate manipulations of a routing tree given
753 * all the bits of info needed
754 */
755int
756rtrequest(int req,
757	struct sockaddr *dst,
758	struct sockaddr *gateway,
759	struct sockaddr *netmask,
760	int flags,
761	struct rtentry **ret_nrt)
762{
763
764	return (rtrequest_fib(req, dst, gateway, netmask, flags, ret_nrt,
765	    RT_DEFAULT_FIB));
766}
767
768int
769rtrequest_fib(int req,
770	struct sockaddr *dst,
771	struct sockaddr *gateway,
772	struct sockaddr *netmask,
773	int flags,
774	struct rtentry **ret_nrt,
775	u_int fibnum)
776{
777	struct rt_addrinfo info;
778
779	if (dst->sa_len == 0)
780		return(EINVAL);
781
782	bzero((caddr_t)&info, sizeof(info));
783	info.rti_flags = flags;
784	info.rti_info[RTAX_DST] = dst;
785	info.rti_info[RTAX_GATEWAY] = gateway;
786	info.rti_info[RTAX_NETMASK] = netmask;
787	return rtrequest1_fib(req, &info, ret_nrt, fibnum);
788}
789
790/*
791 * These (questionable) definitions of apparent local variables apply
792 * to the next two functions.  XXXXXX!!!
793 */
794#define	dst	info->rti_info[RTAX_DST]
795#define	gateway	info->rti_info[RTAX_GATEWAY]
796#define	netmask	info->rti_info[RTAX_NETMASK]
797#define	ifaaddr	info->rti_info[RTAX_IFA]
798#define	ifpaddr	info->rti_info[RTAX_IFP]
799#define	flags	info->rti_flags
800
801int
802rt_getifa(struct rt_addrinfo *info)
803{
804
805	return (rt_getifa_fib(info, RT_DEFAULT_FIB));
806}
807
808/*
809 * Look up rt_addrinfo for a specific fib.  Note that if rti_ifa is defined,
810 * it will be referenced so the caller must free it.
811 */
812int
813rt_getifa_fib(struct rt_addrinfo *info, u_int fibnum)
814{
815	struct ifaddr *ifa;
816	int error = 0;
817
818	/*
819	 * ifp may be specified by sockaddr_dl
820	 * when protocol address is ambiguous.
821	 */
822	if (info->rti_ifp == NULL && ifpaddr != NULL &&
823	    ifpaddr->sa_family == AF_LINK &&
824	    (ifa = ifa_ifwithnet(ifpaddr, 0)) != NULL) {
825		info->rti_ifp = ifa->ifa_ifp;
826		ifa_free(ifa);
827	}
828	if (info->rti_ifa == NULL && ifaaddr != NULL)
829		info->rti_ifa = ifa_ifwithaddr(ifaaddr);
830	if (info->rti_ifa == NULL) {
831		struct sockaddr *sa;
832
833		sa = ifaaddr != NULL ? ifaaddr :
834		    (gateway != NULL ? gateway : dst);
835		if (sa != NULL && info->rti_ifp != NULL)
836			info->rti_ifa = ifaof_ifpforaddr(sa, info->rti_ifp);
837		else if (dst != NULL && gateway != NULL)
838			info->rti_ifa = ifa_ifwithroute_fib(flags, dst, gateway,
839							fibnum);
840		else if (sa != NULL)
841			info->rti_ifa = ifa_ifwithroute_fib(flags, sa, sa,
842							fibnum);
843	}
844	if ((ifa = info->rti_ifa) != NULL) {
845		if (info->rti_ifp == NULL)
846			info->rti_ifp = ifa->ifa_ifp;
847	} else
848		error = ENETUNREACH;
849	return (error);
850}
851
852/*
853 * Expunges references to a route that's about to be reclaimed.
854 * The route must be locked.
855 */
856int
857rtexpunge(struct rtentry *rt)
858{
859#if !defined(RADIX_MPATH)
860	struct radix_node *rn;
861#else
862	struct rt_addrinfo info;
863	int fib;
864	struct rtentry *rt0;
865#endif
866	struct radix_node_head *rnh;
867	struct ifaddr *ifa;
868	int error = 0;
869
870	/*
871	 * Find the correct routing tree to use for this Address Family
872	 */
873	rnh = rt_tables_get_rnh(rt->rt_fibnum, rt_key(rt)->sa_family);
874	RT_LOCK_ASSERT(rt);
875	if (rnh == NULL)
876		return (EAFNOSUPPORT);
877	RADIX_NODE_HEAD_LOCK_ASSERT(rnh);
878
879#ifdef RADIX_MPATH
880	fib = rt->rt_fibnum;
881	bzero(&info, sizeof(info));
882	info.rti_ifp = rt->rt_ifp;
883	info.rti_flags = RTF_RNH_LOCKED;
884	info.rti_info[RTAX_DST] = rt_key(rt);
885	info.rti_info[RTAX_GATEWAY] = rt->rt_ifa->ifa_addr;
886
887	RT_UNLOCK(rt);
888	error = rtrequest1_fib(RTM_DELETE, &info, &rt0, fib);
889
890	if (error == 0 && rt0 != NULL) {
891		rt = rt0;
892		RT_LOCK(rt);
893	} else if (error != 0) {
894		RT_LOCK(rt);
895		return (error);
896	}
897#else
898	/*
899	 * Remove the item from the tree; it should be there,
900	 * but when callers invoke us blindly it may not (sigh).
901	 */
902	rn = rnh->rnh_deladdr(rt_key(rt), rt_mask(rt), rnh);
903	if (rn == NULL) {
904		error = ESRCH;
905		goto bad;
906	}
907	KASSERT((rn->rn_flags & (RNF_ACTIVE | RNF_ROOT)) == 0,
908		("unexpected flags 0x%x", rn->rn_flags));
909	KASSERT(rt == RNTORT(rn),
910		("lookup mismatch, rt %p rn %p", rt, rn));
911#endif /* RADIX_MPATH */
912
913	rt->rt_flags &= ~RTF_UP;
914
915	/*
916	 * Give the protocol a chance to keep things in sync.
917	 */
918	if ((ifa = rt->rt_ifa) && ifa->ifa_rtrequest) {
919		struct rt_addrinfo info;
920
921		bzero((caddr_t)&info, sizeof(info));
922		info.rti_flags = rt->rt_flags;
923		info.rti_info[RTAX_DST] = rt_key(rt);
924		info.rti_info[RTAX_GATEWAY] = rt->rt_gateway;
925		info.rti_info[RTAX_NETMASK] = rt_mask(rt);
926		ifa->ifa_rtrequest(RTM_DELETE, rt, &info);
927	}
928
929	/*
930	 * one more rtentry floating around that is not
931	 * linked to the routing table.
932	 */
933	V_rttrash++;
934#if !defined(RADIX_MPATH)
935bad:
936#endif
937	return (error);
938}
939
940#ifdef RADIX_MPATH
941static int
942rn_mpath_update(int req, struct rt_addrinfo *info,
943    struct radix_node_head *rnh, struct rtentry **ret_nrt)
944{
945	/*
946	 * if we got multipath routes, we require users to specify
947	 * a matching RTAX_GATEWAY.
948	 */
949	struct rtentry *rt, *rto = NULL;
950	register struct radix_node *rn;
951	int error = 0;
952
953	rn = rnh->rnh_matchaddr(dst, rnh);
954	if (rn == NULL)
955		return (ESRCH);
956	rto = rt = RNTORT(rn);
957	rt = rt_mpath_matchgate(rt, gateway);
958	if (rt == NULL)
959		return (ESRCH);
960	/*
961	 * this is the first entry in the chain
962	 */
963	if (rto == rt) {
964		rn = rn_mpath_next((struct radix_node *)rt);
965		/*
966		 * there is another entry, now it's active
967		 */
968		if (rn) {
969			rto = RNTORT(rn);
970			RT_LOCK(rto);
971			rto->rt_flags |= RTF_UP;
972			RT_UNLOCK(rto);
973		} else if (rt->rt_flags & RTF_GATEWAY) {
974			/*
975			 * For gateway routes, we need to
976			 * make sure that we we are deleting
977			 * the correct gateway.
978			 * rt_mpath_matchgate() does not
979			 * check the case when there is only
980			 * one route in the chain.
981			 */
982			if (gateway &&
983			    (rt->rt_gateway->sa_len != gateway->sa_len ||
984				memcmp(rt->rt_gateway, gateway, gateway->sa_len)))
985				error = ESRCH;
986			else {
987				/*
988				 * remove from tree before returning it
989				 * to the caller
990				 */
991				rn = rnh->rnh_deladdr(dst, netmask, rnh);
992				KASSERT(rt == RNTORT(rn), ("radix node disappeared"));
993				goto gwdelete;
994			}
995
996		}
997		/*
998		 * use the normal delete code to remove
999		 * the first entry
1000		 */
1001		if (req != RTM_DELETE)
1002			goto nondelete;
1003
1004		error = ENOENT;
1005		goto done;
1006	}
1007
1008	/*
1009	 * if the entry is 2nd and on up
1010	 */
1011	if ((req == RTM_DELETE) && !rt_mpath_deldup(rto, rt))
1012		panic ("rtrequest1: rt_mpath_deldup");
1013gwdelete:
1014	RT_LOCK(rt);
1015	RT_ADDREF(rt);
1016	if (req == RTM_DELETE) {
1017		rt->rt_flags &= ~RTF_UP;
1018		/*
1019		 * One more rtentry floating around that is not
1020		 * linked to the routing table. rttrash will be decremented
1021		 * when RTFREE(rt) is eventually called.
1022		 */
1023		V_rttrash++;
1024	}
1025
1026nondelete:
1027	if (req != RTM_DELETE)
1028		panic("unrecognized request %d", req);
1029
1030
1031	/*
1032	 * If the caller wants it, then it can have it,
1033	 * but it's up to it to free the rtentry as we won't be
1034	 * doing it.
1035	 */
1036	if (ret_nrt) {
1037		*ret_nrt = rt;
1038		RT_UNLOCK(rt);
1039	} else
1040		RTFREE_LOCKED(rt);
1041done:
1042	return (error);
1043}
1044#endif
1045
1046int
1047rtrequest1_fib(int req, struct rt_addrinfo *info, struct rtentry **ret_nrt,
1048				u_int fibnum)
1049{
1050	int error = 0, needlock = 0;
1051	register struct rtentry *rt;
1052#ifdef FLOWTABLE
1053	register struct rtentry *rt0;
1054#endif
1055	register struct radix_node *rn;
1056	register struct radix_node_head *rnh;
1057	struct ifaddr *ifa;
1058	struct sockaddr *ndst;
1059	struct sockaddr_storage mdst;
1060#define senderr(x) { error = x ; goto bad; }
1061
1062	KASSERT((fibnum < rt_numfibs), ("rtrequest1_fib: bad fibnum"));
1063	switch (dst->sa_family) {
1064	case AF_INET6:
1065	case AF_INET:
1066		/* We support multiple FIBs. */
1067		break;
1068	default:
1069		fibnum = RT_DEFAULT_FIB;
1070		break;
1071	}
1072
1073	/*
1074	 * Find the correct routing tree to use for this Address Family
1075	 */
1076	rnh = rt_tables_get_rnh(fibnum, dst->sa_family);
1077	if (rnh == NULL)
1078		return (EAFNOSUPPORT);
1079	needlock = ((flags & RTF_RNH_LOCKED) == 0);
1080	flags &= ~RTF_RNH_LOCKED;
1081	if (needlock)
1082		RADIX_NODE_HEAD_LOCK(rnh);
1083	else
1084		RADIX_NODE_HEAD_LOCK_ASSERT(rnh);
1085	/*
1086	 * If we are adding a host route then we don't want to put
1087	 * a netmask in the tree, nor do we want to clone it.
1088	 */
1089	if (flags & RTF_HOST)
1090		netmask = NULL;
1091
1092	switch (req) {
1093	case RTM_DELETE:
1094		if (netmask) {
1095			rt_maskedcopy(dst, (struct sockaddr *)&mdst, netmask);
1096			dst = (struct sockaddr *)&mdst;
1097		}
1098#ifdef RADIX_MPATH
1099		if (rn_mpath_capable(rnh)) {
1100			error = rn_mpath_update(req, info, rnh, ret_nrt);
1101			/*
1102			 * "bad" holds true for the success case
1103			 * as well
1104			 */
1105			if (error != ENOENT)
1106				goto bad;
1107			error = 0;
1108		}
1109#endif
1110		if ((flags & RTF_PINNED) == 0) {
1111			/* Check if target route can be deleted */
1112			rt = (struct rtentry *)rnh->rnh_lookup(dst,
1113			    netmask, rnh);
1114			if ((rt != NULL) && (rt->rt_flags & RTF_PINNED))
1115				senderr(EADDRINUSE);
1116		}
1117
1118		/*
1119		 * Remove the item from the tree and return it.
1120		 * Complain if it is not there and do no more processing.
1121		 */
1122		rn = rnh->rnh_deladdr(dst, netmask, rnh);
1123		if (rn == NULL)
1124			senderr(ESRCH);
1125		if (rn->rn_flags & (RNF_ACTIVE | RNF_ROOT))
1126			panic ("rtrequest delete");
1127		rt = RNTORT(rn);
1128		RT_LOCK(rt);
1129		RT_ADDREF(rt);
1130		rt->rt_flags &= ~RTF_UP;
1131
1132		/*
1133		 * give the protocol a chance to keep things in sync.
1134		 */
1135		if ((ifa = rt->rt_ifa) && ifa->ifa_rtrequest)
1136			ifa->ifa_rtrequest(RTM_DELETE, rt, info);
1137
1138		/*
1139		 * One more rtentry floating around that is not
1140		 * linked to the routing table. rttrash will be decremented
1141		 * when RTFREE(rt) is eventually called.
1142		 */
1143		V_rttrash++;
1144
1145		/*
1146		 * If the caller wants it, then it can have it,
1147		 * but it's up to it to free the rtentry as we won't be
1148		 * doing it.
1149		 */
1150		if (ret_nrt) {
1151			*ret_nrt = rt;
1152			RT_UNLOCK(rt);
1153		} else
1154			RTFREE_LOCKED(rt);
1155		break;
1156	case RTM_RESOLVE:
1157		/*
1158		 * resolve was only used for route cloning
1159		 * here for compat
1160		 */
1161		break;
1162	case RTM_ADD:
1163		if ((flags & RTF_GATEWAY) && !gateway)
1164			senderr(EINVAL);
1165		if (dst && gateway && (dst->sa_family != gateway->sa_family) &&
1166		    (gateway->sa_family != AF_UNSPEC) && (gateway->sa_family != AF_LINK))
1167			senderr(EINVAL);
1168
1169		if (info->rti_ifa == NULL) {
1170			error = rt_getifa_fib(info, fibnum);
1171			if (error)
1172				senderr(error);
1173		} else
1174			ifa_ref(info->rti_ifa);
1175		ifa = info->rti_ifa;
1176		rt = uma_zalloc(V_rtzone, M_NOWAIT | M_ZERO);
1177		if (rt == NULL) {
1178			ifa_free(ifa);
1179			senderr(ENOBUFS);
1180		}
1181		RT_LOCK_INIT(rt);
1182		rt->rt_flags = RTF_UP | flags;
1183		rt->rt_fibnum = fibnum;
1184		/*
1185		 * Add the gateway. Possibly re-malloc-ing the storage for it.
1186		 */
1187		RT_LOCK(rt);
1188		if ((error = rt_setgate(rt, dst, gateway)) != 0) {
1189			RT_LOCK_DESTROY(rt);
1190			ifa_free(ifa);
1191			uma_zfree(V_rtzone, rt);
1192			senderr(error);
1193		}
1194
1195		/*
1196		 * point to the (possibly newly malloc'd) dest address.
1197		 */
1198		ndst = (struct sockaddr *)rt_key(rt);
1199
1200		/*
1201		 * make sure it contains the value we want (masked if needed).
1202		 */
1203		if (netmask) {
1204			rt_maskedcopy(dst, ndst, netmask);
1205		} else
1206			bcopy(dst, ndst, dst->sa_len);
1207
1208		/*
1209		 * We use the ifa reference returned by rt_getifa_fib().
1210		 * This moved from below so that rnh->rnh_addaddr() can
1211		 * examine the ifa and  ifa->ifa_ifp if it so desires.
1212		 */
1213		rt->rt_ifa = ifa;
1214		rt->rt_ifp = ifa->ifa_ifp;
1215		rt->rt_rmx.rmx_weight = 1;
1216
1217#ifdef RADIX_MPATH
1218		/* do not permit exactly the same dst/mask/gw pair */
1219		if (rn_mpath_capable(rnh) &&
1220			rt_mpath_conflict(rnh, rt, netmask)) {
1221			ifa_free(rt->rt_ifa);
1222			Free(rt_key(rt));
1223			RT_LOCK_DESTROY(rt);
1224			uma_zfree(V_rtzone, rt);
1225			senderr(EEXIST);
1226		}
1227#endif
1228
1229#ifdef FLOWTABLE
1230		rt0 = NULL;
1231		/* "flow-table" only supports IPv6 and IPv4 at the moment. */
1232		switch (dst->sa_family) {
1233#ifdef INET6
1234		case AF_INET6:
1235#endif
1236#ifdef INET
1237		case AF_INET:
1238#endif
1239#if defined(INET6) || defined(INET)
1240			rn = rnh->rnh_matchaddr(dst, rnh);
1241			if (rn && ((rn->rn_flags & RNF_ROOT) == 0)) {
1242				struct sockaddr *mask;
1243				u_char *m, *n;
1244				int len;
1245
1246				/*
1247				 * compare mask to see if the new route is
1248				 * more specific than the existing one
1249				 */
1250				rt0 = RNTORT(rn);
1251				RT_LOCK(rt0);
1252				RT_ADDREF(rt0);
1253				RT_UNLOCK(rt0);
1254				/*
1255				 * A host route is already present, so
1256				 * leave the flow-table entries as is.
1257				 */
1258				if (rt0->rt_flags & RTF_HOST) {
1259					RTFREE(rt0);
1260					rt0 = NULL;
1261				} else if (!(flags & RTF_HOST) && netmask) {
1262					mask = rt_mask(rt0);
1263					len = mask->sa_len;
1264					m = (u_char *)mask;
1265					n = (u_char *)netmask;
1266					while (len-- > 0) {
1267						if (*n != *m)
1268							break;
1269						n++;
1270						m++;
1271					}
1272					if (len == 0 || (*n < *m)) {
1273						RTFREE(rt0);
1274						rt0 = NULL;
1275					}
1276				}
1277			}
1278#endif/* INET6 || INET */
1279		}
1280#endif /* FLOWTABLE */
1281
1282		/* XXX mtu manipulation will be done in rnh_addaddr -- itojun */
1283		rn = rnh->rnh_addaddr(ndst, netmask, rnh, rt->rt_nodes);
1284		/*
1285		 * If it still failed to go into the tree,
1286		 * then un-make it (this should be a function)
1287		 */
1288		if (rn == NULL) {
1289			ifa_free(rt->rt_ifa);
1290			Free(rt_key(rt));
1291			RT_LOCK_DESTROY(rt);
1292			uma_zfree(V_rtzone, rt);
1293#ifdef FLOWTABLE
1294			if (rt0 != NULL)
1295				RTFREE(rt0);
1296#endif
1297			senderr(EEXIST);
1298		}
1299#ifdef FLOWTABLE
1300		else if (rt0 != NULL) {
1301			flowtable_route_flush(dst->sa_family, rt0);
1302			RTFREE(rt0);
1303		}
1304#endif
1305
1306		/*
1307		 * If this protocol has something to add to this then
1308		 * allow it to do that as well.
1309		 */
1310		if (ifa->ifa_rtrequest)
1311			ifa->ifa_rtrequest(req, rt, info);
1312
1313		/*
1314		 * actually return a resultant rtentry and
1315		 * give the caller a single reference.
1316		 */
1317		if (ret_nrt) {
1318			*ret_nrt = rt;
1319			RT_ADDREF(rt);
1320		}
1321		RT_UNLOCK(rt);
1322		break;
1323	default:
1324		error = EOPNOTSUPP;
1325	}
1326bad:
1327	if (needlock)
1328		RADIX_NODE_HEAD_UNLOCK(rnh);
1329	return (error);
1330#undef senderr
1331}
1332
1333#undef dst
1334#undef gateway
1335#undef netmask
1336#undef ifaaddr
1337#undef ifpaddr
1338#undef flags
1339
1340int
1341rt_setgate(struct rtentry *rt, struct sockaddr *dst, struct sockaddr *gate)
1342{
1343	/* XXX dst may be overwritten, can we move this to below */
1344	int dlen = SA_SIZE(dst), glen = SA_SIZE(gate);
1345#ifdef INVARIANTS
1346	struct radix_node_head *rnh;
1347
1348	rnh = rt_tables_get_rnh(rt->rt_fibnum, dst->sa_family);
1349#endif
1350
1351	RT_LOCK_ASSERT(rt);
1352	RADIX_NODE_HEAD_LOCK_ASSERT(rnh);
1353
1354	/*
1355	 * Prepare to store the gateway in rt->rt_gateway.
1356	 * Both dst and gateway are stored one after the other in the same
1357	 * malloc'd chunk. If we have room, we can reuse the old buffer,
1358	 * rt_gateway already points to the right place.
1359	 * Otherwise, malloc a new block and update the 'dst' address.
1360	 */
1361	if (rt->rt_gateway == NULL || glen > SA_SIZE(rt->rt_gateway)) {
1362		caddr_t new;
1363
1364		R_Malloc(new, caddr_t, dlen + glen);
1365		if (new == NULL)
1366			return ENOBUFS;
1367		/*
1368		 * XXX note, we copy from *dst and not *rt_key(rt) because
1369		 * rt_setgate() can be called to initialize a newly
1370		 * allocated route entry, in which case rt_key(rt) == NULL
1371		 * (and also rt->rt_gateway == NULL).
1372		 * Free()/free() handle a NULL argument just fine.
1373		 */
1374		bcopy(dst, new, dlen);
1375		Free(rt_key(rt));	/* free old block, if any */
1376		rt_key(rt) = (struct sockaddr *)new;
1377		rt->rt_gateway = (struct sockaddr *)(new + dlen);
1378	}
1379
1380	/*
1381	 * Copy the new gateway value into the memory chunk.
1382	 */
1383	bcopy(gate, rt->rt_gateway, glen);
1384
1385	return (0);
1386}
1387
1388void
1389rt_maskedcopy(struct sockaddr *src, struct sockaddr *dst, struct sockaddr *netmask)
1390{
1391	register u_char *cp1 = (u_char *)src;
1392	register u_char *cp2 = (u_char *)dst;
1393	register u_char *cp3 = (u_char *)netmask;
1394	u_char *cplim = cp2 + *cp3;
1395	u_char *cplim2 = cp2 + *cp1;
1396
1397	*cp2++ = *cp1++; *cp2++ = *cp1++; /* copies sa_len & sa_family */
1398	cp3 += 2;
1399	if (cplim > cplim2)
1400		cplim = cplim2;
1401	while (cp2 < cplim)
1402		*cp2++ = *cp1++ & *cp3++;
1403	if (cp2 < cplim2)
1404		bzero((caddr_t)cp2, (unsigned)(cplim2 - cp2));
1405}
1406
1407/*
1408 * Set up a routing table entry, normally
1409 * for an interface.
1410 */
1411#define _SOCKADDR_TMPSIZE 128 /* Not too big.. kernel stack size is limited */
1412static inline  int
1413rtinit1(struct ifaddr *ifa, int cmd, int flags, int fibnum)
1414{
1415	struct sockaddr *dst;
1416	struct sockaddr *netmask;
1417	struct rtentry *rt = NULL;
1418	struct rt_addrinfo info;
1419	int error = 0;
1420	int startfib, endfib;
1421	char tempbuf[_SOCKADDR_TMPSIZE];
1422	int didwork = 0;
1423	int a_failure = 0;
1424	static struct sockaddr_dl null_sdl = {sizeof(null_sdl), AF_LINK};
1425	struct radix_node_head *rnh;
1426
1427	if (flags & RTF_HOST) {
1428		dst = ifa->ifa_dstaddr;
1429		netmask = NULL;
1430	} else {
1431		dst = ifa->ifa_addr;
1432		netmask = ifa->ifa_netmask;
1433	}
1434	if (dst->sa_len == 0)
1435		return(EINVAL);
1436	switch (dst->sa_family) {
1437	case AF_INET6:
1438	case AF_INET:
1439		/* We support multiple FIBs. */
1440		break;
1441	default:
1442		fibnum = RT_DEFAULT_FIB;
1443		break;
1444	}
1445	if (fibnum == -1) {
1446		if (rt_add_addr_allfibs == 0 && cmd == (int)RTM_ADD) {
1447			startfib = endfib = curthread->td_proc->p_fibnum;
1448		} else {
1449			startfib = 0;
1450			endfib = rt_numfibs - 1;
1451		}
1452	} else {
1453		KASSERT((fibnum < rt_numfibs), ("rtinit1: bad fibnum"));
1454		startfib = fibnum;
1455		endfib = fibnum;
1456	}
1457
1458	/*
1459	 * If it's a delete, check that if it exists,
1460	 * it's on the correct interface or we might scrub
1461	 * a route to another ifa which would
1462	 * be confusing at best and possibly worse.
1463	 */
1464	if (cmd == RTM_DELETE) {
1465		/*
1466		 * It's a delete, so it should already exist..
1467		 * If it's a net, mask off the host bits
1468		 * (Assuming we have a mask)
1469		 * XXX this is kinda inet specific..
1470		 */
1471		if (netmask != NULL) {
1472			rt_maskedcopy(dst, (struct sockaddr *)tempbuf, netmask);
1473			dst = (struct sockaddr *)tempbuf;
1474		}
1475	}
1476	/*
1477	 * Now go through all the requested tables (fibs) and do the
1478	 * requested action. Realistically, this will either be fib 0
1479	 * for protocols that don't do multiple tables or all the
1480	 * tables for those that do.
1481	 */
1482	for ( fibnum = startfib; fibnum <= endfib; fibnum++) {
1483		if (cmd == RTM_DELETE) {
1484			struct radix_node *rn;
1485			/*
1486			 * Look up an rtentry that is in the routing tree and
1487			 * contains the correct info.
1488			 */
1489			rnh = rt_tables_get_rnh(fibnum, dst->sa_family);
1490			if (rnh == NULL)
1491				/* this table doesn't exist but others might */
1492				continue;
1493			RADIX_NODE_HEAD_RLOCK(rnh);
1494#ifdef RADIX_MPATH
1495			if (rn_mpath_capable(rnh)) {
1496
1497				rn = rnh->rnh_matchaddr(dst, rnh);
1498				if (rn == NULL)
1499					error = ESRCH;
1500				else {
1501					rt = RNTORT(rn);
1502					/*
1503					 * for interface route the
1504					 * rt->rt_gateway is sockaddr_intf
1505					 * for cloning ARP entries, so
1506					 * rt_mpath_matchgate must use the
1507					 * interface address
1508					 */
1509					rt = rt_mpath_matchgate(rt,
1510					    ifa->ifa_addr);
1511					if (!rt)
1512						error = ESRCH;
1513				}
1514			}
1515			else
1516#endif
1517			rn = rnh->rnh_lookup(dst, netmask, rnh);
1518			error = (rn == NULL ||
1519			    (rn->rn_flags & RNF_ROOT) ||
1520			    RNTORT(rn)->rt_ifa != ifa ||
1521			    !sa_equal((struct sockaddr *)rn->rn_key, dst));
1522			RADIX_NODE_HEAD_RUNLOCK(rnh);
1523			if (error) {
1524				/* this is only an error if bad on ALL tables */
1525				continue;
1526			}
1527		}
1528		/*
1529		 * Do the actual request
1530		 */
1531		bzero((caddr_t)&info, sizeof(info));
1532		info.rti_ifa = ifa;
1533		info.rti_flags = flags |
1534		    (ifa->ifa_flags & ~IFA_RTSELF) | RTF_PINNED;
1535		info.rti_info[RTAX_DST] = dst;
1536		/*
1537		 * doing this for compatibility reasons
1538		 */
1539		if (cmd == RTM_ADD)
1540			info.rti_info[RTAX_GATEWAY] =
1541			    (struct sockaddr *)&null_sdl;
1542		else
1543			info.rti_info[RTAX_GATEWAY] = ifa->ifa_addr;
1544		info.rti_info[RTAX_NETMASK] = netmask;
1545		error = rtrequest1_fib(cmd, &info, &rt, fibnum);
1546
1547		if ((error == EEXIST) && (cmd == RTM_ADD)) {
1548			/*
1549			 * Interface route addition failed.
1550			 * Atomically delete current prefix generating
1551			 * RTM_DELETE message, and retry adding
1552			 * interface prefix.
1553			 */
1554			rnh = rt_tables_get_rnh(fibnum, dst->sa_family);
1555			RADIX_NODE_HEAD_LOCK(rnh);
1556
1557			/* Delete old prefix */
1558			info.rti_ifa = NULL;
1559			info.rti_flags = RTF_RNH_LOCKED;
1560
1561			error = rtrequest1_fib(RTM_DELETE, &info, NULL, fibnum);
1562			if (error == 0) {
1563				info.rti_ifa = ifa;
1564				info.rti_flags = flags | RTF_RNH_LOCKED |
1565				    (ifa->ifa_flags & ~IFA_RTSELF) | RTF_PINNED;
1566				error = rtrequest1_fib(cmd, &info, &rt, fibnum);
1567			}
1568
1569			RADIX_NODE_HEAD_UNLOCK(rnh);
1570		}
1571
1572
1573		if (error == 0 && rt != NULL) {
1574			/*
1575			 * notify any listening routing agents of the change
1576			 */
1577			RT_LOCK(rt);
1578#ifdef RADIX_MPATH
1579			/*
1580			 * in case address alias finds the first address
1581			 * e.g. ifconfig bge0 192.0.2.246/24
1582			 * e.g. ifconfig bge0 192.0.2.247/24
1583			 * the address set in the route is 192.0.2.246
1584			 * so we need to replace it with 192.0.2.247
1585			 */
1586			if (memcmp(rt->rt_ifa->ifa_addr,
1587			    ifa->ifa_addr, ifa->ifa_addr->sa_len)) {
1588				ifa_free(rt->rt_ifa);
1589				ifa_ref(ifa);
1590				rt->rt_ifp = ifa->ifa_ifp;
1591				rt->rt_ifa = ifa;
1592			}
1593#endif
1594			/*
1595			 * doing this for compatibility reasons
1596			 */
1597			if (cmd == RTM_ADD) {
1598			    ((struct sockaddr_dl *)rt->rt_gateway)->sdl_type  =
1599				rt->rt_ifp->if_type;
1600			    ((struct sockaddr_dl *)rt->rt_gateway)->sdl_index =
1601				rt->rt_ifp->if_index;
1602			}
1603			RT_ADDREF(rt);
1604			RT_UNLOCK(rt);
1605			rt_newaddrmsg_fib(cmd, ifa, error, rt, fibnum);
1606			RT_LOCK(rt);
1607			RT_REMREF(rt);
1608			if (cmd == RTM_DELETE) {
1609				/*
1610				 * If we are deleting, and we found an entry,
1611				 * then it's been removed from the tree..
1612				 * now throw it away.
1613				 */
1614				RTFREE_LOCKED(rt);
1615			} else {
1616				if (cmd == RTM_ADD) {
1617					/*
1618					 * We just wanted to add it..
1619					 * we don't actually need a reference.
1620					 */
1621					RT_REMREF(rt);
1622				}
1623				RT_UNLOCK(rt);
1624			}
1625			didwork = 1;
1626		}
1627		if (error)
1628			a_failure = error;
1629	}
1630	if (cmd == RTM_DELETE) {
1631		if (didwork) {
1632			error = 0;
1633		} else {
1634			/* we only give an error if it wasn't in any table */
1635			error = ((flags & RTF_HOST) ?
1636			    EHOSTUNREACH : ENETUNREACH);
1637		}
1638	} else {
1639		if (a_failure) {
1640			/* return an error if any of them failed */
1641			error = a_failure;
1642		}
1643	}
1644	return (error);
1645}
1646
1647#ifndef BURN_BRIDGES
1648/* special one for inet internal use. may not use. */
1649int
1650rtinit_fib(struct ifaddr *ifa, int cmd, int flags)
1651{
1652	return (rtinit1(ifa, cmd, flags, -1));
1653}
1654#endif
1655
1656/*
1657 * Set up a routing table entry, normally
1658 * for an interface.
1659 */
1660int
1661rtinit(struct ifaddr *ifa, int cmd, int flags)
1662{
1663	struct sockaddr *dst;
1664	int fib = RT_DEFAULT_FIB;
1665
1666	if (flags & RTF_HOST) {
1667		dst = ifa->ifa_dstaddr;
1668	} else {
1669		dst = ifa->ifa_addr;
1670	}
1671
1672	switch (dst->sa_family) {
1673	case AF_INET6:
1674	case AF_INET:
1675		/* We do support multiple FIBs. */
1676		fib = -1;
1677		break;
1678	}
1679	return (rtinit1(ifa, cmd, flags, fib));
1680}
1681