1/*
2 * Copyright (c) 2000-2012 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28/*
29 * Copyright (c) 1988, 1991, 1993
30 *	The Regents of the University of California.  All rights reserved.
31 *
32 * Redistribution and use in source and binary forms, with or without
33 * modification, are permitted provided that the following conditions
34 * are met:
35 * 1. Redistributions of source code must retain the above copyright
36 *    notice, this list of conditions and the following disclaimer.
37 * 2. Redistributions in binary form must reproduce the above copyright
38 *    notice, this list of conditions and the following disclaimer in the
39 *    documentation and/or other materials provided with the distribution.
40 * 3. All advertising materials mentioning features or use of this software
41 *    must display the following acknowledgement:
42 *	This product includes software developed by the University of
43 *	California, Berkeley and its contributors.
44 * 4. Neither the name of the University nor the names of its contributors
45 *    may be used to endorse or promote products derived from this software
46 *    without specific prior written permission.
47 *
48 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
49 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
50 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
51 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
52 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
53 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
54 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
55 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
56 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
57 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
58 * SUCH DAMAGE.
59 *
60 *	@(#)rtsock.c	8.5 (Berkeley) 11/2/94
61 */
62
63
64#include <sys/param.h>
65#include <sys/systm.h>
66#include <sys/kernel.h>
67#include <sys/sysctl.h>
68#include <sys/proc.h>
69#include <sys/malloc.h>
70#include <sys/mbuf.h>
71#include <sys/socket.h>
72#include <sys/socketvar.h>
73#include <sys/domain.h>
74#include <sys/protosw.h>
75#include <sys/syslog.h>
76#include <sys/mcache.h>
77#include <kern/lock.h>
78
79#include <net/if.h>
80#include <net/route.h>
81#include <net/dlil.h>
82#include <net/raw_cb.h>
83#include <netinet/in.h>
84#include <netinet/in_var.h>
85#include <netinet/in_arp.h>
86#include <netinet6/nd6.h>
87
88#include <machine/spl.h>
89
90extern struct rtstat rtstat;
91extern int check_routeselfref;
92extern struct domain routedomain;
93
94MALLOC_DEFINE(M_RTABLE, "routetbl", "routing tables");
95
96static struct	sockaddr route_dst = { 2, PF_ROUTE, { 0, } };
97static struct	sockaddr route_src = { 2, PF_ROUTE, { 0, } };
98static struct	sockaddr sa_zero   = { sizeof(sa_zero), AF_INET, { 0, } };
99
100struct walkarg {
101	int	w_tmemsize;
102	int	w_op, w_arg;
103	caddr_t	w_tmem;
104	struct sysctl_req *w_req;
105};
106
107static struct mbuf *rt_msg1(int, struct rt_addrinfo *);
108static int	rt_msg2(int, struct rt_addrinfo *, caddr_t, struct walkarg *);
109static int	rt_xaddrs(caddr_t, caddr_t, struct rt_addrinfo *);
110static int	sysctl_dumpentry(struct radix_node *rn, void *vw);
111static int	sysctl_dumpentry_ext(struct radix_node *rn, void *vw);
112static int	sysctl_iflist(int af, struct walkarg *w);
113static int	sysctl_iflist2(int af, struct walkarg *w);
114static int	route_output(struct mbuf *, struct socket *);
115static void	rt_setmetrics(u_int32_t, struct rt_metrics *, struct rtentry *);
116static void	rt_getmetrics(struct rtentry *, struct rt_metrics *);
117static void	rt_setif(struct rtentry *, struct sockaddr *, struct sockaddr *,
118		    struct sockaddr *, unsigned int);
119static void rt_drainall(void);
120
121#ifndef SIN
122#define	SIN(sa)		((struct sockaddr_in *)(size_t)(sa))
123#endif
124
125SYSCTL_NODE(_net, OID_AUTO, idle, CTLFLAG_RW|CTLFLAG_LOCKED, 0,
126    "idle network monitoring");
127
128static struct timeval last_ts;
129
130SYSCTL_NODE(_net_idle, OID_AUTO, route, CTLFLAG_RW|CTLFLAG_LOCKED, 0,
131    "idle route monitoring");
132
133static int rt_if_idle_drain_interval = RT_IF_IDLE_DRAIN_INTERVAL;
134SYSCTL_INT(_net_idle_route, OID_AUTO, drain_interval, CTLFLAG_RW,
135    &rt_if_idle_drain_interval, 0, "Default interval for draining "
136    "routes when doing interface idle reference counting.");
137
138/*
139 * This macro calculates skew in wall clock, just in case the user changes the
140 * system time. This skew adjustment is required because we now keep the route
141 * expiration times in uptime terms in the kernel, but the userland still
142 * expects expiration times in terms of calendar times.
143 */
144#define CALCULATE_CLOCKSKEW(cc, ic, cu, iu)\
145    ((cc.tv_sec - ic) - (cu - iu))
146
147/*
148 * It really doesn't make any sense at all for this code to share much
149 * with raw_usrreq.c, since its functionality is so restricted.  XXX
150 */
151static int
152rts_abort(struct socket *so)
153{
154	int error;
155
156	error = raw_usrreqs.pru_abort(so);
157	return error;
158}
159
160/* pru_accept is EOPNOTSUPP */
161
162static int
163rts_attach(struct socket *so, int proto, __unused struct proc *p)
164{
165	struct rawcb *rp;
166	int error;
167
168	if (sotorawcb(so) != 0)
169		return EISCONN;	/* XXX panic? */
170	MALLOC(rp, struct rawcb *, sizeof *rp, M_PCB, M_WAITOK); /* XXX */
171	if (rp == 0)
172		return ENOBUFS;
173	bzero(rp, sizeof *rp);
174
175	/*
176	 * The splnet() is necessary to block protocols from sending
177	 * error notifications (like RTM_REDIRECT or RTM_LOSING) while
178	 * this PCB is extant but incompletely initialized.
179	 * Probably we should try to do more of this work beforehand and
180	 * eliminate the spl.
181	 */
182	so->so_pcb = (caddr_t)rp;
183	error = raw_attach(so, proto);	/* don't use raw_usrreqs.pru_attach, it checks for SS_PRIV */
184	rp = sotorawcb(so);
185	if (error) {
186		FREE(rp, M_PCB);
187		so->so_pcb = NULL;
188		so->so_flags |= SOF_PCBCLEARING;
189		return error;
190	}
191
192	switch(rp->rcb_proto.sp_protocol) {
193//####LD route_cb needs looking
194	case AF_INET:
195		route_cb.ip_count++;
196		break;
197	case AF_INET6:
198		route_cb.ip6_count++;
199		break;
200	case AF_IPX:
201		route_cb.ipx_count++;
202		break;
203	case AF_NS:
204		route_cb.ns_count++;
205		break;
206	}
207	rp->rcb_faddr = &route_src;
208	route_cb.any_count++;
209	/* the socket is already locked when we enter rts_attach */
210	soisconnected(so);
211	so->so_options |= SO_USELOOPBACK;
212	return 0;
213}
214
215static int
216rts_bind(struct socket *so, struct sockaddr *nam, struct proc *p)
217{
218	int error;
219	error = raw_usrreqs.pru_bind(so, nam, p); /* xxx just EINVAL */
220	return error;
221}
222
223static int
224rts_connect(struct socket *so, struct sockaddr *nam, struct proc *p)
225{
226	int error;
227	error = raw_usrreqs.pru_connect(so, nam, p); /* XXX just EINVAL */
228	return error;
229}
230
231/* pru_connect2 is EOPNOTSUPP */
232/* pru_control is EOPNOTSUPP */
233
234static int
235rts_detach(struct socket *so)
236{
237	struct rawcb *rp = sotorawcb(so);
238	int error;
239
240	if (rp != 0) {
241		switch(rp->rcb_proto.sp_protocol) {
242		case AF_INET:
243			route_cb.ip_count--;
244			break;
245		case AF_INET6:
246			route_cb.ip6_count--;
247			break;
248		case AF_IPX:
249			route_cb.ipx_count--;
250			break;
251		case AF_NS:
252			route_cb.ns_count--;
253			break;
254		}
255		route_cb.any_count--;
256	}
257	error = raw_usrreqs.pru_detach(so);
258	return error;
259}
260
261static int
262rts_disconnect(struct socket *so)
263{
264	int error;
265	error = raw_usrreqs.pru_disconnect(so);
266	return error;
267}
268
269/* pru_listen is EOPNOTSUPP */
270
271static int
272rts_peeraddr(struct socket *so, struct sockaddr **nam)
273{
274	int error;
275	error = raw_usrreqs.pru_peeraddr(so, nam);
276	return error;
277}
278
279/* pru_rcvd is EOPNOTSUPP */
280/* pru_rcvoob is EOPNOTSUPP */
281
282static int
283rts_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam,
284	 struct mbuf *control, struct proc *p)
285{
286	int error;
287	error = raw_usrreqs.pru_send(so, flags, m, nam, control, p);
288	return error;
289}
290
291/* pru_sense is null */
292
293static int
294rts_shutdown(struct socket *so)
295{
296	int  error;
297	error = raw_usrreqs.pru_shutdown(so);
298	return error;
299}
300
301static int
302rts_sockaddr(struct socket *so, struct sockaddr **nam)
303{
304	int error;
305	error = raw_usrreqs.pru_sockaddr(so, nam);
306	return error;
307}
308
309static struct pr_usrreqs route_usrreqs = {
310	rts_abort, pru_accept_notsupp, rts_attach, rts_bind,
311	rts_connect, pru_connect2_notsupp, pru_control_notsupp,
312	rts_detach, rts_disconnect, pru_listen_notsupp, rts_peeraddr,
313	pru_rcvd_notsupp, pru_rcvoob_notsupp, rts_send, pru_sense_null,
314	rts_shutdown, rts_sockaddr, sosend, soreceive, pru_sopoll_notsupp
315};
316
317/*ARGSUSED*/
318static int
319route_output(struct mbuf *m, struct socket *so)
320{
321	struct rt_msghdr *rtm = NULL;
322	struct rtentry *rt = NULL;
323	struct rtentry *saved_nrt = NULL;
324	struct radix_node_head *rnh;
325	struct rt_addrinfo info;
326	int len, error = 0;
327	sa_family_t dst_sa_family = 0;
328	struct ifnet *ifp = NULL;
329#ifndef __APPLE__
330	struct proc  *curproc = current_proc();
331#endif
332	struct sockaddr_in dst_in, gate_in;
333	int sendonlytoself = 0;
334	unsigned int ifscope = IFSCOPE_NONE;
335
336#define senderr(e) { error = (e); goto flush;}
337	if (m == NULL ||
338	    ((m->m_len < sizeof(intptr_t)) && (m = m_pullup(m, sizeof(intptr_t))) == 0))
339		return (ENOBUFS);
340	if ((m->m_flags & M_PKTHDR) == 0)
341		panic("route_output");
342
343	/* unlock the socket (but keep a reference) it won't be accessed until raw_input appends to it. */
344	socket_unlock(so, 0);
345	lck_mtx_lock(rnh_lock);
346
347	len = m->m_pkthdr.len;
348	if (len < sizeof(*rtm) ||
349	    len != mtod(m, struct rt_msghdr *)->rtm_msglen) {
350		info.rti_info[RTAX_DST] = NULL;
351		senderr(EINVAL);
352	}
353	R_Malloc(rtm, struct rt_msghdr *, len);
354	if (rtm == NULL) {
355		info.rti_info[RTAX_DST] = NULL;
356		senderr(ENOBUFS);
357	}
358	m_copydata(m, 0, len, (caddr_t)rtm);
359	if (rtm->rtm_version != RTM_VERSION) {
360		info.rti_info[RTAX_DST] = NULL;
361		senderr(EPROTONOSUPPORT);
362	}
363
364	/*
365	 * Silent version of RTM_GET for Reachabiltiy APIs. We may change
366	 * all RTM_GETs to be silent in the future, so this is private for now.
367	 */
368	if (rtm->rtm_type == RTM_GET_SILENT) {
369		if ((so->so_options & SO_USELOOPBACK) == 0)
370			senderr(EINVAL);
371		sendonlytoself = 1;
372		rtm->rtm_type = RTM_GET;
373	}
374
375	/*
376	 * Perform permission checking, only privileged sockets
377	 * may perform operations other than RTM_GET
378	 */
379	if (rtm->rtm_type != RTM_GET && (so->so_state & SS_PRIV) == 0) {
380		info.rti_info[RTAX_DST] = NULL;
381		senderr(EPERM);
382	}
383
384	rtm->rtm_pid = proc_selfpid();
385	info.rti_addrs = rtm->rtm_addrs;
386	if (rt_xaddrs((caddr_t)(rtm + 1), len + (caddr_t)rtm, &info)) {
387		info.rti_info[RTAX_DST] = NULL;
388		senderr(EINVAL);
389	}
390	if (info.rti_info[RTAX_DST] == NULL || (info.rti_info[RTAX_DST]->sa_family >= AF_MAX) ||
391	    (info.rti_info[RTAX_GATEWAY] != NULL && (info.rti_info[RTAX_GATEWAY]->sa_family >= AF_MAX))) {
392		senderr(EINVAL);
393	}
394
395	if (info.rti_info[RTAX_DST]->sa_family == AF_INET && info.rti_info[RTAX_DST]->sa_len != sizeof (dst_in)) {
396		/* At minimum, we need up to sin_addr */
397		if (info.rti_info[RTAX_DST]->sa_len < offsetof(struct sockaddr_in, sin_zero))
398			senderr(EINVAL);
399		bzero(&dst_in, sizeof (dst_in));
400		dst_in.sin_len = sizeof (dst_in);
401		dst_in.sin_family = AF_INET;
402		dst_in.sin_port = SIN(info.rti_info[RTAX_DST])->sin_port;
403		dst_in.sin_addr = SIN(info.rti_info[RTAX_DST])->sin_addr;
404		info.rti_info[RTAX_DST] = (struct sockaddr *)&dst_in;
405		dst_sa_family = info.rti_info[RTAX_DST]->sa_family;
406	}
407
408	if (info.rti_info[RTAX_GATEWAY] != NULL &&
409	    info.rti_info[RTAX_GATEWAY]->sa_family == AF_INET && info.rti_info[RTAX_GATEWAY]->sa_len != sizeof (gate_in)) {
410		/* At minimum, we need up to sin_addr */
411		if (info.rti_info[RTAX_GATEWAY]->sa_len < offsetof(struct sockaddr_in, sin_zero))
412			senderr(EINVAL);
413		bzero(&gate_in, sizeof (gate_in));
414		gate_in.sin_len = sizeof (gate_in);
415		gate_in.sin_family = AF_INET;
416		gate_in.sin_port = SIN(info.rti_info[RTAX_GATEWAY])->sin_port;
417		gate_in.sin_addr = SIN(info.rti_info[RTAX_GATEWAY])->sin_addr;
418		info.rti_info[RTAX_GATEWAY] = (struct sockaddr *)&gate_in;
419	}
420
421	if (info.rti_info[RTAX_GENMASK]) {
422		struct radix_node *t;
423		t = rn_addmask((caddr_t)info.rti_info[RTAX_GENMASK], 0, 1);
424		if (t && Bcmp(info.rti_info[RTAX_GENMASK], t->rn_key, *(u_char *)info.rti_info[RTAX_GENMASK]) == 0)
425			info.rti_info[RTAX_GENMASK] = (struct sockaddr *)(t->rn_key);
426		else
427			senderr(ENOBUFS);
428	}
429
430	/*
431	 * If RTF_IFSCOPE flag is set, then rtm_index specifies the scope.
432	 */
433	if (rtm->rtm_flags & RTF_IFSCOPE) {
434		if (info.rti_info[RTAX_DST]->sa_family != AF_INET && info.rti_info[RTAX_DST]->sa_family != AF_INET6)
435			senderr(EINVAL);
436		ifscope = rtm->rtm_index;
437	}
438
439	/*
440	 * RTF_PROXY can only be set internally from within the kernel.
441	 */
442	if (rtm->rtm_flags & RTF_PROXY)
443		senderr(EINVAL);
444
445	/*
446	 * For AF_INET, always zero out the embedded scope ID.  If this is
447	 * a scoped request, it must be done explicitly by setting RTF_IFSCOPE
448	 * flag and the corresponding rtm_index value.  This is to prevent
449	 * false interpretation of the scope ID because it's using the sin_zero
450	 * field, which might not be properly cleared by the requestor.
451	 */
452	if (info.rti_info[RTAX_DST]->sa_family == AF_INET)
453		sin_set_ifscope(info.rti_info[RTAX_DST], IFSCOPE_NONE);
454	if (info.rti_info[RTAX_GATEWAY] != NULL && info.rti_info[RTAX_GATEWAY]->sa_family == AF_INET)
455		sin_set_ifscope(info.rti_info[RTAX_GATEWAY], IFSCOPE_NONE);
456
457	switch (rtm->rtm_type) {
458
459		case RTM_ADD:
460			if (info.rti_info[RTAX_GATEWAY] == NULL)
461				senderr(EINVAL);
462
463#ifdef __APPLE__
464/* XXX LD11JUL02 Special case for AOL 5.1.2 connectivity issue to AirPort BS (Radar 2969954)
465 * AOL is adding a circular route ("10.0.1.1/32 10.0.1.1") when establishing its ppp tunnel
466 * to the AP BaseStation by removing the default gateway and replacing it with their tunnel entry point.
467 * There is no apparent reason to add this route as there is a valid 10.0.1.1/24 route to the BS.
468 * That circular route was ignored on previous version of MacOS X because of a routing bug
469 * corrected with the merge to FreeBSD4.4 (a route generated from an RTF_CLONING route had the RTF_WASCLONED
470 * flag set but did not have a reference to the parent route) and that entry was left in the RT. This workaround is
471 * made in order to provide binary compatibility with AOL.
472 * If we catch a process adding a circular route with a /32 from the routing socket, we error it out instead of
473 * confusing the routing table with a wrong route to the previous default gateway
474 */
475{
476#define satosinaddr(sa) (((struct sockaddr_in *)(void *)sa)->sin_addr.s_addr)
477
478			if (check_routeselfref && (info.rti_info[RTAX_DST] && info.rti_info[RTAX_DST]->sa_family == AF_INET) &&
479				(info.rti_info[RTAX_NETMASK] && satosinaddr(info.rti_info[RTAX_NETMASK]) == INADDR_BROADCAST) &&
480				(info.rti_info[RTAX_GATEWAY] && satosinaddr(info.rti_info[RTAX_DST]) == satosinaddr(info.rti_info[RTAX_GATEWAY]))) {
481					log(LOG_WARNING, "route_output: circular route %ld.%ld.%ld.%ld/32 ignored\n",
482						(ntohl(satosinaddr(info.rti_info[RTAX_GATEWAY])>>24))&0xff,
483						(ntohl(satosinaddr(info.rti_info[RTAX_GATEWAY])>>16))&0xff,
484						(ntohl(satosinaddr(info.rti_info[RTAX_GATEWAY])>>8))&0xff,
485						(ntohl(satosinaddr(info.rti_info[RTAX_GATEWAY])))&0xff);
486
487					senderr(EINVAL);
488			}
489}
490#endif
491			error = rtrequest_scoped_locked(RTM_ADD, info.rti_info[RTAX_DST], info.rti_info[RTAX_GATEWAY],
492			    info.rti_info[RTAX_NETMASK], rtm->rtm_flags, &saved_nrt, ifscope);
493			if (error == 0 && saved_nrt) {
494				RT_LOCK(saved_nrt);
495#ifdef __APPLE__
496				/*
497				 * If the route request specified an interface with
498				 * IFA and/or IFP, we set the requested interface on
499				 * the route with rt_setif.  It would be much better
500				 * to do this inside rtrequest, but that would
501				 * require passing the desired interface, in some
502				 * form, to rtrequest.  Since rtrequest is called in
503				 * so many places (roughly 40 in our source), adding
504				 * a parameter is to much for us to swallow; this is
505				 * something for the FreeBSD developers to tackle.
506				 * Instead, we let rtrequest compute whatever
507				 * interface it wants, then come in behind it and
508				 * stick in the interface that we really want.  This
509				 * works reasonably well except when rtrequest can't
510				 * figure out what interface to use (with
511				 * ifa_withroute) and returns ENETUNREACH.  Ideally
512				 * it shouldn't matter if rtrequest can't figure out
513				 * the interface if we're going to explicitly set it
514				 * ourselves anyway.  But practically we can't
515				 * recover here because rtrequest will not do any of
516				 * the work necessary to add the route if it can't
517				 * find an interface.  As long as there is a default
518				 * route that leads to some interface, rtrequest will
519				 * find an interface, so this problem should be
520				 * rarely encountered.
521				 * dwiggins@bbn.com
522				 */
523
524				rt_setif(saved_nrt, info.rti_info[RTAX_IFP], info.rti_info[RTAX_IFA], info.rti_info[RTAX_GATEWAY],
525				    ifscope);
526#endif
527				rt_setmetrics(rtm->rtm_inits,
528				    &rtm->rtm_rmx, saved_nrt);
529				saved_nrt->rt_rmx.rmx_locks &= ~(rtm->rtm_inits);
530				saved_nrt->rt_rmx.rmx_locks |=
531				    (rtm->rtm_inits & rtm->rtm_rmx.rmx_locks);
532				saved_nrt->rt_genmask = info.rti_info[RTAX_GENMASK];
533				RT_REMREF_LOCKED(saved_nrt);
534				RT_UNLOCK(saved_nrt);
535			}
536			break;
537
538		case RTM_DELETE:
539			error = rtrequest_scoped_locked(RTM_DELETE, info.rti_info[RTAX_DST],
540			    info.rti_info[RTAX_GATEWAY], info.rti_info[RTAX_NETMASK], rtm->rtm_flags, &saved_nrt, ifscope);
541			if (error == 0) {
542				rt = saved_nrt;
543				RT_LOCK(rt);
544				goto report;
545			}
546			break;
547
548		case RTM_GET:
549		case RTM_CHANGE:
550		case RTM_LOCK:
551			if ((rnh = rt_tables[info.rti_info[RTAX_DST]->sa_family]) == NULL)
552				senderr(EAFNOSUPPORT);
553
554			/*
555			 * Lookup the best match based on the key-mask pair;
556			 * callee adds a reference and checks for root node.
557			 */
558			rt = rt_lookup(TRUE, info.rti_info[RTAX_DST], info.rti_info[RTAX_NETMASK], rnh, ifscope);
559			if (rt == NULL)
560				senderr(ESRCH);
561			RT_LOCK(rt);
562
563			/*
564			 * Holding rnh_lock here prevents the possibility of
565			 * ifa from changing (e.g. in_ifinit), so it is safe
566			 * to access its ifa_addr (down below) without locking.
567			 */
568			switch(rtm->rtm_type) {
569
570				case RTM_GET: {
571					struct ifaddr *ifa2;
572				report:
573					ifa2 = NULL;
574					RT_LOCK_ASSERT_HELD(rt);
575					info.rti_info[RTAX_DST] = rt_key(rt);
576					dst_sa_family = info.rti_info[RTAX_DST]->sa_family;
577					info.rti_info[RTAX_GATEWAY] = rt->rt_gateway;
578					info.rti_info[RTAX_NETMASK] = rt_mask(rt);
579					info.rti_info[RTAX_GENMASK] = rt->rt_genmask;
580					if (rtm->rtm_addrs & (RTA_IFP | RTA_IFA)) {
581						ifp = rt->rt_ifp;
582						if (ifp) {
583							ifnet_lock_shared(ifp);
584							ifa2 = ifp->if_lladdr;
585							info.rti_info[RTAX_IFP] = ifa2->ifa_addr;
586							IFA_ADDREF(ifa2);
587							ifnet_lock_done(ifp);
588							info.rti_info[RTAX_IFA] = rt->rt_ifa->ifa_addr;
589							rtm->rtm_index = ifp->if_index;
590						} else {
591							info.rti_info[RTAX_IFP] = NULL;
592							info.rti_info[RTAX_IFA] = NULL;
593						}
594					} else if ((ifp = rt->rt_ifp) != NULL) {
595						rtm->rtm_index = ifp->if_index;
596					}
597					if (ifa2 != NULL)
598						IFA_LOCK(ifa2);
599					len = rt_msg2(rtm->rtm_type, &info, (caddr_t)0,
600						(struct walkarg *)0);
601					if (ifa2 != NULL)
602						IFA_UNLOCK(ifa2);
603					if (len > rtm->rtm_msglen) {
604						struct rt_msghdr *new_rtm;
605						R_Malloc(new_rtm, struct rt_msghdr *, len);
606						if (new_rtm == 0) {
607							RT_UNLOCK(rt);
608							if (ifa2 != NULL)
609								IFA_REMREF(ifa2);
610							senderr(ENOBUFS);
611						}
612						Bcopy(rtm, new_rtm, rtm->rtm_msglen);
613						R_Free(rtm); rtm = new_rtm;
614					}
615					if (ifa2 != NULL)
616						IFA_LOCK(ifa2);
617					(void)rt_msg2(rtm->rtm_type, &info, (caddr_t)rtm,
618						(struct walkarg *)0);
619					if (ifa2 != NULL)
620						IFA_UNLOCK(ifa2);
621					rtm->rtm_flags = rt->rt_flags;
622					rt_getmetrics(rt, &rtm->rtm_rmx);
623					rtm->rtm_addrs = info.rti_addrs;
624					if (ifa2 != NULL)
625						IFA_REMREF(ifa2);
626					}
627					break;
628
629				case RTM_CHANGE:
630					if (info.rti_info[RTAX_GATEWAY] && (error = rt_setgate(rt,
631					    rt_key(rt), info.rti_info[RTAX_GATEWAY]))) {
632						int tmp = error;
633						RT_UNLOCK(rt);
634						senderr(tmp);
635					}
636					/*
637					 * If they tried to change things but didn't specify
638					 * the required gateway, then just use the old one.
639					 * This can happen if the user tries to change the
640					 * flags on the default route without changing the
641					 * default gateway.  Changing flags still doesn't work.
642					 */
643					if ((rt->rt_flags & RTF_GATEWAY) && !info.rti_info[RTAX_GATEWAY])
644						info.rti_info[RTAX_GATEWAY] = rt->rt_gateway;
645
646#ifdef __APPLE__
647					/*
648					 * On Darwin, we call rt_setif which contains the
649					 * equivalent to the code found at this very spot
650					 * in BSD.
651					 */
652					rt_setif(rt, info.rti_info[RTAX_IFP], info.rti_info[RTAX_IFA], info.rti_info[RTAX_GATEWAY],
653					    ifscope);
654#endif
655
656					rt_setmetrics(rtm->rtm_inits, &rtm->rtm_rmx,
657							rt);
658#ifndef __APPLE__
659					/* rt_setif, called above does this for us on darwin */
660					if (rt->rt_ifa && rt->rt_ifa->ifa_rtrequest)
661						rt->rt_ifa->ifa_rtrequest(RTM_ADD, rt, info.rti_info[RTAX_GATEWAY]);
662#endif
663					if (info.rti_info[RTAX_GENMASK])
664						rt->rt_genmask = info.rti_info[RTAX_GENMASK];
665					/*
666					 * Fall into
667					 */
668				case RTM_LOCK:
669					rt->rt_rmx.rmx_locks &= ~(rtm->rtm_inits);
670					rt->rt_rmx.rmx_locks |=
671						(rtm->rtm_inits & rtm->rtm_rmx.rmx_locks);
672					break;
673				}
674			RT_UNLOCK(rt);
675			break;
676
677		default:
678			senderr(EOPNOTSUPP);
679	}
680flush:
681	if (rtm) {
682		if (error)
683			rtm->rtm_errno = error;
684		else
685			rtm->rtm_flags |= RTF_DONE;
686	}
687	if (rt != NULL) {
688		RT_LOCK_ASSERT_NOTHELD(rt);
689		rtfree_locked(rt);
690	}
691	lck_mtx_unlock(rnh_lock);
692	socket_lock(so, 0);	/* relock the socket now */
693    {
694	struct rawcb *rp = 0;
695	/*
696	 * Check to see if we don't want our own messages.
697	 */
698	if ((so->so_options & SO_USELOOPBACK) == 0) {
699		if (route_cb.any_count <= 1) {
700			if (rtm)
701				R_Free(rtm);
702			m_freem(m);
703			return (error);
704		}
705		/* There is another listener, so construct message */
706		rp = sotorawcb(so);
707	}
708	if (rtm) {
709		m_copyback(m, 0, rtm->rtm_msglen, (caddr_t)rtm);
710		if (m->m_pkthdr.len < rtm->rtm_msglen) {
711			m_freem(m);
712			m = NULL;
713		} else if (m->m_pkthdr.len > rtm->rtm_msglen)
714			m_adj(m, rtm->rtm_msglen - m->m_pkthdr.len);
715		R_Free(rtm);
716	}
717	if (sendonlytoself && m) {
718		error = 0;
719		if (sbappendaddr(&so->so_rcv, &route_src, m, (struct mbuf*)0, &error) != 0) {
720			sorwakeup(so);
721		}
722		if (error)
723			return error;
724	} else {
725		struct	sockproto	route_proto = {PF_ROUTE, 0};
726		if (rp)
727			rp->rcb_proto.sp_family = 0; /* Avoid us */
728		if (dst_sa_family != 0)
729			route_proto.sp_protocol = dst_sa_family;
730		if (m) {
731			socket_unlock(so, 0);
732			raw_input(m, &route_proto, &route_src, &route_dst);
733			socket_lock(so, 0);
734		}
735		if (rp)
736			rp->rcb_proto.sp_family = PF_ROUTE;
737		}
738	}
739	return (error);
740}
741
742void
743rt_setexpire(struct rtentry *rt, uint64_t expiry)
744{
745	/* set both rt_expire and rmx_expire */
746	rt->rt_expire = expiry;
747	if (expiry) {
748		rt->rt_rmx.rmx_expire = expiry + rt->base_calendartime -
749		    rt->base_uptime;
750	} else
751		rt->rt_rmx.rmx_expire = 0;
752}
753
754static void
755rt_setmetrics(u_int32_t which, struct rt_metrics *in, struct rtentry *out)
756{
757	struct timeval curr_calendar_time;
758	uint64_t curr_uptime;
759
760	getmicrotime(&curr_calendar_time);
761        curr_uptime = net_uptime();
762
763#define metric(f, e) if (which & (f)) out->rt_rmx.e = in->e;
764	metric(RTV_RPIPE, rmx_recvpipe);
765	metric(RTV_SPIPE, rmx_sendpipe);
766	metric(RTV_SSTHRESH, rmx_ssthresh);
767	metric(RTV_RTT, rmx_rtt);
768	metric(RTV_RTTVAR, rmx_rttvar);
769	metric(RTV_HOPCOUNT, rmx_hopcount);
770	metric(RTV_MTU, rmx_mtu);
771	metric(RTV_EXPIRE, rmx_expire);
772#undef metric
773
774	if (out->rt_rmx.rmx_expire > 0) {
775		/* account for system time change */
776		curr_uptime = net_uptime();
777		getmicrotime(&curr_calendar_time);
778		out->base_calendartime +=
779		    CALCULATE_CLOCKSKEW(curr_calendar_time,
780		    out->base_calendartime,
781		    curr_uptime, out->base_uptime);
782		rt_setexpire(out,
783		    out->rt_rmx.rmx_expire -
784		    out->base_calendartime +
785		    out->base_uptime);
786	} else {
787		rt_setexpire(out, 0);
788	}
789
790	VERIFY(out->rt_expire == 0 || out->rt_rmx.rmx_expire != 0);
791	VERIFY(out->rt_expire != 0 || out->rt_rmx.rmx_expire == 0);
792}
793
794static void
795rt_getmetrics(struct rtentry *in, struct rt_metrics *out)
796{
797	struct timeval curr_calendar_time;
798	uint64_t curr_uptime;
799
800	VERIFY(in->rt_expire == 0 || in->rt_rmx.rmx_expire != 0);
801	VERIFY(in->rt_expire != 0 || in->rt_rmx.rmx_expire == 0);
802
803	*out = in->rt_rmx;
804
805	if (in->rt_expire) {
806		/* account for system time change */
807		getmicrotime(&curr_calendar_time);
808		curr_uptime = net_uptime();
809
810		in->base_calendartime +=
811		    CALCULATE_CLOCKSKEW(curr_calendar_time,
812			in->base_calendartime,
813			curr_uptime, in->base_uptime);
814
815		out->rmx_expire = in->base_calendartime +
816		    in->rt_expire - in->base_uptime;
817	} else
818		out->rmx_expire = 0;
819}
820
821/*
822 * Set route's interface given info.rti_info[RTAX_IFP], info.rti_info[RTAX_IFA], and gateway.
823 */
824static void
825rt_setif(struct rtentry *rt, struct sockaddr *Ifpaddr, struct sockaddr *Ifaaddr,
826    struct sockaddr *Gate, unsigned int ifscope)
827{
828	struct ifaddr *ifa = NULL;
829	struct ifnet *ifp = NULL;
830	void (*ifa_rtrequest)
831	    (int, struct rtentry *, struct sockaddr *);
832
833	lck_mtx_assert(rnh_lock, LCK_MTX_ASSERT_OWNED);
834
835	RT_LOCK_ASSERT_HELD(rt);
836
837	/* trigger route cache reevaluation */
838	if (use_routegenid)
839		routegenid_update();
840
841	/* Don't update a defunct route */
842	if (rt->rt_flags & RTF_CONDEMNED)
843		return;
844
845	/* Add an extra ref for ourselves */
846	RT_ADDREF_LOCKED(rt);
847
848	/* Become a regular mutex, just in case */
849	RT_CONVERT_LOCK(rt);
850
851	/*
852	 * New gateway could require new ifaddr, ifp; flags may also
853	 * be different; ifp may be specified by ll sockaddr when
854	 * protocol address is ambiguous.
855	 */
856	if (Ifpaddr && (ifa = ifa_ifwithnet_scoped(Ifpaddr, ifscope)) &&
857	    (ifp = ifa->ifa_ifp) && (Ifaaddr || Gate)) {
858		IFA_REMREF(ifa);
859		ifa = ifaof_ifpforaddr(Ifaaddr ? Ifaaddr : Gate, ifp);
860	} else {
861		if (ifa) {
862			IFA_REMREF(ifa);
863			ifa = 0;
864		}
865		if (Ifpaddr && (ifp = if_withname(Ifpaddr)) ) {
866			if (Gate) {
867				ifa = ifaof_ifpforaddr(Gate, ifp);
868			} else {
869				ifnet_lock_shared(ifp);
870				ifa = TAILQ_FIRST(&ifp->if_addrhead);
871				if (ifa != NULL)
872					IFA_ADDREF(ifa);
873				ifnet_lock_done(ifp);
874			}
875		} else if (Ifaaddr &&
876		    (ifa = ifa_ifwithaddr_scoped(Ifaaddr, ifscope))) {
877			ifp = ifa->ifa_ifp;
878		} else if (Gate != NULL) {
879			/*
880			 * Safe to drop rt_lock and use rt_key, since holding
881			 * rnh_lock here prevents another thread from calling
882			 * rt_setgate() on this route.  We cannot hold the
883			 * lock across ifa_ifwithroute since the lookup done
884			 * by that routine may point to the same route.
885			 */
886			RT_UNLOCK(rt);
887			if ((ifa = ifa_ifwithroute_scoped_locked(rt->rt_flags,
888			    rt_key(rt), Gate, ifscope)) != NULL)
889				ifp = ifa->ifa_ifp;
890			RT_LOCK(rt);
891			/* Don't update a defunct route */
892			if (rt->rt_flags & RTF_CONDEMNED) {
893				if (ifa != NULL)
894					IFA_REMREF(ifa);
895				/* Release extra ref */
896				RT_REMREF_LOCKED(rt);
897				return;
898			}
899		}
900	}
901	if (ifa) {
902		struct ifaddr *oifa = rt->rt_ifa;
903		if (oifa != ifa) {
904			if (oifa != NULL) {
905				IFA_LOCK_SPIN(oifa);
906				ifa_rtrequest = oifa->ifa_rtrequest;
907				IFA_UNLOCK(oifa);
908				if (ifa_rtrequest != NULL)
909					ifa_rtrequest(RTM_DELETE, rt, Gate);
910			}
911			rtsetifa(rt, ifa);
912
913			if (rt->rt_ifp != ifp) {
914				/*
915				 * Purge any link-layer info caching.
916				 */
917				if (rt->rt_llinfo_purge != NULL)
918					rt->rt_llinfo_purge(rt);
919
920				/*
921				 * Adjust route ref count for the interfaces.
922				 */
923				if (rt->rt_if_ref_fn != NULL) {
924					rt->rt_if_ref_fn(ifp, 1);
925					rt->rt_if_ref_fn(rt->rt_ifp, -1);
926				}
927			}
928			rt->rt_ifp = ifp;
929			/*
930			 * If this is the (non-scoped) default route, record
931			 * the interface index used for the primary ifscope.
932			 */
933			if (rt_primary_default(rt, rt_key(rt))) {
934				set_primary_ifscope(rt_key(rt)->sa_family,
935				    rt->rt_ifp->if_index);
936			}
937			rt->rt_rmx.rmx_mtu = ifp->if_mtu;
938			if (rt->rt_ifa != NULL) {
939				IFA_LOCK_SPIN(rt->rt_ifa);
940				ifa_rtrequest = rt->rt_ifa->ifa_rtrequest;
941				IFA_UNLOCK(rt->rt_ifa);
942				if (ifa_rtrequest != NULL)
943					ifa_rtrequest(RTM_ADD, rt, Gate);
944			}
945			IFA_REMREF(ifa);
946			/* Release extra ref */
947			RT_REMREF_LOCKED(rt);
948			return;
949		}
950		IFA_REMREF(ifa);
951	}
952
953	/* XXX: to reset gateway to correct value, at RTM_CHANGE */
954	if (rt->rt_ifa != NULL) {
955		IFA_LOCK_SPIN(rt->rt_ifa);
956		ifa_rtrequest = rt->rt_ifa->ifa_rtrequest;
957		IFA_UNLOCK(rt->rt_ifa);
958		if (ifa_rtrequest != NULL)
959			ifa_rtrequest(RTM_ADD, rt, Gate);
960	}
961
962	/* Release extra ref */
963	RT_REMREF_LOCKED(rt);
964}
965
966#define ROUNDUP32(a) \
967	((a) > 0 ? (1 + (((a) - 1) | (sizeof(uint32_t) - 1))) : sizeof(uint32_t))
968#define ADVANCE32(x, n) (x += ROUNDUP32((n)->sa_len))
969
970
971/*
972 * Extract the addresses of the passed sockaddrs.
973 * Do a little sanity checking so as to avoid bad memory references.
974 * This data is derived straight from userland.
975 */
976static int
977rt_xaddrs(caddr_t cp, caddr_t cplim, struct rt_addrinfo *rtinfo)
978{
979	struct sockaddr *sa;
980	int i;
981
982	bzero(rtinfo->rti_info, sizeof(rtinfo->rti_info));
983	for (i = 0; (i < RTAX_MAX) && (cp < cplim); i++) {
984		if ((rtinfo->rti_addrs & (1 << i)) == 0)
985			continue;
986		sa = (struct sockaddr *)cp;
987		/*
988		 * It won't fit.
989		 */
990		if ( (cp + sa->sa_len) > cplim ) {
991			return (EINVAL);
992		}
993
994		/*
995		 * there are no more.. quit now
996		 * If there are more bits, they are in error.
997		 * I've seen this. route(1) can evidently generate these.
998		 * This causes kernel to core dump.
999		 * for compatibility, If we see this, point to a safe address.
1000		 */
1001		if (sa->sa_len == 0) {
1002			rtinfo->rti_info[i] = &sa_zero;
1003			return (0); /* should be EINVAL but for compat */
1004		}
1005
1006		/* accept it */
1007		rtinfo->rti_info[i] = sa;
1008		ADVANCE32(cp, sa);
1009	}
1010	return (0);
1011}
1012
1013static struct mbuf *
1014rt_msg1(int type, struct rt_addrinfo *rtinfo)
1015{
1016	struct rt_msghdr *rtm;
1017	struct mbuf *m;
1018	int i;
1019	int len, dlen;
1020
1021	switch (type) {
1022
1023	case RTM_DELADDR:
1024	case RTM_NEWADDR:
1025		len = sizeof(struct ifa_msghdr);
1026		break;
1027
1028	case RTM_DELMADDR:
1029	case RTM_NEWMADDR:
1030		len = sizeof(struct ifma_msghdr);
1031		break;
1032
1033	case RTM_IFINFO:
1034		len = sizeof(struct if_msghdr);
1035		break;
1036
1037	default:
1038		len = sizeof(struct rt_msghdr);
1039	}
1040	if (len > MCLBYTES)
1041		panic("rt_msg1");
1042	m = m_gethdr(M_DONTWAIT, MT_DATA);
1043	if (m && len > MHLEN) {
1044		MCLGET(m, M_DONTWAIT);
1045		if ((m->m_flags & M_EXT) == 0) {
1046			m_free(m);
1047			m = NULL;
1048		}
1049	}
1050	if (m == 0)
1051		return (m);
1052	m->m_pkthdr.len = m->m_len = len;
1053	m->m_pkthdr.rcvif = 0;
1054	rtm = mtod(m, struct rt_msghdr *);
1055	bzero((caddr_t)rtm, len);
1056	for (i = 0; i < RTAX_MAX; i++) {
1057		struct sockaddr *sa, *hint;
1058		struct sockaddr_storage ss;
1059
1060		if ((sa = rtinfo->rti_info[i]) == NULL)
1061			continue;
1062
1063		switch (i) {
1064		case RTAX_DST:
1065		case RTAX_NETMASK:
1066			if ((hint = rtinfo->rti_info[RTAX_DST]) == NULL)
1067				hint = rtinfo->rti_info[RTAX_IFA];
1068
1069			/* Scrub away any trace of embedded interface scope */
1070			sa = rtm_scrub_ifscope(type, i, hint, sa, &ss);
1071			break;
1072
1073		default:
1074			break;
1075		}
1076
1077		rtinfo->rti_addrs |= (1 << i);
1078		dlen = ROUNDUP32(sa->sa_len);
1079		m_copyback(m, len, dlen, (caddr_t)sa);
1080		len += dlen;
1081	}
1082	if (m->m_pkthdr.len != len) {
1083		m_freem(m);
1084		return (NULL);
1085	}
1086	rtm->rtm_msglen = len;
1087	rtm->rtm_version = RTM_VERSION;
1088	rtm->rtm_type = type;
1089	return (m);
1090}
1091
1092static int
1093rt_msg2(int type, struct rt_addrinfo *rtinfo, caddr_t cp, struct walkarg *w)
1094{
1095	int i;
1096	int len, dlen, second_time = 0;
1097	caddr_t cp0;
1098
1099	rtinfo->rti_addrs = 0;
1100again:
1101	switch (type) {
1102
1103	case RTM_DELADDR:
1104	case RTM_NEWADDR:
1105		len = sizeof(struct ifa_msghdr);
1106		break;
1107
1108	case RTM_DELMADDR:
1109	case RTM_NEWMADDR:
1110		len = sizeof(struct ifma_msghdr);
1111		break;
1112
1113	case RTM_IFINFO:
1114		len = sizeof(struct if_msghdr);
1115		break;
1116
1117	case RTM_IFINFO2:
1118		len = sizeof(struct if_msghdr2);
1119		break;
1120
1121	case RTM_NEWMADDR2:
1122		len = sizeof(struct ifma_msghdr2);
1123		break;
1124
1125	case RTM_GET_EXT:
1126		len = sizeof (struct rt_msghdr_ext);
1127		break;
1128
1129	case RTM_GET2:
1130		len = sizeof(struct rt_msghdr2);
1131		break;
1132
1133	default:
1134		len = sizeof(struct rt_msghdr);
1135	}
1136	cp0 = cp;
1137	if (cp0)
1138		cp += len;
1139	for (i = 0; i < RTAX_MAX; i++) {
1140		struct sockaddr *sa, *hint;
1141		struct sockaddr_storage ss;
1142
1143		if ((sa = rtinfo->rti_info[i]) == 0)
1144			continue;
1145
1146		switch (i) {
1147		case RTAX_DST:
1148		case RTAX_NETMASK:
1149			if ((hint = rtinfo->rti_info[RTAX_DST]) == NULL)
1150				hint = rtinfo->rti_info[RTAX_IFA];
1151
1152			/* Scrub away any trace of embedded interface scope */
1153			sa = rtm_scrub_ifscope(type, i, hint, sa, &ss);
1154			break;
1155
1156		default:
1157			break;
1158		}
1159
1160		rtinfo->rti_addrs |= (1 << i);
1161		dlen = ROUNDUP32(sa->sa_len);
1162		if (cp) {
1163			bcopy((caddr_t)sa, cp, (unsigned)dlen);
1164			cp += dlen;
1165		}
1166		len += dlen;
1167	}
1168	if (cp == 0 && w != NULL && !second_time) {
1169		struct walkarg *rw = w;
1170
1171		if (rw->w_req) {
1172			if (rw->w_tmemsize < len) {
1173				if (rw->w_tmem)
1174					FREE(rw->w_tmem, M_RTABLE);
1175				rw->w_tmem = _MALLOC(len, M_RTABLE, M_WAITOK);
1176				if (rw->w_tmem)
1177					rw->w_tmemsize = len;
1178			}
1179			if (rw->w_tmem) {
1180				cp = rw->w_tmem;
1181				second_time = 1;
1182				goto again;
1183			}
1184		}
1185	}
1186	if (cp) {
1187		struct rt_msghdr *rtm = (struct rt_msghdr *)(void *)cp0;
1188
1189		rtm->rtm_version = RTM_VERSION;
1190		rtm->rtm_type = type;
1191		rtm->rtm_msglen = len;
1192	}
1193	return (len);
1194}
1195
1196/*
1197 * This routine is called to generate a message from the routing
1198 * socket indicating that a redirect has occurred, a routing lookup
1199 * has failed, or that a protocol has detected timeouts to a particular
1200 * destination.
1201 */
1202void
1203rt_missmsg(int type, struct rt_addrinfo *rtinfo, int flags, int error)
1204{
1205	struct rt_msghdr *rtm;
1206	struct mbuf *m;
1207	struct sockaddr *sa = rtinfo->rti_info[RTAX_DST];
1208	struct	sockproto	route_proto = {PF_ROUTE, 0};
1209
1210	if (route_cb.any_count == 0)
1211		return;
1212	m = rt_msg1(type, rtinfo);
1213	if (m == 0)
1214		return;
1215	rtm = mtod(m, struct rt_msghdr *);
1216	rtm->rtm_flags = RTF_DONE | flags;
1217	rtm->rtm_errno = error;
1218	rtm->rtm_addrs = rtinfo->rti_addrs;
1219	route_proto.sp_family = sa ? sa->sa_family : 0;
1220	raw_input(m, &route_proto, &route_src, &route_dst);
1221}
1222
1223/*
1224 * This routine is called to generate a message from the routing
1225 * socket indicating that the status of a network interface has changed.
1226 */
1227void
1228rt_ifmsg(
1229	struct ifnet *ifp)
1230{
1231	struct if_msghdr *ifm;
1232	struct mbuf *m;
1233	struct rt_addrinfo info;
1234	struct	sockproto	route_proto = {PF_ROUTE, 0};
1235
1236	if (route_cb.any_count == 0)
1237		return;
1238	bzero((caddr_t)&info, sizeof(info));
1239	m = rt_msg1(RTM_IFINFO, &info);
1240	if (m == 0)
1241		return;
1242	ifm = mtod(m, struct if_msghdr *);
1243	ifm->ifm_index = ifp->if_index;
1244	ifm->ifm_flags = (u_short)ifp->if_flags;
1245	if_data_internal_to_if_data(ifp, &ifp->if_data, &ifm->ifm_data);
1246	ifm->ifm_addrs = 0;
1247	raw_input(m, &route_proto, &route_src, &route_dst);
1248}
1249
1250/*
1251 * This is called to generate messages from the routing socket
1252 * indicating a network interface has had addresses associated with it.
1253 * if we ever reverse the logic and replace messages TO the routing
1254 * socket indicate a request to configure interfaces, then it will
1255 * be unnecessary as the routing socket will automatically generate
1256 * copies of it.
1257 *
1258 * Since this is coming from the interface, it is expected that the
1259 * interface will be locked.  Caller must hold rnh_lock and rt_lock.
1260 */
1261void
1262rt_newaddrmsg(int cmd, struct ifaddr *ifa, int error, struct rtentry *rt)
1263{
1264	struct rt_addrinfo info;
1265	struct sockaddr *sa = 0;
1266	int pass;
1267	struct mbuf *m = 0;
1268	struct ifnet *ifp = ifa->ifa_ifp;
1269	struct	sockproto	route_proto = {PF_ROUTE, 0};
1270
1271	lck_mtx_assert(rnh_lock, LCK_MTX_ASSERT_OWNED);
1272	RT_LOCK_ASSERT_HELD(rt);
1273
1274	if (route_cb.any_count == 0)
1275		return;
1276
1277	/* Become a regular mutex, just in case */
1278	RT_CONVERT_LOCK(rt);
1279	for (pass = 1; pass < 3; pass++) {
1280		bzero((caddr_t)&info, sizeof(info));
1281		if ((cmd == RTM_ADD && pass == 1) ||
1282		    (cmd == RTM_DELETE && pass == 2)) {
1283			struct ifa_msghdr *ifam;
1284			int ncmd = cmd == RTM_ADD ? RTM_NEWADDR : RTM_DELADDR;
1285
1286			/* Lock ifp for if_lladdr */
1287			ifnet_lock_shared(ifp);
1288			IFA_LOCK(ifa);
1289			info.rti_info[RTAX_IFA] = sa = ifa->ifa_addr;
1290			/*
1291			 * Holding ifnet lock here prevents the link address
1292			 * from changing contents, so no need to hold its
1293			 * lock.  The link address is always present; it's
1294			 * never freed.
1295			 */
1296			info.rti_info[RTAX_IFP] = ifp->if_lladdr->ifa_addr;
1297			info.rti_info[RTAX_NETMASK] = ifa->ifa_netmask;
1298			info.rti_info[RTAX_BRD] = ifa->ifa_dstaddr;
1299			if ((m = rt_msg1(ncmd, &info)) == NULL) {
1300				IFA_UNLOCK(ifa);
1301				ifnet_lock_done(ifp);
1302				continue;
1303			}
1304			IFA_UNLOCK(ifa);
1305			ifnet_lock_done(ifp);
1306			ifam = mtod(m, struct ifa_msghdr *);
1307			ifam->ifam_index = ifp->if_index;
1308			IFA_LOCK_SPIN(ifa);
1309			ifam->ifam_metric = ifa->ifa_metric;
1310			ifam->ifam_flags = ifa->ifa_flags;
1311			IFA_UNLOCK(ifa);
1312			ifam->ifam_addrs = info.rti_addrs;
1313		}
1314		if ((cmd == RTM_ADD && pass == 2) ||
1315		    (cmd == RTM_DELETE && pass == 1)) {
1316			struct rt_msghdr *rtm;
1317
1318			if (rt == 0)
1319				continue;
1320			info.rti_info[RTAX_NETMASK] = rt_mask(rt);
1321			info.rti_info[RTAX_DST] = sa = rt_key(rt);
1322			info.rti_info[RTAX_GATEWAY] = rt->rt_gateway;
1323			if ((m = rt_msg1(cmd, &info)) == NULL)
1324				continue;
1325			rtm = mtod(m, struct rt_msghdr *);
1326			rtm->rtm_index = ifp->if_index;
1327			rtm->rtm_flags |= rt->rt_flags;
1328			rtm->rtm_errno = error;
1329			rtm->rtm_addrs = info.rti_addrs;
1330		}
1331		route_proto.sp_protocol = sa ? sa->sa_family : 0;
1332		raw_input(m, &route_proto, &route_src, &route_dst);
1333	}
1334}
1335
1336/*
1337 * This is the analogue to the rt_newaddrmsg which performs the same
1338 * function but for multicast group memberhips.  This is easier since
1339 * there is no route state to worry about.
1340 */
1341void
1342rt_newmaddrmsg(int cmd, struct ifmultiaddr *ifma)
1343{
1344	struct rt_addrinfo info;
1345	struct mbuf *m = 0;
1346	struct ifnet *ifp = ifma->ifma_ifp;
1347	struct ifma_msghdr *ifmam;
1348	struct	sockproto	route_proto = {PF_ROUTE, 0};
1349
1350	if (route_cb.any_count == 0)
1351		return;
1352
1353	/* Lock ifp for if_lladdr */
1354	ifnet_lock_shared(ifp);
1355	bzero((caddr_t)&info, sizeof(info));
1356	IFMA_LOCK(ifma);
1357	info.rti_info[RTAX_IFA] = ifma->ifma_addr;
1358	info.rti_info[RTAX_IFP] = ifp->if_lladdr->ifa_addr;	/* lladdr doesn't need lock */
1359
1360	/*
1361	 * If a link-layer address is present, present it as a ``gateway''
1362	 * (similarly to how ARP entries, e.g., are presented).
1363	 */
1364	info.rti_info[RTAX_GATEWAY] = (ifma->ifma_ll != NULL) ? ifma->ifma_ll->ifma_addr : NULL;
1365	if ((m = rt_msg1(cmd, &info)) == NULL) {
1366		IFMA_UNLOCK(ifma);
1367		ifnet_lock_done(ifp);
1368		return;
1369	}
1370	ifmam = mtod(m, struct ifma_msghdr *);
1371	ifmam->ifmam_index = ifp->if_index;
1372	ifmam->ifmam_addrs = info.rti_addrs;
1373	route_proto.sp_protocol = ifma->ifma_addr->sa_family;
1374	IFMA_UNLOCK(ifma);
1375	ifnet_lock_done(ifp);
1376	raw_input(m, &route_proto, &route_src, &route_dst);
1377}
1378
1379/*
1380 * This is used in dumping the kernel table via sysctl().
1381 */
1382int
1383sysctl_dumpentry(struct radix_node *rn, void *vw)
1384{
1385	struct walkarg *w = vw;
1386	struct rtentry *rt = (struct rtentry *)rn;
1387	int error = 0, size;
1388	struct rt_addrinfo info;
1389
1390	RT_LOCK(rt);
1391	if (w->w_op == NET_RT_FLAGS && !(rt->rt_flags & w->w_arg)) {
1392		RT_UNLOCK(rt);
1393		return 0;
1394	}
1395	bzero((caddr_t)&info, sizeof(info));
1396	info.rti_info[RTAX_DST] = rt_key(rt);
1397	info.rti_info[RTAX_GATEWAY] = rt->rt_gateway;
1398	info.rti_info[RTAX_NETMASK] = rt_mask(rt);
1399	info.rti_info[RTAX_GENMASK] = rt->rt_genmask;
1400
1401	if (w->w_op != NET_RT_DUMP2) {
1402		size = rt_msg2(RTM_GET, &info, 0, w);
1403		if (w->w_req && w->w_tmem) {
1404			struct rt_msghdr *rtm =
1405			    (struct rt_msghdr *)(void *)w->w_tmem;
1406
1407			rtm->rtm_flags = rt->rt_flags;
1408			rtm->rtm_use = rt->rt_use;
1409			rt_getmetrics(rt, &rtm->rtm_rmx);
1410			rtm->rtm_index = rt->rt_ifp->if_index;
1411			rtm->rtm_pid = 0;
1412			rtm->rtm_seq = 0;
1413			rtm->rtm_errno = 0;
1414			rtm->rtm_addrs = info.rti_addrs;
1415			error = SYSCTL_OUT(w->w_req, (caddr_t)rtm, size);
1416			RT_UNLOCK(rt);
1417			return (error);
1418		}
1419	} else {
1420		size = rt_msg2(RTM_GET2, &info, 0, w);
1421		if (w->w_req && w->w_tmem) {
1422			struct rt_msghdr2 *rtm =
1423			    (struct rt_msghdr2 *)(void *)w->w_tmem;
1424
1425			rtm->rtm_flags = rt->rt_flags;
1426			rtm->rtm_use = rt->rt_use;
1427			rt_getmetrics(rt, &rtm->rtm_rmx);
1428			rtm->rtm_index = rt->rt_ifp->if_index;
1429			rtm->rtm_refcnt = rt->rt_refcnt;
1430			if (rt->rt_parent)
1431				rtm->rtm_parentflags = rt->rt_parent->rt_flags;
1432			else
1433				rtm->rtm_parentflags = 0;
1434			rtm->rtm_reserved = 0;
1435			rtm->rtm_addrs = info.rti_addrs;
1436			error = SYSCTL_OUT(w->w_req, (caddr_t)rtm, size);
1437			RT_UNLOCK(rt);
1438			return (error);
1439		}
1440	}
1441	RT_UNLOCK(rt);
1442	return (error);
1443}
1444
1445/*
1446 * This is used for dumping extended information from route entries.
1447 */
1448int
1449sysctl_dumpentry_ext(struct radix_node *rn, void *vw)
1450{
1451	struct walkarg *w = vw;
1452	struct rtentry *rt = (struct rtentry *)rn;
1453	int error = 0, size;
1454	struct rt_addrinfo info;
1455
1456	RT_LOCK(rt);
1457	if (w->w_op == NET_RT_DUMPX_FLAGS && !(rt->rt_flags & w->w_arg)) {
1458		RT_UNLOCK(rt);
1459		return (0);
1460	}
1461	bzero(&info, sizeof (info));
1462	info.rti_info[RTAX_DST] = rt_key(rt);
1463	info.rti_info[RTAX_GATEWAY] = rt->rt_gateway;
1464	info.rti_info[RTAX_NETMASK] = rt_mask(rt);
1465	info.rti_info[RTAX_GENMASK] = rt->rt_genmask;
1466
1467	size = rt_msg2(RTM_GET_EXT, &info, 0, w);
1468	if (w->w_req && w->w_tmem) {
1469		struct rt_msghdr_ext *ertm =
1470		    (struct rt_msghdr_ext *)(void *)w->w_tmem;
1471
1472		ertm->rtm_flags = rt->rt_flags;
1473		ertm->rtm_use = rt->rt_use;
1474		rt_getmetrics(rt, &ertm->rtm_rmx);
1475		ertm->rtm_index = rt->rt_ifp->if_index;
1476		ertm->rtm_pid = 0;
1477		ertm->rtm_seq = 0;
1478		ertm->rtm_errno = 0;
1479		ertm->rtm_addrs = info.rti_addrs;
1480		if (rt->rt_llinfo_get_ri == NULL) {
1481			bzero(&ertm->rtm_ri, sizeof (ertm->rtm_ri));
1482			ertm->rtm_ri.ri_rssi = IFNET_RSSI_UNKNOWN;
1483			ertm->rtm_ri.ri_lqm = IFNET_LQM_THRESH_OFF;
1484			ertm->rtm_ri.ri_npm = IFNET_NPM_THRESH_UNKNOWN;
1485		}
1486		else
1487			rt->rt_llinfo_get_ri(rt, &ertm->rtm_ri);
1488
1489		error = SYSCTL_OUT(w->w_req, (caddr_t)ertm, size);
1490		RT_UNLOCK(rt);
1491		return (error);
1492	}
1493	RT_UNLOCK(rt);
1494	return (error);
1495}
1496
1497/*
1498 * rdar://9307819
1499 * To avoid to call copyout() while holding locks and to cause problems
1500 * in the paging path, sysctl_iflist() and sysctl_iflist2() contstruct
1501 * the list in two passes. In the first pass we compute the total
1502 * length of the data we are going to copyout, then we release
1503 * all locks to allocate a temporary buffer that gets filled
1504 * in the second pass.
1505 *
1506 * Note that we are verifying the assumption that _MALLOC returns a buffer
1507 * that is at least 32 bits aligned and that the messages and addresses are
1508 * 32 bits aligned.
1509 */
1510
1511int
1512sysctl_iflist(int af, struct walkarg *w)
1513{
1514	struct ifnet *ifp;
1515	struct ifaddr *ifa;
1516	struct	rt_addrinfo info;
1517	int	len, error = 0;
1518	int	pass = 0;
1519	int	total_len = 0, current_len = 0;
1520	char	*total_buffer = NULL, *cp = NULL;
1521
1522	bzero((caddr_t)&info, sizeof(info));
1523
1524	for (pass = 0; pass < 2; pass++) {
1525		ifnet_head_lock_shared();
1526
1527		TAILQ_FOREACH(ifp, &ifnet_head, if_link) {
1528			if (error)
1529				break;
1530			if (w->w_arg && w->w_arg != ifp->if_index)
1531				continue;
1532			ifnet_lock_shared(ifp);
1533			/*
1534			 * Holding ifnet lock here prevents the link address from
1535			 * changing contents, so no need to hold the ifa lock.
1536			 * The link address is always present; it's never freed.
1537			 */
1538			ifa = ifp->if_lladdr;
1539			info.rti_info[RTAX_IFP] = ifa->ifa_addr;
1540			len = rt_msg2(RTM_IFINFO, &info, (caddr_t)0, NULL);
1541			if (pass == 0) {
1542				total_len += len;
1543			} else {
1544				struct if_msghdr *ifm;
1545
1546				if (current_len + len > total_len) {
1547					ifnet_lock_done(ifp);
1548					printf("sysctl_iflist: current_len (%d) + len (%d) > total_len (%d)\n",
1549						current_len, len, total_len);
1550					error = ENOBUFS;
1551					break;
1552				}
1553				info.rti_info[RTAX_IFP] = ifa->ifa_addr;
1554				len = rt_msg2(RTM_IFINFO, &info, (caddr_t)cp, NULL);
1555				info.rti_info[RTAX_IFP] = NULL;
1556
1557				ifm = (struct if_msghdr *)(void *)cp;
1558				ifm->ifm_index = ifp->if_index;
1559				ifm->ifm_flags = (u_short)ifp->if_flags;
1560				if_data_internal_to_if_data(ifp, &ifp->if_data,
1561					&ifm->ifm_data);
1562				ifm->ifm_addrs = info.rti_addrs;
1563
1564				cp += len;
1565				VERIFY(IS_P2ALIGNED(cp, sizeof(u_int32_t)));
1566				current_len += len;
1567			}
1568			while ((ifa = ifa->ifa_link.tqe_next) != 0) {
1569				IFA_LOCK(ifa);
1570				if (af && af != ifa->ifa_addr->sa_family) {
1571					IFA_UNLOCK(ifa);
1572					continue;
1573				}
1574				info.rti_info[RTAX_IFA] = ifa->ifa_addr;
1575				info.rti_info[RTAX_NETMASK] = ifa->ifa_netmask;
1576				info.rti_info[RTAX_BRD] = ifa->ifa_dstaddr;
1577				len = rt_msg2(RTM_NEWADDR, &info, 0, 0);
1578				if (pass == 0) {
1579					total_len += len;
1580				} else {
1581					struct ifa_msghdr *ifam;
1582
1583					if (current_len + len > total_len) {
1584						IFA_UNLOCK(ifa);
1585						printf("sysctl_iflist: current_len (%d) + len (%d) > total_len (%d)\n",
1586							current_len, len, total_len);
1587						error = ENOBUFS;
1588						break;
1589					}
1590					len = rt_msg2(RTM_NEWADDR, &info, (caddr_t)cp, NULL);
1591
1592					ifam = (struct ifa_msghdr *)(void *)cp;
1593					ifam->ifam_index = ifa->ifa_ifp->if_index;
1594					ifam->ifam_flags = ifa->ifa_flags;
1595					ifam->ifam_metric = ifa->ifa_metric;
1596					ifam->ifam_addrs = info.rti_addrs;
1597
1598					cp += len;
1599					VERIFY(IS_P2ALIGNED(cp, sizeof(u_int32_t)));
1600					current_len += len;
1601				}
1602				IFA_UNLOCK(ifa);
1603			}
1604			ifnet_lock_done(ifp);
1605			info.rti_info[RTAX_IFA] = info.rti_info[RTAX_NETMASK] =
1606				info.rti_info[RTAX_BRD] = NULL;
1607		}
1608
1609		ifnet_head_done();
1610
1611		if (error)
1612			break;
1613
1614		if (pass == 0) {
1615			/* Better to return zero length buffer than ENOBUFS */
1616			if (total_len == 0)
1617				total_len = 1;
1618			total_len += total_len >> 3;
1619			total_buffer = _MALLOC(total_len, M_RTABLE, M_ZERO | M_WAITOK);
1620			if (total_buffer == NULL) {
1621				printf("sysctl_iflist: _MALLOC(%d) failed\n", total_len);
1622				error = ENOBUFS;
1623				break;
1624			}
1625			cp = total_buffer;
1626			VERIFY(IS_P2ALIGNED(cp, sizeof(u_int32_t)));
1627		} else {
1628			error = SYSCTL_OUT(w->w_req, total_buffer, current_len);
1629			if (error)
1630				break;
1631		}
1632	}
1633
1634	if (total_buffer != NULL)
1635		_FREE(total_buffer, M_RTABLE);
1636
1637	return error;
1638}
1639
1640int
1641sysctl_iflist2(int af, struct walkarg *w)
1642{
1643	struct ifnet *ifp;
1644	struct ifaddr *ifa;
1645	struct	rt_addrinfo info;
1646	int	len, error = 0;
1647	int	pass = 0;
1648	int	total_len = 0, current_len = 0;
1649	char	*total_buffer = NULL, *cp = NULL;
1650
1651	bzero((caddr_t)&info, sizeof(info));
1652
1653	for (pass = 0; pass < 2; pass++) {
1654		ifnet_head_lock_shared();
1655
1656		TAILQ_FOREACH(ifp, &ifnet_head, if_link) {
1657			if (error)
1658				break;
1659			if (w->w_arg && w->w_arg != ifp->if_index)
1660				continue;
1661			ifnet_lock_shared(ifp);
1662			/*
1663			 * Holding ifnet lock here prevents the link address from
1664			 * changing contents, so no need to hold the ifa lock.
1665			 * The link address is always present; it's never freed.
1666			 */
1667			ifa = ifp->if_lladdr;
1668			info.rti_info[RTAX_IFP] = ifa->ifa_addr;
1669			len = rt_msg2(RTM_IFINFO2, &info, (caddr_t)0, NULL);
1670			if (pass == 0) {
1671				total_len += len;
1672			} else {
1673				struct if_msghdr2 *ifm;
1674
1675				if (current_len + len > total_len) {
1676					ifnet_lock_done(ifp);
1677					printf("sysctl_iflist2: current_len (%d) + len (%d) > total_len (%d)\n",
1678						current_len, len, total_len);
1679					error = ENOBUFS;
1680					break;
1681				}
1682				info.rti_info[RTAX_IFP] = ifa->ifa_addr;
1683				len = rt_msg2(RTM_IFINFO2, &info, (caddr_t)cp, NULL);
1684				info.rti_info[RTAX_IFP] = NULL;
1685
1686				ifm = (struct if_msghdr2 *)(void *)cp;
1687				ifm->ifm_addrs = info.rti_addrs;
1688				ifm->ifm_flags = (u_short)ifp->if_flags;
1689				ifm->ifm_index = ifp->if_index;
1690				ifm->ifm_snd_len = IFCQ_LEN(&ifp->if_snd);
1691				ifm->ifm_snd_maxlen = IFCQ_MAXLEN(&ifp->if_snd);
1692				ifm->ifm_snd_drops =
1693				    ifp->if_snd.ifcq_dropcnt.packets;
1694				ifm->ifm_timer = ifp->if_timer;
1695				if_data_internal_to_if_data64(ifp, &ifp->if_data,
1696					&ifm->ifm_data);
1697
1698				cp += len;
1699				VERIFY(IS_P2ALIGNED(cp, sizeof(u_int32_t)));
1700				current_len += len;
1701			}
1702			while ((ifa = ifa->ifa_link.tqe_next) != 0) {
1703				IFA_LOCK(ifa);
1704				if (af && af != ifa->ifa_addr->sa_family) {
1705					IFA_UNLOCK(ifa);
1706					continue;
1707				}
1708				info.rti_info[RTAX_IFA] = ifa->ifa_addr;
1709				info.rti_info[RTAX_NETMASK] = ifa->ifa_netmask;
1710				info.rti_info[RTAX_BRD] = ifa->ifa_dstaddr;
1711				len = rt_msg2(RTM_NEWADDR, &info, 0, 0);
1712				if (pass == 0) {
1713					total_len += len;
1714				} else {
1715					struct ifa_msghdr *ifam;
1716
1717					if (current_len + len > total_len) {
1718						IFA_UNLOCK(ifa);
1719						printf("sysctl_iflist2: current_len (%d) + len (%d) > total_len (%d)\n",
1720							current_len, len, total_len);
1721						error = ENOBUFS;
1722						break;
1723					}
1724					len = rt_msg2(RTM_NEWADDR, &info, (caddr_t)cp, 0);
1725
1726					ifam = (struct ifa_msghdr *)(void *)cp;
1727					ifam->ifam_index = ifa->ifa_ifp->if_index;
1728					ifam->ifam_flags = ifa->ifa_flags;
1729					ifam->ifam_metric = ifa->ifa_metric;
1730					ifam->ifam_addrs = info.rti_addrs;
1731
1732					cp += len;
1733					VERIFY(IS_P2ALIGNED(cp, sizeof(u_int32_t)));
1734					current_len += len;
1735				}
1736				IFA_UNLOCK(ifa);
1737			}
1738			if (error) {
1739				ifnet_lock_done(ifp);
1740				break;
1741			}
1742			{
1743				struct ifmultiaddr *ifma;
1744
1745				for (ifma = LIST_FIRST(&ifp->if_multiaddrs);
1746					ifma != NULL; ifma = LIST_NEXT(ifma, ifma_link)) {
1747					struct ifaddr *ifa0;
1748
1749					IFMA_LOCK(ifma);
1750					if (af && af != ifma->ifma_addr->sa_family) {
1751						IFMA_UNLOCK(ifma);
1752						continue;
1753					}
1754					bzero((caddr_t)&info, sizeof(info));
1755					info.rti_info[RTAX_IFA] = ifma->ifma_addr;
1756					/*
1757					 * Holding ifnet lock here prevents the link
1758					 * address from changing contents, so no need
1759					 * to hold the ifa0 lock.  The link address is
1760					 * always present; it's never freed.
1761					 */
1762					ifa0 = ifp->if_lladdr;
1763					info.rti_info[RTAX_IFP] = ifa0->ifa_addr;
1764					if (ifma->ifma_ll != NULL)
1765						info.rti_info[RTAX_GATEWAY] = ifma->ifma_ll->ifma_addr;
1766					len = rt_msg2(RTM_NEWMADDR2, &info, 0, 0);
1767					if (pass == 0) {
1768						total_len += len;
1769					} else {
1770						struct ifma_msghdr2 *ifmam;
1771
1772						if (current_len + len > total_len) {
1773							IFMA_UNLOCK(ifma);
1774							printf("sysctl_iflist2: current_len (%d) + len (%d) > total_len (%d)\n",
1775								current_len, len, total_len);
1776							error = ENOBUFS;
1777							break;
1778						}
1779						len = rt_msg2(RTM_NEWMADDR2, &info, (caddr_t)cp, 0);
1780
1781						ifmam = (struct ifma_msghdr2 *)(void *)cp;
1782						ifmam->ifmam_addrs = info.rti_addrs;
1783						ifmam->ifmam_flags = 0;
1784						ifmam->ifmam_index =
1785							ifma->ifma_ifp->if_index;
1786						ifmam->ifmam_refcount =
1787							ifma->ifma_reqcnt;
1788
1789						cp += len;
1790						VERIFY(IS_P2ALIGNED(cp, sizeof(u_int32_t)));
1791						current_len += len;
1792					}
1793					IFMA_UNLOCK(ifma);
1794				}
1795			}
1796			ifnet_lock_done(ifp);
1797			info.rti_info[RTAX_IFA] = info.rti_info[RTAX_NETMASK] =
1798				info.rti_info[RTAX_BRD] = NULL;
1799		}
1800		ifnet_head_done();
1801
1802		if (error)
1803			break;
1804
1805		if (pass == 0) {
1806			/* Better to return zero length buffer than ENOBUFS */
1807			if (total_len == 0)
1808				total_len = 1;
1809			total_len += total_len >> 3;
1810			total_buffer = _MALLOC(total_len, M_RTABLE, M_ZERO | M_WAITOK);
1811			if (total_buffer == NULL) {
1812				printf("sysctl_iflist2: _MALLOC(%d) failed\n", total_len);
1813				error = ENOBUFS;
1814				break;
1815			}
1816			cp = total_buffer;
1817			VERIFY(IS_P2ALIGNED(cp, sizeof(u_int32_t)));
1818		} else {
1819			error = SYSCTL_OUT(w->w_req, total_buffer, current_len);
1820			if (error)
1821				break;
1822		}
1823	}
1824
1825	if (total_buffer != NULL)
1826		_FREE(total_buffer, M_RTABLE);
1827
1828	return error;
1829}
1830
1831
1832static int
1833sysctl_rtstat(struct sysctl_req *req)
1834{
1835	int error;
1836
1837	error = SYSCTL_OUT(req, &rtstat, sizeof(struct rtstat));
1838	if (error)
1839		return (error);
1840
1841	return 0;
1842}
1843
1844static int
1845sysctl_rttrash(struct sysctl_req *req)
1846{
1847	int error;
1848
1849	error = SYSCTL_OUT(req, &rttrash, sizeof(rttrash));
1850	if (error)
1851		return (error);
1852
1853	return 0;
1854}
1855
1856/*
1857 * Called from pfslowtimo(), protected by domain_proto_mtx
1858 */
1859static void
1860rt_drainall(void)
1861{
1862	struct timeval delta_ts, current_ts;
1863
1864	/*
1865	 * This test is done without holding rnh_lock; in the even that
1866	 * we read stale value, it will only cause an extra (or miss)
1867	 * drain and is therefore harmless.
1868	 */
1869	if (ifnet_aggressive_drainers == 0) {
1870		if (timerisset(&last_ts))
1871			timerclear(&last_ts);
1872		return;
1873	}
1874
1875	microuptime(&current_ts);
1876	timersub(&current_ts, &last_ts, &delta_ts);
1877
1878	if (delta_ts.tv_sec >= rt_if_idle_drain_interval) {
1879		timerclear(&last_ts);
1880
1881		in_rtqdrain();		/* protocol cloned routes: INET */
1882		in_arpdrain(NULL);	/* cloned routes: ARP */
1883#if INET6
1884		in6_rtqdrain();		/* protocol cloned routes: INET6 */
1885		nd6_drain(NULL);	/* cloned routes: ND6 */
1886#endif /* INET6 */
1887
1888		last_ts.tv_sec = current_ts.tv_sec;
1889		last_ts.tv_usec = current_ts.tv_usec;
1890	}
1891}
1892
1893void
1894rt_aggdrain(int on)
1895{
1896	lck_mtx_assert(rnh_lock, LCK_MTX_ASSERT_OWNED);
1897
1898	if (on)
1899		routedomain.dom_protosw->pr_flags |= PR_AGGDRAIN;
1900	else
1901		routedomain.dom_protosw->pr_flags &= ~PR_AGGDRAIN;
1902}
1903
1904static int
1905sysctl_rtsock SYSCTL_HANDLER_ARGS
1906{
1907#pragma unused(oidp)
1908	int	*name = (int *)arg1;
1909	u_int	namelen = arg2;
1910	struct radix_node_head *rnh;
1911	int	i, error = EINVAL;
1912	u_char  af;
1913	struct	walkarg w;
1914
1915	name ++;
1916	namelen--;
1917	if (req->newptr)
1918		return (EPERM);
1919	if (namelen != 3)
1920		return (EINVAL);
1921	af = name[0];
1922	Bzero(&w, sizeof(w));
1923	w.w_op = name[1];
1924	w.w_arg = name[2];
1925	w.w_req = req;
1926
1927	switch (w.w_op) {
1928
1929	case NET_RT_DUMP:
1930	case NET_RT_DUMP2:
1931	case NET_RT_FLAGS:
1932		lck_mtx_lock(rnh_lock);
1933		for (i = 1; i <= AF_MAX; i++)
1934			if ((rnh = rt_tables[i]) && (af == 0 || af == i) &&
1935			    (error = rnh->rnh_walktree(rnh,
1936			    sysctl_dumpentry, &w)))
1937				break;
1938		lck_mtx_unlock(rnh_lock);
1939		break;
1940	case NET_RT_DUMPX:
1941	case NET_RT_DUMPX_FLAGS:
1942		lck_mtx_lock(rnh_lock);
1943		for (i = 1; i <= AF_MAX; i++)
1944			if ((rnh = rt_tables[i]) && (af == 0 || af == i) &&
1945			    (error = rnh->rnh_walktree(rnh,
1946			    sysctl_dumpentry_ext, &w)))
1947				break;
1948		lck_mtx_unlock(rnh_lock);
1949		break;
1950	case NET_RT_IFLIST:
1951		error = sysctl_iflist(af, &w);
1952		break;
1953	case NET_RT_IFLIST2:
1954		error = sysctl_iflist2(af, &w);
1955		break;
1956	case NET_RT_STAT:
1957		error = sysctl_rtstat(req);
1958		break;
1959	case NET_RT_TRASH:
1960		error = sysctl_rttrash(req);
1961		break;
1962	}
1963	if (w.w_tmem)
1964		FREE(w.w_tmem, M_RTABLE);
1965	return (error);
1966}
1967
1968SYSCTL_NODE(_net, PF_ROUTE, routetable, CTLFLAG_RD | CTLFLAG_LOCKED, sysctl_rtsock, "");
1969
1970/*
1971 * Definitions of protocols supported in the ROUTE domain.
1972 */
1973static struct protosw routesw[] = {
1974{ SOCK_RAW,	&routedomain,	0,		PR_ATOMIC|PR_ADDR,
1975  0,		route_output,	raw_ctlinput,	0,
1976  0,
1977  raw_init,	0,		0,		rt_drainall,
1978  0,
1979  &route_usrreqs,
1980  0,			0,		0,
1981  { 0, 0 }, 	0,	{ 0 }
1982}
1983};
1984
1985struct domain routedomain =
1986    { PF_ROUTE, "route", route_init, 0, 0,
1987      routesw,
1988      NULL, NULL, 0, 0, 0, 0, NULL, 0,
1989      { 0, 0 } };
1990
1991DOMAIN_SET(route);
1992
1993