1/*-
2 * SPDX-License-Identifier: BSD-3-Clause
3 *
4 * Copyright (c) 1980, 1986, 1993
5 *	The Regents of the University of California.  All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 * 3. Neither the name of the University nor the names of its contributors
16 *    may be used to endorse or promote products derived from this software
17 *    without specific prior written permission.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29 * SUCH DAMAGE.
30 *
31 *	@(#)if.c	8.5 (Berkeley) 1/9/95
32 * $FreeBSD$
33 */
34
35#include "opt_bpf.h"
36#include "opt_inet6.h"
37#include "opt_inet.h"
38
39#include <sys/param.h>
40#include <sys/conf.h>
41#include <sys/eventhandler.h>
42#include <sys/malloc.h>
43#include <sys/domainset.h>
44#include <sys/sbuf.h>
45#include <sys/bus.h>
46#include <sys/epoch.h>
47#include <sys/mbuf.h>
48#include <sys/systm.h>
49#include <sys/priv.h>
50#include <sys/proc.h>
51#include <sys/socket.h>
52#include <sys/socketvar.h>
53#include <sys/protosw.h>
54#include <sys/kernel.h>
55#include <sys/lock.h>
56#include <sys/refcount.h>
57#include <sys/module.h>
58#include <sys/rwlock.h>
59#include <sys/sockio.h>
60#include <sys/syslog.h>
61#include <sys/sysctl.h>
62#include <sys/sysent.h>
63#include <sys/taskqueue.h>
64#include <sys/domain.h>
65#include <sys/jail.h>
66#include <sys/priv.h>
67
68#include <machine/stdarg.h>
69#include <vm/uma.h>
70
71#include <net/bpf.h>
72#include <net/ethernet.h>
73#include <net/if.h>
74#include <net/if_arp.h>
75#include <net/if_clone.h>
76#include <net/if_dl.h>
77#include <net/if_types.h>
78#include <net/if_var.h>
79#include <net/if_media.h>
80#include <net/if_vlan_var.h>
81#include <net/radix.h>
82#include <net/route.h>
83#include <net/route/route_ctl.h>
84#include <net/vnet.h>
85
86#if defined(INET) || defined(INET6)
87#include <net/ethernet.h>
88#include <netinet/in.h>
89#include <netinet/in_var.h>
90#include <netinet/ip.h>
91#include <netinet/ip_carp.h>
92#ifdef INET
93#include <net/debugnet.h>
94#include <netinet/if_ether.h>
95#endif /* INET */
96#ifdef INET6
97#include <netinet6/in6_var.h>
98#include <netinet6/in6_ifattach.h>
99#endif /* INET6 */
100#endif /* INET || INET6 */
101
102#include <security/mac/mac_framework.h>
103
104/*
105 * Consumers of struct ifreq such as tcpdump assume no pad between ifr_name
106 * and ifr_ifru when it is used in SIOCGIFCONF.
107 */
108_Static_assert(sizeof(((struct ifreq *)0)->ifr_name) ==
109    offsetof(struct ifreq, ifr_ifru), "gap between ifr_name and ifr_ifru");
110
111__read_mostly epoch_t net_epoch_preempt;
112#ifdef COMPAT_FREEBSD32
113#include <sys/mount.h>
114#include <compat/freebsd32/freebsd32.h>
115
116struct ifreq_buffer32 {
117	uint32_t	length;		/* (size_t) */
118	uint32_t	buffer;		/* (void *) */
119};
120
121/*
122 * Interface request structure used for socket
123 * ioctl's.  All interface ioctl's must have parameter
124 * definitions which begin with ifr_name.  The
125 * remainder may be interface specific.
126 */
127struct ifreq32 {
128	char	ifr_name[IFNAMSIZ];		/* if name, e.g. "en0" */
129	union {
130		struct sockaddr	ifru_addr;
131		struct sockaddr	ifru_dstaddr;
132		struct sockaddr	ifru_broadaddr;
133		struct ifreq_buffer32 ifru_buffer;
134		short		ifru_flags[2];
135		short		ifru_index;
136		int		ifru_jid;
137		int		ifru_metric;
138		int		ifru_mtu;
139		int		ifru_phys;
140		int		ifru_media;
141		uint32_t	ifru_data;
142		int		ifru_cap[2];
143		u_int		ifru_fib;
144		u_char		ifru_vlan_pcp;
145	} ifr_ifru;
146};
147CTASSERT(sizeof(struct ifreq) == sizeof(struct ifreq32));
148CTASSERT(__offsetof(struct ifreq, ifr_ifru) ==
149    __offsetof(struct ifreq32, ifr_ifru));
150
151struct ifgroupreq32 {
152	char	ifgr_name[IFNAMSIZ];
153	u_int	ifgr_len;
154	union {
155		char		ifgru_group[IFNAMSIZ];
156		uint32_t	ifgru_groups;
157	} ifgr_ifgru;
158};
159
160struct ifmediareq32 {
161	char		ifm_name[IFNAMSIZ];
162	int		ifm_current;
163	int		ifm_mask;
164	int		ifm_status;
165	int		ifm_active;
166	int		ifm_count;
167	uint32_t	ifm_ulist;	/* (int *) */
168};
169#define	SIOCGIFMEDIA32	_IOC_NEWTYPE(SIOCGIFMEDIA, struct ifmediareq32)
170#define	SIOCGIFXMEDIA32	_IOC_NEWTYPE(SIOCGIFXMEDIA, struct ifmediareq32)
171
172#define	_CASE_IOC_IFGROUPREQ_32(cmd)				\
173    _IOC_NEWTYPE((cmd), struct ifgroupreq32): case
174#else /* !COMPAT_FREEBSD32 */
175#define _CASE_IOC_IFGROUPREQ_32(cmd)
176#endif /* !COMPAT_FREEBSD32 */
177
178#define CASE_IOC_IFGROUPREQ(cmd)	\
179    _CASE_IOC_IFGROUPREQ_32(cmd)	\
180    (cmd)
181
182union ifreq_union {
183	struct ifreq	ifr;
184#ifdef COMPAT_FREEBSD32
185	struct ifreq32	ifr32;
186#endif
187};
188
189union ifgroupreq_union {
190	struct ifgroupreq ifgr;
191#ifdef COMPAT_FREEBSD32
192	struct ifgroupreq32 ifgr32;
193#endif
194};
195
196SYSCTL_NODE(_net, PF_LINK, link, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
197    "Link layers");
198SYSCTL_NODE(_net_link, 0, generic, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
199    "Generic link-management");
200
201SYSCTL_INT(_net_link, OID_AUTO, ifqmaxlen, CTLFLAG_RDTUN,
202    &ifqmaxlen, 0, "max send queue size");
203
204/* Log link state change events */
205static int log_link_state_change = 1;
206
207SYSCTL_INT(_net_link, OID_AUTO, log_link_state_change, CTLFLAG_RW,
208	&log_link_state_change, 0,
209	"log interface link state change events");
210
211/* Log promiscuous mode change events */
212static int log_promisc_mode_change = 1;
213
214SYSCTL_INT(_net_link, OID_AUTO, log_promisc_mode_change, CTLFLAG_RDTUN,
215	&log_promisc_mode_change, 1,
216	"log promiscuous mode change events");
217
218/* Interface description */
219static unsigned int ifdescr_maxlen = 1024;
220SYSCTL_UINT(_net, OID_AUTO, ifdescr_maxlen, CTLFLAG_RW,
221	&ifdescr_maxlen, 0,
222	"administrative maximum length for interface description");
223
224static MALLOC_DEFINE(M_IFDESCR, "ifdescr", "ifnet descriptions");
225
226/* global sx for non-critical path ifdescr */
227static struct sx ifdescr_sx;
228SX_SYSINIT(ifdescr_sx, &ifdescr_sx, "ifnet descr");
229
230void	(*ng_ether_link_state_p)(struct ifnet *ifp, int state);
231void	(*lagg_linkstate_p)(struct ifnet *ifp, int state);
232/* These are external hooks for CARP. */
233void	(*carp_linkstate_p)(struct ifnet *ifp);
234void	(*carp_demote_adj_p)(int, char *);
235int	(*carp_master_p)(struct ifaddr *);
236#if defined(INET) || defined(INET6)
237int	(*carp_forus_p)(struct ifnet *ifp, u_char *dhost);
238int	(*carp_output_p)(struct ifnet *ifp, struct mbuf *m,
239    const struct sockaddr *sa);
240int	(*carp_ioctl_p)(struct ifreq *, u_long, struct thread *);
241int	(*carp_attach_p)(struct ifaddr *, int);
242void	(*carp_detach_p)(struct ifaddr *, bool);
243#endif
244#ifdef INET
245int	(*carp_iamatch_p)(struct ifaddr *, uint8_t **);
246#endif
247#ifdef INET6
248struct ifaddr *(*carp_iamatch6_p)(struct ifnet *ifp, struct in6_addr *taddr6);
249caddr_t	(*carp_macmatch6_p)(struct ifnet *ifp, struct mbuf *m,
250    const struct in6_addr *taddr);
251#endif
252
253struct mbuf *(*tbr_dequeue_ptr)(struct ifaltq *, int) = NULL;
254
255/*
256 * XXX: Style; these should be sorted alphabetically, and unprototyped
257 * static functions should be prototyped. Currently they are sorted by
258 * declaration order.
259 */
260static void	if_attachdomain(void *);
261static void	if_attachdomain1(struct ifnet *);
262static int	ifconf(u_long, caddr_t);
263static void	*if_grow(void);
264static void	if_input_default(struct ifnet *, struct mbuf *);
265static int	if_requestencap_default(struct ifnet *, struct if_encap_req *);
266static void	if_route(struct ifnet *, int flag, int fam);
267static int	if_setflag(struct ifnet *, int, int, int *, int);
268static int	if_transmit(struct ifnet *ifp, struct mbuf *m);
269static void	if_unroute(struct ifnet *, int flag, int fam);
270static int	if_delmulti_locked(struct ifnet *, struct ifmultiaddr *, int);
271static void	do_link_state_change(void *, int);
272static int	if_getgroup(struct ifgroupreq *, struct ifnet *);
273static int	if_getgroupmembers(struct ifgroupreq *);
274static void	if_delgroups(struct ifnet *);
275static void	if_attach_internal(struct ifnet *, int, struct if_clone *);
276static int	if_detach_internal(struct ifnet *, int, struct if_clone **);
277static void	if_siocaddmulti(void *, int);
278static void	if_link_ifnet(struct ifnet *);
279static bool	if_unlink_ifnet(struct ifnet *, bool);
280#ifdef VIMAGE
281static int	if_vmove(struct ifnet *, struct vnet *);
282#endif
283
284#ifdef INET6
285/*
286 * XXX: declare here to avoid to include many inet6 related files..
287 * should be more generalized?
288 */
289extern void	nd6_setmtu(struct ifnet *);
290#endif
291
292/* ipsec helper hooks */
293VNET_DEFINE(struct hhook_head *, ipsec_hhh_in[HHOOK_IPSEC_COUNT]);
294VNET_DEFINE(struct hhook_head *, ipsec_hhh_out[HHOOK_IPSEC_COUNT]);
295
296VNET_DEFINE(int, if_index);
297int	ifqmaxlen = IFQ_MAXLEN;
298VNET_DEFINE(struct ifnethead, ifnet);	/* depend on static init XXX */
299VNET_DEFINE(struct ifgrouphead, ifg_head);
300
301VNET_DEFINE_STATIC(int, if_indexlim) = 8;
302
303/* Table of ifnet by index. */
304VNET_DEFINE(struct ifnet **, ifindex_table);
305
306#define	V_if_indexlim		VNET(if_indexlim)
307#define	V_ifindex_table		VNET(ifindex_table)
308
309/*
310 * The global network interface list (V_ifnet) and related state (such as
311 * if_index, if_indexlim, and ifindex_table) are protected by an sxlock.
312 * This may be acquired to stabilise the list, or we may rely on NET_EPOCH.
313 */
314struct sx ifnet_sxlock;
315SX_SYSINIT_FLAGS(ifnet_sx, &ifnet_sxlock, "ifnet_sx", SX_RECURSE);
316
317struct sx ifnet_detach_sxlock;
318SX_SYSINIT_FLAGS(ifnet_detach, &ifnet_detach_sxlock, "ifnet_detach_sx",
319    SX_RECURSE);
320
321/*
322 * The allocation of network interfaces is a rather non-atomic affair; we
323 * need to select an index before we are ready to expose the interface for
324 * use, so will use this pointer value to indicate reservation.
325 */
326#define	IFNET_HOLD	(void *)(uintptr_t)(-1)
327
328#ifdef VIMAGE
329#define	VNET_IS_SHUTTING_DOWN(_vnet)					\
330    ((_vnet)->vnet_shutdown && (_vnet)->vnet_state < SI_SUB_VNET_DONE)
331#endif
332
333static	if_com_alloc_t *if_com_alloc[256];
334static	if_com_free_t *if_com_free[256];
335
336static MALLOC_DEFINE(M_IFNET, "ifnet", "interface internals");
337MALLOC_DEFINE(M_IFADDR, "ifaddr", "interface address");
338MALLOC_DEFINE(M_IFMADDR, "ether_multi", "link-level multicast address");
339
340struct ifnet *
341ifnet_byindex(u_short idx)
342{
343	struct ifnet *ifp;
344
345	if (__predict_false(idx > V_if_index))
346		return (NULL);
347
348	ifp = *(struct ifnet * const volatile *)(V_ifindex_table + idx);
349	return (__predict_false(ifp == IFNET_HOLD) ? NULL : ifp);
350}
351
352struct ifnet *
353ifnet_byindex_ref(u_short idx)
354{
355	struct ifnet *ifp;
356
357	NET_EPOCH_ASSERT();
358
359	ifp = ifnet_byindex(idx);
360	if (ifp == NULL || (ifp->if_flags & IFF_DYING))
361		return (NULL);
362	if_ref(ifp);
363	return (ifp);
364}
365
366/*
367 * Allocate an ifindex array entry; return 0 on success or an error on
368 * failure.
369 */
370static u_short
371ifindex_alloc(void **old)
372{
373	u_short idx;
374
375	IFNET_WLOCK_ASSERT();
376	/*
377	 * Try to find an empty slot below V_if_index.  If we fail, take the
378	 * next slot.
379	 */
380	for (idx = 1; idx <= V_if_index; idx++) {
381		if (V_ifindex_table[idx] == NULL)
382			break;
383	}
384
385	/* Catch if_index overflow. */
386	if (idx >= V_if_indexlim) {
387		*old = if_grow();
388		return (USHRT_MAX);
389	}
390	if (idx > V_if_index)
391		V_if_index = idx;
392	return (idx);
393}
394
395static void
396ifindex_free_locked(u_short idx)
397{
398
399	IFNET_WLOCK_ASSERT();
400
401	V_ifindex_table[idx] = NULL;
402	while (V_if_index > 0 &&
403	    V_ifindex_table[V_if_index] == NULL)
404		V_if_index--;
405}
406
407static void
408ifindex_free(u_short idx)
409{
410
411	IFNET_WLOCK();
412	ifindex_free_locked(idx);
413	IFNET_WUNLOCK();
414}
415
416static void
417ifnet_setbyindex(u_short idx, struct ifnet *ifp)
418{
419
420	V_ifindex_table[idx] = ifp;
421}
422
423struct ifaddr *
424ifaddr_byindex(u_short idx)
425{
426	struct ifnet *ifp;
427	struct ifaddr *ifa = NULL;
428
429	NET_EPOCH_ASSERT();
430
431	ifp = ifnet_byindex(idx);
432	if (ifp != NULL && (ifa = ifp->if_addr) != NULL)
433		ifa_ref(ifa);
434	return (ifa);
435}
436
437/*
438 * Network interface utility routines.
439 *
440 * Routines with ifa_ifwith* names take sockaddr *'s as
441 * parameters.
442 */
443
444static void
445vnet_if_init(const void *unused __unused)
446{
447	void *old;
448
449	CK_STAILQ_INIT(&V_ifnet);
450	CK_STAILQ_INIT(&V_ifg_head);
451	IFNET_WLOCK();
452	old = if_grow();				/* create initial table */
453	IFNET_WUNLOCK();
454	epoch_wait_preempt(net_epoch_preempt);
455	free(old, M_IFNET);
456	vnet_if_clone_init();
457}
458VNET_SYSINIT(vnet_if_init, SI_SUB_INIT_IF, SI_ORDER_SECOND, vnet_if_init,
459    NULL);
460
461#ifdef VIMAGE
462static void
463vnet_if_uninit(const void *unused __unused)
464{
465
466	VNET_ASSERT(CK_STAILQ_EMPTY(&V_ifnet), ("%s:%d tailq &V_ifnet=%p "
467	    "not empty", __func__, __LINE__, &V_ifnet));
468	VNET_ASSERT(CK_STAILQ_EMPTY(&V_ifg_head), ("%s:%d tailq &V_ifg_head=%p "
469	    "not empty", __func__, __LINE__, &V_ifg_head));
470
471	free((caddr_t)V_ifindex_table, M_IFNET);
472}
473VNET_SYSUNINIT(vnet_if_uninit, SI_SUB_INIT_IF, SI_ORDER_FIRST,
474    vnet_if_uninit, NULL);
475#endif
476
477static void
478if_link_ifnet(struct ifnet *ifp)
479{
480
481	IFNET_WLOCK();
482	CK_STAILQ_INSERT_TAIL(&V_ifnet, ifp, if_link);
483#ifdef VIMAGE
484	curvnet->vnet_ifcnt++;
485#endif
486	IFNET_WUNLOCK();
487}
488
489static bool
490if_unlink_ifnet(struct ifnet *ifp, bool vmove)
491{
492	struct ifnet *iter;
493	int found = 0;
494
495	IFNET_WLOCK();
496	CK_STAILQ_FOREACH(iter, &V_ifnet, if_link)
497		if (iter == ifp) {
498			CK_STAILQ_REMOVE(&V_ifnet, ifp, ifnet, if_link);
499			if (!vmove)
500				ifp->if_flags |= IFF_DYING;
501			found = 1;
502			break;
503		}
504#ifdef VIMAGE
505	curvnet->vnet_ifcnt--;
506#endif
507	IFNET_WUNLOCK();
508
509	return (found);
510}
511
512#ifdef VIMAGE
513static void
514vnet_if_return(const void *unused __unused)
515{
516	struct ifnet *ifp, *nifp;
517	struct ifnet **pending;
518	int found, i;
519
520	i = 0;
521
522	/*
523	 * We need to protect our access to the V_ifnet tailq. Ordinarily we'd
524	 * enter NET_EPOCH, but that's not possible, because if_vmove() calls
525	 * if_detach_internal(), which waits for NET_EPOCH callbacks to
526	 * complete. We can't do that from within NET_EPOCH.
527	 *
528	 * However, we can also use the IFNET_xLOCK, which is the V_ifnet
529	 * read/write lock. We cannot hold the lock as we call if_vmove()
530	 * though, as that presents LOR w.r.t ifnet_sx, in_multi_sx and iflib
531	 * ctx lock.
532	 */
533	IFNET_WLOCK();
534
535	pending = malloc(sizeof(struct ifnet *) * curvnet->vnet_ifcnt,
536	    M_IFNET, M_WAITOK | M_ZERO);
537
538	/* Return all inherited interfaces to their parent vnets. */
539	CK_STAILQ_FOREACH_SAFE(ifp, &V_ifnet, if_link, nifp) {
540		if (ifp->if_home_vnet != ifp->if_vnet) {
541			found = if_unlink_ifnet(ifp, true);
542			MPASS(found);
543
544			pending[i++] = ifp;
545		}
546	}
547	IFNET_WUNLOCK();
548
549	for (int j = 0; j < i; j++) {
550		if_vmove(pending[j], pending[j]->if_home_vnet);
551	}
552
553	free(pending, M_IFNET);
554}
555VNET_SYSUNINIT(vnet_if_return, SI_SUB_VNET_DONE, SI_ORDER_ANY,
556    vnet_if_return, NULL);
557#endif
558
559static void *
560if_grow(void)
561{
562	int oldlim;
563	u_int n;
564	struct ifnet **e;
565	void *old;
566
567	old = NULL;
568	IFNET_WLOCK_ASSERT();
569	oldlim = V_if_indexlim;
570	IFNET_WUNLOCK();
571	n = (oldlim << 1) * sizeof(*e);
572	e = malloc(n, M_IFNET, M_WAITOK | M_ZERO);
573	IFNET_WLOCK();
574	if (V_if_indexlim != oldlim) {
575		free(e, M_IFNET);
576		return (NULL);
577	}
578	if (V_ifindex_table != NULL) {
579		memcpy((caddr_t)e, (caddr_t)V_ifindex_table, n/2);
580		old = V_ifindex_table;
581	}
582	V_if_indexlim <<= 1;
583	V_ifindex_table = e;
584	return (old);
585}
586
587/*
588 * Allocate a struct ifnet and an index for an interface.  A layer 2
589 * common structure will also be allocated if an allocation routine is
590 * registered for the passed type.
591 */
592struct ifnet *
593if_alloc_domain(u_char type, int numa_domain)
594{
595	struct ifnet *ifp;
596	u_short idx;
597	void *old;
598
599	KASSERT(numa_domain <= IF_NODOM, ("numa_domain too large"));
600	if (numa_domain == IF_NODOM)
601		ifp = malloc(sizeof(struct ifnet), M_IFNET,
602		    M_WAITOK | M_ZERO);
603	else
604		ifp = malloc_domainset(sizeof(struct ifnet), M_IFNET,
605		    DOMAINSET_PREF(numa_domain), M_WAITOK | M_ZERO);
606 restart:
607	IFNET_WLOCK();
608	idx = ifindex_alloc(&old);
609	if (__predict_false(idx == USHRT_MAX)) {
610		IFNET_WUNLOCK();
611		epoch_wait_preempt(net_epoch_preempt);
612		free(old, M_IFNET);
613		goto restart;
614	}
615	ifnet_setbyindex(idx, IFNET_HOLD);
616	IFNET_WUNLOCK();
617	ifp->if_index = idx;
618	ifp->if_type = type;
619	ifp->if_alloctype = type;
620	ifp->if_numa_domain = numa_domain;
621#ifdef VIMAGE
622	ifp->if_vnet = curvnet;
623#endif
624	if (if_com_alloc[type] != NULL) {
625		ifp->if_l2com = if_com_alloc[type](type, ifp);
626		if (ifp->if_l2com == NULL) {
627			free(ifp, M_IFNET);
628			ifindex_free(idx);
629			return (NULL);
630		}
631	}
632
633	IF_ADDR_LOCK_INIT(ifp);
634	TASK_INIT(&ifp->if_linktask, 0, do_link_state_change, ifp);
635	TASK_INIT(&ifp->if_addmultitask, 0, if_siocaddmulti, ifp);
636	ifp->if_afdata_initialized = 0;
637	IF_AFDATA_LOCK_INIT(ifp);
638	CK_STAILQ_INIT(&ifp->if_addrhead);
639	CK_STAILQ_INIT(&ifp->if_multiaddrs);
640	CK_STAILQ_INIT(&ifp->if_groups);
641#ifdef MAC
642	mac_ifnet_init(ifp);
643#endif
644	ifq_init(&ifp->if_snd, ifp);
645
646	refcount_init(&ifp->if_refcount, 1);	/* Index reference. */
647	for (int i = 0; i < IFCOUNTERS; i++)
648		ifp->if_counters[i] = counter_u64_alloc(M_WAITOK);
649	ifp->if_get_counter = if_get_counter_default;
650	ifp->if_pcp = IFNET_PCP_NONE;
651	ifnet_setbyindex(ifp->if_index, ifp);
652	return (ifp);
653}
654
655struct ifnet *
656if_alloc_dev(u_char type, device_t dev)
657{
658	int numa_domain;
659
660	if (dev == NULL || bus_get_domain(dev, &numa_domain) != 0)
661		return (if_alloc_domain(type, IF_NODOM));
662	return (if_alloc_domain(type, numa_domain));
663}
664
665struct ifnet *
666if_alloc(u_char type)
667{
668
669	return (if_alloc_domain(type, IF_NODOM));
670}
671/*
672 * Do the actual work of freeing a struct ifnet, and layer 2 common
673 * structure.  This call is made when the last reference to an
674 * interface is released.
675 */
676static void
677if_free_internal(struct ifnet *ifp)
678{
679
680	KASSERT((ifp->if_flags & IFF_DYING),
681	    ("if_free_internal: interface not dying"));
682
683	if (if_com_free[ifp->if_alloctype] != NULL)
684		if_com_free[ifp->if_alloctype](ifp->if_l2com,
685		    ifp->if_alloctype);
686
687#ifdef MAC
688	mac_ifnet_destroy(ifp);
689#endif /* MAC */
690	IF_AFDATA_DESTROY(ifp);
691	IF_ADDR_LOCK_DESTROY(ifp);
692	ifq_delete(&ifp->if_snd);
693
694	for (int i = 0; i < IFCOUNTERS; i++)
695		counter_u64_free(ifp->if_counters[i]);
696
697	free(ifp->if_description, M_IFDESCR);
698	free(ifp->if_hw_addr, M_IFADDR);
699	free(ifp, M_IFNET);
700}
701
702static void
703if_destroy(epoch_context_t ctx)
704{
705	struct ifnet *ifp;
706
707	ifp = __containerof(ctx, struct ifnet, if_epoch_ctx);
708	if_free_internal(ifp);
709}
710
711/*
712 * Deregister an interface and free the associated storage.
713 */
714void
715if_free(struct ifnet *ifp)
716{
717
718	ifp->if_flags |= IFF_DYING;			/* XXX: Locking */
719
720	CURVNET_SET_QUIET(ifp->if_vnet);
721	IFNET_WLOCK();
722	KASSERT(ifp == ifnet_byindex(ifp->if_index),
723	    ("%s: freeing unallocated ifnet", ifp->if_xname));
724
725	ifindex_free_locked(ifp->if_index);
726	IFNET_WUNLOCK();
727
728	if (refcount_release(&ifp->if_refcount))
729		NET_EPOCH_CALL(if_destroy, &ifp->if_epoch_ctx);
730	CURVNET_RESTORE();
731}
732
733/*
734 * Interfaces to keep an ifnet type-stable despite the possibility of the
735 * driver calling if_free().  If there are additional references, we defer
736 * freeing the underlying data structure.
737 */
738void
739if_ref(struct ifnet *ifp)
740{
741
742	/* We don't assert the ifnet list lock here, but arguably should. */
743	refcount_acquire(&ifp->if_refcount);
744}
745
746void
747if_rele(struct ifnet *ifp)
748{
749
750	if (!refcount_release(&ifp->if_refcount))
751		return;
752	NET_EPOCH_CALL(if_destroy, &ifp->if_epoch_ctx);
753}
754
755void
756ifq_init(struct ifaltq *ifq, struct ifnet *ifp)
757{
758
759	mtx_init(&ifq->ifq_mtx, ifp->if_xname, "if send queue", MTX_DEF);
760
761	if (ifq->ifq_maxlen == 0)
762		ifq->ifq_maxlen = ifqmaxlen;
763
764	ifq->altq_type = 0;
765	ifq->altq_disc = NULL;
766	ifq->altq_flags &= ALTQF_CANTCHANGE;
767	ifq->altq_tbr  = NULL;
768	ifq->altq_ifp  = ifp;
769}
770
771void
772ifq_delete(struct ifaltq *ifq)
773{
774	mtx_destroy(&ifq->ifq_mtx);
775}
776
777/*
778 * Perform generic interface initialization tasks and attach the interface
779 * to the list of "active" interfaces.  If vmove flag is set on entry
780 * to if_attach_internal(), perform only a limited subset of initialization
781 * tasks, given that we are moving from one vnet to another an ifnet which
782 * has already been fully initialized.
783 *
784 * Note that if_detach_internal() removes group membership unconditionally
785 * even when vmove flag is set, and if_attach_internal() adds only IFG_ALL.
786 * Thus, when if_vmove() is applied to a cloned interface, group membership
787 * is lost while a cloned one always joins a group whose name is
788 * ifc->ifc_name.  To recover this after if_detach_internal() and
789 * if_attach_internal(), the cloner should be specified to
790 * if_attach_internal() via ifc.  If it is non-NULL, if_attach_internal()
791 * attempts to join a group whose name is ifc->ifc_name.
792 *
793 * XXX:
794 *  - The decision to return void and thus require this function to
795 *    succeed is questionable.
796 *  - We should probably do more sanity checking.  For instance we don't
797 *    do anything to insure if_xname is unique or non-empty.
798 */
799void
800if_attach(struct ifnet *ifp)
801{
802
803	if_attach_internal(ifp, 0, NULL);
804}
805
806/*
807 * Compute the least common TSO limit.
808 */
809void
810if_hw_tsomax_common(if_t ifp, struct ifnet_hw_tsomax *pmax)
811{
812	/*
813	 * 1) If there is no limit currently, take the limit from
814	 * the network adapter.
815	 *
816	 * 2) If the network adapter has a limit below the current
817	 * limit, apply it.
818	 */
819	if (pmax->tsomaxbytes == 0 || (ifp->if_hw_tsomax != 0 &&
820	    ifp->if_hw_tsomax < pmax->tsomaxbytes)) {
821		pmax->tsomaxbytes = ifp->if_hw_tsomax;
822	}
823	if (pmax->tsomaxsegcount == 0 || (ifp->if_hw_tsomaxsegcount != 0 &&
824	    ifp->if_hw_tsomaxsegcount < pmax->tsomaxsegcount)) {
825		pmax->tsomaxsegcount = ifp->if_hw_tsomaxsegcount;
826	}
827	if (pmax->tsomaxsegsize == 0 || (ifp->if_hw_tsomaxsegsize != 0 &&
828	    ifp->if_hw_tsomaxsegsize < pmax->tsomaxsegsize)) {
829		pmax->tsomaxsegsize = ifp->if_hw_tsomaxsegsize;
830	}
831}
832
833/*
834 * Update TSO limit of a network adapter.
835 *
836 * Returns zero if no change. Else non-zero.
837 */
838int
839if_hw_tsomax_update(if_t ifp, struct ifnet_hw_tsomax *pmax)
840{
841	int retval = 0;
842	if (ifp->if_hw_tsomax != pmax->tsomaxbytes) {
843		ifp->if_hw_tsomax = pmax->tsomaxbytes;
844		retval++;
845	}
846	if (ifp->if_hw_tsomaxsegsize != pmax->tsomaxsegsize) {
847		ifp->if_hw_tsomaxsegsize = pmax->tsomaxsegsize;
848		retval++;
849	}
850	if (ifp->if_hw_tsomaxsegcount != pmax->tsomaxsegcount) {
851		ifp->if_hw_tsomaxsegcount = pmax->tsomaxsegcount;
852		retval++;
853	}
854	return (retval);
855}
856
857static void
858if_attach_internal(struct ifnet *ifp, int vmove, struct if_clone *ifc)
859{
860	unsigned socksize, ifasize;
861	int namelen, masklen;
862	struct sockaddr_dl *sdl;
863	struct ifaddr *ifa;
864
865	if (ifp->if_index == 0 || ifp != ifnet_byindex(ifp->if_index))
866		panic ("%s: BUG: if_attach called without if_alloc'd input()\n",
867		    ifp->if_xname);
868
869#ifdef VIMAGE
870	ifp->if_vnet = curvnet;
871	if (ifp->if_home_vnet == NULL)
872		ifp->if_home_vnet = curvnet;
873#endif
874
875	if_addgroup(ifp, IFG_ALL);
876
877	/* Restore group membership for cloned interfaces. */
878	if (vmove && ifc != NULL)
879		if_clone_addgroup(ifp, ifc);
880
881	getmicrotime(&ifp->if_lastchange);
882	ifp->if_epoch = time_uptime;
883
884	KASSERT((ifp->if_transmit == NULL && ifp->if_qflush == NULL) ||
885	    (ifp->if_transmit != NULL && ifp->if_qflush != NULL),
886	    ("transmit and qflush must both either be set or both be NULL"));
887	if (ifp->if_transmit == NULL) {
888		ifp->if_transmit = if_transmit;
889		ifp->if_qflush = if_qflush;
890	}
891	if (ifp->if_input == NULL)
892		ifp->if_input = if_input_default;
893
894	if (ifp->if_requestencap == NULL)
895		ifp->if_requestencap = if_requestencap_default;
896
897	if (!vmove) {
898#ifdef MAC
899		mac_ifnet_create(ifp);
900#endif
901
902		/*
903		 * Create a Link Level name for this device.
904		 */
905		namelen = strlen(ifp->if_xname);
906		/*
907		 * Always save enough space for any possiable name so we
908		 * can do a rename in place later.
909		 */
910		masklen = offsetof(struct sockaddr_dl, sdl_data[0]) + IFNAMSIZ;
911		socksize = masklen + ifp->if_addrlen;
912		if (socksize < sizeof(*sdl))
913			socksize = sizeof(*sdl);
914		socksize = roundup2(socksize, sizeof(long));
915		ifasize = sizeof(*ifa) + 2 * socksize;
916		ifa = ifa_alloc(ifasize, M_WAITOK);
917		sdl = (struct sockaddr_dl *)(ifa + 1);
918		sdl->sdl_len = socksize;
919		sdl->sdl_family = AF_LINK;
920		bcopy(ifp->if_xname, sdl->sdl_data, namelen);
921		sdl->sdl_nlen = namelen;
922		sdl->sdl_index = ifp->if_index;
923		sdl->sdl_type = ifp->if_type;
924		ifp->if_addr = ifa;
925		ifa->ifa_ifp = ifp;
926		ifa->ifa_addr = (struct sockaddr *)sdl;
927		sdl = (struct sockaddr_dl *)(socksize + (caddr_t)sdl);
928		ifa->ifa_netmask = (struct sockaddr *)sdl;
929		sdl->sdl_len = masklen;
930		while (namelen != 0)
931			sdl->sdl_data[--namelen] = 0xff;
932		CK_STAILQ_INSERT_HEAD(&ifp->if_addrhead, ifa, ifa_link);
933		/* Reliably crash if used uninitialized. */
934		ifp->if_broadcastaddr = NULL;
935
936		if (ifp->if_type == IFT_ETHER) {
937			ifp->if_hw_addr = malloc(ifp->if_addrlen, M_IFADDR,
938			    M_WAITOK | M_ZERO);
939		}
940
941#if defined(INET) || defined(INET6)
942		/* Use defaults for TSO, if nothing is set */
943		if (ifp->if_hw_tsomax == 0 &&
944		    ifp->if_hw_tsomaxsegcount == 0 &&
945		    ifp->if_hw_tsomaxsegsize == 0) {
946			/*
947			 * The TSO defaults needs to be such that an
948			 * NFS mbuf list of 35 mbufs totalling just
949			 * below 64K works and that a chain of mbufs
950			 * can be defragged into at most 32 segments:
951			 */
952			ifp->if_hw_tsomax = min(IP_MAXPACKET, (32 * MCLBYTES) -
953			    (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN));
954			ifp->if_hw_tsomaxsegcount = 35;
955			ifp->if_hw_tsomaxsegsize = 2048;	/* 2K */
956
957			/* XXX some drivers set IFCAP_TSO after ethernet attach */
958			if (ifp->if_capabilities & IFCAP_TSO) {
959				if_printf(ifp, "Using defaults for TSO: %u/%u/%u\n",
960				    ifp->if_hw_tsomax,
961				    ifp->if_hw_tsomaxsegcount,
962				    ifp->if_hw_tsomaxsegsize);
963			}
964		}
965#endif
966	}
967#ifdef VIMAGE
968	else {
969		/*
970		 * Update the interface index in the link layer address
971		 * of the interface.
972		 */
973		for (ifa = ifp->if_addr; ifa != NULL;
974		    ifa = CK_STAILQ_NEXT(ifa, ifa_link)) {
975			if (ifa->ifa_addr->sa_family == AF_LINK) {
976				sdl = (struct sockaddr_dl *)ifa->ifa_addr;
977				sdl->sdl_index = ifp->if_index;
978			}
979		}
980	}
981#endif
982
983	if_link_ifnet(ifp);
984
985	if (domain_init_status >= 2)
986		if_attachdomain1(ifp);
987
988	EVENTHANDLER_INVOKE(ifnet_arrival_event, ifp);
989	if (IS_DEFAULT_VNET(curvnet))
990		devctl_notify("IFNET", ifp->if_xname, "ATTACH", NULL);
991
992	/* Announce the interface. */
993	rt_ifannouncemsg(ifp, IFAN_ARRIVAL);
994}
995
996static void
997if_epochalloc(void *dummy __unused)
998{
999
1000	net_epoch_preempt = epoch_alloc("Net preemptible", EPOCH_PREEMPT);
1001}
1002SYSINIT(ifepochalloc, SI_SUB_EPOCH, SI_ORDER_ANY, if_epochalloc, NULL);
1003
1004static void
1005if_attachdomain(void *dummy)
1006{
1007	struct ifnet *ifp;
1008
1009	CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link)
1010		if_attachdomain1(ifp);
1011}
1012SYSINIT(domainifattach, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_SECOND,
1013    if_attachdomain, NULL);
1014
1015static void
1016if_attachdomain1(struct ifnet *ifp)
1017{
1018	struct domain *dp;
1019
1020	/*
1021	 * Since dp->dom_ifattach calls malloc() with M_WAITOK, we
1022	 * cannot lock ifp->if_afdata initialization, entirely.
1023	 */
1024	IF_AFDATA_LOCK(ifp);
1025	if (ifp->if_afdata_initialized >= domain_init_status) {
1026		IF_AFDATA_UNLOCK(ifp);
1027		log(LOG_WARNING, "%s called more than once on %s\n",
1028		    __func__, ifp->if_xname);
1029		return;
1030	}
1031	ifp->if_afdata_initialized = domain_init_status;
1032	IF_AFDATA_UNLOCK(ifp);
1033
1034	/* address family dependent data region */
1035	bzero(ifp->if_afdata, sizeof(ifp->if_afdata));
1036	for (dp = domains; dp; dp = dp->dom_next) {
1037		if (dp->dom_ifattach)
1038			ifp->if_afdata[dp->dom_family] =
1039			    (*dp->dom_ifattach)(ifp);
1040	}
1041}
1042
1043/*
1044 * Remove any unicast or broadcast network addresses from an interface.
1045 */
1046void
1047if_purgeaddrs(struct ifnet *ifp)
1048{
1049	struct ifaddr *ifa;
1050
1051	while (1) {
1052		struct epoch_tracker et;
1053
1054		NET_EPOCH_ENTER(et);
1055		CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
1056			if (ifa->ifa_addr->sa_family != AF_LINK)
1057				break;
1058		}
1059		NET_EPOCH_EXIT(et);
1060
1061		if (ifa == NULL)
1062			break;
1063#ifdef INET
1064		/* XXX: Ugly!! ad hoc just for INET */
1065		if (ifa->ifa_addr->sa_family == AF_INET) {
1066			struct ifaliasreq ifr;
1067
1068			bzero(&ifr, sizeof(ifr));
1069			ifr.ifra_addr = *ifa->ifa_addr;
1070			if (ifa->ifa_dstaddr)
1071				ifr.ifra_broadaddr = *ifa->ifa_dstaddr;
1072			if (in_control(NULL, SIOCDIFADDR, (caddr_t)&ifr, ifp,
1073			    NULL) == 0)
1074				continue;
1075		}
1076#endif /* INET */
1077#ifdef INET6
1078		if (ifa->ifa_addr->sa_family == AF_INET6) {
1079			in6_purgeifaddr((struct in6_ifaddr *)ifa);
1080			/* ifp_addrhead is already updated */
1081			continue;
1082		}
1083#endif /* INET6 */
1084		IF_ADDR_WLOCK(ifp);
1085		CK_STAILQ_REMOVE(&ifp->if_addrhead, ifa, ifaddr, ifa_link);
1086		IF_ADDR_WUNLOCK(ifp);
1087		ifa_free(ifa);
1088	}
1089}
1090
1091/*
1092 * Remove any multicast network addresses from an interface when an ifnet
1093 * is going away.
1094 */
1095static void
1096if_purgemaddrs(struct ifnet *ifp)
1097{
1098	struct ifmultiaddr *ifma;
1099
1100	IF_ADDR_WLOCK(ifp);
1101	while (!CK_STAILQ_EMPTY(&ifp->if_multiaddrs)) {
1102		ifma = CK_STAILQ_FIRST(&ifp->if_multiaddrs);
1103		CK_STAILQ_REMOVE(&ifp->if_multiaddrs, ifma, ifmultiaddr, ifma_link);
1104		if_delmulti_locked(ifp, ifma, 1);
1105	}
1106	IF_ADDR_WUNLOCK(ifp);
1107}
1108
1109/*
1110 * Detach an interface, removing it from the list of "active" interfaces.
1111 * If vmove flag is set on entry to if_detach_internal(), perform only a
1112 * limited subset of cleanup tasks, given that we are moving an ifnet from
1113 * one vnet to another, where it must be fully operational.
1114 *
1115 * XXXRW: There are some significant questions about event ordering, and
1116 * how to prevent things from starting to use the interface during detach.
1117 */
1118void
1119if_detach(struct ifnet *ifp)
1120{
1121	bool found;
1122
1123	CURVNET_SET_QUIET(ifp->if_vnet);
1124	found = if_unlink_ifnet(ifp, false);
1125	if (found) {
1126		sx_xlock(&ifnet_detach_sxlock);
1127		if_detach_internal(ifp, 0, NULL);
1128		sx_xunlock(&ifnet_detach_sxlock);
1129	}
1130	CURVNET_RESTORE();
1131}
1132
1133/*
1134 * The vmove flag, if set, indicates that we are called from a callpath
1135 * that is moving an interface to a different vnet instance.
1136 *
1137 * The shutdown flag, if set, indicates that we are called in the
1138 * process of shutting down a vnet instance.  Currently only the
1139 * vnet_if_return SYSUNINIT function sets it.  Note: we can be called
1140 * on a vnet instance shutdown without this flag being set, e.g., when
1141 * the cloned interfaces are destoyed as first thing of teardown.
1142 */
1143static int
1144if_detach_internal(struct ifnet *ifp, int vmove, struct if_clone **ifcp)
1145{
1146	struct ifaddr *ifa;
1147	int i;
1148	struct domain *dp;
1149#ifdef VIMAGE
1150	bool shutdown;
1151
1152	shutdown = VNET_IS_SHUTTING_DOWN(ifp->if_vnet);
1153#endif
1154
1155	/*
1156	 * At this point we know the interface still was on the ifnet list
1157	 * and we removed it so we are in a stable state.
1158	 */
1159	epoch_wait_preempt(net_epoch_preempt);
1160
1161	/*
1162	 * Ensure all pending EPOCH(9) callbacks have been executed. This
1163	 * fixes issues about late destruction of multicast options
1164	 * which lead to leave group calls, which in turn access the
1165	 * belonging ifnet structure:
1166	 */
1167	epoch_drain_callbacks(net_epoch_preempt);
1168
1169	/*
1170	 * In any case (destroy or vmove) detach us from the groups
1171	 * and remove/wait for pending events on the taskq.
1172	 * XXX-BZ in theory an interface could still enqueue a taskq change?
1173	 */
1174	if_delgroups(ifp);
1175
1176	taskqueue_drain(taskqueue_swi, &ifp->if_linktask);
1177	taskqueue_drain(taskqueue_swi, &ifp->if_addmultitask);
1178
1179	/*
1180	 * Check if this is a cloned interface or not. Must do even if
1181	 * shutting down as a if_vmove_reclaim() would move the ifp and
1182	 * the if_clone_addgroup() will have a corrupted string overwise
1183	 * from a gibberish pointer.
1184	 */
1185	if (vmove && ifcp != NULL)
1186		*ifcp = if_clone_findifc(ifp);
1187
1188	if_down(ifp);
1189
1190#ifdef VIMAGE
1191	/*
1192	 * On VNET shutdown abort here as the stack teardown will do all
1193	 * the work top-down for us.
1194	 */
1195	if (shutdown) {
1196		/* Give interface users the chance to clean up. */
1197		EVENTHANDLER_INVOKE(ifnet_departure_event, ifp);
1198
1199		/*
1200		 * In case of a vmove we are done here without error.
1201		 * If we would signal an error it would lead to the same
1202		 * abort as if we did not find the ifnet anymore.
1203		 * if_detach() calls us in void context and does not care
1204		 * about an early abort notification, so life is splendid :)
1205		 */
1206		goto finish_vnet_shutdown;
1207	}
1208#endif
1209
1210	/*
1211	 * At this point we are not tearing down a VNET and are either
1212	 * going to destroy or vmove the interface and have to cleanup
1213	 * accordingly.
1214	 */
1215
1216	/*
1217	 * Remove routes and flush queues.
1218	 */
1219#ifdef ALTQ
1220	if (ALTQ_IS_ENABLED(&ifp->if_snd))
1221		altq_disable(&ifp->if_snd);
1222	if (ALTQ_IS_ATTACHED(&ifp->if_snd))
1223		altq_detach(&ifp->if_snd);
1224#endif
1225
1226	if_purgeaddrs(ifp);
1227
1228#ifdef INET
1229	in_ifdetach(ifp);
1230#endif
1231
1232#ifdef INET6
1233	/*
1234	 * Remove all IPv6 kernel structs related to ifp.  This should be done
1235	 * before removing routing entries below, since IPv6 interface direct
1236	 * routes are expected to be removed by the IPv6-specific kernel API.
1237	 * Otherwise, the kernel will detect some inconsistency and bark it.
1238	 */
1239	in6_ifdetach(ifp);
1240#endif
1241	if_purgemaddrs(ifp);
1242
1243	/* Announce that the interface is gone. */
1244	rt_ifannouncemsg(ifp, IFAN_DEPARTURE);
1245	EVENTHANDLER_INVOKE(ifnet_departure_event, ifp);
1246	if (IS_DEFAULT_VNET(curvnet))
1247		devctl_notify("IFNET", ifp->if_xname, "DETACH", NULL);
1248
1249	if (!vmove) {
1250		/*
1251		 * Prevent further calls into the device driver via ifnet.
1252		 */
1253		if_dead(ifp);
1254
1255		/*
1256		 * Clean up all addresses.
1257		 */
1258		IF_ADDR_WLOCK(ifp);
1259		if (!CK_STAILQ_EMPTY(&ifp->if_addrhead)) {
1260			ifa = CK_STAILQ_FIRST(&ifp->if_addrhead);
1261			CK_STAILQ_REMOVE(&ifp->if_addrhead, ifa, ifaddr, ifa_link);
1262			IF_ADDR_WUNLOCK(ifp);
1263			ifa_free(ifa);
1264		} else
1265			IF_ADDR_WUNLOCK(ifp);
1266	}
1267
1268	rt_flushifroutes(ifp);
1269
1270#ifdef VIMAGE
1271finish_vnet_shutdown:
1272#endif
1273	/*
1274	 * We cannot hold the lock over dom_ifdetach calls as they might
1275	 * sleep, for example trying to drain a callout, thus open up the
1276	 * theoretical race with re-attaching.
1277	 */
1278	IF_AFDATA_LOCK(ifp);
1279	i = ifp->if_afdata_initialized;
1280	ifp->if_afdata_initialized = 0;
1281	IF_AFDATA_UNLOCK(ifp);
1282	for (dp = domains; i > 0 && dp; dp = dp->dom_next) {
1283		if (dp->dom_ifdetach && ifp->if_afdata[dp->dom_family]) {
1284			(*dp->dom_ifdetach)(ifp,
1285			    ifp->if_afdata[dp->dom_family]);
1286			ifp->if_afdata[dp->dom_family] = NULL;
1287		}
1288	}
1289
1290	return (0);
1291}
1292
1293#ifdef VIMAGE
1294/*
1295 * if_vmove() performs a limited version of if_detach() in current
1296 * vnet and if_attach()es the ifnet to the vnet specified as 2nd arg.
1297 * An attempt is made to shrink if_index in current vnet, find an
1298 * unused if_index in target vnet and calls if_grow() if necessary,
1299 * and finally find an unused if_xname for the target vnet.
1300 */
1301static int
1302if_vmove(struct ifnet *ifp, struct vnet *new_vnet)
1303{
1304	struct if_clone *ifc;
1305#ifdef DEV_BPF
1306	u_int bif_dlt, bif_hdrlen;
1307#endif
1308	void *old;
1309	int rc;
1310
1311#ifdef DEV_BPF
1312 	/*
1313	 * if_detach_internal() will call the eventhandler to notify
1314	 * interface departure.  That will detach if_bpf.  We need to
1315	 * safe the dlt and hdrlen so we can re-attach it later.
1316	 */
1317	bpf_get_bp_params(ifp->if_bpf, &bif_dlt, &bif_hdrlen);
1318#endif
1319
1320	/*
1321	 * Detach from current vnet, but preserve LLADDR info, do not
1322	 * mark as dead etc. so that the ifnet can be reattached later.
1323	 * If we cannot find it, we lost the race to someone else.
1324	 */
1325	rc = if_detach_internal(ifp, 1, &ifc);
1326	if (rc != 0)
1327		return (rc);
1328
1329	/*
1330	 * Unlink the ifnet from ifindex_table[] in current vnet, and shrink
1331	 * the if_index for that vnet if possible.
1332	 *
1333	 * NOTE: IFNET_WLOCK/IFNET_WUNLOCK() are assumed to be unvirtualized,
1334	 * or we'd lock on one vnet and unlock on another.
1335	 */
1336	IFNET_WLOCK();
1337	ifindex_free_locked(ifp->if_index);
1338	IFNET_WUNLOCK();
1339
1340	/*
1341	 * Perform interface-specific reassignment tasks, if provided by
1342	 * the driver.
1343	 */
1344	if (ifp->if_reassign != NULL)
1345		ifp->if_reassign(ifp, new_vnet, NULL);
1346
1347	/*
1348	 * Switch to the context of the target vnet.
1349	 */
1350	CURVNET_SET_QUIET(new_vnet);
1351 restart:
1352	IFNET_WLOCK();
1353	ifp->if_index = ifindex_alloc(&old);
1354	if (__predict_false(ifp->if_index == USHRT_MAX)) {
1355		IFNET_WUNLOCK();
1356		epoch_wait_preempt(net_epoch_preempt);
1357		free(old, M_IFNET);
1358		goto restart;
1359	}
1360	ifnet_setbyindex(ifp->if_index, ifp);
1361	IFNET_WUNLOCK();
1362
1363	if_attach_internal(ifp, 1, ifc);
1364
1365#ifdef DEV_BPF
1366	if (ifp->if_bpf == NULL)
1367		bpfattach(ifp, bif_dlt, bif_hdrlen);
1368#endif
1369
1370	CURVNET_RESTORE();
1371	return (0);
1372}
1373
1374/*
1375 * Move an ifnet to or from another child prison/vnet, specified by the jail id.
1376 */
1377static int
1378if_vmove_loan(struct thread *td, struct ifnet *ifp, char *ifname, int jid)
1379{
1380	struct prison *pr;
1381	struct ifnet *difp;
1382	int error;
1383	bool found;
1384	bool shutdown;
1385
1386	/* Try to find the prison within our visibility. */
1387	sx_slock(&allprison_lock);
1388	pr = prison_find_child(td->td_ucred->cr_prison, jid);
1389	sx_sunlock(&allprison_lock);
1390	if (pr == NULL)
1391		return (ENXIO);
1392	prison_hold_locked(pr);
1393	mtx_unlock(&pr->pr_mtx);
1394
1395	/* Do not try to move the iface from and to the same prison. */
1396	if (pr->pr_vnet == ifp->if_vnet) {
1397		prison_free(pr);
1398		return (EEXIST);
1399	}
1400
1401	/* Make sure the named iface does not exists in the dst. prison/vnet. */
1402	/* XXX Lock interfaces to avoid races. */
1403	CURVNET_SET_QUIET(pr->pr_vnet);
1404	difp = ifunit(ifname);
1405	if (difp != NULL) {
1406		CURVNET_RESTORE();
1407		prison_free(pr);
1408		return (EEXIST);
1409	}
1410
1411	/* Make sure the VNET is stable. */
1412	shutdown = VNET_IS_SHUTTING_DOWN(ifp->if_vnet);
1413	if (shutdown) {
1414		CURVNET_RESTORE();
1415		prison_free(pr);
1416		return (EBUSY);
1417	}
1418	CURVNET_RESTORE();
1419
1420	found = if_unlink_ifnet(ifp, true);
1421	MPASS(found);
1422
1423	/* Move the interface into the child jail/vnet. */
1424	error = if_vmove(ifp, pr->pr_vnet);
1425
1426	/* Report the new if_xname back to the userland on success. */
1427	if (error == 0)
1428		sprintf(ifname, "%s", ifp->if_xname);
1429
1430	prison_free(pr);
1431	return (error);
1432}
1433
1434static int
1435if_vmove_reclaim(struct thread *td, char *ifname, int jid)
1436{
1437	struct prison *pr;
1438	struct vnet *vnet_dst;
1439	struct ifnet *ifp;
1440	int error, found;
1441 	bool shutdown;
1442
1443	/* Try to find the prison within our visibility. */
1444	sx_slock(&allprison_lock);
1445	pr = prison_find_child(td->td_ucred->cr_prison, jid);
1446	sx_sunlock(&allprison_lock);
1447	if (pr == NULL)
1448		return (ENXIO);
1449	prison_hold_locked(pr);
1450	mtx_unlock(&pr->pr_mtx);
1451
1452	/* Make sure the named iface exists in the source prison/vnet. */
1453	CURVNET_SET(pr->pr_vnet);
1454	ifp = ifunit(ifname);		/* XXX Lock to avoid races. */
1455	if (ifp == NULL) {
1456		CURVNET_RESTORE();
1457		prison_free(pr);
1458		return (ENXIO);
1459	}
1460
1461	/* Do not try to move the iface from and to the same prison. */
1462	vnet_dst = TD_TO_VNET(td);
1463	if (vnet_dst == ifp->if_vnet) {
1464		CURVNET_RESTORE();
1465		prison_free(pr);
1466		return (EEXIST);
1467	}
1468
1469	/* Make sure the VNET is stable. */
1470	shutdown = VNET_IS_SHUTTING_DOWN(ifp->if_vnet);
1471	if (shutdown) {
1472		CURVNET_RESTORE();
1473		prison_free(pr);
1474		return (EBUSY);
1475	}
1476
1477	/* Get interface back from child jail/vnet. */
1478	found = if_unlink_ifnet(ifp, true);
1479	MPASS(found);
1480	error = if_vmove(ifp, vnet_dst);
1481	CURVNET_RESTORE();
1482
1483	/* Report the new if_xname back to the userland on success. */
1484	if (error == 0)
1485		sprintf(ifname, "%s", ifp->if_xname);
1486
1487	prison_free(pr);
1488	return (error);
1489}
1490#endif /* VIMAGE */
1491
1492/*
1493 * Add a group to an interface
1494 */
1495int
1496if_addgroup(struct ifnet *ifp, const char *groupname)
1497{
1498	struct ifg_list		*ifgl;
1499	struct ifg_group	*ifg = NULL;
1500	struct ifg_member	*ifgm;
1501	int 			 new = 0;
1502
1503	if (groupname[0] && groupname[strlen(groupname) - 1] >= '0' &&
1504	    groupname[strlen(groupname) - 1] <= '9')
1505		return (EINVAL);
1506
1507	IFNET_WLOCK();
1508	CK_STAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next)
1509		if (!strcmp(ifgl->ifgl_group->ifg_group, groupname)) {
1510			IFNET_WUNLOCK();
1511			return (EEXIST);
1512		}
1513
1514	if ((ifgl = malloc(sizeof(*ifgl), M_TEMP, M_NOWAIT)) == NULL) {
1515	    	IFNET_WUNLOCK();
1516		return (ENOMEM);
1517	}
1518
1519	if ((ifgm = malloc(sizeof(*ifgm), M_TEMP, M_NOWAIT)) == NULL) {
1520		free(ifgl, M_TEMP);
1521		IFNET_WUNLOCK();
1522		return (ENOMEM);
1523	}
1524
1525	CK_STAILQ_FOREACH(ifg, &V_ifg_head, ifg_next)
1526		if (!strcmp(ifg->ifg_group, groupname))
1527			break;
1528
1529	if (ifg == NULL) {
1530		if ((ifg = malloc(sizeof(*ifg), M_TEMP, M_NOWAIT)) == NULL) {
1531			free(ifgl, M_TEMP);
1532			free(ifgm, M_TEMP);
1533			IFNET_WUNLOCK();
1534			return (ENOMEM);
1535		}
1536		strlcpy(ifg->ifg_group, groupname, sizeof(ifg->ifg_group));
1537		ifg->ifg_refcnt = 0;
1538		CK_STAILQ_INIT(&ifg->ifg_members);
1539		CK_STAILQ_INSERT_TAIL(&V_ifg_head, ifg, ifg_next);
1540		new = 1;
1541	}
1542
1543	ifg->ifg_refcnt++;
1544	ifgl->ifgl_group = ifg;
1545	ifgm->ifgm_ifp = ifp;
1546
1547	IF_ADDR_WLOCK(ifp);
1548	CK_STAILQ_INSERT_TAIL(&ifg->ifg_members, ifgm, ifgm_next);
1549	CK_STAILQ_INSERT_TAIL(&ifp->if_groups, ifgl, ifgl_next);
1550	IF_ADDR_WUNLOCK(ifp);
1551
1552	IFNET_WUNLOCK();
1553
1554	if (new)
1555		EVENTHANDLER_INVOKE(group_attach_event, ifg);
1556	EVENTHANDLER_INVOKE(group_change_event, groupname);
1557
1558	return (0);
1559}
1560
1561/*
1562 * Helper function to remove a group out of an interface.  Expects the global
1563 * ifnet lock to be write-locked, and drops it before returning.
1564 */
1565static void
1566_if_delgroup_locked(struct ifnet *ifp, struct ifg_list *ifgl,
1567    const char *groupname)
1568{
1569	struct ifg_member *ifgm;
1570	bool freeifgl;
1571
1572	IFNET_WLOCK_ASSERT();
1573
1574	IF_ADDR_WLOCK(ifp);
1575	CK_STAILQ_REMOVE(&ifp->if_groups, ifgl, ifg_list, ifgl_next);
1576	IF_ADDR_WUNLOCK(ifp);
1577
1578	CK_STAILQ_FOREACH(ifgm, &ifgl->ifgl_group->ifg_members, ifgm_next) {
1579		if (ifgm->ifgm_ifp == ifp) {
1580			CK_STAILQ_REMOVE(&ifgl->ifgl_group->ifg_members, ifgm,
1581			    ifg_member, ifgm_next);
1582			break;
1583		}
1584	}
1585
1586	if (--ifgl->ifgl_group->ifg_refcnt == 0) {
1587		CK_STAILQ_REMOVE(&V_ifg_head, ifgl->ifgl_group, ifg_group,
1588		    ifg_next);
1589		freeifgl = true;
1590	} else {
1591		freeifgl = false;
1592	}
1593	IFNET_WUNLOCK();
1594
1595	epoch_wait_preempt(net_epoch_preempt);
1596	if (freeifgl) {
1597		EVENTHANDLER_INVOKE(group_detach_event, ifgl->ifgl_group);
1598		free(ifgl->ifgl_group, M_TEMP);
1599	}
1600	free(ifgm, M_TEMP);
1601	free(ifgl, M_TEMP);
1602
1603	EVENTHANDLER_INVOKE(group_change_event, groupname);
1604}
1605
1606/*
1607 * Remove a group from an interface
1608 */
1609int
1610if_delgroup(struct ifnet *ifp, const char *groupname)
1611{
1612	struct ifg_list *ifgl;
1613
1614	IFNET_WLOCK();
1615	CK_STAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next)
1616		if (strcmp(ifgl->ifgl_group->ifg_group, groupname) == 0)
1617			break;
1618	if (ifgl == NULL) {
1619		IFNET_WUNLOCK();
1620		return (ENOENT);
1621	}
1622
1623	_if_delgroup_locked(ifp, ifgl, groupname);
1624
1625	return (0);
1626}
1627
1628/*
1629 * Remove an interface from all groups
1630 */
1631static void
1632if_delgroups(struct ifnet *ifp)
1633{
1634	struct ifg_list *ifgl;
1635	char groupname[IFNAMSIZ];
1636
1637	IFNET_WLOCK();
1638	while ((ifgl = CK_STAILQ_FIRST(&ifp->if_groups)) != NULL) {
1639		strlcpy(groupname, ifgl->ifgl_group->ifg_group, IFNAMSIZ);
1640		_if_delgroup_locked(ifp, ifgl, groupname);
1641		IFNET_WLOCK();
1642	}
1643	IFNET_WUNLOCK();
1644}
1645
1646static char *
1647ifgr_group_get(void *ifgrp)
1648{
1649	union ifgroupreq_union *ifgrup;
1650
1651	ifgrup = ifgrp;
1652#ifdef COMPAT_FREEBSD32
1653	if (SV_CURPROC_FLAG(SV_ILP32))
1654		return (&ifgrup->ifgr32.ifgr_ifgru.ifgru_group[0]);
1655#endif
1656	return (&ifgrup->ifgr.ifgr_ifgru.ifgru_group[0]);
1657}
1658
1659static struct ifg_req *
1660ifgr_groups_get(void *ifgrp)
1661{
1662	union ifgroupreq_union *ifgrup;
1663
1664	ifgrup = ifgrp;
1665#ifdef COMPAT_FREEBSD32
1666	if (SV_CURPROC_FLAG(SV_ILP32))
1667		return ((struct ifg_req *)(uintptr_t)
1668		    ifgrup->ifgr32.ifgr_ifgru.ifgru_groups);
1669#endif
1670	return (ifgrup->ifgr.ifgr_ifgru.ifgru_groups);
1671}
1672
1673/*
1674 * Stores all groups from an interface in memory pointed to by ifgr.
1675 */
1676static int
1677if_getgroup(struct ifgroupreq *ifgr, struct ifnet *ifp)
1678{
1679	int			 len, error;
1680	struct ifg_list		*ifgl;
1681	struct ifg_req		 ifgrq, *ifgp;
1682
1683	NET_EPOCH_ASSERT();
1684
1685	if (ifgr->ifgr_len == 0) {
1686		CK_STAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next)
1687			ifgr->ifgr_len += sizeof(struct ifg_req);
1688		return (0);
1689	}
1690
1691	len = ifgr->ifgr_len;
1692	ifgp = ifgr_groups_get(ifgr);
1693	/* XXX: wire */
1694	CK_STAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next) {
1695		if (len < sizeof(ifgrq))
1696			return (EINVAL);
1697		bzero(&ifgrq, sizeof ifgrq);
1698		strlcpy(ifgrq.ifgrq_group, ifgl->ifgl_group->ifg_group,
1699		    sizeof(ifgrq.ifgrq_group));
1700		if ((error = copyout(&ifgrq, ifgp, sizeof(struct ifg_req))))
1701			return (error);
1702		len -= sizeof(ifgrq);
1703		ifgp++;
1704	}
1705
1706	return (0);
1707}
1708
1709/*
1710 * Stores all members of a group in memory pointed to by igfr
1711 */
1712static int
1713if_getgroupmembers(struct ifgroupreq *ifgr)
1714{
1715	struct ifg_group	*ifg;
1716	struct ifg_member	*ifgm;
1717	struct ifg_req		 ifgrq, *ifgp;
1718	int			 len, error;
1719
1720	IFNET_RLOCK();
1721	CK_STAILQ_FOREACH(ifg, &V_ifg_head, ifg_next)
1722		if (strcmp(ifg->ifg_group, ifgr->ifgr_name) == 0)
1723			break;
1724	if (ifg == NULL) {
1725		IFNET_RUNLOCK();
1726		return (ENOENT);
1727	}
1728
1729	if (ifgr->ifgr_len == 0) {
1730		CK_STAILQ_FOREACH(ifgm, &ifg->ifg_members, ifgm_next)
1731			ifgr->ifgr_len += sizeof(ifgrq);
1732		IFNET_RUNLOCK();
1733		return (0);
1734	}
1735
1736	len = ifgr->ifgr_len;
1737	ifgp = ifgr_groups_get(ifgr);
1738	CK_STAILQ_FOREACH(ifgm, &ifg->ifg_members, ifgm_next) {
1739		if (len < sizeof(ifgrq)) {
1740			IFNET_RUNLOCK();
1741			return (EINVAL);
1742		}
1743		bzero(&ifgrq, sizeof ifgrq);
1744		strlcpy(ifgrq.ifgrq_member, ifgm->ifgm_ifp->if_xname,
1745		    sizeof(ifgrq.ifgrq_member));
1746		if ((error = copyout(&ifgrq, ifgp, sizeof(struct ifg_req)))) {
1747			IFNET_RUNLOCK();
1748			return (error);
1749		}
1750		len -= sizeof(ifgrq);
1751		ifgp++;
1752	}
1753	IFNET_RUNLOCK();
1754
1755	return (0);
1756}
1757
1758/*
1759 * Return counter values from counter(9)s stored in ifnet.
1760 */
1761uint64_t
1762if_get_counter_default(struct ifnet *ifp, ift_counter cnt)
1763{
1764
1765	KASSERT(cnt < IFCOUNTERS, ("%s: invalid cnt %d", __func__, cnt));
1766
1767	return (counter_u64_fetch(ifp->if_counters[cnt]));
1768}
1769
1770/*
1771 * Increase an ifnet counter. Usually used for counters shared
1772 * between the stack and a driver, but function supports them all.
1773 */
1774void
1775if_inc_counter(struct ifnet *ifp, ift_counter cnt, int64_t inc)
1776{
1777
1778	KASSERT(cnt < IFCOUNTERS, ("%s: invalid cnt %d", __func__, cnt));
1779
1780	counter_u64_add(ifp->if_counters[cnt], inc);
1781}
1782
1783/*
1784 * Copy data from ifnet to userland API structure if_data.
1785 */
1786void
1787if_data_copy(struct ifnet *ifp, struct if_data *ifd)
1788{
1789
1790	ifd->ifi_type = ifp->if_type;
1791	ifd->ifi_physical = 0;
1792	ifd->ifi_addrlen = ifp->if_addrlen;
1793	ifd->ifi_hdrlen = ifp->if_hdrlen;
1794	ifd->ifi_link_state = ifp->if_link_state;
1795	ifd->ifi_vhid = 0;
1796	ifd->ifi_datalen = sizeof(struct if_data);
1797	ifd->ifi_mtu = ifp->if_mtu;
1798	ifd->ifi_metric = ifp->if_metric;
1799	ifd->ifi_baudrate = ifp->if_baudrate;
1800	ifd->ifi_hwassist = ifp->if_hwassist;
1801	ifd->ifi_epoch = ifp->if_epoch;
1802	ifd->ifi_lastchange = ifp->if_lastchange;
1803
1804	ifd->ifi_ipackets = ifp->if_get_counter(ifp, IFCOUNTER_IPACKETS);
1805	ifd->ifi_ierrors = ifp->if_get_counter(ifp, IFCOUNTER_IERRORS);
1806	ifd->ifi_opackets = ifp->if_get_counter(ifp, IFCOUNTER_OPACKETS);
1807	ifd->ifi_oerrors = ifp->if_get_counter(ifp, IFCOUNTER_OERRORS);
1808	ifd->ifi_collisions = ifp->if_get_counter(ifp, IFCOUNTER_COLLISIONS);
1809	ifd->ifi_ibytes = ifp->if_get_counter(ifp, IFCOUNTER_IBYTES);
1810	ifd->ifi_obytes = ifp->if_get_counter(ifp, IFCOUNTER_OBYTES);
1811	ifd->ifi_imcasts = ifp->if_get_counter(ifp, IFCOUNTER_IMCASTS);
1812	ifd->ifi_omcasts = ifp->if_get_counter(ifp, IFCOUNTER_OMCASTS);
1813	ifd->ifi_iqdrops = ifp->if_get_counter(ifp, IFCOUNTER_IQDROPS);
1814	ifd->ifi_oqdrops = ifp->if_get_counter(ifp, IFCOUNTER_OQDROPS);
1815	ifd->ifi_noproto = ifp->if_get_counter(ifp, IFCOUNTER_NOPROTO);
1816}
1817
1818/*
1819 * Initialization, destruction and refcounting functions for ifaddrs.
1820 */
1821struct ifaddr *
1822ifa_alloc(size_t size, int flags)
1823{
1824	struct ifaddr *ifa;
1825
1826	KASSERT(size >= sizeof(struct ifaddr),
1827	    ("%s: invalid size %zu", __func__, size));
1828
1829	ifa = malloc(size, M_IFADDR, M_ZERO | flags);
1830	if (ifa == NULL)
1831		return (NULL);
1832
1833	if ((ifa->ifa_opackets = counter_u64_alloc(flags)) == NULL)
1834		goto fail;
1835	if ((ifa->ifa_ipackets = counter_u64_alloc(flags)) == NULL)
1836		goto fail;
1837	if ((ifa->ifa_obytes = counter_u64_alloc(flags)) == NULL)
1838		goto fail;
1839	if ((ifa->ifa_ibytes = counter_u64_alloc(flags)) == NULL)
1840		goto fail;
1841
1842	refcount_init(&ifa->ifa_refcnt, 1);
1843
1844	return (ifa);
1845
1846fail:
1847	/* free(NULL) is okay */
1848	counter_u64_free(ifa->ifa_opackets);
1849	counter_u64_free(ifa->ifa_ipackets);
1850	counter_u64_free(ifa->ifa_obytes);
1851	counter_u64_free(ifa->ifa_ibytes);
1852	free(ifa, M_IFADDR);
1853
1854	return (NULL);
1855}
1856
1857void
1858ifa_ref(struct ifaddr *ifa)
1859{
1860	u_int old;
1861
1862	old = refcount_acquire(&ifa->ifa_refcnt);
1863	KASSERT(old > 0, ("%s: ifa %p has 0 refs", __func__, ifa));
1864}
1865
1866int
1867ifa_try_ref(struct ifaddr *ifa)
1868{
1869
1870	NET_EPOCH_ASSERT();
1871	return (refcount_acquire_if_not_zero(&ifa->ifa_refcnt));
1872}
1873
1874static void
1875ifa_destroy(epoch_context_t ctx)
1876{
1877	struct ifaddr *ifa;
1878
1879	ifa = __containerof(ctx, struct ifaddr, ifa_epoch_ctx);
1880	counter_u64_free(ifa->ifa_opackets);
1881	counter_u64_free(ifa->ifa_ipackets);
1882	counter_u64_free(ifa->ifa_obytes);
1883	counter_u64_free(ifa->ifa_ibytes);
1884	free(ifa, M_IFADDR);
1885}
1886
1887void
1888ifa_free(struct ifaddr *ifa)
1889{
1890
1891	if (refcount_release(&ifa->ifa_refcnt))
1892		NET_EPOCH_CALL(ifa_destroy, &ifa->ifa_epoch_ctx);
1893}
1894
1895/*
1896 * XXX: Because sockaddr_dl has deeper structure than the sockaddr
1897 * structs used to represent other address families, it is necessary
1898 * to perform a different comparison.
1899 */
1900
1901#define	sa_dl_equal(a1, a2)	\
1902	((((const struct sockaddr_dl *)(a1))->sdl_len ==		\
1903	 ((const struct sockaddr_dl *)(a2))->sdl_len) &&		\
1904	 (bcmp(CLLADDR((const struct sockaddr_dl *)(a1)),		\
1905	       CLLADDR((const struct sockaddr_dl *)(a2)),		\
1906	       ((const struct sockaddr_dl *)(a1))->sdl_alen) == 0))
1907
1908/*
1909 * Locate an interface based on a complete address.
1910 */
1911/*ARGSUSED*/
1912struct ifaddr *
1913ifa_ifwithaddr(const struct sockaddr *addr)
1914{
1915	struct ifnet *ifp;
1916	struct ifaddr *ifa;
1917
1918	NET_EPOCH_ASSERT();
1919
1920	CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) {
1921		CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
1922			if (ifa->ifa_addr->sa_family != addr->sa_family)
1923				continue;
1924			if (sa_equal(addr, ifa->ifa_addr)) {
1925				goto done;
1926			}
1927			/* IP6 doesn't have broadcast */
1928			if ((ifp->if_flags & IFF_BROADCAST) &&
1929			    ifa->ifa_broadaddr &&
1930			    ifa->ifa_broadaddr->sa_len != 0 &&
1931			    sa_equal(ifa->ifa_broadaddr, addr)) {
1932				goto done;
1933			}
1934		}
1935	}
1936	ifa = NULL;
1937done:
1938	return (ifa);
1939}
1940
1941int
1942ifa_ifwithaddr_check(const struct sockaddr *addr)
1943{
1944	struct epoch_tracker et;
1945	int rc;
1946
1947	NET_EPOCH_ENTER(et);
1948	rc = (ifa_ifwithaddr(addr) != NULL);
1949	NET_EPOCH_EXIT(et);
1950	return (rc);
1951}
1952
1953/*
1954 * Locate an interface based on the broadcast address.
1955 */
1956/* ARGSUSED */
1957struct ifaddr *
1958ifa_ifwithbroadaddr(const struct sockaddr *addr, int fibnum)
1959{
1960	struct ifnet *ifp;
1961	struct ifaddr *ifa;
1962
1963	NET_EPOCH_ASSERT();
1964	CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) {
1965		if ((fibnum != RT_ALL_FIBS) && (ifp->if_fib != fibnum))
1966			continue;
1967		CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
1968			if (ifa->ifa_addr->sa_family != addr->sa_family)
1969				continue;
1970			if ((ifp->if_flags & IFF_BROADCAST) &&
1971			    ifa->ifa_broadaddr &&
1972			    ifa->ifa_broadaddr->sa_len != 0 &&
1973			    sa_equal(ifa->ifa_broadaddr, addr)) {
1974				goto done;
1975			}
1976		}
1977	}
1978	ifa = NULL;
1979done:
1980	return (ifa);
1981}
1982
1983/*
1984 * Locate the point to point interface with a given destination address.
1985 */
1986/*ARGSUSED*/
1987struct ifaddr *
1988ifa_ifwithdstaddr(const struct sockaddr *addr, int fibnum)
1989{
1990	struct ifnet *ifp;
1991	struct ifaddr *ifa;
1992
1993	NET_EPOCH_ASSERT();
1994	CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) {
1995		if ((ifp->if_flags & IFF_POINTOPOINT) == 0)
1996			continue;
1997		if ((fibnum != RT_ALL_FIBS) && (ifp->if_fib != fibnum))
1998			continue;
1999		CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
2000			if (ifa->ifa_addr->sa_family != addr->sa_family)
2001				continue;
2002			if (ifa->ifa_dstaddr != NULL &&
2003			    sa_equal(addr, ifa->ifa_dstaddr)) {
2004				goto done;
2005			}
2006		}
2007	}
2008	ifa = NULL;
2009done:
2010	return (ifa);
2011}
2012
2013/*
2014 * Find an interface on a specific network.  If many, choice
2015 * is most specific found.
2016 */
2017struct ifaddr *
2018ifa_ifwithnet(const struct sockaddr *addr, int ignore_ptp, int fibnum)
2019{
2020	struct ifnet *ifp;
2021	struct ifaddr *ifa;
2022	struct ifaddr *ifa_maybe = NULL;
2023	u_int af = addr->sa_family;
2024	const char *addr_data = addr->sa_data, *cplim;
2025
2026	NET_EPOCH_ASSERT();
2027	/*
2028	 * AF_LINK addresses can be looked up directly by their index number,
2029	 * so do that if we can.
2030	 */
2031	if (af == AF_LINK) {
2032	    const struct sockaddr_dl *sdl = (const struct sockaddr_dl *)addr;
2033	    if (sdl->sdl_index && sdl->sdl_index <= V_if_index)
2034		return (ifaddr_byindex(sdl->sdl_index));
2035	}
2036
2037	/*
2038	 * Scan though each interface, looking for ones that have addresses
2039	 * in this address family and the requested fib.
2040	 */
2041	CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) {
2042		if ((fibnum != RT_ALL_FIBS) && (ifp->if_fib != fibnum))
2043			continue;
2044		CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
2045			const char *cp, *cp2, *cp3;
2046
2047			if (ifa->ifa_addr->sa_family != af)
2048next:				continue;
2049			if (af == AF_INET &&
2050			    ifp->if_flags & IFF_POINTOPOINT && !ignore_ptp) {
2051				/*
2052				 * This is a bit broken as it doesn't
2053				 * take into account that the remote end may
2054				 * be a single node in the network we are
2055				 * looking for.
2056				 * The trouble is that we don't know the
2057				 * netmask for the remote end.
2058				 */
2059				if (ifa->ifa_dstaddr != NULL &&
2060				    sa_equal(addr, ifa->ifa_dstaddr)) {
2061					goto done;
2062				}
2063			} else {
2064				/*
2065				 * Scan all the bits in the ifa's address.
2066				 * If a bit dissagrees with what we are
2067				 * looking for, mask it with the netmask
2068				 * to see if it really matters.
2069				 * (A byte at a time)
2070				 */
2071				if (ifa->ifa_netmask == 0)
2072					continue;
2073				cp = addr_data;
2074				cp2 = ifa->ifa_addr->sa_data;
2075				cp3 = ifa->ifa_netmask->sa_data;
2076				cplim = ifa->ifa_netmask->sa_len
2077					+ (char *)ifa->ifa_netmask;
2078				while (cp3 < cplim)
2079					if ((*cp++ ^ *cp2++) & *cp3++)
2080						goto next; /* next address! */
2081				/*
2082				 * If the netmask of what we just found
2083				 * is more specific than what we had before
2084				 * (if we had one), or if the virtual status
2085				 * of new prefix is better than of the old one,
2086				 * then remember the new one before continuing
2087				 * to search for an even better one.
2088				 */
2089				if (ifa_maybe == NULL ||
2090				    ifa_preferred(ifa_maybe, ifa) ||
2091				    rn_refines((caddr_t)ifa->ifa_netmask,
2092				    (caddr_t)ifa_maybe->ifa_netmask)) {
2093					ifa_maybe = ifa;
2094				}
2095			}
2096		}
2097	}
2098	ifa = ifa_maybe;
2099	ifa_maybe = NULL;
2100done:
2101	return (ifa);
2102}
2103
2104/*
2105 * Find an interface address specific to an interface best matching
2106 * a given address.
2107 */
2108struct ifaddr *
2109ifaof_ifpforaddr(const struct sockaddr *addr, struct ifnet *ifp)
2110{
2111	struct ifaddr *ifa;
2112	const char *cp, *cp2, *cp3;
2113	char *cplim;
2114	struct ifaddr *ifa_maybe = NULL;
2115	u_int af = addr->sa_family;
2116
2117	if (af >= AF_MAX)
2118		return (NULL);
2119
2120	NET_EPOCH_ASSERT();
2121	CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
2122		if (ifa->ifa_addr->sa_family != af)
2123			continue;
2124		if (ifa_maybe == NULL)
2125			ifa_maybe = ifa;
2126		if (ifa->ifa_netmask == 0) {
2127			if (sa_equal(addr, ifa->ifa_addr) ||
2128			    (ifa->ifa_dstaddr &&
2129			    sa_equal(addr, ifa->ifa_dstaddr)))
2130				goto done;
2131			continue;
2132		}
2133		if (ifp->if_flags & IFF_POINTOPOINT) {
2134			if (sa_equal(addr, ifa->ifa_dstaddr))
2135				goto done;
2136		} else {
2137			cp = addr->sa_data;
2138			cp2 = ifa->ifa_addr->sa_data;
2139			cp3 = ifa->ifa_netmask->sa_data;
2140			cplim = ifa->ifa_netmask->sa_len + (char *)ifa->ifa_netmask;
2141			for (; cp3 < cplim; cp3++)
2142				if ((*cp++ ^ *cp2++) & *cp3)
2143					break;
2144			if (cp3 == cplim)
2145				goto done;
2146		}
2147	}
2148	ifa = ifa_maybe;
2149done:
2150	return (ifa);
2151}
2152
2153/*
2154 * See whether new ifa is better than current one:
2155 * 1) A non-virtual one is preferred over virtual.
2156 * 2) A virtual in master state preferred over any other state.
2157 *
2158 * Used in several address selecting functions.
2159 */
2160int
2161ifa_preferred(struct ifaddr *cur, struct ifaddr *next)
2162{
2163
2164	return (cur->ifa_carp && (!next->ifa_carp ||
2165	    ((*carp_master_p)(next) && !(*carp_master_p)(cur))));
2166}
2167
2168struct sockaddr_dl *
2169link_alloc_sdl(size_t size, int flags)
2170{
2171
2172	return (malloc(size, M_TEMP, flags));
2173}
2174
2175void
2176link_free_sdl(struct sockaddr *sa)
2177{
2178	free(sa, M_TEMP);
2179}
2180
2181/*
2182 * Fills in given sdl with interface basic info.
2183 * Returns pointer to filled sdl.
2184 */
2185struct sockaddr_dl *
2186link_init_sdl(struct ifnet *ifp, struct sockaddr *paddr, u_char iftype)
2187{
2188	struct sockaddr_dl *sdl;
2189
2190	sdl = (struct sockaddr_dl *)paddr;
2191	memset(sdl, 0, sizeof(struct sockaddr_dl));
2192	sdl->sdl_len = sizeof(struct sockaddr_dl);
2193	sdl->sdl_family = AF_LINK;
2194	sdl->sdl_index = ifp->if_index;
2195	sdl->sdl_type = iftype;
2196
2197	return (sdl);
2198}
2199
2200/*
2201 * Mark an interface down and notify protocols of
2202 * the transition.
2203 */
2204static void
2205if_unroute(struct ifnet *ifp, int flag, int fam)
2206{
2207	struct ifaddr *ifa;
2208
2209	KASSERT(flag == IFF_UP, ("if_unroute: flag != IFF_UP"));
2210
2211	ifp->if_flags &= ~flag;
2212	getmicrotime(&ifp->if_lastchange);
2213	CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link)
2214		if (fam == PF_UNSPEC || (fam == ifa->ifa_addr->sa_family))
2215			pfctlinput(PRC_IFDOWN, ifa->ifa_addr);
2216	ifp->if_qflush(ifp);
2217
2218	if (ifp->if_carp)
2219		(*carp_linkstate_p)(ifp);
2220	rt_ifmsg(ifp);
2221}
2222
2223/*
2224 * Mark an interface up and notify protocols of
2225 * the transition.
2226 */
2227static void
2228if_route(struct ifnet *ifp, int flag, int fam)
2229{
2230	struct ifaddr *ifa;
2231
2232	KASSERT(flag == IFF_UP, ("if_route: flag != IFF_UP"));
2233
2234	ifp->if_flags |= flag;
2235	getmicrotime(&ifp->if_lastchange);
2236	CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link)
2237		if (fam == PF_UNSPEC || (fam == ifa->ifa_addr->sa_family))
2238			pfctlinput(PRC_IFUP, ifa->ifa_addr);
2239	if (ifp->if_carp)
2240		(*carp_linkstate_p)(ifp);
2241	rt_ifmsg(ifp);
2242#ifdef INET6
2243	in6_if_up(ifp);
2244#endif
2245}
2246
2247void	(*vlan_link_state_p)(struct ifnet *);	/* XXX: private from if_vlan */
2248void	(*vlan_trunk_cap_p)(struct ifnet *);		/* XXX: private from if_vlan */
2249struct ifnet *(*vlan_trunkdev_p)(struct ifnet *);
2250struct	ifnet *(*vlan_devat_p)(struct ifnet *, uint16_t);
2251int	(*vlan_tag_p)(struct ifnet *, uint16_t *);
2252int	(*vlan_pcp_p)(struct ifnet *, uint16_t *);
2253int	(*vlan_setcookie_p)(struct ifnet *, void *);
2254void	*(*vlan_cookie_p)(struct ifnet *);
2255
2256/*
2257 * Handle a change in the interface link state. To avoid LORs
2258 * between driver lock and upper layer locks, as well as possible
2259 * recursions, we post event to taskqueue, and all job
2260 * is done in static do_link_state_change().
2261 */
2262void
2263if_link_state_change(struct ifnet *ifp, int link_state)
2264{
2265	/* Return if state hasn't changed. */
2266	if (ifp->if_link_state == link_state)
2267		return;
2268
2269	ifp->if_link_state = link_state;
2270
2271	/* XXXGL: reference ifp? */
2272	taskqueue_enqueue(taskqueue_swi, &ifp->if_linktask);
2273}
2274
2275static void
2276do_link_state_change(void *arg, int pending)
2277{
2278	struct ifnet *ifp;
2279	int link_state;
2280
2281	ifp = arg;
2282	link_state = ifp->if_link_state;
2283
2284	CURVNET_SET(ifp->if_vnet);
2285	rt_ifmsg(ifp);
2286	if (ifp->if_vlantrunk != NULL)
2287		(*vlan_link_state_p)(ifp);
2288
2289	if ((ifp->if_type == IFT_ETHER || ifp->if_type == IFT_L2VLAN) &&
2290	    ifp->if_l2com != NULL)
2291		(*ng_ether_link_state_p)(ifp, link_state);
2292	if (ifp->if_carp)
2293		(*carp_linkstate_p)(ifp);
2294	if (ifp->if_bridge)
2295		ifp->if_bridge_linkstate(ifp);
2296	if (ifp->if_lagg)
2297		(*lagg_linkstate_p)(ifp, link_state);
2298
2299	if (IS_DEFAULT_VNET(curvnet))
2300		devctl_notify("IFNET", ifp->if_xname,
2301		    (link_state == LINK_STATE_UP) ? "LINK_UP" : "LINK_DOWN",
2302		    NULL);
2303	if (pending > 1)
2304		if_printf(ifp, "%d link states coalesced\n", pending);
2305	if (log_link_state_change)
2306		if_printf(ifp, "link state changed to %s\n",
2307		    (link_state == LINK_STATE_UP) ? "UP" : "DOWN" );
2308	EVENTHANDLER_INVOKE(ifnet_link_event, ifp, link_state);
2309	CURVNET_RESTORE();
2310}
2311
2312/*
2313 * Mark an interface down and notify protocols of
2314 * the transition.
2315 */
2316void
2317if_down(struct ifnet *ifp)
2318{
2319
2320	EVENTHANDLER_INVOKE(ifnet_event, ifp, IFNET_EVENT_DOWN);
2321	if_unroute(ifp, IFF_UP, AF_UNSPEC);
2322}
2323
2324/*
2325 * Mark an interface up and notify protocols of
2326 * the transition.
2327 */
2328void
2329if_up(struct ifnet *ifp)
2330{
2331
2332	if_route(ifp, IFF_UP, AF_UNSPEC);
2333	EVENTHANDLER_INVOKE(ifnet_event, ifp, IFNET_EVENT_UP);
2334}
2335
2336/*
2337 * Flush an interface queue.
2338 */
2339void
2340if_qflush(struct ifnet *ifp)
2341{
2342	struct mbuf *m, *n;
2343	struct ifaltq *ifq;
2344
2345	ifq = &ifp->if_snd;
2346	IFQ_LOCK(ifq);
2347#ifdef ALTQ
2348	if (ALTQ_IS_ENABLED(ifq))
2349		ALTQ_PURGE(ifq);
2350#endif
2351	n = ifq->ifq_head;
2352	while ((m = n) != NULL) {
2353		n = m->m_nextpkt;
2354		m_freem(m);
2355	}
2356	ifq->ifq_head = 0;
2357	ifq->ifq_tail = 0;
2358	ifq->ifq_len = 0;
2359	IFQ_UNLOCK(ifq);
2360}
2361
2362/*
2363 * Map interface name to interface structure pointer, with or without
2364 * returning a reference.
2365 */
2366struct ifnet *
2367ifunit_ref(const char *name)
2368{
2369	struct epoch_tracker et;
2370	struct ifnet *ifp;
2371
2372	NET_EPOCH_ENTER(et);
2373	CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) {
2374		if (strncmp(name, ifp->if_xname, IFNAMSIZ) == 0 &&
2375		    !(ifp->if_flags & IFF_DYING))
2376			break;
2377	}
2378	if (ifp != NULL)
2379		if_ref(ifp);
2380	NET_EPOCH_EXIT(et);
2381	return (ifp);
2382}
2383
2384struct ifnet *
2385ifunit(const char *name)
2386{
2387	struct epoch_tracker et;
2388	struct ifnet *ifp;
2389
2390	NET_EPOCH_ENTER(et);
2391	CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) {
2392		if (strncmp(name, ifp->if_xname, IFNAMSIZ) == 0)
2393			break;
2394	}
2395	NET_EPOCH_EXIT(et);
2396	return (ifp);
2397}
2398
2399void *
2400ifr_buffer_get_buffer(void *data)
2401{
2402	union ifreq_union *ifrup;
2403
2404	ifrup = data;
2405#ifdef COMPAT_FREEBSD32
2406	if (SV_CURPROC_FLAG(SV_ILP32))
2407		return ((void *)(uintptr_t)
2408		    ifrup->ifr32.ifr_ifru.ifru_buffer.buffer);
2409#endif
2410	return (ifrup->ifr.ifr_ifru.ifru_buffer.buffer);
2411}
2412
2413static void
2414ifr_buffer_set_buffer_null(void *data)
2415{
2416	union ifreq_union *ifrup;
2417
2418	ifrup = data;
2419#ifdef COMPAT_FREEBSD32
2420	if (SV_CURPROC_FLAG(SV_ILP32))
2421		ifrup->ifr32.ifr_ifru.ifru_buffer.buffer = 0;
2422	else
2423#endif
2424		ifrup->ifr.ifr_ifru.ifru_buffer.buffer = NULL;
2425}
2426
2427size_t
2428ifr_buffer_get_length(void *data)
2429{
2430	union ifreq_union *ifrup;
2431
2432	ifrup = data;
2433#ifdef COMPAT_FREEBSD32
2434	if (SV_CURPROC_FLAG(SV_ILP32))
2435		return (ifrup->ifr32.ifr_ifru.ifru_buffer.length);
2436#endif
2437	return (ifrup->ifr.ifr_ifru.ifru_buffer.length);
2438}
2439
2440static void
2441ifr_buffer_set_length(void *data, size_t len)
2442{
2443	union ifreq_union *ifrup;
2444
2445	ifrup = data;
2446#ifdef COMPAT_FREEBSD32
2447	if (SV_CURPROC_FLAG(SV_ILP32))
2448		ifrup->ifr32.ifr_ifru.ifru_buffer.length = len;
2449	else
2450#endif
2451		ifrup->ifr.ifr_ifru.ifru_buffer.length = len;
2452}
2453
2454void *
2455ifr_data_get_ptr(void *ifrp)
2456{
2457	union ifreq_union *ifrup;
2458
2459	ifrup = ifrp;
2460#ifdef COMPAT_FREEBSD32
2461	if (SV_CURPROC_FLAG(SV_ILP32))
2462		return ((void *)(uintptr_t)
2463		    ifrup->ifr32.ifr_ifru.ifru_data);
2464#endif
2465		return (ifrup->ifr.ifr_ifru.ifru_data);
2466}
2467
2468/*
2469 * Hardware specific interface ioctls.
2470 */
2471int
2472ifhwioctl(u_long cmd, struct ifnet *ifp, caddr_t data, struct thread *td)
2473{
2474	struct ifreq *ifr;
2475	int error = 0, do_ifup = 0;
2476	int new_flags, temp_flags;
2477	size_t namelen, onamelen;
2478	size_t descrlen;
2479	char *descrbuf, *odescrbuf;
2480	char new_name[IFNAMSIZ];
2481	struct ifaddr *ifa;
2482	struct sockaddr_dl *sdl;
2483
2484	ifr = (struct ifreq *)data;
2485	switch (cmd) {
2486	case SIOCGIFINDEX:
2487		ifr->ifr_index = ifp->if_index;
2488		break;
2489
2490	case SIOCGIFFLAGS:
2491		temp_flags = ifp->if_flags | ifp->if_drv_flags;
2492		ifr->ifr_flags = temp_flags & 0xffff;
2493		ifr->ifr_flagshigh = temp_flags >> 16;
2494		break;
2495
2496	case SIOCGIFCAP:
2497		ifr->ifr_reqcap = ifp->if_capabilities;
2498		ifr->ifr_curcap = ifp->if_capenable;
2499		break;
2500
2501	case SIOCGIFDATA:
2502	{
2503		struct if_data ifd;
2504
2505		/* Ensure uninitialised padding is not leaked. */
2506		memset(&ifd, 0, sizeof(ifd));
2507
2508		if_data_copy(ifp, &ifd);
2509		error = copyout(&ifd, ifr_data_get_ptr(ifr), sizeof(ifd));
2510		break;
2511	}
2512
2513#ifdef MAC
2514	case SIOCGIFMAC:
2515		error = mac_ifnet_ioctl_get(td->td_ucred, ifr, ifp);
2516		break;
2517#endif
2518
2519	case SIOCGIFMETRIC:
2520		ifr->ifr_metric = ifp->if_metric;
2521		break;
2522
2523	case SIOCGIFMTU:
2524		ifr->ifr_mtu = ifp->if_mtu;
2525		break;
2526
2527	case SIOCGIFPHYS:
2528		/* XXXGL: did this ever worked? */
2529		ifr->ifr_phys = 0;
2530		break;
2531
2532	case SIOCGIFDESCR:
2533		error = 0;
2534		sx_slock(&ifdescr_sx);
2535		if (ifp->if_description == NULL)
2536			error = ENOMSG;
2537		else {
2538			/* space for terminating nul */
2539			descrlen = strlen(ifp->if_description) + 1;
2540			if (ifr_buffer_get_length(ifr) < descrlen)
2541				ifr_buffer_set_buffer_null(ifr);
2542			else
2543				error = copyout(ifp->if_description,
2544				    ifr_buffer_get_buffer(ifr), descrlen);
2545			ifr_buffer_set_length(ifr, descrlen);
2546		}
2547		sx_sunlock(&ifdescr_sx);
2548		break;
2549
2550	case SIOCSIFDESCR:
2551		error = priv_check(td, PRIV_NET_SETIFDESCR);
2552		if (error)
2553			return (error);
2554
2555		/*
2556		 * Copy only (length-1) bytes to make sure that
2557		 * if_description is always nul terminated.  The
2558		 * length parameter is supposed to count the
2559		 * terminating nul in.
2560		 */
2561		if (ifr_buffer_get_length(ifr) > ifdescr_maxlen)
2562			return (ENAMETOOLONG);
2563		else if (ifr_buffer_get_length(ifr) == 0)
2564			descrbuf = NULL;
2565		else {
2566			descrbuf = malloc(ifr_buffer_get_length(ifr),
2567			    M_IFDESCR, M_WAITOK | M_ZERO);
2568			error = copyin(ifr_buffer_get_buffer(ifr), descrbuf,
2569			    ifr_buffer_get_length(ifr) - 1);
2570			if (error) {
2571				free(descrbuf, M_IFDESCR);
2572				break;
2573			}
2574		}
2575
2576		sx_xlock(&ifdescr_sx);
2577		odescrbuf = ifp->if_description;
2578		ifp->if_description = descrbuf;
2579		sx_xunlock(&ifdescr_sx);
2580
2581		getmicrotime(&ifp->if_lastchange);
2582		free(odescrbuf, M_IFDESCR);
2583		break;
2584
2585	case SIOCGIFFIB:
2586		ifr->ifr_fib = ifp->if_fib;
2587		break;
2588
2589	case SIOCSIFFIB:
2590		error = priv_check(td, PRIV_NET_SETIFFIB);
2591		if (error)
2592			return (error);
2593		if (ifr->ifr_fib >= rt_numfibs)
2594			return (EINVAL);
2595
2596		ifp->if_fib = ifr->ifr_fib;
2597		break;
2598
2599	case SIOCSIFFLAGS:
2600		error = priv_check(td, PRIV_NET_SETIFFLAGS);
2601		if (error)
2602			return (error);
2603		/*
2604		 * Currently, no driver owned flags pass the IFF_CANTCHANGE
2605		 * check, so we don't need special handling here yet.
2606		 */
2607		new_flags = (ifr->ifr_flags & 0xffff) |
2608		    (ifr->ifr_flagshigh << 16);
2609		if (ifp->if_flags & IFF_UP &&
2610		    (new_flags & IFF_UP) == 0) {
2611			if_down(ifp);
2612		} else if (new_flags & IFF_UP &&
2613		    (ifp->if_flags & IFF_UP) == 0) {
2614			do_ifup = 1;
2615		}
2616		/* See if permanently promiscuous mode bit is about to flip */
2617		if ((ifp->if_flags ^ new_flags) & IFF_PPROMISC) {
2618			if (new_flags & IFF_PPROMISC)
2619				ifp->if_flags |= IFF_PROMISC;
2620			else if (ifp->if_pcount == 0)
2621				ifp->if_flags &= ~IFF_PROMISC;
2622			if (log_promisc_mode_change)
2623                                if_printf(ifp, "permanently promiscuous mode %s\n",
2624                                    ((new_flags & IFF_PPROMISC) ?
2625                                     "enabled" : "disabled"));
2626		}
2627		ifp->if_flags = (ifp->if_flags & IFF_CANTCHANGE) |
2628			(new_flags &~ IFF_CANTCHANGE);
2629		if (ifp->if_ioctl) {
2630			(void) (*ifp->if_ioctl)(ifp, cmd, data);
2631		}
2632		if (do_ifup)
2633			if_up(ifp);
2634		getmicrotime(&ifp->if_lastchange);
2635		break;
2636
2637	case SIOCSIFCAP:
2638		error = priv_check(td, PRIV_NET_SETIFCAP);
2639		if (error)
2640			return (error);
2641		if (ifp->if_ioctl == NULL)
2642			return (EOPNOTSUPP);
2643		if (ifr->ifr_reqcap & ~ifp->if_capabilities)
2644			return (EINVAL);
2645		error = (*ifp->if_ioctl)(ifp, cmd, data);
2646		if (error == 0)
2647			getmicrotime(&ifp->if_lastchange);
2648		break;
2649
2650#ifdef MAC
2651	case SIOCSIFMAC:
2652		error = mac_ifnet_ioctl_set(td->td_ucred, ifr, ifp);
2653		break;
2654#endif
2655
2656	case SIOCSIFNAME:
2657		error = priv_check(td, PRIV_NET_SETIFNAME);
2658		if (error)
2659			return (error);
2660		error = copyinstr(ifr_data_get_ptr(ifr), new_name, IFNAMSIZ,
2661		    NULL);
2662		if (error != 0)
2663			return (error);
2664		if (new_name[0] == '\0')
2665			return (EINVAL);
2666		if (strcmp(new_name, ifp->if_xname) == 0)
2667			break;
2668		if (ifunit(new_name) != NULL)
2669			return (EEXIST);
2670
2671		/*
2672		 * XXX: Locking.  Nothing else seems to lock if_flags,
2673		 * and there are numerous other races with the
2674		 * ifunit() checks not being atomic with namespace
2675		 * changes (renames, vmoves, if_attach, etc).
2676		 */
2677		ifp->if_flags |= IFF_RENAMING;
2678
2679		/* Announce the departure of the interface. */
2680		rt_ifannouncemsg(ifp, IFAN_DEPARTURE);
2681		EVENTHANDLER_INVOKE(ifnet_departure_event, ifp);
2682
2683		if_printf(ifp, "changing name to '%s'\n", new_name);
2684
2685		IF_ADDR_WLOCK(ifp);
2686		strlcpy(ifp->if_xname, new_name, sizeof(ifp->if_xname));
2687		ifa = ifp->if_addr;
2688		sdl = (struct sockaddr_dl *)ifa->ifa_addr;
2689		namelen = strlen(new_name);
2690		onamelen = sdl->sdl_nlen;
2691		/*
2692		 * Move the address if needed.  This is safe because we
2693		 * allocate space for a name of length IFNAMSIZ when we
2694		 * create this in if_attach().
2695		 */
2696		if (namelen != onamelen) {
2697			bcopy(sdl->sdl_data + onamelen,
2698			    sdl->sdl_data + namelen, sdl->sdl_alen);
2699		}
2700		bcopy(new_name, sdl->sdl_data, namelen);
2701		sdl->sdl_nlen = namelen;
2702		sdl = (struct sockaddr_dl *)ifa->ifa_netmask;
2703		bzero(sdl->sdl_data, onamelen);
2704		while (namelen != 0)
2705			sdl->sdl_data[--namelen] = 0xff;
2706		IF_ADDR_WUNLOCK(ifp);
2707
2708		EVENTHANDLER_INVOKE(ifnet_arrival_event, ifp);
2709		/* Announce the return of the interface. */
2710		rt_ifannouncemsg(ifp, IFAN_ARRIVAL);
2711
2712		ifp->if_flags &= ~IFF_RENAMING;
2713		break;
2714
2715#ifdef VIMAGE
2716	case SIOCSIFVNET:
2717		error = priv_check(td, PRIV_NET_SETIFVNET);
2718		if (error)
2719			return (error);
2720		error = if_vmove_loan(td, ifp, ifr->ifr_name, ifr->ifr_jid);
2721		break;
2722#endif
2723
2724	case SIOCSIFMETRIC:
2725		error = priv_check(td, PRIV_NET_SETIFMETRIC);
2726		if (error)
2727			return (error);
2728		ifp->if_metric = ifr->ifr_metric;
2729		getmicrotime(&ifp->if_lastchange);
2730		break;
2731
2732	case SIOCSIFPHYS:
2733		error = priv_check(td, PRIV_NET_SETIFPHYS);
2734		if (error)
2735			return (error);
2736		if (ifp->if_ioctl == NULL)
2737			return (EOPNOTSUPP);
2738		error = (*ifp->if_ioctl)(ifp, cmd, data);
2739		if (error == 0)
2740			getmicrotime(&ifp->if_lastchange);
2741		break;
2742
2743	case SIOCSIFMTU:
2744	{
2745		u_long oldmtu = ifp->if_mtu;
2746
2747		error = priv_check(td, PRIV_NET_SETIFMTU);
2748		if (error)
2749			return (error);
2750		if (ifr->ifr_mtu < IF_MINMTU || ifr->ifr_mtu > IF_MAXMTU)
2751			return (EINVAL);
2752		if (ifp->if_ioctl == NULL)
2753			return (EOPNOTSUPP);
2754		error = (*ifp->if_ioctl)(ifp, cmd, data);
2755		if (error == 0) {
2756			getmicrotime(&ifp->if_lastchange);
2757			rt_ifmsg(ifp);
2758#ifdef INET
2759			DEBUGNET_NOTIFY_MTU(ifp);
2760#endif
2761		}
2762		/*
2763		 * If the link MTU changed, do network layer specific procedure.
2764		 */
2765		if (ifp->if_mtu != oldmtu) {
2766#ifdef INET6
2767			nd6_setmtu(ifp);
2768#endif
2769			rt_updatemtu(ifp);
2770		}
2771		break;
2772	}
2773
2774	case SIOCADDMULTI:
2775	case SIOCDELMULTI:
2776		if (cmd == SIOCADDMULTI)
2777			error = priv_check(td, PRIV_NET_ADDMULTI);
2778		else
2779			error = priv_check(td, PRIV_NET_DELMULTI);
2780		if (error)
2781			return (error);
2782
2783		/* Don't allow group membership on non-multicast interfaces. */
2784		if ((ifp->if_flags & IFF_MULTICAST) == 0)
2785			return (EOPNOTSUPP);
2786
2787		/* Don't let users screw up protocols' entries. */
2788		if (ifr->ifr_addr.sa_family != AF_LINK)
2789			return (EINVAL);
2790
2791		if (cmd == SIOCADDMULTI) {
2792			struct epoch_tracker et;
2793			struct ifmultiaddr *ifma;
2794
2795			/*
2796			 * Userland is only permitted to join groups once
2797			 * via the if_addmulti() KPI, because it cannot hold
2798			 * struct ifmultiaddr * between calls. It may also
2799			 * lose a race while we check if the membership
2800			 * already exists.
2801			 */
2802			NET_EPOCH_ENTER(et);
2803			ifma = if_findmulti(ifp, &ifr->ifr_addr);
2804			NET_EPOCH_EXIT(et);
2805			if (ifma != NULL)
2806				error = EADDRINUSE;
2807			else
2808				error = if_addmulti(ifp, &ifr->ifr_addr, &ifma);
2809		} else {
2810			error = if_delmulti(ifp, &ifr->ifr_addr);
2811		}
2812		if (error == 0)
2813			getmicrotime(&ifp->if_lastchange);
2814		break;
2815
2816	case SIOCSIFPHYADDR:
2817	case SIOCDIFPHYADDR:
2818#ifdef INET6
2819	case SIOCSIFPHYADDR_IN6:
2820#endif
2821	case SIOCSIFMEDIA:
2822	case SIOCSIFGENERIC:
2823		error = priv_check(td, PRIV_NET_HWIOCTL);
2824		if (error)
2825			return (error);
2826		if (ifp->if_ioctl == NULL)
2827			return (EOPNOTSUPP);
2828		error = (*ifp->if_ioctl)(ifp, cmd, data);
2829		if (error == 0)
2830			getmicrotime(&ifp->if_lastchange);
2831		break;
2832
2833	case SIOCGIFSTATUS:
2834	case SIOCGIFPSRCADDR:
2835	case SIOCGIFPDSTADDR:
2836	case SIOCGIFMEDIA:
2837	case SIOCGIFXMEDIA:
2838	case SIOCGIFGENERIC:
2839	case SIOCGIFRSSKEY:
2840	case SIOCGIFRSSHASH:
2841	case SIOCGIFDOWNREASON:
2842		if (ifp->if_ioctl == NULL)
2843			return (EOPNOTSUPP);
2844		error = (*ifp->if_ioctl)(ifp, cmd, data);
2845		break;
2846
2847	case SIOCSIFLLADDR:
2848		error = priv_check(td, PRIV_NET_SETLLADDR);
2849		if (error)
2850			return (error);
2851		error = if_setlladdr(ifp,
2852		    ifr->ifr_addr.sa_data, ifr->ifr_addr.sa_len);
2853		break;
2854
2855	case SIOCGHWADDR:
2856		error = if_gethwaddr(ifp, ifr);
2857		break;
2858
2859	case CASE_IOC_IFGROUPREQ(SIOCAIFGROUP):
2860		error = priv_check(td, PRIV_NET_ADDIFGROUP);
2861		if (error)
2862			return (error);
2863		if ((error = if_addgroup(ifp,
2864		    ifgr_group_get((struct ifgroupreq *)data))))
2865			return (error);
2866		break;
2867
2868	case CASE_IOC_IFGROUPREQ(SIOCGIFGROUP):
2869	{
2870		struct epoch_tracker et;
2871
2872		NET_EPOCH_ENTER(et);
2873		error = if_getgroup((struct ifgroupreq *)data, ifp);
2874		NET_EPOCH_EXIT(et);
2875		break;
2876	}
2877
2878	case CASE_IOC_IFGROUPREQ(SIOCDIFGROUP):
2879		error = priv_check(td, PRIV_NET_DELIFGROUP);
2880		if (error)
2881			return (error);
2882		if ((error = if_delgroup(ifp,
2883		    ifgr_group_get((struct ifgroupreq *)data))))
2884			return (error);
2885		break;
2886
2887	default:
2888		error = ENOIOCTL;
2889		break;
2890	}
2891	return (error);
2892}
2893
2894#ifdef COMPAT_FREEBSD32
2895struct ifconf32 {
2896	int32_t	ifc_len;
2897	union {
2898		uint32_t	ifcu_buf;
2899		uint32_t	ifcu_req;
2900	} ifc_ifcu;
2901};
2902#define	SIOCGIFCONF32	_IOWR('i', 36, struct ifconf32)
2903#endif
2904
2905#ifdef COMPAT_FREEBSD32
2906static void
2907ifmr_init(struct ifmediareq *ifmr, caddr_t data)
2908{
2909	struct ifmediareq32 *ifmr32;
2910
2911	ifmr32 = (struct ifmediareq32 *)data;
2912	memcpy(ifmr->ifm_name, ifmr32->ifm_name,
2913	    sizeof(ifmr->ifm_name));
2914	ifmr->ifm_current = ifmr32->ifm_current;
2915	ifmr->ifm_mask = ifmr32->ifm_mask;
2916	ifmr->ifm_status = ifmr32->ifm_status;
2917	ifmr->ifm_active = ifmr32->ifm_active;
2918	ifmr->ifm_count = ifmr32->ifm_count;
2919	ifmr->ifm_ulist = (int *)(uintptr_t)ifmr32->ifm_ulist;
2920}
2921
2922static void
2923ifmr_update(const struct ifmediareq *ifmr, caddr_t data)
2924{
2925	struct ifmediareq32 *ifmr32;
2926
2927	ifmr32 = (struct ifmediareq32 *)data;
2928	ifmr32->ifm_current = ifmr->ifm_current;
2929	ifmr32->ifm_mask = ifmr->ifm_mask;
2930	ifmr32->ifm_status = ifmr->ifm_status;
2931	ifmr32->ifm_active = ifmr->ifm_active;
2932	ifmr32->ifm_count = ifmr->ifm_count;
2933}
2934#endif
2935
2936/*
2937 * Interface ioctls.
2938 */
2939int
2940ifioctl(struct socket *so, u_long cmd, caddr_t data, struct thread *td)
2941{
2942#ifdef COMPAT_FREEBSD32
2943	caddr_t saved_data = NULL;
2944	struct ifmediareq ifmr;
2945	struct ifmediareq *ifmrp = NULL;
2946#endif
2947	struct ifnet *ifp;
2948	struct ifreq *ifr;
2949	int error;
2950	int oif_flags;
2951#ifdef VIMAGE
2952	bool shutdown;
2953#endif
2954
2955	CURVNET_SET(so->so_vnet);
2956#ifdef VIMAGE
2957	/* Make sure the VNET is stable. */
2958	shutdown = VNET_IS_SHUTTING_DOWN(so->so_vnet);
2959	if (shutdown) {
2960		CURVNET_RESTORE();
2961		return (EBUSY);
2962	}
2963#endif
2964
2965	switch (cmd) {
2966	case SIOCGIFCONF:
2967		error = ifconf(cmd, data);
2968		goto out_noref;
2969
2970#ifdef COMPAT_FREEBSD32
2971	case SIOCGIFCONF32:
2972		{
2973			struct ifconf32 *ifc32;
2974			struct ifconf ifc;
2975
2976			ifc32 = (struct ifconf32 *)data;
2977			ifc.ifc_len = ifc32->ifc_len;
2978			ifc.ifc_buf = PTRIN(ifc32->ifc_buf);
2979
2980			error = ifconf(SIOCGIFCONF, (void *)&ifc);
2981			if (error == 0)
2982				ifc32->ifc_len = ifc.ifc_len;
2983			goto out_noref;
2984		}
2985#endif
2986	}
2987
2988#ifdef COMPAT_FREEBSD32
2989	switch (cmd) {
2990	case SIOCGIFMEDIA32:
2991	case SIOCGIFXMEDIA32:
2992		ifmrp = &ifmr;
2993		ifmr_init(ifmrp, data);
2994		cmd = _IOC_NEWTYPE(cmd, struct ifmediareq);
2995		saved_data = data;
2996		data = (caddr_t)ifmrp;
2997	}
2998#endif
2999
3000	ifr = (struct ifreq *)data;
3001	switch (cmd) {
3002#ifdef VIMAGE
3003	case SIOCSIFRVNET:
3004		error = priv_check(td, PRIV_NET_SETIFVNET);
3005		if (error == 0)
3006			error = if_vmove_reclaim(td, ifr->ifr_name,
3007			    ifr->ifr_jid);
3008		goto out_noref;
3009#endif
3010	case SIOCIFCREATE:
3011	case SIOCIFCREATE2:
3012		error = priv_check(td, PRIV_NET_IFCREATE);
3013		if (error == 0)
3014			error = if_clone_create(ifr->ifr_name,
3015			    sizeof(ifr->ifr_name), cmd == SIOCIFCREATE2 ?
3016			    ifr_data_get_ptr(ifr) : NULL);
3017		goto out_noref;
3018	case SIOCIFDESTROY:
3019		error = priv_check(td, PRIV_NET_IFDESTROY);
3020
3021		if (error == 0) {
3022			sx_xlock(&ifnet_detach_sxlock);
3023			error = if_clone_destroy(ifr->ifr_name);
3024			sx_xunlock(&ifnet_detach_sxlock);
3025		}
3026		goto out_noref;
3027
3028	case SIOCIFGCLONERS:
3029		error = if_clone_list((struct if_clonereq *)data);
3030		goto out_noref;
3031
3032	case CASE_IOC_IFGROUPREQ(SIOCGIFGMEMB):
3033		error = if_getgroupmembers((struct ifgroupreq *)data);
3034		goto out_noref;
3035
3036#if defined(INET) || defined(INET6)
3037	case SIOCSVH:
3038	case SIOCGVH:
3039		if (carp_ioctl_p == NULL)
3040			error = EPROTONOSUPPORT;
3041		else
3042			error = (*carp_ioctl_p)(ifr, cmd, td);
3043		goto out_noref;
3044#endif
3045	}
3046
3047	ifp = ifunit_ref(ifr->ifr_name);
3048	if (ifp == NULL) {
3049		error = ENXIO;
3050		goto out_noref;
3051	}
3052
3053	error = ifhwioctl(cmd, ifp, data, td);
3054	if (error != ENOIOCTL)
3055		goto out_ref;
3056
3057	oif_flags = ifp->if_flags;
3058	if (so->so_proto == NULL) {
3059		error = EOPNOTSUPP;
3060		goto out_ref;
3061	}
3062
3063	/*
3064	 * Pass the request on to the socket control method, and if the
3065	 * latter returns EOPNOTSUPP, directly to the interface.
3066	 *
3067	 * Make an exception for the legacy SIOCSIF* requests.  Drivers
3068	 * trust SIOCSIFADDR et al to come from an already privileged
3069	 * layer, and do not perform any credentials checks or input
3070	 * validation.
3071	 */
3072	error = ((*so->so_proto->pr_usrreqs->pru_control)(so, cmd, data,
3073	    ifp, td));
3074	if (error == EOPNOTSUPP && ifp != NULL && ifp->if_ioctl != NULL &&
3075	    cmd != SIOCSIFADDR && cmd != SIOCSIFBRDADDR &&
3076	    cmd != SIOCSIFDSTADDR && cmd != SIOCSIFNETMASK)
3077		error = (*ifp->if_ioctl)(ifp, cmd, data);
3078
3079	if ((oif_flags ^ ifp->if_flags) & IFF_UP) {
3080#ifdef INET6
3081		if (ifp->if_flags & IFF_UP)
3082			in6_if_up(ifp);
3083#endif
3084	}
3085
3086out_ref:
3087	if_rele(ifp);
3088out_noref:
3089#ifdef COMPAT_FREEBSD32
3090	if (ifmrp != NULL) {
3091		KASSERT((cmd == SIOCGIFMEDIA || cmd == SIOCGIFXMEDIA),
3092		    ("ifmrp non-NULL, but cmd is not an ifmedia req 0x%lx",
3093		     cmd));
3094		data = saved_data;
3095		ifmr_update(ifmrp, data);
3096	}
3097#endif
3098	CURVNET_RESTORE();
3099	return (error);
3100}
3101
3102/*
3103 * The code common to handling reference counted flags,
3104 * e.g., in ifpromisc() and if_allmulti().
3105 * The "pflag" argument can specify a permanent mode flag to check,
3106 * such as IFF_PPROMISC for promiscuous mode; should be 0 if none.
3107 *
3108 * Only to be used on stack-owned flags, not driver-owned flags.
3109 */
3110static int
3111if_setflag(struct ifnet *ifp, int flag, int pflag, int *refcount, int onswitch)
3112{
3113	struct ifreq ifr;
3114	int error;
3115	int oldflags, oldcount;
3116
3117	/* Sanity checks to catch programming errors */
3118	KASSERT((flag & (IFF_DRV_OACTIVE|IFF_DRV_RUNNING)) == 0,
3119	    ("%s: setting driver-owned flag %d", __func__, flag));
3120
3121	if (onswitch)
3122		KASSERT(*refcount >= 0,
3123		    ("%s: increment negative refcount %d for flag %d",
3124		    __func__, *refcount, flag));
3125	else
3126		KASSERT(*refcount > 0,
3127		    ("%s: decrement non-positive refcount %d for flag %d",
3128		    __func__, *refcount, flag));
3129
3130	/* In case this mode is permanent, just touch refcount */
3131	if (ifp->if_flags & pflag) {
3132		*refcount += onswitch ? 1 : -1;
3133		return (0);
3134	}
3135
3136	/* Save ifnet parameters for if_ioctl() may fail */
3137	oldcount = *refcount;
3138	oldflags = ifp->if_flags;
3139
3140	/*
3141	 * See if we aren't the only and touching refcount is enough.
3142	 * Actually toggle interface flag if we are the first or last.
3143	 */
3144	if (onswitch) {
3145		if ((*refcount)++)
3146			return (0);
3147		ifp->if_flags |= flag;
3148	} else {
3149		if (--(*refcount))
3150			return (0);
3151		ifp->if_flags &= ~flag;
3152	}
3153
3154	/* Call down the driver since we've changed interface flags */
3155	if (ifp->if_ioctl == NULL) {
3156		error = EOPNOTSUPP;
3157		goto recover;
3158	}
3159	ifr.ifr_flags = ifp->if_flags & 0xffff;
3160	ifr.ifr_flagshigh = ifp->if_flags >> 16;
3161	error = (*ifp->if_ioctl)(ifp, SIOCSIFFLAGS, (caddr_t)&ifr);
3162	if (error)
3163		goto recover;
3164	/* Notify userland that interface flags have changed */
3165	rt_ifmsg(ifp);
3166	return (0);
3167
3168recover:
3169	/* Recover after driver error */
3170	*refcount = oldcount;
3171	ifp->if_flags = oldflags;
3172	return (error);
3173}
3174
3175/*
3176 * Set/clear promiscuous mode on interface ifp based on the truth value
3177 * of pswitch.  The calls are reference counted so that only the first
3178 * "on" request actually has an effect, as does the final "off" request.
3179 * Results are undefined if the "off" and "on" requests are not matched.
3180 */
3181int
3182ifpromisc(struct ifnet *ifp, int pswitch)
3183{
3184	int error;
3185	int oldflags = ifp->if_flags;
3186
3187	error = if_setflag(ifp, IFF_PROMISC, IFF_PPROMISC,
3188			   &ifp->if_pcount, pswitch);
3189	/* If promiscuous mode status has changed, log a message */
3190	if (error == 0 && ((ifp->if_flags ^ oldflags) & IFF_PROMISC) &&
3191            log_promisc_mode_change)
3192		if_printf(ifp, "promiscuous mode %s\n",
3193		    (ifp->if_flags & IFF_PROMISC) ? "enabled" : "disabled");
3194	return (error);
3195}
3196
3197/*
3198 * Return interface configuration
3199 * of system.  List may be used
3200 * in later ioctl's (above) to get
3201 * other information.
3202 */
3203/*ARGSUSED*/
3204static int
3205ifconf(u_long cmd, caddr_t data)
3206{
3207	struct ifconf *ifc = (struct ifconf *)data;
3208	struct ifnet *ifp;
3209	struct ifaddr *ifa;
3210	struct ifreq ifr;
3211	struct sbuf *sb;
3212	int error, full = 0, valid_len, max_len;
3213
3214	/* Limit initial buffer size to maxphys to avoid DoS from userspace. */
3215	max_len = maxphys - 1;
3216
3217	/* Prevent hostile input from being able to crash the system */
3218	if (ifc->ifc_len <= 0)
3219		return (EINVAL);
3220
3221again:
3222	if (ifc->ifc_len <= max_len) {
3223		max_len = ifc->ifc_len;
3224		full = 1;
3225	}
3226	sb = sbuf_new(NULL, NULL, max_len + 1, SBUF_FIXEDLEN);
3227	max_len = 0;
3228	valid_len = 0;
3229
3230	IFNET_RLOCK();
3231	CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) {
3232		struct epoch_tracker et;
3233		int addrs;
3234
3235		/*
3236		 * Zero the ifr to make sure we don't disclose the contents
3237		 * of the stack.
3238		 */
3239		memset(&ifr, 0, sizeof(ifr));
3240
3241		if (strlcpy(ifr.ifr_name, ifp->if_xname, sizeof(ifr.ifr_name))
3242		    >= sizeof(ifr.ifr_name)) {
3243			sbuf_delete(sb);
3244			IFNET_RUNLOCK();
3245			return (ENAMETOOLONG);
3246		}
3247
3248		addrs = 0;
3249		NET_EPOCH_ENTER(et);
3250		CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
3251			struct sockaddr *sa = ifa->ifa_addr;
3252
3253			if (prison_if(curthread->td_ucred, sa) != 0)
3254				continue;
3255			addrs++;
3256			if (sa->sa_len <= sizeof(*sa)) {
3257				if (sa->sa_len < sizeof(*sa)) {
3258					memset(&ifr.ifr_ifru.ifru_addr, 0,
3259					    sizeof(ifr.ifr_ifru.ifru_addr));
3260					memcpy(&ifr.ifr_ifru.ifru_addr, sa,
3261					    sa->sa_len);
3262				} else
3263					ifr.ifr_ifru.ifru_addr = *sa;
3264				sbuf_bcat(sb, &ifr, sizeof(ifr));
3265				max_len += sizeof(ifr);
3266			} else {
3267				sbuf_bcat(sb, &ifr,
3268				    offsetof(struct ifreq, ifr_addr));
3269				max_len += offsetof(struct ifreq, ifr_addr);
3270				sbuf_bcat(sb, sa, sa->sa_len);
3271				max_len += sa->sa_len;
3272			}
3273
3274			if (sbuf_error(sb) == 0)
3275				valid_len = sbuf_len(sb);
3276		}
3277		NET_EPOCH_EXIT(et);
3278		if (addrs == 0) {
3279			sbuf_bcat(sb, &ifr, sizeof(ifr));
3280			max_len += sizeof(ifr);
3281
3282			if (sbuf_error(sb) == 0)
3283				valid_len = sbuf_len(sb);
3284		}
3285	}
3286	IFNET_RUNLOCK();
3287
3288	/*
3289	 * If we didn't allocate enough space (uncommon), try again.  If
3290	 * we have already allocated as much space as we are allowed,
3291	 * return what we've got.
3292	 */
3293	if (valid_len != max_len && !full) {
3294		sbuf_delete(sb);
3295		goto again;
3296	}
3297
3298	ifc->ifc_len = valid_len;
3299	sbuf_finish(sb);
3300	error = copyout(sbuf_data(sb), ifc->ifc_req, ifc->ifc_len);
3301	sbuf_delete(sb);
3302	return (error);
3303}
3304
3305/*
3306 * Just like ifpromisc(), but for all-multicast-reception mode.
3307 */
3308int
3309if_allmulti(struct ifnet *ifp, int onswitch)
3310{
3311
3312	return (if_setflag(ifp, IFF_ALLMULTI, 0, &ifp->if_amcount, onswitch));
3313}
3314
3315struct ifmultiaddr *
3316if_findmulti(struct ifnet *ifp, const struct sockaddr *sa)
3317{
3318	struct ifmultiaddr *ifma;
3319
3320	IF_ADDR_LOCK_ASSERT(ifp);
3321
3322	CK_STAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
3323		if (sa->sa_family == AF_LINK) {
3324			if (sa_dl_equal(ifma->ifma_addr, sa))
3325				break;
3326		} else {
3327			if (sa_equal(ifma->ifma_addr, sa))
3328				break;
3329		}
3330	}
3331
3332	return ifma;
3333}
3334
3335/*
3336 * Allocate a new ifmultiaddr and initialize based on passed arguments.  We
3337 * make copies of passed sockaddrs.  The ifmultiaddr will not be added to
3338 * the ifnet multicast address list here, so the caller must do that and
3339 * other setup work (such as notifying the device driver).  The reference
3340 * count is initialized to 1.
3341 */
3342static struct ifmultiaddr *
3343if_allocmulti(struct ifnet *ifp, struct sockaddr *sa, struct sockaddr *llsa,
3344    int mflags)
3345{
3346	struct ifmultiaddr *ifma;
3347	struct sockaddr *dupsa;
3348
3349	ifma = malloc(sizeof *ifma, M_IFMADDR, mflags |
3350	    M_ZERO);
3351	if (ifma == NULL)
3352		return (NULL);
3353
3354	dupsa = malloc(sa->sa_len, M_IFMADDR, mflags);
3355	if (dupsa == NULL) {
3356		free(ifma, M_IFMADDR);
3357		return (NULL);
3358	}
3359	bcopy(sa, dupsa, sa->sa_len);
3360	ifma->ifma_addr = dupsa;
3361
3362	ifma->ifma_ifp = ifp;
3363	ifma->ifma_refcount = 1;
3364	ifma->ifma_protospec = NULL;
3365
3366	if (llsa == NULL) {
3367		ifma->ifma_lladdr = NULL;
3368		return (ifma);
3369	}
3370
3371	dupsa = malloc(llsa->sa_len, M_IFMADDR, mflags);
3372	if (dupsa == NULL) {
3373		free(ifma->ifma_addr, M_IFMADDR);
3374		free(ifma, M_IFMADDR);
3375		return (NULL);
3376	}
3377	bcopy(llsa, dupsa, llsa->sa_len);
3378	ifma->ifma_lladdr = dupsa;
3379
3380	return (ifma);
3381}
3382
3383/*
3384 * if_freemulti: free ifmultiaddr structure and possibly attached related
3385 * addresses.  The caller is responsible for implementing reference
3386 * counting, notifying the driver, handling routing messages, and releasing
3387 * any dependent link layer state.
3388 */
3389#ifdef MCAST_VERBOSE
3390extern void kdb_backtrace(void);
3391#endif
3392static void
3393if_freemulti_internal(struct ifmultiaddr *ifma)
3394{
3395
3396	KASSERT(ifma->ifma_refcount == 0, ("if_freemulti: refcount %d",
3397	    ifma->ifma_refcount));
3398
3399	if (ifma->ifma_lladdr != NULL)
3400		free(ifma->ifma_lladdr, M_IFMADDR);
3401#ifdef MCAST_VERBOSE
3402	kdb_backtrace();
3403	printf("%s freeing ifma: %p\n", __func__, ifma);
3404#endif
3405	free(ifma->ifma_addr, M_IFMADDR);
3406	free(ifma, M_IFMADDR);
3407}
3408
3409static void
3410if_destroymulti(epoch_context_t ctx)
3411{
3412	struct ifmultiaddr *ifma;
3413
3414	ifma = __containerof(ctx, struct ifmultiaddr, ifma_epoch_ctx);
3415	if_freemulti_internal(ifma);
3416}
3417
3418void
3419if_freemulti(struct ifmultiaddr *ifma)
3420{
3421	KASSERT(ifma->ifma_refcount == 0, ("if_freemulti_epoch: refcount %d",
3422	    ifma->ifma_refcount));
3423
3424	NET_EPOCH_CALL(if_destroymulti, &ifma->ifma_epoch_ctx);
3425}
3426
3427/*
3428 * Register an additional multicast address with a network interface.
3429 *
3430 * - If the address is already present, bump the reference count on the
3431 *   address and return.
3432 * - If the address is not link-layer, look up a link layer address.
3433 * - Allocate address structures for one or both addresses, and attach to the
3434 *   multicast address list on the interface.  If automatically adding a link
3435 *   layer address, the protocol address will own a reference to the link
3436 *   layer address, to be freed when it is freed.
3437 * - Notify the network device driver of an addition to the multicast address
3438 *   list.
3439 *
3440 * 'sa' points to caller-owned memory with the desired multicast address.
3441 *
3442 * 'retifma' will be used to return a pointer to the resulting multicast
3443 * address reference, if desired.
3444 */
3445int
3446if_addmulti(struct ifnet *ifp, struct sockaddr *sa,
3447    struct ifmultiaddr **retifma)
3448{
3449	struct ifmultiaddr *ifma, *ll_ifma;
3450	struct sockaddr *llsa;
3451	struct sockaddr_dl sdl;
3452	int error;
3453
3454#ifdef INET
3455	IN_MULTI_LIST_UNLOCK_ASSERT();
3456#endif
3457#ifdef INET6
3458	IN6_MULTI_LIST_UNLOCK_ASSERT();
3459#endif
3460	/*
3461	 * If the address is already present, return a new reference to it;
3462	 * otherwise, allocate storage and set up a new address.
3463	 */
3464	IF_ADDR_WLOCK(ifp);
3465	ifma = if_findmulti(ifp, sa);
3466	if (ifma != NULL) {
3467		ifma->ifma_refcount++;
3468		if (retifma != NULL)
3469			*retifma = ifma;
3470		IF_ADDR_WUNLOCK(ifp);
3471		return (0);
3472	}
3473
3474	/*
3475	 * The address isn't already present; resolve the protocol address
3476	 * into a link layer address, and then look that up, bump its
3477	 * refcount or allocate an ifma for that also.
3478	 * Most link layer resolving functions returns address data which
3479	 * fits inside default sockaddr_dl structure. However callback
3480	 * can allocate another sockaddr structure, in that case we need to
3481	 * free it later.
3482	 */
3483	llsa = NULL;
3484	ll_ifma = NULL;
3485	if (ifp->if_resolvemulti != NULL) {
3486		/* Provide called function with buffer size information */
3487		sdl.sdl_len = sizeof(sdl);
3488		llsa = (struct sockaddr *)&sdl;
3489		error = ifp->if_resolvemulti(ifp, &llsa, sa);
3490		if (error)
3491			goto unlock_out;
3492	}
3493
3494	/*
3495	 * Allocate the new address.  Don't hook it up yet, as we may also
3496	 * need to allocate a link layer multicast address.
3497	 */
3498	ifma = if_allocmulti(ifp, sa, llsa, M_NOWAIT);
3499	if (ifma == NULL) {
3500		error = ENOMEM;
3501		goto free_llsa_out;
3502	}
3503
3504	/*
3505	 * If a link layer address is found, we'll need to see if it's
3506	 * already present in the address list, or allocate is as well.
3507	 * When this block finishes, the link layer address will be on the
3508	 * list.
3509	 */
3510	if (llsa != NULL) {
3511		ll_ifma = if_findmulti(ifp, llsa);
3512		if (ll_ifma == NULL) {
3513			ll_ifma = if_allocmulti(ifp, llsa, NULL, M_NOWAIT);
3514			if (ll_ifma == NULL) {
3515				--ifma->ifma_refcount;
3516				if_freemulti(ifma);
3517				error = ENOMEM;
3518				goto free_llsa_out;
3519			}
3520			ll_ifma->ifma_flags |= IFMA_F_ENQUEUED;
3521			CK_STAILQ_INSERT_HEAD(&ifp->if_multiaddrs, ll_ifma,
3522			    ifma_link);
3523		} else
3524			ll_ifma->ifma_refcount++;
3525		ifma->ifma_llifma = ll_ifma;
3526	}
3527
3528	/*
3529	 * We now have a new multicast address, ifma, and possibly a new or
3530	 * referenced link layer address.  Add the primary address to the
3531	 * ifnet address list.
3532	 */
3533	ifma->ifma_flags |= IFMA_F_ENQUEUED;
3534	CK_STAILQ_INSERT_HEAD(&ifp->if_multiaddrs, ifma, ifma_link);
3535
3536	if (retifma != NULL)
3537		*retifma = ifma;
3538
3539	/*
3540	 * Must generate the message while holding the lock so that 'ifma'
3541	 * pointer is still valid.
3542	 */
3543	rt_newmaddrmsg(RTM_NEWMADDR, ifma);
3544	IF_ADDR_WUNLOCK(ifp);
3545
3546	/*
3547	 * We are certain we have added something, so call down to the
3548	 * interface to let them know about it.
3549	 */
3550	if (ifp->if_ioctl != NULL) {
3551		if (THREAD_CAN_SLEEP())
3552			(void )(*ifp->if_ioctl)(ifp, SIOCADDMULTI, 0);
3553		else
3554			taskqueue_enqueue(taskqueue_swi, &ifp->if_addmultitask);
3555	}
3556
3557	if ((llsa != NULL) && (llsa != (struct sockaddr *)&sdl))
3558		link_free_sdl(llsa);
3559
3560	return (0);
3561
3562free_llsa_out:
3563	if ((llsa != NULL) && (llsa != (struct sockaddr *)&sdl))
3564		link_free_sdl(llsa);
3565
3566unlock_out:
3567	IF_ADDR_WUNLOCK(ifp);
3568	return (error);
3569}
3570
3571static void
3572if_siocaddmulti(void *arg, int pending)
3573{
3574	struct ifnet *ifp;
3575
3576	ifp = arg;
3577#ifdef DIAGNOSTIC
3578	if (pending > 1)
3579		if_printf(ifp, "%d SIOCADDMULTI coalesced\n", pending);
3580#endif
3581	CURVNET_SET(ifp->if_vnet);
3582	(void )(*ifp->if_ioctl)(ifp, SIOCADDMULTI, 0);
3583	CURVNET_RESTORE();
3584}
3585
3586/*
3587 * Delete a multicast group membership by network-layer group address.
3588 *
3589 * Returns ENOENT if the entry could not be found. If ifp no longer
3590 * exists, results are undefined. This entry point should only be used
3591 * from subsystems which do appropriate locking to hold ifp for the
3592 * duration of the call.
3593 * Network-layer protocol domains must use if_delmulti_ifma().
3594 */
3595int
3596if_delmulti(struct ifnet *ifp, struct sockaddr *sa)
3597{
3598	struct ifmultiaddr *ifma;
3599	int lastref;
3600
3601	KASSERT(ifp, ("%s: NULL ifp", __func__));
3602
3603	IF_ADDR_WLOCK(ifp);
3604	lastref = 0;
3605	ifma = if_findmulti(ifp, sa);
3606	if (ifma != NULL)
3607		lastref = if_delmulti_locked(ifp, ifma, 0);
3608	IF_ADDR_WUNLOCK(ifp);
3609
3610	if (ifma == NULL)
3611		return (ENOENT);
3612
3613	if (lastref && ifp->if_ioctl != NULL) {
3614		(void)(*ifp->if_ioctl)(ifp, SIOCDELMULTI, 0);
3615	}
3616
3617	return (0);
3618}
3619
3620/*
3621 * Delete all multicast group membership for an interface.
3622 * Should be used to quickly flush all multicast filters.
3623 */
3624void
3625if_delallmulti(struct ifnet *ifp)
3626{
3627	struct ifmultiaddr *ifma;
3628	struct ifmultiaddr *next;
3629
3630	IF_ADDR_WLOCK(ifp);
3631	CK_STAILQ_FOREACH_SAFE(ifma, &ifp->if_multiaddrs, ifma_link, next)
3632		if_delmulti_locked(ifp, ifma, 0);
3633	IF_ADDR_WUNLOCK(ifp);
3634}
3635
3636void
3637if_delmulti_ifma(struct ifmultiaddr *ifma)
3638{
3639	if_delmulti_ifma_flags(ifma, 0);
3640}
3641
3642/*
3643 * Delete a multicast group membership by group membership pointer.
3644 * Network-layer protocol domains must use this routine.
3645 *
3646 * It is safe to call this routine if the ifp disappeared.
3647 */
3648void
3649if_delmulti_ifma_flags(struct ifmultiaddr *ifma, int flags)
3650{
3651	struct ifnet *ifp;
3652	int lastref;
3653	MCDPRINTF("%s freeing ifma: %p\n", __func__, ifma);
3654#ifdef INET
3655	IN_MULTI_LIST_UNLOCK_ASSERT();
3656#endif
3657	ifp = ifma->ifma_ifp;
3658#ifdef DIAGNOSTIC
3659	if (ifp == NULL) {
3660		printf("%s: ifma_ifp seems to be detached\n", __func__);
3661	} else {
3662		struct epoch_tracker et;
3663		struct ifnet *oifp;
3664
3665		NET_EPOCH_ENTER(et);
3666		CK_STAILQ_FOREACH(oifp, &V_ifnet, if_link)
3667			if (ifp == oifp)
3668				break;
3669		NET_EPOCH_EXIT(et);
3670		if (ifp != oifp)
3671			ifp = NULL;
3672	}
3673#endif
3674	/*
3675	 * If and only if the ifnet instance exists: Acquire the address lock.
3676	 */
3677	if (ifp != NULL)
3678		IF_ADDR_WLOCK(ifp);
3679
3680	lastref = if_delmulti_locked(ifp, ifma, flags);
3681
3682	if (ifp != NULL) {
3683		/*
3684		 * If and only if the ifnet instance exists:
3685		 *  Release the address lock.
3686		 *  If the group was left: update the hardware hash filter.
3687		 */
3688		IF_ADDR_WUNLOCK(ifp);
3689		if (lastref && ifp->if_ioctl != NULL) {
3690			(void)(*ifp->if_ioctl)(ifp, SIOCDELMULTI, 0);
3691		}
3692	}
3693}
3694
3695/*
3696 * Perform deletion of network-layer and/or link-layer multicast address.
3697 *
3698 * Return 0 if the reference count was decremented.
3699 * Return 1 if the final reference was released, indicating that the
3700 * hardware hash filter should be reprogrammed.
3701 */
3702static int
3703if_delmulti_locked(struct ifnet *ifp, struct ifmultiaddr *ifma, int detaching)
3704{
3705	struct ifmultiaddr *ll_ifma;
3706
3707	if (ifp != NULL && ifma->ifma_ifp != NULL) {
3708		KASSERT(ifma->ifma_ifp == ifp,
3709		    ("%s: inconsistent ifp %p", __func__, ifp));
3710		IF_ADDR_WLOCK_ASSERT(ifp);
3711	}
3712
3713	ifp = ifma->ifma_ifp;
3714	MCDPRINTF("%s freeing %p from %s \n", __func__, ifma, ifp ? ifp->if_xname : "");
3715
3716	/*
3717	 * If the ifnet is detaching, null out references to ifnet,
3718	 * so that upper protocol layers will notice, and not attempt
3719	 * to obtain locks for an ifnet which no longer exists. The
3720	 * routing socket announcement must happen before the ifnet
3721	 * instance is detached from the system.
3722	 */
3723	if (detaching) {
3724#ifdef DIAGNOSTIC
3725		printf("%s: detaching ifnet instance %p\n", __func__, ifp);
3726#endif
3727		/*
3728		 * ifp may already be nulled out if we are being reentered
3729		 * to delete the ll_ifma.
3730		 */
3731		if (ifp != NULL) {
3732			rt_newmaddrmsg(RTM_DELMADDR, ifma);
3733			ifma->ifma_ifp = NULL;
3734		}
3735	}
3736
3737	if (--ifma->ifma_refcount > 0)
3738		return 0;
3739
3740	if (ifp != NULL && detaching == 0 && (ifma->ifma_flags & IFMA_F_ENQUEUED)) {
3741		CK_STAILQ_REMOVE(&ifp->if_multiaddrs, ifma, ifmultiaddr, ifma_link);
3742		ifma->ifma_flags &= ~IFMA_F_ENQUEUED;
3743	}
3744	/*
3745	 * If this ifma is a network-layer ifma, a link-layer ifma may
3746	 * have been associated with it. Release it first if so.
3747	 */
3748	ll_ifma = ifma->ifma_llifma;
3749	if (ll_ifma != NULL) {
3750		KASSERT(ifma->ifma_lladdr != NULL,
3751		    ("%s: llifma w/o lladdr", __func__));
3752		if (detaching)
3753			ll_ifma->ifma_ifp = NULL;	/* XXX */
3754		if (--ll_ifma->ifma_refcount == 0) {
3755			if (ifp != NULL) {
3756				if (ll_ifma->ifma_flags & IFMA_F_ENQUEUED) {
3757					CK_STAILQ_REMOVE(&ifp->if_multiaddrs, ll_ifma, ifmultiaddr,
3758						ifma_link);
3759					ll_ifma->ifma_flags &= ~IFMA_F_ENQUEUED;
3760				}
3761			}
3762			if_freemulti(ll_ifma);
3763		}
3764	}
3765#ifdef INVARIANTS
3766	if (ifp) {
3767		struct ifmultiaddr *ifmatmp;
3768
3769		CK_STAILQ_FOREACH(ifmatmp, &ifp->if_multiaddrs, ifma_link)
3770			MPASS(ifma != ifmatmp);
3771	}
3772#endif
3773	if_freemulti(ifma);
3774	/*
3775	 * The last reference to this instance of struct ifmultiaddr
3776	 * was released; the hardware should be notified of this change.
3777	 */
3778	return 1;
3779}
3780
3781/*
3782 * Set the link layer address on an interface.
3783 *
3784 * At this time we only support certain types of interfaces,
3785 * and we don't allow the length of the address to change.
3786 *
3787 * Set noinline to be dtrace-friendly
3788 */
3789__noinline int
3790if_setlladdr(struct ifnet *ifp, const u_char *lladdr, int len)
3791{
3792	struct sockaddr_dl *sdl;
3793	struct ifaddr *ifa;
3794	struct ifreq ifr;
3795
3796	ifa = ifp->if_addr;
3797	if (ifa == NULL)
3798		return (EINVAL);
3799
3800	sdl = (struct sockaddr_dl *)ifa->ifa_addr;
3801	if (sdl == NULL)
3802		return (EINVAL);
3803
3804	if (len != sdl->sdl_alen)	/* don't allow length to change */
3805		return (EINVAL);
3806
3807	switch (ifp->if_type) {
3808	case IFT_ETHER:
3809	case IFT_XETHER:
3810	case IFT_L2VLAN:
3811	case IFT_BRIDGE:
3812	case IFT_IEEE8023ADLAG:
3813		bcopy(lladdr, LLADDR(sdl), len);
3814		break;
3815	default:
3816		return (ENODEV);
3817	}
3818
3819	/*
3820	 * If the interface is already up, we need
3821	 * to re-init it in order to reprogram its
3822	 * address filter.
3823	 */
3824	if ((ifp->if_flags & IFF_UP) != 0) {
3825		if (ifp->if_ioctl) {
3826			ifp->if_flags &= ~IFF_UP;
3827			ifr.ifr_flags = ifp->if_flags & 0xffff;
3828			ifr.ifr_flagshigh = ifp->if_flags >> 16;
3829			(*ifp->if_ioctl)(ifp, SIOCSIFFLAGS, (caddr_t)&ifr);
3830			ifp->if_flags |= IFF_UP;
3831			ifr.ifr_flags = ifp->if_flags & 0xffff;
3832			ifr.ifr_flagshigh = ifp->if_flags >> 16;
3833			(*ifp->if_ioctl)(ifp, SIOCSIFFLAGS, (caddr_t)&ifr);
3834		}
3835	}
3836	EVENTHANDLER_INVOKE(iflladdr_event, ifp);
3837
3838	return (0);
3839}
3840
3841/*
3842 * Compat function for handling basic encapsulation requests.
3843 * Not converted stacks (FDDI, IB, ..) supports traditional
3844 * output model: ARP (and other similar L2 protocols) are handled
3845 * inside output routine, arpresolve/nd6_resolve() returns MAC
3846 * address instead of full prepend.
3847 *
3848 * This function creates calculated header==MAC for IPv4/IPv6 and
3849 * returns EAFNOSUPPORT (which is then handled in ARP code) for other
3850 * address families.
3851 */
3852static int
3853if_requestencap_default(struct ifnet *ifp, struct if_encap_req *req)
3854{
3855
3856	if (req->rtype != IFENCAP_LL)
3857		return (EOPNOTSUPP);
3858
3859	if (req->bufsize < req->lladdr_len)
3860		return (ENOMEM);
3861
3862	switch (req->family) {
3863	case AF_INET:
3864	case AF_INET6:
3865		break;
3866	default:
3867		return (EAFNOSUPPORT);
3868	}
3869
3870	/* Copy lladdr to storage as is */
3871	memmove(req->buf, req->lladdr, req->lladdr_len);
3872	req->bufsize = req->lladdr_len;
3873	req->lladdr_off = 0;
3874
3875	return (0);
3876}
3877
3878/*
3879 * Tunnel interfaces can nest, also they may cause infinite recursion
3880 * calls when misconfigured. We'll prevent this by detecting loops.
3881 * High nesting level may cause stack exhaustion. We'll prevent this
3882 * by introducing upper limit.
3883 *
3884 * Return 0, if tunnel nesting count is equal or less than limit.
3885 */
3886int
3887if_tunnel_check_nesting(struct ifnet *ifp, struct mbuf *m, uint32_t cookie,
3888    int limit)
3889{
3890	struct m_tag *mtag;
3891	int count;
3892
3893	count = 1;
3894	mtag = NULL;
3895	while ((mtag = m_tag_locate(m, cookie, 0, mtag)) != NULL) {
3896		if (*(struct ifnet **)(mtag + 1) == ifp) {
3897			log(LOG_NOTICE, "%s: loop detected\n", if_name(ifp));
3898			return (EIO);
3899		}
3900		count++;
3901	}
3902	if (count > limit) {
3903		log(LOG_NOTICE,
3904		    "%s: if_output recursively called too many times(%d)\n",
3905		    if_name(ifp), count);
3906		return (EIO);
3907	}
3908	mtag = m_tag_alloc(cookie, 0, sizeof(struct ifnet *), M_NOWAIT);
3909	if (mtag == NULL)
3910		return (ENOMEM);
3911	*(struct ifnet **)(mtag + 1) = ifp;
3912	m_tag_prepend(m, mtag);
3913	return (0);
3914}
3915
3916/*
3917 * Get the link layer address that was read from the hardware at attach.
3918 *
3919 * This is only set by Ethernet NICs (IFT_ETHER), but laggX interfaces re-type
3920 * their component interfaces as IFT_IEEE8023ADLAG.
3921 */
3922int
3923if_gethwaddr(struct ifnet *ifp, struct ifreq *ifr)
3924{
3925
3926	if (ifp->if_hw_addr == NULL)
3927		return (ENODEV);
3928
3929	switch (ifp->if_type) {
3930	case IFT_ETHER:
3931	case IFT_IEEE8023ADLAG:
3932		bcopy(ifp->if_hw_addr, ifr->ifr_addr.sa_data, ifp->if_addrlen);
3933		return (0);
3934	default:
3935		return (ENODEV);
3936	}
3937}
3938
3939/*
3940 * The name argument must be a pointer to storage which will last as
3941 * long as the interface does.  For physical devices, the result of
3942 * device_get_name(dev) is a good choice and for pseudo-devices a
3943 * static string works well.
3944 */
3945void
3946if_initname(struct ifnet *ifp, const char *name, int unit)
3947{
3948	ifp->if_dname = name;
3949	ifp->if_dunit = unit;
3950	if (unit != IF_DUNIT_NONE)
3951		snprintf(ifp->if_xname, IFNAMSIZ, "%s%d", name, unit);
3952	else
3953		strlcpy(ifp->if_xname, name, IFNAMSIZ);
3954}
3955
3956int
3957if_printf(struct ifnet *ifp, const char *fmt, ...)
3958{
3959	char if_fmt[256];
3960	va_list ap;
3961
3962	snprintf(if_fmt, sizeof(if_fmt), "%s: %s", ifp->if_xname, fmt);
3963	va_start(ap, fmt);
3964	vlog(LOG_INFO, if_fmt, ap);
3965	va_end(ap);
3966	return (0);
3967}
3968
3969void
3970if_start(struct ifnet *ifp)
3971{
3972
3973	(*(ifp)->if_start)(ifp);
3974}
3975
3976/*
3977 * Backwards compatibility interface for drivers
3978 * that have not implemented it
3979 */
3980static int
3981if_transmit(struct ifnet *ifp, struct mbuf *m)
3982{
3983	int error;
3984
3985	IFQ_HANDOFF(ifp, m, error);
3986	return (error);
3987}
3988
3989static void
3990if_input_default(struct ifnet *ifp __unused, struct mbuf *m)
3991{
3992
3993	m_freem(m);
3994}
3995
3996int
3997if_handoff(struct ifqueue *ifq, struct mbuf *m, struct ifnet *ifp, int adjust)
3998{
3999	int active = 0;
4000
4001	IF_LOCK(ifq);
4002	if (_IF_QFULL(ifq)) {
4003		IF_UNLOCK(ifq);
4004		if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1);
4005		m_freem(m);
4006		return (0);
4007	}
4008	if (ifp != NULL) {
4009		if_inc_counter(ifp, IFCOUNTER_OBYTES, m->m_pkthdr.len + adjust);
4010		if (m->m_flags & (M_BCAST|M_MCAST))
4011			if_inc_counter(ifp, IFCOUNTER_OMCASTS, 1);
4012		active = ifp->if_drv_flags & IFF_DRV_OACTIVE;
4013	}
4014	_IF_ENQUEUE(ifq, m);
4015	IF_UNLOCK(ifq);
4016	if (ifp != NULL && !active)
4017		(*(ifp)->if_start)(ifp);
4018	return (1);
4019}
4020
4021void
4022if_register_com_alloc(u_char type,
4023    if_com_alloc_t *a, if_com_free_t *f)
4024{
4025
4026	KASSERT(if_com_alloc[type] == NULL,
4027	    ("if_register_com_alloc: %d already registered", type));
4028	KASSERT(if_com_free[type] == NULL,
4029	    ("if_register_com_alloc: %d free already registered", type));
4030
4031	if_com_alloc[type] = a;
4032	if_com_free[type] = f;
4033}
4034
4035void
4036if_deregister_com_alloc(u_char type)
4037{
4038
4039	KASSERT(if_com_alloc[type] != NULL,
4040	    ("if_deregister_com_alloc: %d not registered", type));
4041	KASSERT(if_com_free[type] != NULL,
4042	    ("if_deregister_com_alloc: %d free not registered", type));
4043
4044	/*
4045	 * Ensure all pending EPOCH(9) callbacks have been executed. This
4046	 * fixes issues about late invocation of if_destroy(), which leads
4047	 * to memory leak from if_com_alloc[type] allocated if_l2com.
4048	 */
4049	epoch_drain_callbacks(net_epoch_preempt);
4050
4051	if_com_alloc[type] = NULL;
4052	if_com_free[type] = NULL;
4053}
4054
4055/* API for driver access to network stack owned ifnet.*/
4056uint64_t
4057if_setbaudrate(struct ifnet *ifp, uint64_t baudrate)
4058{
4059	uint64_t oldbrate;
4060
4061	oldbrate = ifp->if_baudrate;
4062	ifp->if_baudrate = baudrate;
4063	return (oldbrate);
4064}
4065
4066uint64_t
4067if_getbaudrate(if_t ifp)
4068{
4069
4070	return (((struct ifnet *)ifp)->if_baudrate);
4071}
4072
4073int
4074if_setcapabilities(if_t ifp, int capabilities)
4075{
4076	((struct ifnet *)ifp)->if_capabilities = capabilities;
4077	return (0);
4078}
4079
4080int
4081if_setcapabilitiesbit(if_t ifp, int setbit, int clearbit)
4082{
4083	((struct ifnet *)ifp)->if_capabilities |= setbit;
4084	((struct ifnet *)ifp)->if_capabilities &= ~clearbit;
4085
4086	return (0);
4087}
4088
4089int
4090if_getcapabilities(if_t ifp)
4091{
4092	return ((struct ifnet *)ifp)->if_capabilities;
4093}
4094
4095int
4096if_setcapenable(if_t ifp, int capabilities)
4097{
4098	((struct ifnet *)ifp)->if_capenable = capabilities;
4099	return (0);
4100}
4101
4102int
4103if_setcapenablebit(if_t ifp, int setcap, int clearcap)
4104{
4105	if(setcap)
4106		((struct ifnet *)ifp)->if_capenable |= setcap;
4107	if(clearcap)
4108		((struct ifnet *)ifp)->if_capenable &= ~clearcap;
4109
4110	return (0);
4111}
4112
4113const char *
4114if_getdname(if_t ifp)
4115{
4116	return ((struct ifnet *)ifp)->if_dname;
4117}
4118
4119int
4120if_togglecapenable(if_t ifp, int togglecap)
4121{
4122	((struct ifnet *)ifp)->if_capenable ^= togglecap;
4123	return (0);
4124}
4125
4126int
4127if_getcapenable(if_t ifp)
4128{
4129	return ((struct ifnet *)ifp)->if_capenable;
4130}
4131
4132/*
4133 * This is largely undesirable because it ties ifnet to a device, but does
4134 * provide flexiblity for an embedded product vendor. Should be used with
4135 * the understanding that it violates the interface boundaries, and should be
4136 * a last resort only.
4137 */
4138int
4139if_setdev(if_t ifp, void *dev)
4140{
4141	return (0);
4142}
4143
4144int
4145if_setdrvflagbits(if_t ifp, int set_flags, int clear_flags)
4146{
4147	((struct ifnet *)ifp)->if_drv_flags |= set_flags;
4148	((struct ifnet *)ifp)->if_drv_flags &= ~clear_flags;
4149
4150	return (0);
4151}
4152
4153int
4154if_getdrvflags(if_t ifp)
4155{
4156	return ((struct ifnet *)ifp)->if_drv_flags;
4157}
4158
4159int
4160if_setdrvflags(if_t ifp, int flags)
4161{
4162	((struct ifnet *)ifp)->if_drv_flags = flags;
4163	return (0);
4164}
4165
4166int
4167if_setflags(if_t ifp, int flags)
4168{
4169
4170	ifp->if_flags = flags;
4171	return (0);
4172}
4173
4174int
4175if_setflagbits(if_t ifp, int set, int clear)
4176{
4177	((struct ifnet *)ifp)->if_flags |= set;
4178	((struct ifnet *)ifp)->if_flags &= ~clear;
4179
4180	return (0);
4181}
4182
4183int
4184if_getflags(if_t ifp)
4185{
4186	return ((struct ifnet *)ifp)->if_flags;
4187}
4188
4189int
4190if_clearhwassist(if_t ifp)
4191{
4192	((struct ifnet *)ifp)->if_hwassist = 0;
4193	return (0);
4194}
4195
4196int
4197if_sethwassistbits(if_t ifp, int toset, int toclear)
4198{
4199	((struct ifnet *)ifp)->if_hwassist |= toset;
4200	((struct ifnet *)ifp)->if_hwassist &= ~toclear;
4201
4202	return (0);
4203}
4204
4205int
4206if_sethwassist(if_t ifp, int hwassist_bit)
4207{
4208	((struct ifnet *)ifp)->if_hwassist = hwassist_bit;
4209	return (0);
4210}
4211
4212int
4213if_gethwassist(if_t ifp)
4214{
4215	return ((struct ifnet *)ifp)->if_hwassist;
4216}
4217
4218int
4219if_setmtu(if_t ifp, int mtu)
4220{
4221	((struct ifnet *)ifp)->if_mtu = mtu;
4222	return (0);
4223}
4224
4225int
4226if_getmtu(if_t ifp)
4227{
4228	return ((struct ifnet *)ifp)->if_mtu;
4229}
4230
4231int
4232if_getmtu_family(if_t ifp, int family)
4233{
4234	struct domain *dp;
4235
4236	for (dp = domains; dp; dp = dp->dom_next) {
4237		if (dp->dom_family == family && dp->dom_ifmtu != NULL)
4238			return (dp->dom_ifmtu((struct ifnet *)ifp));
4239	}
4240
4241	return (((struct ifnet *)ifp)->if_mtu);
4242}
4243
4244/*
4245 * Methods for drivers to access interface unicast and multicast
4246 * link level addresses.  Driver shall not know 'struct ifaddr' neither
4247 * 'struct ifmultiaddr'.
4248 */
4249u_int
4250if_lladdr_count(if_t ifp)
4251{
4252	struct epoch_tracker et;
4253	struct ifaddr *ifa;
4254	u_int count;
4255
4256	count = 0;
4257	NET_EPOCH_ENTER(et);
4258	CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link)
4259		if (ifa->ifa_addr->sa_family == AF_LINK)
4260			count++;
4261	NET_EPOCH_EXIT(et);
4262
4263	return (count);
4264}
4265
4266u_int
4267if_foreach_lladdr(if_t ifp, iflladdr_cb_t cb, void *cb_arg)
4268{
4269	struct epoch_tracker et;
4270	struct ifaddr *ifa;
4271	u_int count;
4272
4273	MPASS(cb);
4274
4275	count = 0;
4276	NET_EPOCH_ENTER(et);
4277	CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
4278		if (ifa->ifa_addr->sa_family != AF_LINK)
4279			continue;
4280		count += (*cb)(cb_arg, (struct sockaddr_dl *)ifa->ifa_addr,
4281		    count);
4282	}
4283	NET_EPOCH_EXIT(et);
4284
4285	return (count);
4286}
4287
4288u_int
4289if_llmaddr_count(if_t ifp)
4290{
4291	struct epoch_tracker et;
4292	struct ifmultiaddr *ifma;
4293	int count;
4294
4295	count = 0;
4296	NET_EPOCH_ENTER(et);
4297	CK_STAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link)
4298		if (ifma->ifma_addr->sa_family == AF_LINK)
4299			count++;
4300	NET_EPOCH_EXIT(et);
4301
4302	return (count);
4303}
4304
4305u_int
4306if_foreach_llmaddr(if_t ifp, iflladdr_cb_t cb, void *cb_arg)
4307{
4308	struct epoch_tracker et;
4309	struct ifmultiaddr *ifma;
4310	u_int count;
4311
4312	MPASS(cb);
4313
4314	count = 0;
4315	NET_EPOCH_ENTER(et);
4316	CK_STAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
4317		if (ifma->ifma_addr->sa_family != AF_LINK)
4318			continue;
4319		count += (*cb)(cb_arg, (struct sockaddr_dl *)ifma->ifma_addr,
4320		    count);
4321	}
4322	NET_EPOCH_EXIT(et);
4323
4324	return (count);
4325}
4326
4327int
4328if_setsoftc(if_t ifp, void *softc)
4329{
4330	((struct ifnet *)ifp)->if_softc = softc;
4331	return (0);
4332}
4333
4334void *
4335if_getsoftc(if_t ifp)
4336{
4337	return ((struct ifnet *)ifp)->if_softc;
4338}
4339
4340void
4341if_setrcvif(struct mbuf *m, if_t ifp)
4342{
4343
4344	MPASS((m->m_pkthdr.csum_flags & CSUM_SND_TAG) == 0);
4345	m->m_pkthdr.rcvif = (struct ifnet *)ifp;
4346}
4347
4348void
4349if_setvtag(struct mbuf *m, uint16_t tag)
4350{
4351	m->m_pkthdr.ether_vtag = tag;
4352}
4353
4354uint16_t
4355if_getvtag(struct mbuf *m)
4356{
4357
4358	return (m->m_pkthdr.ether_vtag);
4359}
4360
4361int
4362if_sendq_empty(if_t ifp)
4363{
4364	return IFQ_DRV_IS_EMPTY(&((struct ifnet *)ifp)->if_snd);
4365}
4366
4367struct ifaddr *
4368if_getifaddr(if_t ifp)
4369{
4370	return ((struct ifnet *)ifp)->if_addr;
4371}
4372
4373int
4374if_getamcount(if_t ifp)
4375{
4376	return ((struct ifnet *)ifp)->if_amcount;
4377}
4378
4379int
4380if_setsendqready(if_t ifp)
4381{
4382	IFQ_SET_READY(&((struct ifnet *)ifp)->if_snd);
4383	return (0);
4384}
4385
4386int
4387if_setsendqlen(if_t ifp, int tx_desc_count)
4388{
4389	IFQ_SET_MAXLEN(&((struct ifnet *)ifp)->if_snd, tx_desc_count);
4390	((struct ifnet *)ifp)->if_snd.ifq_drv_maxlen = tx_desc_count;
4391
4392	return (0);
4393}
4394
4395int
4396if_vlantrunkinuse(if_t ifp)
4397{
4398	return ((struct ifnet *)ifp)->if_vlantrunk != NULL?1:0;
4399}
4400
4401int
4402if_input(if_t ifp, struct mbuf* sendmp)
4403{
4404	(*((struct ifnet *)ifp)->if_input)((struct ifnet *)ifp, sendmp);
4405	return (0);
4406
4407}
4408
4409struct mbuf *
4410if_dequeue(if_t ifp)
4411{
4412	struct mbuf *m;
4413	IFQ_DRV_DEQUEUE(&((struct ifnet *)ifp)->if_snd, m);
4414
4415	return (m);
4416}
4417
4418int
4419if_sendq_prepend(if_t ifp, struct mbuf *m)
4420{
4421	IFQ_DRV_PREPEND(&((struct ifnet *)ifp)->if_snd, m);
4422	return (0);
4423}
4424
4425int
4426if_setifheaderlen(if_t ifp, int len)
4427{
4428	((struct ifnet *)ifp)->if_hdrlen = len;
4429	return (0);
4430}
4431
4432caddr_t
4433if_getlladdr(if_t ifp)
4434{
4435	return (IF_LLADDR((struct ifnet *)ifp));
4436}
4437
4438void *
4439if_gethandle(u_char type)
4440{
4441	return (if_alloc(type));
4442}
4443
4444void
4445if_bpfmtap(if_t ifh, struct mbuf *m)
4446{
4447	struct ifnet *ifp = (struct ifnet *)ifh;
4448
4449	BPF_MTAP(ifp, m);
4450}
4451
4452void
4453if_etherbpfmtap(if_t ifh, struct mbuf *m)
4454{
4455	struct ifnet *ifp = (struct ifnet *)ifh;
4456
4457	ETHER_BPF_MTAP(ifp, m);
4458}
4459
4460void
4461if_vlancap(if_t ifh)
4462{
4463	struct ifnet *ifp = (struct ifnet *)ifh;
4464	VLAN_CAPABILITIES(ifp);
4465}
4466
4467int
4468if_sethwtsomax(if_t ifp, u_int if_hw_tsomax)
4469{
4470
4471	((struct ifnet *)ifp)->if_hw_tsomax = if_hw_tsomax;
4472        return (0);
4473}
4474
4475int
4476if_sethwtsomaxsegcount(if_t ifp, u_int if_hw_tsomaxsegcount)
4477{
4478
4479	((struct ifnet *)ifp)->if_hw_tsomaxsegcount = if_hw_tsomaxsegcount;
4480        return (0);
4481}
4482
4483int
4484if_sethwtsomaxsegsize(if_t ifp, u_int if_hw_tsomaxsegsize)
4485{
4486
4487	((struct ifnet *)ifp)->if_hw_tsomaxsegsize = if_hw_tsomaxsegsize;
4488        return (0);
4489}
4490
4491u_int
4492if_gethwtsomax(if_t ifp)
4493{
4494
4495	return (((struct ifnet *)ifp)->if_hw_tsomax);
4496}
4497
4498u_int
4499if_gethwtsomaxsegcount(if_t ifp)
4500{
4501
4502	return (((struct ifnet *)ifp)->if_hw_tsomaxsegcount);
4503}
4504
4505u_int
4506if_gethwtsomaxsegsize(if_t ifp)
4507{
4508
4509	return (((struct ifnet *)ifp)->if_hw_tsomaxsegsize);
4510}
4511
4512void
4513if_setinitfn(if_t ifp, void (*init_fn)(void *))
4514{
4515	((struct ifnet *)ifp)->if_init = init_fn;
4516}
4517
4518void
4519if_setioctlfn(if_t ifp, int (*ioctl_fn)(if_t, u_long, caddr_t))
4520{
4521	((struct ifnet *)ifp)->if_ioctl = (void *)ioctl_fn;
4522}
4523
4524void
4525if_setstartfn(if_t ifp, void (*start_fn)(if_t))
4526{
4527	((struct ifnet *)ifp)->if_start = (void *)start_fn;
4528}
4529
4530void
4531if_settransmitfn(if_t ifp, if_transmit_fn_t start_fn)
4532{
4533	((struct ifnet *)ifp)->if_transmit = start_fn;
4534}
4535
4536void if_setqflushfn(if_t ifp, if_qflush_fn_t flush_fn)
4537{
4538	((struct ifnet *)ifp)->if_qflush = flush_fn;
4539
4540}
4541
4542void
4543if_setgetcounterfn(if_t ifp, if_get_counter_t fn)
4544{
4545
4546	ifp->if_get_counter = fn;
4547}
4548
4549/* Revisit these - These are inline functions originally. */
4550int
4551drbr_inuse_drv(if_t ifh, struct buf_ring *br)
4552{
4553	return drbr_inuse(ifh, br);
4554}
4555
4556struct mbuf*
4557drbr_dequeue_drv(if_t ifh, struct buf_ring *br)
4558{
4559	return drbr_dequeue(ifh, br);
4560}
4561
4562int
4563drbr_needs_enqueue_drv(if_t ifh, struct buf_ring *br)
4564{
4565	return drbr_needs_enqueue(ifh, br);
4566}
4567
4568int
4569drbr_enqueue_drv(if_t ifh, struct buf_ring *br, struct mbuf *m)
4570{
4571	return drbr_enqueue(ifh, br, m);
4572
4573}
4574