1/*
2 * Copyright (c) 1999-2012 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28/*
29 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
30 * support for mandatory and extensible security protections.  This notice
31 * is included in support of clause 2.2 (b) of the Apple Public License,
32 * Version 2.0.
33 */
34
35#include <sys/param.h>
36#include <sys/systm.h>
37#include <sys/kernel.h>
38#include <sys/malloc.h>
39#include <sys/mbuf.h>
40#include <sys/socket.h>
41#include <sys/domain.h>
42#include <sys/user.h>
43#include <sys/random.h>
44#include <sys/socketvar.h>
45#include <net/if_dl.h>
46#include <net/if.h>
47#include <net/route.h>
48#include <net/if_var.h>
49#include <net/dlil.h>
50#include <net/if_arp.h>
51#include <net/iptap.h>
52#include <sys/kern_event.h>
53#include <sys/kdebug.h>
54#include <sys/mcache.h>
55
56#include <kern/assert.h>
57#include <kern/task.h>
58#include <kern/thread.h>
59#include <kern/sched_prim.h>
60#include <kern/locks.h>
61#include <kern/zalloc.h>
62#include <net/kpi_protocol.h>
63
64#include <net/if_types.h>
65#include <net/if_llreach.h>
66#include <net/kpi_interfacefilter.h>
67#include <net/classq/classq.h>
68#include <net/classq/classq_sfb.h>
69
70#if INET
71#include <netinet/in_var.h>
72#include <netinet/igmp_var.h>
73#include <netinet/ip_var.h>
74#include <netinet/tcp.h>
75#include <netinet/tcp_var.h>
76#include <netinet/udp.h>
77#include <netinet/udp_var.h>
78#include <netinet/if_ether.h>
79#include <netinet/in_pcb.h>
80#endif /* INET */
81
82#if INET6
83#include <netinet6/in6_var.h>
84#include <netinet6/nd6.h>
85#include <netinet6/mld6_var.h>
86#endif /* INET6 */
87
88#if NETAT
89#include <netat/at_var.h>
90#endif /* NETAT */
91
92#include <libkern/OSAtomic.h>
93
94#include <machine/machine_routines.h>
95
96#include <mach/thread_act.h>
97#include <mach/sdt.h>
98
99#if CONFIG_MACF_NET
100#include <security/mac_framework.h>
101#endif /* MAC_NET */
102
103#if PF
104#include <net/pfvar.h>
105#endif /* PF */
106#if PF_ALTQ
107#include <net/altq/altq.h>
108#endif /* PF_ALTQ */
109#include <net/pktsched/pktsched.h>
110
111#define DBG_LAYER_BEG		DLILDBG_CODE(DBG_DLIL_STATIC, 0)
112#define DBG_LAYER_END		DLILDBG_CODE(DBG_DLIL_STATIC, 2)
113#define DBG_FNC_DLIL_INPUT      DLILDBG_CODE(DBG_DLIL_STATIC, (1 << 8))
114#define DBG_FNC_DLIL_OUTPUT     DLILDBG_CODE(DBG_DLIL_STATIC, (2 << 8))
115#define DBG_FNC_DLIL_IFOUT      DLILDBG_CODE(DBG_DLIL_STATIC, (3 << 8))
116
117
118#define MAX_FRAME_TYPE_SIZE 4 /* LONGWORDS */
119#define MAX_LINKADDR	    4 /* LONGWORDS */
120#define M_NKE M_IFADDR
121
122#if 1
123#define DLIL_PRINTF	printf
124#else
125#define DLIL_PRINTF	kprintf
126#endif
127
128#define	IF_DATA_REQUIRE_ALIGNED_64(f)	\
129	_CASSERT(!(offsetof(struct if_data_internal, f) % sizeof (u_int64_t)))
130
131#define	IFNET_IF_DATA_REQUIRE_ALIGNED_64(f)	\
132	_CASSERT(!(offsetof(struct ifnet, if_data.f) % sizeof (u_int64_t)))
133
134enum {
135	kProtoKPI_v1	= 1,
136	kProtoKPI_v2	= 2
137};
138
139/*
140 * List of if_proto structures in if_proto_hash[] is protected by
141 * the ifnet lock.  The rest of the fields are initialized at protocol
142 * attach time and never change, thus no lock required as long as
143 * a reference to it is valid, via if_proto_ref().
144 */
145struct if_proto {
146    SLIST_ENTRY(if_proto)	next_hash;
147    u_int32_t			refcount;
148    u_int32_t			detached;
149    struct ifnet		*ifp;
150    protocol_family_t		protocol_family;
151    int				proto_kpi;
152    union {
153		struct {
154			proto_media_input		input;
155			proto_media_preout		pre_output;
156			proto_media_event		event;
157			proto_media_ioctl		ioctl;
158			proto_media_detached		detached;
159			proto_media_resolve_multi	resolve_multi;
160			proto_media_send_arp		send_arp;
161		} v1;
162		struct {
163			proto_media_input_v2		input;
164			proto_media_preout		pre_output;
165			proto_media_event		event;
166			proto_media_ioctl		ioctl;
167			proto_media_detached		detached;
168			proto_media_resolve_multi	resolve_multi;
169			proto_media_send_arp		send_arp;
170		} v2;
171	} kpi;
172};
173
174SLIST_HEAD(proto_hash_entry, if_proto);
175
176#define	DLIL_SDLMAXLEN	64
177#define	DLIL_SDLDATALEN	\
178	(DLIL_SDLMAXLEN - offsetof(struct sockaddr_dl, sdl_data[0]))
179
180struct dlil_ifnet {
181	struct ifnet	dl_if;			/* public ifnet */
182	/*
183	 * DLIL private fields, protected by dl_if_lock
184	 */
185	decl_lck_mtx_data(, dl_if_lock);
186	TAILQ_ENTRY(dlil_ifnet) dl_if_link;	/* dlil_ifnet link */
187	u_int32_t dl_if_flags;			/* flags (below) */
188	u_int32_t dl_if_refcnt;			/* refcnt */
189	void (*dl_if_trace)(struct dlil_ifnet *, int); /* ref trace callback */
190	void	*dl_if_uniqueid;		/* unique interface id */
191	size_t	dl_if_uniqueid_len;		/* length of the unique id */
192	char	dl_if_namestorage[IFNAMSIZ];	/* interface name storage */
193	struct {
194		struct ifaddr	ifa;		/* lladdr ifa */
195		u_int8_t	asdl[DLIL_SDLMAXLEN]; /* addr storage */
196		u_int8_t	msdl[DLIL_SDLMAXLEN]; /* mask storage */
197	} dl_if_lladdr;
198	u_int8_t dl_if_descstorage[IF_DESCSIZE]; /* desc storage */
199	struct dlil_threading_info dl_if_inpstorage; /* input thread storage */
200	ctrace_t	dl_if_attach;		/* attach PC stacktrace */
201	ctrace_t	dl_if_detach;		/* detach PC stacktrace */
202};
203
204/* Values for dl_if_flags (private to DLIL) */
205#define	DLIF_INUSE	0x1	/* DLIL ifnet recycler, ifnet in use */
206#define	DLIF_REUSE	0x2	/* DLIL ifnet recycles, ifnet is not new */
207#define	DLIF_DEBUG	0x4	/* has debugging info */
208
209#define	IF_REF_TRACE_HIST_SIZE	8	/* size of ref trace history */
210
211/* For gdb */
212__private_extern__ unsigned int if_ref_trace_hist_size = IF_REF_TRACE_HIST_SIZE;
213
214struct dlil_ifnet_dbg {
215	struct dlil_ifnet	dldbg_dlif;		/* dlil_ifnet */
216	u_int16_t		dldbg_if_refhold_cnt;	/* # ifnet references */
217	u_int16_t		dldbg_if_refrele_cnt;	/* # ifnet releases */
218	/*
219	 * Circular lists of ifnet_{reference,release} callers.
220	 */
221	ctrace_t		dldbg_if_refhold[IF_REF_TRACE_HIST_SIZE];
222	ctrace_t		dldbg_if_refrele[IF_REF_TRACE_HIST_SIZE];
223};
224
225#define	DLIL_TO_IFP(s)	(&s->dl_if)
226#define	IFP_TO_DLIL(s)	((struct dlil_ifnet *)s)
227
228struct ifnet_filter {
229	TAILQ_ENTRY(ifnet_filter)	filt_next;
230	u_int32_t			filt_skip;
231	ifnet_t				filt_ifp;
232	const char			*filt_name;
233	void				*filt_cookie;
234	protocol_family_t		filt_protocol;
235	iff_input_func			filt_input;
236	iff_output_func			filt_output;
237	iff_event_func			filt_event;
238	iff_ioctl_func			filt_ioctl;
239	iff_detached_func		filt_detached;
240};
241
242struct proto_input_entry;
243
244static TAILQ_HEAD(, dlil_ifnet) dlil_ifnet_head;
245static lck_grp_t *dlil_lock_group;
246lck_grp_t *ifnet_lock_group;
247static lck_grp_t *ifnet_head_lock_group;
248static lck_grp_t *ifnet_snd_lock_group;
249static lck_grp_t *ifnet_rcv_lock_group;
250lck_attr_t *ifnet_lock_attr;
251decl_lck_rw_data(static, ifnet_head_lock);
252decl_lck_mtx_data(static, dlil_ifnet_lock);
253u_int32_t dlil_filter_count = 0;
254extern u_int32_t	ipv4_ll_arp_aware;
255
256struct sfb_fc_list ifnet_fclist;
257decl_lck_mtx_data(static, ifnet_fclist_lock);
258
259static unsigned int ifnet_fcezone_size;		/* size of ifnet_fce */
260static struct zone *ifnet_fcezone;		/* zone for ifnet_fce */
261
262#define IFNET_FCEZONE_MAX	32		/* maximum elements in zone */
263#define IFNET_FCEZONE_NAME	"ifnet_fcezone"	/* zone name */
264
265static void ifnet_fc_thread_func(void *, wait_result_t);
266static void ifnet_fc_init(void);
267
268#if DEBUG
269static unsigned int ifnet_debug = 1;	/* debugging (enabled) */
270#else
271static unsigned int ifnet_debug;	/* debugging (disabled) */
272#endif /* !DEBUG */
273static unsigned int dlif_size;		/* size of dlil_ifnet to allocate */
274static unsigned int dlif_bufsize;	/* size of dlif_size + headroom */
275static struct zone *dlif_zone;		/* zone for dlil_ifnet */
276
277#define	DLIF_ZONE_MAX		64		/* maximum elements in zone */
278#define	DLIF_ZONE_NAME		"ifnet"		/* zone name */
279
280static unsigned int dlif_filt_size;	/* size of ifnet_filter */
281static struct zone *dlif_filt_zone;	/* zone for ifnet_filter */
282
283#define	DLIF_FILT_ZONE_MAX	8		/* maximum elements in zone */
284#define	DLIF_FILT_ZONE_NAME	"ifnet_filter"	/* zone name */
285
286static unsigned int dlif_phash_size;	/* size of ifnet proto hash table */
287static struct zone *dlif_phash_zone;	/* zone for ifnet proto hash table */
288
289#define	DLIF_PHASH_ZONE_MAX	DLIF_ZONE_MAX	/* maximum elements in zone */
290#define	DLIF_PHASH_ZONE_NAME	"ifnet_proto_hash" /* zone name */
291
292static unsigned int dlif_proto_size;	/* size of if_proto */
293static struct zone *dlif_proto_zone;	/* zone for if_proto */
294
295#define	DLIF_PROTO_ZONE_MAX	(DLIF_ZONE_MAX*2) /* maximum elements in zone */
296#define	DLIF_PROTO_ZONE_NAME	"ifnet_proto"	/* zone name */
297
298static unsigned int dlif_tcpstat_size;		/* size of tcpstat_local to allocate */
299static unsigned int dlif_tcpstat_bufsize;	/* size of dlif_tcpstat_size + headroom */
300static struct zone *dlif_tcpstat_zone;		/* zone for tcpstat_local */
301
302#define	DLIF_TCPSTAT_ZONE_MAX	1		/* maximum elements in zone */
303#define	DLIF_TCPSTAT_ZONE_NAME	"ifnet_tcpstat"	/* zone name */
304
305static unsigned int dlif_udpstat_size;		/* size of udpstat_local to allocate */
306static unsigned int dlif_udpstat_bufsize;	/* size of dlif_udpstat_size + headroom */
307static struct zone *dlif_udpstat_zone;		/* zone for udpstat_local */
308
309#define	DLIF_UDPSTAT_ZONE_MAX	1		/* maximum elements in zone */
310#define	DLIF_UDPSTAT_ZONE_NAME	"ifnet_udpstat"	/* zone name */
311
312/*
313 * Updating this variable should be done by first acquiring the global
314 * radix node head (rnh_lock), in tandem with settting/clearing the
315 * PR_AGGDRAIN for routedomain.
316 */
317u_int32_t ifnet_aggressive_drainers;
318static u_int32_t net_rtref;
319
320static struct dlil_main_threading_info dlil_main_input_thread_info;
321__private_extern__ struct dlil_threading_info *dlil_main_input_thread =
322    (struct dlil_threading_info *)&dlil_main_input_thread_info;
323
324static int dlil_event_internal(struct ifnet *ifp, struct kev_msg *msg);
325static int dlil_detach_filter_internal(interface_filter_t filter, int detached);
326static void dlil_if_trace(struct dlil_ifnet *, int);
327static void if_proto_ref(struct if_proto *);
328static void if_proto_free(struct if_proto *);
329static struct if_proto *find_attached_proto(struct ifnet *, u_int32_t);
330static int dlil_ifp_proto_count(struct ifnet *);
331static void if_flt_monitor_busy(struct ifnet *);
332static void if_flt_monitor_unbusy(struct ifnet *);
333static void if_flt_monitor_enter(struct ifnet *);
334static void if_flt_monitor_leave(struct ifnet *);
335static int dlil_interface_filters_input(struct ifnet *, struct mbuf **,
336    char **, protocol_family_t);
337static int dlil_interface_filters_output(struct ifnet *, struct mbuf **,
338    protocol_family_t);
339static struct ifaddr *dlil_alloc_lladdr(struct ifnet *,
340    const struct sockaddr_dl *);
341static int ifnet_lookup(struct ifnet *);
342static void if_purgeaddrs(struct ifnet *);
343
344static errno_t ifproto_media_input_v1(struct ifnet *, protocol_family_t,
345    struct mbuf *, char *);
346static errno_t ifproto_media_input_v2(struct ifnet *, protocol_family_t,
347    struct mbuf *);
348static errno_t ifproto_media_preout(struct ifnet *, protocol_family_t,
349    mbuf_t *, const struct sockaddr *, void *, char *, char *);
350static void ifproto_media_event(struct ifnet *, protocol_family_t,
351    const struct kev_msg *);
352static errno_t ifproto_media_ioctl(struct ifnet *, protocol_family_t,
353    unsigned long, void *);
354static errno_t ifproto_media_resolve_multi(ifnet_t, const struct sockaddr *,
355    struct sockaddr_dl *, size_t);
356static errno_t ifproto_media_send_arp(struct ifnet *, u_short,
357    const struct sockaddr_dl *, const struct sockaddr *,
358    const struct sockaddr_dl *, const struct sockaddr *);
359
360static errno_t ifp_if_output(struct ifnet *, struct mbuf *);
361static void ifp_if_start(struct ifnet *);
362static void ifp_if_input_poll(struct ifnet *, u_int32_t, u_int32_t,
363    struct mbuf **, struct mbuf **, u_int32_t *, u_int32_t *);
364static errno_t ifp_if_ctl(struct ifnet *, ifnet_ctl_cmd_t, u_int32_t, void *);
365static errno_t ifp_if_demux(struct ifnet *, struct mbuf *, char *,
366    protocol_family_t *);
367static errno_t ifp_if_add_proto(struct ifnet *, protocol_family_t,
368    const struct ifnet_demux_desc *, u_int32_t);
369static errno_t ifp_if_del_proto(struct ifnet *, protocol_family_t);
370static errno_t ifp_if_check_multi(struct ifnet *, const struct sockaddr *);
371static errno_t ifp_if_framer(struct ifnet *, struct mbuf **,
372    const struct sockaddr *, const char *, const char *
373#if CONFIG_EMBEDDED
374    ,
375    u_int32_t *, u_int32_t *
376#endif /* CONFIG_EMBEDDED */
377    );
378static errno_t ifp_if_set_bpf_tap(struct ifnet *, bpf_tap_mode, bpf_packet_func);
379static void ifp_if_free(struct ifnet *);
380static void ifp_if_event(struct ifnet *, const struct kev_msg *);
381static __inline void ifp_inc_traffic_class_in(struct ifnet *, struct mbuf *);
382static __inline void ifp_inc_traffic_class_out(struct ifnet *, struct mbuf *);
383
384static void dlil_main_input_thread_func(void *, wait_result_t);
385static void dlil_input_thread_func(void *, wait_result_t);
386static void dlil_rxpoll_input_thread_func(void *, wait_result_t);
387static void dlil_rxpoll_calc_limits(struct dlil_threading_info *);
388static int dlil_create_input_thread(ifnet_t, struct dlil_threading_info *);
389static void dlil_terminate_input_thread(struct dlil_threading_info *);
390static void dlil_input_stats_add(const struct ifnet_stat_increment_param *,
391    struct dlil_threading_info *, boolean_t);
392static void dlil_input_stats_sync(struct ifnet *, struct dlil_threading_info *);
393static void dlil_input_packet_list_common(struct ifnet *, struct mbuf *,
394    u_int32_t, ifnet_model_t, boolean_t);
395static errno_t ifnet_input_common(struct ifnet *, struct mbuf *, struct mbuf *,
396    const struct ifnet_stat_increment_param *, boolean_t, boolean_t);
397
398static void ifnet_detacher_thread_func(void *, wait_result_t);
399static int ifnet_detacher_thread_cont(int);
400static void ifnet_detach_final(struct ifnet *);
401static void ifnet_detaching_enqueue(struct ifnet *);
402static struct ifnet *ifnet_detaching_dequeue(void);
403
404static void ifnet_start_thread_fn(void *, wait_result_t);
405static void ifnet_poll_thread_fn(void *, wait_result_t);
406static void ifnet_poll(struct ifnet *);
407
408static void ifp_src_route_copyout(struct ifnet *, struct route *);
409static void ifp_src_route_copyin(struct ifnet *, struct route *);
410#if INET6
411static void ifp_src_route6_copyout(struct ifnet *, struct route_in6 *);
412static void ifp_src_route6_copyin(struct ifnet *, struct route_in6 *);
413#endif /* INET6 */
414
415static int sysctl_rxpoll SYSCTL_HANDLER_ARGS;
416static int sysctl_sndq_maxlen SYSCTL_HANDLER_ARGS;
417static int sysctl_rcvq_maxlen SYSCTL_HANDLER_ARGS;
418
419/* The following are protected by dlil_ifnet_lock */
420static TAILQ_HEAD(, ifnet) ifnet_detaching_head;
421static u_int32_t ifnet_detaching_cnt;
422static void *ifnet_delayed_run;	/* wait channel for detaching thread */
423
424extern void bpfdetach(struct ifnet*);
425extern void proto_input_run(void);
426
427extern uint32_t udp_count_opportunistic(unsigned int ifindex,
428	u_int32_t flags);
429extern uint32_t tcp_count_opportunistic(unsigned int ifindex,
430	u_int32_t flags);
431
432__private_extern__ void link_rtrequest(int, struct rtentry *, struct sockaddr *);
433
434#if DEBUG
435static int dlil_verbose = 1;
436#else
437static int dlil_verbose = 0;
438#endif /* DEBUG */
439#if IFNET_INPUT_SANITY_CHK
440/* sanity checking of input packet lists received */
441static u_int32_t dlil_input_sanity_check = 0;
442#endif /* IFNET_INPUT_SANITY_CHK */
443/* rate limit debug messages */
444struct timespec dlil_dbgrate = { 1, 0 };
445
446SYSCTL_DECL(_net_link_generic_system);
447
448SYSCTL_INT(_net_link_generic_system, OID_AUTO, dlil_verbose,
449    CTLFLAG_RW | CTLFLAG_LOCKED, &dlil_verbose, 0, "Log DLIL error messages");
450
451#define	IF_SNDQ_MINLEN	32
452u_int32_t if_sndq_maxlen = IFQ_MAXLEN;
453SYSCTL_PROC(_net_link_generic_system, OID_AUTO, sndq_maxlen,
454    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &if_sndq_maxlen, IFQ_MAXLEN,
455    sysctl_sndq_maxlen, "I", "Default transmit queue max length");
456
457#define	IF_RCVQ_MINLEN	32
458#define IF_RCVQ_MAXLEN	256
459u_int32_t if_rcvq_maxlen = IF_RCVQ_MAXLEN;
460SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rcvq_maxlen,
461    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &if_rcvq_maxlen, IFQ_MAXLEN,
462    sysctl_rcvq_maxlen, "I", "Default receive queue max length");
463
464#define	IF_RXPOLL_DECAY	2		/* ilog2 of EWMA decay rate (4) */
465static u_int32_t if_rxpoll_decay = IF_RXPOLL_DECAY;
466SYSCTL_UINT(_net_link_generic_system, OID_AUTO, rxpoll_decay,
467    CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_decay, IF_RXPOLL_DECAY,
468    "ilog2 of EWMA decay rate of avg inbound packets");
469
470#define	IF_RXPOLL_MODE_HOLDTIME	(1000ULL * 1000 * 1000)	/* 1 sec */
471static u_int64_t if_rxpoll_mode_holdtime = IF_RXPOLL_MODE_HOLDTIME;
472SYSCTL_QUAD(_net_link_generic_system, OID_AUTO, rxpoll_freeze_time,
473    CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_mode_holdtime,
474    "input poll mode freeze time");
475
476#define	IF_RXPOLL_SAMPLETIME	(10ULL * 1000 * 1000)	/* 10 ms */
477static u_int64_t if_rxpoll_sample_holdtime = IF_RXPOLL_SAMPLETIME;
478SYSCTL_QUAD(_net_link_generic_system, OID_AUTO, rxpoll_sample_time,
479    CTLFLAG_RD | CTLFLAG_LOCKED, &if_rxpoll_sample_holdtime,
480    "input poll sampling time");
481
482#define	IF_RXPOLL_INTERVAL_TIME	(1ULL * 1000 * 1000)	/* 1 ms */
483static u_int64_t if_rxpoll_interval_time = IF_RXPOLL_INTERVAL_TIME;
484SYSCTL_QUAD(_net_link_generic_system, OID_AUTO, rxpoll_interval_time,
485    CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_interval_time,
486    "input poll interval (time)");
487
488#define	IF_RXPOLL_INTERVAL_PKTS	0			/* 0 (disabled) */
489static u_int32_t if_rxpoll_interval_pkts = IF_RXPOLL_INTERVAL_PKTS;
490SYSCTL_UINT(_net_link_generic_system, OID_AUTO, rxpoll_interval_pkts,
491    CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_interval_pkts,
492    IF_RXPOLL_INTERVAL_PKTS, "input poll interval (packets)");
493
494#define	IF_RXPOLL_WLOWAT		5
495static u_int32_t if_rxpoll_wlowat = IF_RXPOLL_WLOWAT;
496SYSCTL_UINT(_net_link_generic_system, OID_AUTO, rxpoll_wakeups_lowat,
497    CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_wlowat, IF_RXPOLL_WLOWAT,
498    "input poll wakeup low watermark");
499
500#define	IF_RXPOLL_WHIWAT		100
501static u_int32_t if_rxpoll_whiwat = IF_RXPOLL_WHIWAT;
502SYSCTL_UINT(_net_link_generic_system, OID_AUTO, rxpoll_wakeups_hiwat,
503    CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_whiwat, IF_RXPOLL_WHIWAT,
504    "input poll wakeup high watermark");
505
506static u_int32_t if_rxpoll_max = 0;			/* 0 (automatic) */
507SYSCTL_UINT(_net_link_generic_system, OID_AUTO, rxpoll_max,
508    CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_max, 0,
509    "max packets per poll call");
510
511static u_int32_t if_rxpoll = 1;
512SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rxpoll,
513    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll, 0,
514    sysctl_rxpoll, "I", "enable opportunistic input polling");
515
516u_int32_t if_bw_smoothing_val = 3;
517SYSCTL_UINT(_net_link_generic_system, OID_AUTO, if_bw_smoothing_val,
518    CTLFLAG_RW | CTLFLAG_LOCKED, &if_bw_smoothing_val, 0, "");
519
520u_int32_t if_bw_measure_size = 10;
521SYSCTL_INT(_net_link_generic_system, OID_AUTO, if_bw_measure_size,
522    CTLFLAG_RW | CTLFLAG_LOCKED, &if_bw_measure_size, 0, "");
523
524static u_int32_t cur_dlil_input_threads = 0;
525SYSCTL_UINT(_net_link_generic_system, OID_AUTO, dlil_input_threads,
526    CTLFLAG_RD | CTLFLAG_LOCKED, &cur_dlil_input_threads , 0,
527    "Current number of DLIL input threads");
528
529#if IFNET_INPUT_SANITY_CHK
530SYSCTL_UINT(_net_link_generic_system, OID_AUTO, dlil_input_sanity_check,
531    CTLFLAG_RW | CTLFLAG_LOCKED, &dlil_input_sanity_check , 0,
532    "Turn on sanity checking in DLIL input");
533#endif /* IFNET_INPUT_SANITY_CHK */
534
535static u_int32_t if_flowadv = 1;
536SYSCTL_UINT(_net_link_generic_system, OID_AUTO, flow_advisory,
537    CTLFLAG_RW | CTLFLAG_LOCKED, &if_flowadv, 1,
538    "enable flow-advisory mechanism");
539
540unsigned int net_rxpoll = 1;
541unsigned int net_affinity = 1;
542static kern_return_t dlil_affinity_set(struct thread *, u_int32_t);
543
544extern u_int32_t	inject_buckets;
545
546static	lck_grp_attr_t	*dlil_grp_attributes = NULL;
547static	lck_attr_t	*dlil_lck_attributes = NULL;
548
549#define PROTO_HASH_SLOTS	0x5
550
551#define	DLIL_INPUT_CHECK(m, ifp) {					\
552	struct ifnet *_rcvif = mbuf_pkthdr_rcvif(m);			\
553	if (_rcvif == NULL || (ifp != lo_ifp && _rcvif != ifp) ||	\
554	    !(mbuf_flags(m) & MBUF_PKTHDR)) {				\
555		panic_plain("%s: invalid mbuf %p\n", __func__, m);	\
556		/* NOTREACHED */					\
557	}								\
558}
559
560#define	DLIL_EWMA(old, new, decay) do {					\
561	u_int32_t _avg;							\
562	if ((_avg = (old)) > 0)						\
563		_avg = (((_avg << (decay)) - _avg) + (new)) >> (decay);	\
564	else								\
565		_avg = (new);						\
566	(old) = _avg;							\
567} while (0)
568
569#define	MBPS	(1ULL * 1000 * 1000)
570#define	GBPS	(MBPS * 1000)
571
572struct rxpoll_time_tbl {
573	u_int64_t	speed;		/* downlink speed */
574	u_int32_t	plowat;		/* packets low watermark */
575	u_int32_t	phiwat;		/* packets high watermark */
576	u_int32_t	blowat;		/* bytes low watermark */
577	u_int32_t	bhiwat;		/* bytes high watermark */
578};
579
580static struct rxpoll_time_tbl rxpoll_tbl[] = {
581	{  10 * MBPS,	2,	8,	(1 * 1024),	(6 * 1024)	},
582	{ 100 * MBPS,	10,	40,	(4 * 1024),	(64 * 1024)	},
583	{   1 * GBPS,	10,	40,	(4 * 1024),	(64 * 1024)	},
584	{  10 * GBPS,	10,	40,	(4 * 1024),	(64 * 1024)	},
585	{ 100 * GBPS,	10,	40,	(4 * 1024),	(64 * 1024)	},
586	{ 0, 0, 0, 0, 0 }
587};
588
589/*
590 * Internal functions.
591 */
592
593static int
594proto_hash_value(u_int32_t protocol_family)
595{
596	/*
597	 * dlil_proto_unplumb_all() depends on the mapping between
598	 * the hash bucket index and the protocol family defined
599	 * here; future changes must be applied there as well.
600	 */
601	switch(protocol_family) {
602		case PF_INET:
603			return (0);
604		case PF_INET6:
605			return (1);
606		case PF_APPLETALK:
607			return (2);
608		case PF_VLAN:
609			return (3);
610		case PF_UNSPEC:
611		default:
612			return (4);
613	}
614}
615
616/*
617 * Caller must already be holding ifnet lock.
618 */
619static struct if_proto *
620find_attached_proto(struct ifnet *ifp, u_int32_t protocol_family)
621{
622	struct if_proto *proto = NULL;
623	u_int32_t i = proto_hash_value(protocol_family);
624
625	ifnet_lock_assert(ifp, IFNET_LCK_ASSERT_OWNED);
626
627	if (ifp->if_proto_hash != NULL)
628		proto = SLIST_FIRST(&ifp->if_proto_hash[i]);
629
630	while (proto != NULL && proto->protocol_family != protocol_family)
631		proto = SLIST_NEXT(proto, next_hash);
632
633	if (proto != NULL)
634		if_proto_ref(proto);
635
636	return (proto);
637}
638
639static void
640if_proto_ref(struct if_proto *proto)
641{
642	atomic_add_32(&proto->refcount, 1);
643}
644
645extern void if_rtproto_del(struct ifnet *ifp, int protocol);
646
647static void
648if_proto_free(struct if_proto *proto)
649{
650	u_int32_t oldval;
651	struct ifnet *ifp = proto->ifp;
652	u_int32_t proto_family = proto->protocol_family;
653	struct kev_dl_proto_data ev_pr_data;
654
655	oldval = atomic_add_32_ov(&proto->refcount, -1);
656	if (oldval > 1)
657		return;
658
659	/* No more reference on this, protocol must have been detached */
660	VERIFY(proto->detached);
661
662	if (proto->proto_kpi == kProtoKPI_v1) {
663		if (proto->kpi.v1.detached)
664			proto->kpi.v1.detached(ifp, proto->protocol_family);
665	}
666	if (proto->proto_kpi == kProtoKPI_v2) {
667		if (proto->kpi.v2.detached)
668			proto->kpi.v2.detached(ifp, proto->protocol_family);
669	}
670
671	/*
672	 * Cleanup routes that may still be in the routing table for that
673	 * interface/protocol pair.
674	 */
675	if_rtproto_del(ifp, proto_family);
676
677	/*
678	 * The reserved field carries the number of protocol still attached
679	 * (subject to change)
680	 */
681	ifnet_lock_shared(ifp);
682	ev_pr_data.proto_family = proto_family;
683	ev_pr_data.proto_remaining_count = dlil_ifp_proto_count(ifp);
684	ifnet_lock_done(ifp);
685
686	dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_PROTO_DETACHED,
687	    (struct net_event_data *)&ev_pr_data,
688	    sizeof(struct kev_dl_proto_data));
689
690	zfree(dlif_proto_zone, proto);
691}
692
693__private_extern__ void
694ifnet_lock_assert(struct ifnet *ifp, ifnet_lock_assert_t what)
695{
696	unsigned int type = 0;
697	int ass = 1;
698
699	switch (what) {
700	case IFNET_LCK_ASSERT_EXCLUSIVE:
701		type = LCK_RW_ASSERT_EXCLUSIVE;
702		break;
703
704	case IFNET_LCK_ASSERT_SHARED:
705		type = LCK_RW_ASSERT_SHARED;
706		break;
707
708	case IFNET_LCK_ASSERT_OWNED:
709		type = LCK_RW_ASSERT_HELD;
710		break;
711
712	case IFNET_LCK_ASSERT_NOTOWNED:
713		/* nothing to do here for RW lock; bypass assert */
714		ass = 0;
715		break;
716
717	default:
718		panic("bad ifnet assert type: %d", what);
719		/* NOTREACHED */
720	}
721	if (ass)
722		lck_rw_assert(&ifp->if_lock, type);
723}
724
725__private_extern__ void
726ifnet_lock_shared(struct ifnet *ifp)
727{
728	lck_rw_lock_shared(&ifp->if_lock);
729}
730
731__private_extern__ void
732ifnet_lock_exclusive(struct ifnet *ifp)
733{
734	lck_rw_lock_exclusive(&ifp->if_lock);
735}
736
737__private_extern__ void
738ifnet_lock_done(struct ifnet *ifp)
739{
740	lck_rw_done(&ifp->if_lock);
741}
742
743__private_extern__ void
744ifnet_head_lock_shared(void)
745{
746	lck_rw_lock_shared(&ifnet_head_lock);
747}
748
749__private_extern__ void
750ifnet_head_lock_exclusive(void)
751{
752	lck_rw_lock_exclusive(&ifnet_head_lock);
753}
754
755__private_extern__ void
756ifnet_head_done(void)
757{
758	lck_rw_done(&ifnet_head_lock);
759}
760
761/*
762 * Caller must already be holding ifnet lock.
763 */
764static int
765dlil_ifp_proto_count(struct ifnet * ifp)
766{
767	int i, count = 0;
768
769	ifnet_lock_assert(ifp, IFNET_LCK_ASSERT_OWNED);
770
771	if (ifp->if_proto_hash == NULL)
772		goto done;
773
774	for (i = 0; i < PROTO_HASH_SLOTS; i++) {
775		struct if_proto *proto;
776		SLIST_FOREACH(proto, &ifp->if_proto_hash[i], next_hash) {
777			count++;
778		}
779	}
780done:
781	return (count);
782}
783
784__private_extern__ void
785dlil_post_msg(struct ifnet *ifp, u_int32_t event_subclass,
786    u_int32_t event_code, struct net_event_data *event_data,
787    u_int32_t event_data_len)
788{
789	struct net_event_data ev_data;
790	struct kev_msg ev_msg;
791
792	bzero(&ev_msg, sizeof (ev_msg));
793	bzero(&ev_data, sizeof (ev_data));
794	/*
795	 * a net event always starts with a net_event_data structure
796	 * but the caller can generate a simple net event or
797	 * provide a longer event structure to post
798	 */
799	ev_msg.vendor_code	= KEV_VENDOR_APPLE;
800	ev_msg.kev_class	= KEV_NETWORK_CLASS;
801	ev_msg.kev_subclass	= event_subclass;
802	ev_msg.event_code	= event_code;
803
804	if (event_data == NULL) {
805		event_data = &ev_data;
806		event_data_len = sizeof(struct net_event_data);
807	}
808
809	strncpy(&event_data->if_name[0], ifp->if_name, IFNAMSIZ);
810	event_data->if_family = ifp->if_family;
811	event_data->if_unit   = (u_int32_t) ifp->if_unit;
812
813	ev_msg.dv[0].data_length = event_data_len;
814	ev_msg.dv[0].data_ptr    = event_data;
815	ev_msg.dv[1].data_length = 0;
816
817	dlil_event_internal(ifp, &ev_msg);
818}
819
820__private_extern__ int
821dlil_alloc_local_stats(struct ifnet *ifp)
822{
823	int ret = EINVAL;
824	void *buf, *base, **pbuf;
825
826	if (ifp == NULL)
827		goto end;
828
829	if (ifp->if_tcp_stat == NULL && ifp->if_udp_stat == NULL) {
830		/* allocate tcpstat_local structure */
831		buf = zalloc(dlif_tcpstat_zone);
832		if (buf == NULL) {
833			ret = ENOMEM;
834			goto end;
835		}
836		bzero(buf, dlif_tcpstat_bufsize);
837
838		/* Get the 64-bit aligned base address for this object */
839		base = (void *)P2ROUNDUP((intptr_t)buf + sizeof (u_int64_t),
840		    sizeof (u_int64_t));
841		VERIFY(((intptr_t)base + dlif_tcpstat_size) <=
842		    ((intptr_t)buf + dlif_tcpstat_bufsize));
843
844		/*
845		 * Wind back a pointer size from the aligned base and
846		 * save the original address so we can free it later.
847		 */
848		pbuf = (void **)((intptr_t)base - sizeof (void *));
849		*pbuf = buf;
850		ifp->if_tcp_stat = base;
851
852		/* allocate udpstat_local structure */
853		buf = zalloc(dlif_udpstat_zone);
854		if (buf == NULL) {
855			ret = ENOMEM;
856			goto end;
857		}
858		bzero(buf, dlif_udpstat_bufsize);
859
860		/* Get the 64-bit aligned base address for this object */
861		base = (void *)P2ROUNDUP((intptr_t)buf + sizeof (u_int64_t),
862		    sizeof (u_int64_t));
863		VERIFY(((intptr_t)base + dlif_udpstat_size) <=
864		    ((intptr_t)buf + dlif_udpstat_bufsize));
865
866		/*
867		 * Wind back a pointer size from the aligned base and
868		 * save the original address so we can free it later.
869		 */
870		pbuf = (void **)((intptr_t)base - sizeof (void *));
871		*pbuf = buf;
872		ifp->if_udp_stat = base;
873
874		VERIFY(IS_P2ALIGNED(ifp->if_tcp_stat, sizeof (u_int64_t)) &&
875		    IS_P2ALIGNED(ifp->if_udp_stat, sizeof (u_int64_t)));
876
877		ret = 0;
878	}
879
880end:
881	if (ret != 0) {
882		if (ifp->if_tcp_stat != NULL) {
883			pbuf = (void **)
884			    ((intptr_t)ifp->if_tcp_stat - sizeof (void *));
885			zfree(dlif_tcpstat_zone, *pbuf);
886			ifp->if_tcp_stat = NULL;
887		}
888		if (ifp->if_udp_stat != NULL) {
889			pbuf = (void **)
890			    ((intptr_t)ifp->if_udp_stat - sizeof (void *));
891			zfree(dlif_udpstat_zone, *pbuf);
892			ifp->if_udp_stat = NULL;
893		}
894	}
895
896	return (ret);
897}
898
899static int
900dlil_create_input_thread(ifnet_t ifp, struct dlil_threading_info *inp)
901{
902	thread_continue_t func;
903	u_int32_t limit;
904	int error;
905
906	/* NULL ifp indicates the main input thread, called at dlil_init time */
907	if (ifp == NULL) {
908		func = dlil_main_input_thread_func;
909		VERIFY(inp == dlil_main_input_thread);
910		(void) strlcat(inp->input_name,
911		    "main_input", DLIL_THREADNAME_LEN);
912	} else if (net_rxpoll && (ifp->if_eflags & IFEF_RXPOLL)) {
913		func = dlil_rxpoll_input_thread_func;
914		VERIFY(inp != dlil_main_input_thread);
915		(void) snprintf(inp->input_name, DLIL_THREADNAME_LEN,
916		    "%s%d_input_poll", ifp->if_name, ifp->if_unit);
917	} else {
918		func = dlil_input_thread_func;
919		VERIFY(inp != dlil_main_input_thread);
920		(void) snprintf(inp->input_name, DLIL_THREADNAME_LEN,
921		    "%s%d_input", ifp->if_name, ifp->if_unit);
922	}
923	VERIFY(inp->input_thr == THREAD_NULL);
924
925	inp->lck_grp = lck_grp_alloc_init(inp->input_name, dlil_grp_attributes);
926	lck_mtx_init(&inp->input_lck, inp->lck_grp, dlil_lck_attributes);
927
928	inp->mode = IFNET_MODEL_INPUT_POLL_OFF;
929	inp->ifp = ifp;		/* NULL for main input thread */
930
931	net_timerclear(&inp->mode_holdtime);
932	net_timerclear(&inp->mode_lasttime);
933	net_timerclear(&inp->sample_holdtime);
934	net_timerclear(&inp->sample_lasttime);
935	net_timerclear(&inp->dbg_lasttime);
936
937	/*
938	 * For interfaces that support opportunistic polling, set the
939	 * low and high watermarks for outstanding inbound packets/bytes.
940	 * Also define freeze times for transitioning between modes
941	 * and updating the average.
942	 */
943	if (ifp != NULL && net_rxpoll && (ifp->if_eflags & IFEF_RXPOLL)) {
944		limit = MAX(if_rcvq_maxlen, IF_RCVQ_MINLEN);
945		dlil_rxpoll_calc_limits(inp);
946	} else {
947		limit = (u_int32_t)-1;
948	}
949
950	_qinit(&inp->rcvq_pkts, Q_DROPTAIL, limit);
951	if (inp == dlil_main_input_thread) {
952		struct dlil_main_threading_info *inpm =
953		    (struct dlil_main_threading_info *)inp;
954		_qinit(&inpm->lo_rcvq_pkts, Q_DROPTAIL, limit);
955	}
956
957	error = kernel_thread_start(func, inp, &inp->input_thr);
958	if (error == KERN_SUCCESS) {
959		ml_thread_policy(inp->input_thr, MACHINE_GROUP,
960		    (MACHINE_NETWORK_GROUP|MACHINE_NETWORK_NETISR));
961		/*
962		 * We create an affinity set so that the matching workloop
963		 * thread or the starter thread (for loopback) can be
964		 * scheduled on the same processor set as the input thread.
965		 */
966		if (net_affinity) {
967			struct thread *tp = inp->input_thr;
968			u_int32_t tag;
969			/*
970			 * Randomize to reduce the probability
971			 * of affinity tag namespace collision.
972			 */
973			read_random(&tag, sizeof (tag));
974			if (dlil_affinity_set(tp, tag) == KERN_SUCCESS) {
975				thread_reference(tp);
976				inp->tag = tag;
977				inp->net_affinity = TRUE;
978			}
979		}
980	} else if (inp == dlil_main_input_thread) {
981		panic_plain("%s: couldn't create main input thread", __func__);
982		/* NOTREACHED */
983	} else {
984		panic_plain("%s: couldn't create %s%d input thread", __func__,
985		    ifp->if_name, ifp->if_unit);
986		/* NOTREACHED */
987	}
988	OSAddAtomic(1, &cur_dlil_input_threads);
989
990	return (error);
991}
992
993static void
994dlil_terminate_input_thread(struct dlil_threading_info *inp)
995{
996	struct ifnet *ifp;
997
998	VERIFY(current_thread() == inp->input_thr);
999	VERIFY(inp != dlil_main_input_thread);
1000
1001	OSAddAtomic(-1, &cur_dlil_input_threads);
1002
1003	lck_mtx_destroy(&inp->input_lck, inp->lck_grp);
1004	lck_grp_free(inp->lck_grp);
1005
1006	inp->input_waiting = 0;
1007	inp->wtot = 0;
1008	bzero(inp->input_name, sizeof (inp->input_name));
1009	ifp = inp->ifp;
1010	inp->ifp = NULL;
1011	VERIFY(qhead(&inp->rcvq_pkts) == NULL && qempty(&inp->rcvq_pkts));
1012	qlimit(&inp->rcvq_pkts) = 0;
1013	bzero(&inp->stats, sizeof (inp->stats));
1014
1015	VERIFY(!inp->net_affinity);
1016	inp->input_thr = THREAD_NULL;
1017	VERIFY(inp->wloop_thr == THREAD_NULL);
1018	VERIFY(inp->poll_thr == THREAD_NULL);
1019	VERIFY(inp->tag == 0);
1020
1021	inp->mode = IFNET_MODEL_INPUT_POLL_OFF;
1022	bzero(&inp->tstats, sizeof (inp->tstats));
1023	bzero(&inp->pstats, sizeof (inp->pstats));
1024	bzero(&inp->sstats, sizeof (inp->sstats));
1025
1026	net_timerclear(&inp->mode_holdtime);
1027	net_timerclear(&inp->mode_lasttime);
1028	net_timerclear(&inp->sample_holdtime);
1029	net_timerclear(&inp->sample_lasttime);
1030	net_timerclear(&inp->dbg_lasttime);
1031
1032#if IFNET_INPUT_SANITY_CHK
1033	inp->input_mbuf_cnt = 0;
1034#endif /* IFNET_INPUT_SANITY_CHK */
1035
1036	if (dlil_verbose) {
1037		printf("%s%d: input thread terminated\n",
1038		    ifp->if_name, ifp->if_unit);
1039	}
1040
1041	/* for the extra refcnt from kernel_thread_start() */
1042	thread_deallocate(current_thread());
1043
1044	/* this is the end */
1045	thread_terminate(current_thread());
1046	/* NOTREACHED */
1047}
1048
1049static kern_return_t
1050dlil_affinity_set(struct thread *tp, u_int32_t tag)
1051{
1052	thread_affinity_policy_data_t policy;
1053
1054	bzero(&policy, sizeof (policy));
1055	policy.affinity_tag = tag;
1056	return (thread_policy_set(tp, THREAD_AFFINITY_POLICY,
1057	    (thread_policy_t)&policy, THREAD_AFFINITY_POLICY_COUNT));
1058}
1059
1060void
1061dlil_init(void)
1062{
1063	thread_t thread = THREAD_NULL;
1064
1065	/*
1066	 * The following fields must be 64-bit aligned for atomic operations.
1067	 */
1068	IF_DATA_REQUIRE_ALIGNED_64(ifi_ipackets);
1069	IF_DATA_REQUIRE_ALIGNED_64(ifi_ierrors)
1070	IF_DATA_REQUIRE_ALIGNED_64(ifi_opackets);
1071	IF_DATA_REQUIRE_ALIGNED_64(ifi_oerrors);
1072	IF_DATA_REQUIRE_ALIGNED_64(ifi_collisions);
1073	IF_DATA_REQUIRE_ALIGNED_64(ifi_ibytes);
1074	IF_DATA_REQUIRE_ALIGNED_64(ifi_obytes);
1075	IF_DATA_REQUIRE_ALIGNED_64(ifi_imcasts);
1076	IF_DATA_REQUIRE_ALIGNED_64(ifi_omcasts);
1077	IF_DATA_REQUIRE_ALIGNED_64(ifi_iqdrops);
1078	IF_DATA_REQUIRE_ALIGNED_64(ifi_noproto);
1079	IF_DATA_REQUIRE_ALIGNED_64(ifi_alignerrs);
1080
1081	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_ipackets);
1082	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_ierrors)
1083	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_opackets);
1084	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_oerrors);
1085	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_collisions);
1086	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_ibytes);
1087	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_obytes);
1088	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_imcasts);
1089	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_omcasts);
1090	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_iqdrops);
1091	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_noproto);
1092	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_alignerrs);
1093
1094	/*
1095	 * These IF_HWASSIST_ flags must be equal to their IFNET_* counterparts.
1096	 */
1097	_CASSERT(IF_HWASSIST_CSUM_IP == IFNET_CSUM_IP);
1098	_CASSERT(IF_HWASSIST_CSUM_TCP == IFNET_CSUM_TCP);
1099	_CASSERT(IF_HWASSIST_CSUM_UDP == IFNET_CSUM_UDP);
1100	_CASSERT(IF_HWASSIST_CSUM_IP_FRAGS == IFNET_CSUM_FRAGMENT);
1101	_CASSERT(IF_HWASSIST_CSUM_FRAGMENT == IFNET_IP_FRAGMENT);
1102	_CASSERT(IF_HWASSIST_CSUM_TCP_SUM16 == IFNET_CSUM_SUM16);
1103	_CASSERT(IF_HWASSIST_VLAN_TAGGING == IFNET_VLAN_TAGGING);
1104	_CASSERT(IF_HWASSIST_VLAN_MTU == IFNET_VLAN_MTU);
1105	_CASSERT(IF_HWASSIST_TSO_V4 == IFNET_TSO_IPV4);
1106	_CASSERT(IF_HWASSIST_TSO_V6 == IFNET_TSO_IPV6);
1107
1108	/*
1109	 * Make sure we have at least IF_LLREACH_MAXLEN in the llreach info.
1110	 */
1111	_CASSERT(IF_LLREACH_MAXLEN <= IF_LLREACHINFO_ADDRLEN);
1112	_CASSERT(IFNET_LLREACHINFO_ADDRLEN == IF_LLREACHINFO_ADDRLEN);
1113
1114	PE_parse_boot_argn("net_affinity", &net_affinity,
1115	    sizeof (net_affinity));
1116
1117	PE_parse_boot_argn("net_rxpoll", &net_rxpoll, sizeof (net_rxpoll));
1118
1119	PE_parse_boot_argn("net_rtref", &net_rtref, sizeof (net_rtref));
1120
1121	PE_parse_boot_argn("ifnet_debug", &ifnet_debug, sizeof (ifnet_debug));
1122
1123	dlif_size = (ifnet_debug == 0) ? sizeof (struct dlil_ifnet) :
1124	    sizeof (struct dlil_ifnet_dbg);
1125	/* Enforce 64-bit alignment for dlil_ifnet structure */
1126	dlif_bufsize = dlif_size + sizeof (void *) + sizeof (u_int64_t);
1127	dlif_bufsize = P2ROUNDUP(dlif_bufsize, sizeof (u_int64_t));
1128	dlif_zone = zinit(dlif_bufsize, DLIF_ZONE_MAX * dlif_bufsize,
1129	    0, DLIF_ZONE_NAME);
1130	if (dlif_zone == NULL) {
1131		panic_plain("%s: failed allocating %s", __func__,
1132		    DLIF_ZONE_NAME);
1133		/* NOTREACHED */
1134	}
1135	zone_change(dlif_zone, Z_EXPAND, TRUE);
1136	zone_change(dlif_zone, Z_CALLERACCT, FALSE);
1137
1138	dlif_filt_size = sizeof (struct ifnet_filter);
1139	dlif_filt_zone = zinit(dlif_filt_size,
1140	    DLIF_FILT_ZONE_MAX * dlif_filt_size, 0, DLIF_FILT_ZONE_NAME);
1141	if (dlif_filt_zone == NULL) {
1142		panic_plain("%s: failed allocating %s", __func__,
1143		    DLIF_FILT_ZONE_NAME);
1144		/* NOTREACHED */
1145	}
1146	zone_change(dlif_filt_zone, Z_EXPAND, TRUE);
1147	zone_change(dlif_filt_zone, Z_CALLERACCT, FALSE);
1148
1149	dlif_phash_size = sizeof (struct proto_hash_entry) * PROTO_HASH_SLOTS;
1150	dlif_phash_zone = zinit(dlif_phash_size,
1151	    DLIF_PHASH_ZONE_MAX * dlif_phash_size, 0, DLIF_PHASH_ZONE_NAME);
1152	if (dlif_phash_zone == NULL) {
1153		panic_plain("%s: failed allocating %s", __func__,
1154		    DLIF_PHASH_ZONE_NAME);
1155		/* NOTREACHED */
1156	}
1157	zone_change(dlif_phash_zone, Z_EXPAND, TRUE);
1158	zone_change(dlif_phash_zone, Z_CALLERACCT, FALSE);
1159
1160	dlif_proto_size = sizeof (struct if_proto);
1161	dlif_proto_zone = zinit(dlif_proto_size,
1162	    DLIF_PROTO_ZONE_MAX * dlif_proto_size, 0, DLIF_PROTO_ZONE_NAME);
1163	if (dlif_proto_zone == NULL) {
1164		panic_plain("%s: failed allocating %s", __func__,
1165		    DLIF_PROTO_ZONE_NAME);
1166		/* NOTREACHED */
1167	}
1168	zone_change(dlif_proto_zone, Z_EXPAND, TRUE);
1169	zone_change(dlif_proto_zone, Z_CALLERACCT, FALSE);
1170
1171	dlif_tcpstat_size = sizeof (struct tcpstat_local);
1172	/* Enforce 64-bit alignment for tcpstat_local structure */
1173	dlif_tcpstat_bufsize =
1174	    dlif_tcpstat_size + sizeof (void *) + sizeof (u_int64_t);
1175	dlif_tcpstat_bufsize =
1176	    P2ROUNDUP(dlif_tcpstat_bufsize, sizeof (u_int64_t));
1177	dlif_tcpstat_zone = zinit(dlif_tcpstat_bufsize,
1178	    DLIF_TCPSTAT_ZONE_MAX * dlif_tcpstat_bufsize, 0,
1179	    DLIF_TCPSTAT_ZONE_NAME);
1180	if (dlif_tcpstat_zone == NULL) {
1181		panic_plain("%s: failed allocating %s", __func__,
1182		    DLIF_TCPSTAT_ZONE_NAME);
1183		/* NOTREACHED */
1184	}
1185	zone_change(dlif_tcpstat_zone, Z_EXPAND, TRUE);
1186	zone_change(dlif_tcpstat_zone, Z_CALLERACCT, FALSE);
1187
1188	dlif_udpstat_size = sizeof (struct udpstat_local);
1189	/* Enforce 64-bit alignment for udpstat_local structure */
1190	dlif_udpstat_bufsize =
1191	    dlif_udpstat_size + sizeof (void *) + sizeof (u_int64_t);
1192	dlif_udpstat_bufsize =
1193	    P2ROUNDUP(dlif_udpstat_bufsize, sizeof (u_int64_t));
1194	dlif_udpstat_zone = zinit(dlif_udpstat_bufsize,
1195	    DLIF_TCPSTAT_ZONE_MAX * dlif_udpstat_bufsize, 0,
1196	    DLIF_UDPSTAT_ZONE_NAME);
1197	if (dlif_udpstat_zone == NULL) {
1198		panic_plain("%s: failed allocating %s", __func__,
1199		    DLIF_UDPSTAT_ZONE_NAME);
1200		/* NOTREACHED */
1201	}
1202	zone_change(dlif_udpstat_zone, Z_EXPAND, TRUE);
1203	zone_change(dlif_udpstat_zone, Z_CALLERACCT, FALSE);
1204
1205	ifnet_llreach_init();
1206
1207	TAILQ_INIT(&dlil_ifnet_head);
1208	TAILQ_INIT(&ifnet_head);
1209	TAILQ_INIT(&ifnet_detaching_head);
1210
1211	/* Setup the lock groups we will use */
1212	dlil_grp_attributes = lck_grp_attr_alloc_init();
1213
1214	dlil_lock_group = lck_grp_alloc_init("DLIL internal locks",
1215	    dlil_grp_attributes);
1216	ifnet_lock_group = lck_grp_alloc_init("ifnet locks",
1217	    dlil_grp_attributes);
1218	ifnet_head_lock_group = lck_grp_alloc_init("ifnet head lock",
1219	    dlil_grp_attributes);
1220	ifnet_rcv_lock_group = lck_grp_alloc_init("ifnet rcv locks",
1221	    dlil_grp_attributes);
1222	ifnet_snd_lock_group = lck_grp_alloc_init("ifnet snd locks",
1223	    dlil_grp_attributes);
1224
1225	/* Setup the lock attributes we will use */
1226	dlil_lck_attributes = lck_attr_alloc_init();
1227
1228	ifnet_lock_attr = lck_attr_alloc_init();
1229
1230	lck_rw_init(&ifnet_head_lock, ifnet_head_lock_group,
1231	    dlil_lck_attributes);
1232	lck_mtx_init(&dlil_ifnet_lock, dlil_lock_group, dlil_lck_attributes);
1233
1234	ifnet_fc_init();
1235
1236	lck_attr_free(dlil_lck_attributes);
1237	dlil_lck_attributes = NULL;
1238
1239	ifa_init();
1240	/*
1241	 * Create and start up the main DLIL input thread and the interface
1242	 * detacher threads once everything is initialized.
1243	 */
1244	dlil_create_input_thread(NULL, dlil_main_input_thread);
1245
1246	if (kernel_thread_start(ifnet_detacher_thread_func,
1247	    NULL, &thread) != KERN_SUCCESS) {
1248		panic_plain("%s: couldn't create detacher thread", __func__);
1249		/* NOTREACHED */
1250	}
1251	thread_deallocate(thread);
1252
1253#if PF
1254	/* Initialize the packet filter */
1255	pfinit();
1256#endif /* PF */
1257
1258	/* Initialize queue algorithms */
1259	classq_init();
1260
1261	/* Initialize packet schedulers */
1262	pktsched_init();
1263}
1264
1265static void
1266if_flt_monitor_busy(struct ifnet *ifp)
1267{
1268	lck_mtx_assert(&ifp->if_flt_lock, LCK_MTX_ASSERT_OWNED);
1269
1270	++ifp->if_flt_busy;
1271	VERIFY(ifp->if_flt_busy != 0);
1272}
1273
1274static void
1275if_flt_monitor_unbusy(struct ifnet *ifp)
1276{
1277	if_flt_monitor_leave(ifp);
1278}
1279
1280static void
1281if_flt_monitor_enter(struct ifnet *ifp)
1282{
1283	lck_mtx_assert(&ifp->if_flt_lock, LCK_MTX_ASSERT_OWNED);
1284
1285	while (ifp->if_flt_busy) {
1286		++ifp->if_flt_waiters;
1287		(void) msleep(&ifp->if_flt_head, &ifp->if_flt_lock,
1288		    (PZERO - 1), "if_flt_monitor", NULL);
1289	}
1290	if_flt_monitor_busy(ifp);
1291}
1292
1293static void
1294if_flt_monitor_leave(struct ifnet *ifp)
1295{
1296	lck_mtx_assert(&ifp->if_flt_lock, LCK_MTX_ASSERT_OWNED);
1297
1298	VERIFY(ifp->if_flt_busy != 0);
1299	--ifp->if_flt_busy;
1300
1301	if (ifp->if_flt_busy == 0 && ifp->if_flt_waiters > 0) {
1302		ifp->if_flt_waiters = 0;
1303		wakeup(&ifp->if_flt_head);
1304	}
1305}
1306
1307__private_extern__ int
1308dlil_attach_filter(struct ifnet	*ifp, const struct iff_filter *if_filter,
1309    interface_filter_t *filter_ref)
1310{
1311	int retval = 0;
1312	struct ifnet_filter *filter = NULL;
1313
1314	ifnet_head_lock_shared();
1315	/* Check that the interface is in the global list */
1316	if (!ifnet_lookup(ifp)) {
1317		retval = ENXIO;
1318		goto done;
1319	}
1320
1321	filter = zalloc(dlif_filt_zone);
1322	if (filter == NULL) {
1323		retval = ENOMEM;
1324		goto done;
1325	}
1326	bzero(filter, dlif_filt_size);
1327
1328	/* refcnt held above during lookup */
1329	filter->filt_ifp = ifp;
1330	filter->filt_cookie = if_filter->iff_cookie;
1331	filter->filt_name = if_filter->iff_name;
1332	filter->filt_protocol = if_filter->iff_protocol;
1333	filter->filt_input = if_filter->iff_input;
1334	filter->filt_output = if_filter->iff_output;
1335	filter->filt_event = if_filter->iff_event;
1336	filter->filt_ioctl = if_filter->iff_ioctl;
1337	filter->filt_detached = if_filter->iff_detached;
1338
1339	lck_mtx_lock(&ifp->if_flt_lock);
1340	if_flt_monitor_enter(ifp);
1341
1342	lck_mtx_assert(&ifp->if_flt_lock, LCK_MTX_ASSERT_OWNED);
1343	TAILQ_INSERT_TAIL(&ifp->if_flt_head, filter, filt_next);
1344
1345	if_flt_monitor_leave(ifp);
1346	lck_mtx_unlock(&ifp->if_flt_lock);
1347
1348	*filter_ref = filter;
1349
1350	/*
1351	 * Bump filter count and route_generation ID to let TCP
1352	 * know it shouldn't do TSO on this connection
1353	 */
1354	OSAddAtomic(1, &dlil_filter_count);
1355	if (use_routegenid)
1356		routegenid_update();
1357
1358	if (dlil_verbose) {
1359		printf("%s%d: %s filter attached\n", ifp->if_name,
1360		    ifp->if_unit, if_filter->iff_name);
1361	}
1362done:
1363	ifnet_head_done();
1364	if (retval != 0 && ifp != NULL) {
1365		DLIL_PRINTF("%s%d: failed to attach %s (err=%d)\n",
1366		    ifp->if_name, ifp->if_unit, if_filter->iff_name, retval);
1367	}
1368	if (retval != 0 && filter != NULL)
1369		zfree(dlif_filt_zone, filter);
1370
1371	return (retval);
1372}
1373
1374static int
1375dlil_detach_filter_internal(interface_filter_t	filter, int detached)
1376{
1377	int retval = 0;
1378
1379	if (detached == 0) {
1380		ifnet_t ifp = NULL;
1381
1382		ifnet_head_lock_shared();
1383		TAILQ_FOREACH(ifp, &ifnet_head, if_link) {
1384			interface_filter_t entry = NULL;
1385
1386			lck_mtx_lock(&ifp->if_flt_lock);
1387			TAILQ_FOREACH(entry, &ifp->if_flt_head, filt_next) {
1388				if (entry != filter || entry->filt_skip)
1389					continue;
1390				/*
1391				 * We've found a match; since it's possible
1392				 * that the thread gets blocked in the monitor,
1393				 * we do the lock dance.  Interface should
1394				 * not be detached since we still have a use
1395				 * count held during filter attach.
1396				 */
1397				entry->filt_skip = 1;	/* skip input/output */
1398				lck_mtx_unlock(&ifp->if_flt_lock);
1399				ifnet_head_done();
1400
1401				lck_mtx_lock(&ifp->if_flt_lock);
1402				if_flt_monitor_enter(ifp);
1403				lck_mtx_assert(&ifp->if_flt_lock,
1404				    LCK_MTX_ASSERT_OWNED);
1405
1406				/* Remove the filter from the list */
1407				TAILQ_REMOVE(&ifp->if_flt_head, filter,
1408				    filt_next);
1409
1410				if_flt_monitor_leave(ifp);
1411				lck_mtx_unlock(&ifp->if_flt_lock);
1412				if (dlil_verbose) {
1413					printf("%s%d: %s filter detached\n",
1414					    ifp->if_name, ifp->if_unit,
1415					    filter->filt_name);
1416				}
1417				goto destroy;
1418			}
1419			lck_mtx_unlock(&ifp->if_flt_lock);
1420		}
1421		ifnet_head_done();
1422
1423		/* filter parameter is not a valid filter ref */
1424		retval = EINVAL;
1425		goto done;
1426	}
1427
1428	if (dlil_verbose)
1429		printf("%s filter detached\n", filter->filt_name);
1430
1431destroy:
1432
1433	/* Call the detached function if there is one */
1434	if (filter->filt_detached)
1435		filter->filt_detached(filter->filt_cookie, filter->filt_ifp);
1436
1437	/* Free the filter */
1438	zfree(dlif_filt_zone, filter);
1439
1440	/*
1441	 * Decrease filter count and route_generation ID to let TCP
1442	 * know it should reevalute doing TSO or not
1443	 */
1444	OSAddAtomic(-1, &dlil_filter_count);
1445	if (use_routegenid)
1446		routegenid_update();
1447
1448done:
1449	if (retval != 0) {
1450		DLIL_PRINTF("failed to detach %s filter (err=%d)\n",
1451		    filter->filt_name, retval);
1452	}
1453	return (retval);
1454}
1455
1456__private_extern__ void
1457dlil_detach_filter(interface_filter_t filter)
1458{
1459	if (filter == NULL)
1460		return;
1461	dlil_detach_filter_internal(filter, 0);
1462}
1463
1464/*
1465 * Main input thread:
1466 *
1467 *   a) handles all inbound packets for lo0
1468 *   b) handles all inbound packets for interfaces with no dedicated
1469 *	input thread (e.g. anything but Ethernet/PDP or those that support
1470 *	opportunistic polling.)
1471 *   c) protocol registrations
1472 *   d) packet injections
1473 */
1474static void
1475dlil_main_input_thread_func(void *v, wait_result_t w)
1476{
1477#pragma unused(w)
1478	struct dlil_main_threading_info *inpm = v;
1479	struct dlil_threading_info *inp = v;
1480
1481	VERIFY(inp == dlil_main_input_thread);
1482	VERIFY(inp->ifp == NULL);
1483	VERIFY(inp->mode == IFNET_MODEL_INPUT_POLL_OFF);
1484
1485	while (1) {
1486		struct mbuf *m = NULL, *m_loop = NULL;
1487		u_int32_t m_cnt, m_cnt_loop;
1488		boolean_t proto_req;
1489
1490		lck_mtx_lock_spin(&inp->input_lck);
1491
1492		/* Wait until there is work to be done */
1493		while (!(inp->input_waiting & ~DLIL_INPUT_RUNNING)) {
1494			inp->input_waiting &= ~DLIL_INPUT_RUNNING;
1495			(void) msleep(&inp->input_waiting, &inp->input_lck,
1496			    (PZERO - 1) | PSPIN, inp->input_name, NULL);
1497		}
1498
1499		inp->input_waiting |= DLIL_INPUT_RUNNING;
1500		inp->input_waiting &= ~DLIL_INPUT_WAITING;
1501
1502		/* Main input thread cannot be terminated */
1503		VERIFY(!(inp->input_waiting & DLIL_INPUT_TERMINATE));
1504
1505		proto_req = (inp->input_waiting &
1506		    (DLIL_PROTO_WAITING | DLIL_PROTO_REGISTER));
1507
1508		/* Packets for non-dedicated interfaces other than lo0 */
1509		m_cnt = qlen(&inp->rcvq_pkts);
1510		m = _getq_all(&inp->rcvq_pkts);
1511
1512		/* Packets exclusive for lo0 */
1513		m_cnt_loop = qlen(&inpm->lo_rcvq_pkts);
1514		m_loop = _getq_all(&inpm->lo_rcvq_pkts);
1515
1516		inp->wtot = 0;
1517
1518		lck_mtx_unlock(&inp->input_lck);
1519
1520		/*
1521		* NOTE warning %%% attention !!!!
1522		* We should think about putting some thread starvation
1523		* safeguards if we deal with long chains of packets.
1524		*/
1525		if (m_loop != NULL)
1526			dlil_input_packet_list_extended(lo_ifp, m_loop,
1527			    m_cnt_loop, inp->mode);
1528
1529		if (m != NULL)
1530			dlil_input_packet_list_extended(NULL, m,
1531			    m_cnt, inp->mode);
1532
1533		if (proto_req)
1534			proto_input_run();
1535	}
1536
1537	/* NOTREACHED */
1538	VERIFY(0);	/* we should never get here */
1539}
1540
1541/*
1542 * Input thread for interfaces with legacy input model.
1543 */
1544static void
1545dlil_input_thread_func(void *v, wait_result_t w)
1546{
1547#pragma unused(w)
1548	struct dlil_threading_info *inp = v;
1549	struct ifnet *ifp = inp->ifp;
1550
1551	VERIFY(inp != dlil_main_input_thread);
1552	VERIFY(ifp != NULL);
1553	VERIFY(!(ifp->if_eflags & IFEF_RXPOLL) || !net_rxpoll);
1554	VERIFY(inp->mode == IFNET_MODEL_INPUT_POLL_OFF);
1555
1556	while (1) {
1557		struct mbuf *m = NULL;
1558		u_int32_t m_cnt;
1559
1560		lck_mtx_lock_spin(&inp->input_lck);
1561
1562		/* Wait until there is work to be done */
1563		while (!(inp->input_waiting & ~DLIL_INPUT_RUNNING)) {
1564			inp->input_waiting &= ~DLIL_INPUT_RUNNING;
1565			(void) msleep(&inp->input_waiting, &inp->input_lck,
1566			    (PZERO - 1) | PSPIN, inp->input_name, NULL);
1567		}
1568
1569		inp->input_waiting |= DLIL_INPUT_RUNNING;
1570		inp->input_waiting &= ~DLIL_INPUT_WAITING;
1571
1572		/*
1573		 * Protocol registration and injection must always use
1574		 * the main input thread; in theory the latter can utilize
1575		 * the corresponding input thread where the packet arrived
1576		 * on, but that requires our knowing the interface in advance
1577		 * (and the benefits might not worth the trouble.)
1578		 */
1579		VERIFY(!(inp->input_waiting &
1580		    (DLIL_PROTO_WAITING|DLIL_PROTO_REGISTER)));
1581
1582		/* Packets for this interface */
1583		m_cnt = qlen(&inp->rcvq_pkts);
1584		m = _getq_all(&inp->rcvq_pkts);
1585
1586		if (inp->input_waiting & DLIL_INPUT_TERMINATE) {
1587			lck_mtx_unlock(&inp->input_lck);
1588
1589			/* Free up pending packets */
1590			if (m != NULL)
1591				mbuf_freem_list(m);
1592
1593			dlil_terminate_input_thread(inp);
1594			/* NOTREACHED */
1595			return;
1596		}
1597
1598		inp->wtot = 0;
1599
1600		dlil_input_stats_sync(ifp, inp);
1601
1602		lck_mtx_unlock(&inp->input_lck);
1603
1604		/*
1605		* NOTE warning %%% attention !!!!
1606		* We should think about putting some thread starvation
1607		* safeguards if we deal with long chains of packets.
1608		*/
1609		if (m != NULL)
1610			dlil_input_packet_list_extended(NULL, m,
1611			    m_cnt, inp->mode);
1612	}
1613
1614	/* NOTREACHED */
1615	VERIFY(0);	/* we should never get here */
1616}
1617
1618/*
1619 * Input thread for interfaces with opportunistic polling input model.
1620 */
1621static void
1622dlil_rxpoll_input_thread_func(void *v, wait_result_t w)
1623{
1624#pragma unused(w)
1625	struct dlil_threading_info *inp = v;
1626	struct ifnet *ifp = inp->ifp;
1627	struct timespec ts;
1628
1629	VERIFY(inp != dlil_main_input_thread);
1630	VERIFY(ifp != NULL && (ifp->if_eflags & IFEF_RXPOLL));
1631
1632	while (1) {
1633		struct mbuf *m = NULL;
1634		u_int32_t m_cnt, m_size, poll_req = 0;
1635		ifnet_model_t mode;
1636		struct timespec now, delta;
1637
1638		lck_mtx_lock_spin(&inp->input_lck);
1639
1640		/* Link parameters changed? */
1641		if (ifp->if_poll_update != 0) {
1642			ifp->if_poll_update = 0;
1643			dlil_rxpoll_calc_limits(inp);
1644		}
1645
1646		/* Current operating mode */
1647		mode = inp->mode;
1648
1649		/* Wait until there is work to be done */
1650		while (!(inp->input_waiting & ~DLIL_INPUT_RUNNING) &&
1651		    qempty(&inp->rcvq_pkts)) {
1652			inp->input_waiting &= ~DLIL_INPUT_RUNNING;
1653			(void) msleep(&inp->input_waiting, &inp->input_lck,
1654			    (PZERO - 1) | PSPIN, inp->input_name, NULL);
1655		}
1656
1657		inp->input_waiting |= DLIL_INPUT_RUNNING;
1658		inp->input_waiting &= ~DLIL_INPUT_WAITING;
1659
1660		/*
1661		 * Protocol registration and injection must always use
1662		 * the main input thread; in theory the latter can utilize
1663		 * the corresponding input thread where the packet arrived
1664		 * on, but that requires our knowing the interface in advance
1665		 * (and the benefits might not worth the trouble.)
1666		 */
1667		VERIFY(!(inp->input_waiting &
1668		    (DLIL_PROTO_WAITING|DLIL_PROTO_REGISTER)));
1669
1670		if (inp->input_waiting & DLIL_INPUT_TERMINATE) {
1671			/* Free up pending packets */
1672			_flushq(&inp->rcvq_pkts);
1673			lck_mtx_unlock(&inp->input_lck);
1674
1675			dlil_terminate_input_thread(inp);
1676			/* NOTREACHED */
1677			return;
1678		}
1679
1680		/* Total count of all packets */
1681		m_cnt = qlen(&inp->rcvq_pkts);
1682
1683		/* Total bytes of all packets */
1684		m_size = qsize(&inp->rcvq_pkts);
1685
1686		/* Packets for this interface */
1687		m = _getq_all(&inp->rcvq_pkts);
1688		VERIFY(m != NULL || m_cnt == 0);
1689
1690		nanouptime(&now);
1691		if (!net_timerisset(&inp->sample_lasttime))
1692			*(&inp->sample_lasttime) = *(&now);
1693
1694		net_timersub(&now, &inp->sample_lasttime, &delta);
1695		if (if_rxpoll && net_timerisset(&inp->sample_holdtime)) {
1696			u_int32_t ptot, btot;
1697
1698			/* Accumulate statistics for current sampling */
1699			PKTCNTR_ADD(&inp->sstats, m_cnt, m_size);
1700
1701			if (net_timercmp(&delta, &inp->sample_holdtime, <))
1702				goto skip;
1703
1704			*(&inp->sample_lasttime) = *(&now);
1705
1706			/* Calculate min/max of inbound bytes */
1707			btot = (u_int32_t)inp->sstats.bytes;
1708			if (inp->rxpoll_bmin == 0 || inp->rxpoll_bmin > btot)
1709				inp->rxpoll_bmin = btot;
1710			if (btot > inp->rxpoll_bmax)
1711				inp->rxpoll_bmax = btot;
1712
1713			/* Calculate EWMA of inbound bytes */
1714			DLIL_EWMA(inp->rxpoll_bavg, btot, if_rxpoll_decay);
1715
1716			/* Calculate min/max of inbound packets */
1717			ptot = (u_int32_t)inp->sstats.packets;
1718			if (inp->rxpoll_pmin == 0 || inp->rxpoll_pmin > ptot)
1719				inp->rxpoll_pmin = ptot;
1720			if (ptot > inp->rxpoll_pmax)
1721				inp->rxpoll_pmax = ptot;
1722
1723			/* Calculate EWMA of inbound packets */
1724			DLIL_EWMA(inp->rxpoll_pavg, ptot, if_rxpoll_decay);
1725
1726			/* Reset sampling statistics */
1727			PKTCNTR_CLEAR(&inp->sstats);
1728
1729			/* Calculate EWMA of wakeup requests */
1730			DLIL_EWMA(inp->rxpoll_wavg, inp->wtot, if_rxpoll_decay);
1731			inp->wtot = 0;
1732
1733			if (dlil_verbose) {
1734				if (!net_timerisset(&inp->dbg_lasttime))
1735					*(&inp->dbg_lasttime) = *(&now);
1736				net_timersub(&now, &inp->dbg_lasttime, &delta);
1737				if (net_timercmp(&delta, &dlil_dbgrate, >=)) {
1738					*(&inp->dbg_lasttime) = *(&now);
1739					printf("%s%d: [%s] pkts avg %d max %d "
1740					    "limits [%d/%d], wreq avg %d "
1741					    "limits [%d/%d], bytes avg %d "
1742					    "limits [%d/%d]\n", ifp->if_name,
1743					    ifp->if_unit, (inp->mode ==
1744					    IFNET_MODEL_INPUT_POLL_ON) ?
1745					    "ON" : "OFF", inp->rxpoll_pavg,
1746					    inp->rxpoll_pmax,
1747					    inp->rxpoll_plowat,
1748					    inp->rxpoll_phiwat,
1749					    inp->rxpoll_wavg,
1750					    inp->rxpoll_wlowat,
1751					    inp->rxpoll_whiwat,
1752					    inp->rxpoll_bavg,
1753					    inp->rxpoll_blowat,
1754					    inp->rxpoll_bhiwat);
1755				}
1756			}
1757
1758			/* Perform mode transition, if necessary */
1759			if (!net_timerisset(&inp->mode_lasttime))
1760				*(&inp->mode_lasttime) = *(&now);
1761
1762			net_timersub(&now, &inp->mode_lasttime, &delta);
1763			if (net_timercmp(&delta, &inp->mode_holdtime, <))
1764				goto skip;
1765
1766			if (inp->rxpoll_pavg <= inp->rxpoll_plowat &&
1767			    inp->rxpoll_bavg <= inp->rxpoll_blowat &&
1768			    inp->rxpoll_wavg <= inp->rxpoll_wlowat &&
1769			    inp->mode != IFNET_MODEL_INPUT_POLL_OFF) {
1770				mode = IFNET_MODEL_INPUT_POLL_OFF;
1771			} else if (inp->rxpoll_pavg >= inp->rxpoll_phiwat &&
1772			    (inp->rxpoll_bavg >= inp->rxpoll_bhiwat ||
1773			    inp->rxpoll_wavg >= inp->rxpoll_whiwat) &&
1774			    inp->mode != IFNET_MODEL_INPUT_POLL_ON) {
1775				mode = IFNET_MODEL_INPUT_POLL_ON;
1776			}
1777
1778			if (mode != inp->mode) {
1779				inp->mode = mode;
1780				*(&inp->mode_lasttime) = *(&now);
1781				poll_req++;
1782			}
1783		}
1784skip:
1785		dlil_input_stats_sync(ifp, inp);
1786
1787		lck_mtx_unlock(&inp->input_lck);
1788
1789		/*
1790		 * If there's a mode change and interface is still attached,
1791		 * perform a downcall to the driver for the new mode.  Also
1792		 * hold an IO refcnt on the interface to prevent it from
1793		 * being detached (will be release below.)
1794		 */
1795		if (poll_req != 0 && ifnet_is_attached(ifp, 1)) {
1796			struct ifnet_model_params p = { mode, { 0 } };
1797			errno_t err;
1798
1799			if (dlil_verbose) {
1800				printf("%s%d: polling is now %s, "
1801				    "pkts avg %d max %d limits [%d/%d], "
1802				    "wreq avg %d limits [%d/%d], "
1803				    "bytes avg %d limits [%d/%d]\n",
1804				    ifp->if_name, ifp->if_unit,
1805				    (mode == IFNET_MODEL_INPUT_POLL_ON) ?
1806				    "ON" : "OFF", inp->rxpoll_pavg,
1807				    inp->rxpoll_pmax, inp->rxpoll_plowat,
1808				    inp->rxpoll_phiwat, inp->rxpoll_wavg,
1809				    inp->rxpoll_wlowat, inp->rxpoll_whiwat,
1810				    inp->rxpoll_bavg, inp->rxpoll_blowat,
1811				    inp->rxpoll_bhiwat);
1812			}
1813
1814			if ((err = ((*ifp->if_input_ctl)(ifp,
1815			    IFNET_CTL_SET_INPUT_MODEL, sizeof (p), &p))) != 0) {
1816				printf("%s%d: error setting polling mode "
1817				    "to %s (%d)\n", ifp->if_name, ifp->if_unit,
1818				    (mode == IFNET_MODEL_INPUT_POLL_ON) ?
1819				    "ON" : "OFF", err);
1820			}
1821
1822			switch (mode) {
1823			case IFNET_MODEL_INPUT_POLL_OFF:
1824				ifnet_set_poll_cycle(ifp, NULL);
1825				inp->rxpoll_offreq++;
1826				if (err != 0)
1827					inp->rxpoll_offerr++;
1828				break;
1829
1830			case IFNET_MODEL_INPUT_POLL_ON:
1831				net_nsectimer(&if_rxpoll_interval_time, &ts);
1832				ifnet_set_poll_cycle(ifp, &ts);
1833				ifnet_poll(ifp);
1834				inp->rxpoll_onreq++;
1835				if (err != 0)
1836					inp->rxpoll_onerr++;
1837				break;
1838
1839			default:
1840				VERIFY(0);
1841				/* NOTREACHED */
1842			}
1843
1844			/* Release the IO refcnt */
1845			ifnet_decr_iorefcnt(ifp);
1846		}
1847
1848		/*
1849		* NOTE warning %%% attention !!!!
1850		* We should think about putting some thread starvation
1851		* safeguards if we deal with long chains of packets.
1852		*/
1853		if (m != NULL)
1854			dlil_input_packet_list_extended(NULL, m, m_cnt, mode);
1855	}
1856
1857	/* NOTREACHED */
1858	VERIFY(0);	/* we should never get here */
1859}
1860
1861static void
1862dlil_rxpoll_calc_limits(struct dlil_threading_info *inp)
1863{
1864	struct ifnet *ifp = inp->ifp;
1865	u_int64_t sample_holdtime, inbw;
1866
1867	VERIFY(inp != dlil_main_input_thread);
1868	VERIFY(ifp != NULL && (ifp->if_eflags & IFEF_RXPOLL));
1869
1870	if ((inbw = ifnet_input_linkrate(ifp)) == 0) {
1871		sample_holdtime = 0;	/* polling is disabled */
1872		inp->rxpoll_wlowat = inp->rxpoll_plowat =
1873		    inp->rxpoll_blowat = 0;
1874		inp->rxpoll_whiwat = inp->rxpoll_phiwat =
1875		    inp->rxpoll_bhiwat = (u_int32_t)-1;
1876	} else {
1877		unsigned int n, i;
1878
1879		n = 0;
1880		for (i = 0; rxpoll_tbl[i].speed != 0; i++) {
1881			if (inbw < rxpoll_tbl[i].speed)
1882				break;
1883			n = i;
1884		}
1885		sample_holdtime = if_rxpoll_sample_holdtime;
1886		inp->rxpoll_wlowat = if_rxpoll_wlowat;
1887		inp->rxpoll_whiwat = if_rxpoll_whiwat;
1888		inp->rxpoll_plowat = rxpoll_tbl[n].plowat;
1889		inp->rxpoll_phiwat = rxpoll_tbl[n].phiwat;
1890		inp->rxpoll_blowat = rxpoll_tbl[n].blowat;
1891		inp->rxpoll_bhiwat = rxpoll_tbl[n].bhiwat;
1892	}
1893
1894	net_nsectimer(&if_rxpoll_mode_holdtime, &inp->mode_holdtime);
1895	net_nsectimer(&sample_holdtime, &inp->sample_holdtime);
1896
1897	if (dlil_verbose) {
1898		printf("%s%d: speed %llu bps, sample per %llu nsec, "
1899		    "pkt limits [%d/%d], wreq limits [%d/%d], "
1900		    "bytes limits [%d/%d]\n", ifp->if_name, ifp->if_unit,
1901		    inbw, sample_holdtime, inp->rxpoll_plowat,
1902		    inp->rxpoll_phiwat, inp->rxpoll_wlowat, inp->rxpoll_whiwat,
1903		    inp->rxpoll_blowat, inp->rxpoll_bhiwat);
1904	}
1905}
1906
1907errno_t
1908ifnet_input(struct ifnet *ifp, struct mbuf *m_head,
1909    const struct ifnet_stat_increment_param *s)
1910{
1911	return (ifnet_input_common(ifp, m_head, NULL, s, FALSE, FALSE));
1912}
1913
1914errno_t
1915ifnet_input_extended(struct ifnet *ifp, struct mbuf *m_head,
1916    struct mbuf *m_tail, const struct ifnet_stat_increment_param *s)
1917{
1918	return (ifnet_input_common(ifp, m_head, m_tail, s, TRUE, FALSE));
1919}
1920
1921static errno_t
1922ifnet_input_common(struct ifnet *ifp, struct mbuf *m_head, struct mbuf *m_tail,
1923    const struct ifnet_stat_increment_param *s, boolean_t ext, boolean_t poll)
1924{
1925	struct thread *tp = current_thread();
1926	struct mbuf *last;
1927	struct dlil_threading_info *inp;
1928	u_int32_t m_cnt = 0, m_size = 0;
1929
1930	/*
1931	 * Drop the packet(s) if the parameters are invalid, or if the
1932	 * interface is no longer attached; else hold an IO refcnt to
1933	 * prevent it from being detached (will be released below.)
1934	 */
1935	if (ifp == NULL || m_head == NULL || (s == NULL && ext) ||
1936	    (ifp != lo_ifp && !ifnet_is_attached(ifp, 1))) {
1937		if (m_head != NULL)
1938			mbuf_freem_list(m_head);
1939		return (EINVAL);
1940	}
1941
1942	VERIFY(m_tail == NULL || ext);
1943	VERIFY(s != NULL || !ext);
1944
1945	if (m_tail == NULL) {
1946		last = m_head;
1947		while (1) {
1948#if IFNET_INPUT_SANITY_CHK
1949			if (dlil_input_sanity_check != 0)
1950				DLIL_INPUT_CHECK(last, ifp);
1951#endif /* IFNET_INPUT_SANITY_CHK */
1952			m_cnt++;
1953			m_size += m_length(last);
1954			if (mbuf_nextpkt(last) == NULL)
1955				break;
1956			last = mbuf_nextpkt(last);
1957		}
1958		m_tail = last;
1959	} else {
1960#if IFNET_INPUT_SANITY_CHK
1961		if (dlil_input_sanity_check != 0) {
1962			last = m_head;
1963			while (1) {
1964				DLIL_INPUT_CHECK(last, ifp);
1965				m_cnt++;
1966				m_size += m_length(last);
1967				if (mbuf_nextpkt(last) == NULL)
1968					break;
1969				last = mbuf_nextpkt(last);
1970			}
1971		} else {
1972			m_cnt = s->packets_in;
1973			m_size = s->bytes_in;
1974			last = m_tail;
1975		}
1976#else
1977		m_cnt = s->packets_in;
1978		m_size = s->bytes_in;
1979		last = m_tail;
1980#endif /* IFNET_INPUT_SANITY_CHK */
1981	}
1982
1983	if (last != m_tail) {
1984		panic_plain("%s: invalid input packet chain for %s%d, "
1985		    "tail mbuf %p instead of %p\n", __func__, ifp->if_name,
1986		    ifp->if_unit, m_tail, last);
1987	}
1988
1989	/*
1990	 * Assert packet count only for the extended variant, for backwards
1991	 * compatibility, since this came directly from the device driver.
1992	 * Relax this assertion for input bytes, as the driver may have
1993	 * included the link-layer headers in the computation; hence
1994	 * m_size is just an approximation.
1995	 */
1996	if (ext && s->packets_in != m_cnt) {
1997		panic_plain("%s: input packet count mismatch for %s%d, "
1998		    "%d instead of %d\n", __func__, ifp->if_name,
1999		    ifp->if_unit, s->packets_in, m_cnt);
2000	}
2001
2002	if ((inp = ifp->if_inp) == NULL)
2003		inp = dlil_main_input_thread;
2004
2005	/*
2006	 * If there is a matching DLIL input thread associated with an
2007	 * affinity set, associate this thread with the same set.  We
2008	 * will only do this once.
2009	 */
2010	lck_mtx_lock_spin(&inp->input_lck);
2011	if (inp != dlil_main_input_thread && inp->net_affinity &&
2012	    ((!poll && inp->wloop_thr == THREAD_NULL) ||
2013	    (poll && inp->poll_thr == THREAD_NULL))) {
2014		u_int32_t tag = inp->tag;
2015
2016		if (poll) {
2017			VERIFY(inp->poll_thr == THREAD_NULL);
2018			inp->poll_thr = tp;
2019		} else {
2020			VERIFY(inp->wloop_thr == THREAD_NULL);
2021			inp->wloop_thr = tp;
2022		}
2023		lck_mtx_unlock(&inp->input_lck);
2024
2025		/* Associate the current thread with the new affinity tag */
2026		(void) dlil_affinity_set(tp, tag);
2027
2028		/*
2029		 * Take a reference on the current thread; during detach,
2030		 * we will need to refer to it in order ot tear down its
2031		 * affinity.
2032		 */
2033		thread_reference(tp);
2034		lck_mtx_lock_spin(&inp->input_lck);
2035	}
2036
2037        /*
2038	 * Because of loopbacked multicast we cannot stuff the ifp in
2039	 * the rcvif of the packet header: loopback (lo0) packets use a
2040	 * dedicated list so that we can later associate them with lo_ifp
2041	 * on their way up the stack.  Packets for other interfaces without
2042	 * dedicated input threads go to the regular list.
2043	 */
2044	if (inp == dlil_main_input_thread && ifp == lo_ifp) {
2045		struct dlil_main_threading_info *inpm =
2046		    (struct dlil_main_threading_info *)inp;
2047		_addq_multi(&inpm->lo_rcvq_pkts, m_head, m_tail, m_cnt, m_size);
2048	} else {
2049		_addq_multi(&inp->rcvq_pkts, m_head, m_tail, m_cnt, m_size);
2050	}
2051
2052#if IFNET_INPUT_SANITY_CHK
2053	if (dlil_input_sanity_check != 0) {
2054		u_int32_t count;
2055		struct mbuf *m0;
2056
2057		for (m0 = m_head, count = 0; m0; m0 = mbuf_nextpkt(m0))
2058			count++;
2059
2060		if (count != m_cnt) {
2061			panic_plain("%s%d: invalid packet count %d "
2062			    "(expected %d)\n", ifp->if_name, ifp->if_unit,
2063			    count, m_cnt);
2064			/* NOTREACHED */
2065		}
2066
2067		inp->input_mbuf_cnt += m_cnt;
2068	}
2069#endif /* IFNET_INPUT_SANITY_CHK */
2070
2071	if (s != NULL) {
2072		dlil_input_stats_add(s, inp, poll);
2073		/*
2074		 * If we're using the main input thread, synchronize the
2075		 * stats now since we have the interface context.  All
2076		 * other cases involving dedicated input threads will
2077		 * have their stats synchronized there.
2078		 */
2079		if (inp == dlil_main_input_thread)
2080			dlil_input_stats_sync(ifp, inp);
2081	}
2082
2083	inp->input_waiting |= DLIL_INPUT_WAITING;
2084	if (!(inp->input_waiting & DLIL_INPUT_RUNNING)) {
2085		inp->wtot++;
2086		wakeup_one((caddr_t)&inp->input_waiting);
2087	}
2088	lck_mtx_unlock(&inp->input_lck);
2089
2090	if (ifp != lo_ifp) {
2091		/* Release the IO refcnt */
2092		ifnet_decr_iorefcnt(ifp);
2093	}
2094
2095	return (0);
2096}
2097
2098void
2099ifnet_start(struct ifnet *ifp)
2100{
2101	/*
2102	 * If the starter thread is inactive, signal it to do work.
2103	 */
2104	lck_mtx_lock_spin(&ifp->if_start_lock);
2105	ifp->if_start_req++;
2106	if (!ifp->if_start_active && ifp->if_start_thread != THREAD_NULL) {
2107		wakeup_one((caddr_t)&ifp->if_start_thread);
2108	}
2109	lck_mtx_unlock(&ifp->if_start_lock);
2110}
2111
2112static void
2113ifnet_start_thread_fn(void *v, wait_result_t w)
2114{
2115#pragma unused(w)
2116	struct ifnet *ifp = v;
2117	char ifname[IFNAMSIZ + 1];
2118	struct timespec *ts = NULL;
2119	struct ifclassq *ifq = &ifp->if_snd;
2120
2121	/*
2122	 * Treat the dedicated starter thread for lo0 as equivalent to
2123	 * the driver workloop thread; if net_affinity is enabled for
2124	 * the main input thread, associate this starter thread to it
2125	 * by binding them with the same affinity tag.  This is done
2126	 * only once (as we only have one lo_ifp which never goes away.)
2127	 */
2128	if (ifp == lo_ifp) {
2129		struct dlil_threading_info *inp = dlil_main_input_thread;
2130		struct thread *tp = current_thread();
2131
2132		lck_mtx_lock(&inp->input_lck);
2133		if (inp->net_affinity) {
2134			u_int32_t tag = inp->tag;
2135
2136			VERIFY(inp->wloop_thr == THREAD_NULL);
2137			VERIFY(inp->poll_thr == THREAD_NULL);
2138			inp->wloop_thr = tp;
2139			lck_mtx_unlock(&inp->input_lck);
2140
2141			/* Associate this thread with the affinity tag */
2142			(void) dlil_affinity_set(tp, tag);
2143		} else {
2144			lck_mtx_unlock(&inp->input_lck);
2145		}
2146	}
2147
2148	snprintf(ifname, sizeof (ifname), "%s%d_starter",
2149	    ifp->if_name, ifp->if_unit);
2150
2151	lck_mtx_lock_spin(&ifp->if_start_lock);
2152
2153	for (;;) {
2154		(void) msleep(&ifp->if_start_thread, &ifp->if_start_lock,
2155		    (PZERO - 1) | PSPIN, ifname, ts);
2156
2157		/* interface is detached? */
2158		if (ifp->if_start_thread == THREAD_NULL) {
2159			ifnet_set_start_cycle(ifp, NULL);
2160			lck_mtx_unlock(&ifp->if_start_lock);
2161			ifnet_purge(ifp);
2162
2163			if (dlil_verbose) {
2164				printf("%s%d: starter thread terminated\n",
2165				    ifp->if_name, ifp->if_unit);
2166			}
2167
2168			/* for the extra refcnt from kernel_thread_start() */
2169			thread_deallocate(current_thread());
2170			/* this is the end */
2171			thread_terminate(current_thread());
2172			/* NOTREACHED */
2173			return;
2174		}
2175
2176		ifp->if_start_active = 1;
2177		for (;;) {
2178			u_int32_t req = ifp->if_start_req;
2179
2180			lck_mtx_unlock(&ifp->if_start_lock);
2181			/* invoke the driver's start routine */
2182			((*ifp->if_start)(ifp));
2183			lck_mtx_lock_spin(&ifp->if_start_lock);
2184
2185			/* if there's no pending request, we're done */
2186			if (req == ifp->if_start_req)
2187				break;
2188		}
2189		ifp->if_start_req = 0;
2190		ifp->if_start_active = 0;
2191		/*
2192		 * Wakeup N ns from now if rate-controlled by TBR, and if
2193		 * there are still packets in the send queue which haven't
2194		 * been dequeued so far; else sleep indefinitely (ts = NULL)
2195		 * until ifnet_start() is called again.
2196		 */
2197		ts = ((IFCQ_TBR_IS_ENABLED(ifq) && !IFCQ_IS_EMPTY(ifq)) ?
2198		    &ifp->if_start_cycle : NULL);
2199
2200		if (ts != NULL && ts->tv_sec == 0 && ts->tv_nsec == 0)
2201			ts = NULL;
2202	}
2203
2204	/* NOTREACHED */
2205	lck_mtx_unlock(&ifp->if_start_lock);
2206	VERIFY(0);	/* we should never get here */
2207}
2208
2209void
2210ifnet_set_start_cycle(struct ifnet *ifp, struct timespec *ts)
2211{
2212	if (ts == NULL)
2213		bzero(&ifp->if_start_cycle, sizeof (ifp->if_start_cycle));
2214	else
2215		*(&ifp->if_start_cycle) = *ts;
2216
2217	if (ts != NULL && ts->tv_nsec != 0 && dlil_verbose)
2218		printf("%s%d: restart interval set to %lu nsec\n",
2219		    ifp->if_name, ifp->if_unit, ts->tv_nsec);
2220}
2221
2222static void
2223ifnet_poll(struct ifnet *ifp)
2224{
2225	/*
2226	 * If the poller thread is inactive, signal it to do work.
2227	 */
2228	lck_mtx_lock_spin(&ifp->if_poll_lock);
2229	ifp->if_poll_req++;
2230	if (!ifp->if_poll_active && ifp->if_poll_thread != THREAD_NULL) {
2231		wakeup_one((caddr_t)&ifp->if_poll_thread);
2232	}
2233	lck_mtx_unlock(&ifp->if_poll_lock);
2234}
2235
2236static void
2237ifnet_poll_thread_fn(void *v, wait_result_t w)
2238{
2239#pragma unused(w)
2240	struct dlil_threading_info *inp;
2241	struct ifnet *ifp = v;
2242	char ifname[IFNAMSIZ + 1];
2243	struct timespec *ts = NULL;
2244	struct ifnet_stat_increment_param s;
2245
2246	snprintf(ifname, sizeof (ifname), "%s%d_poller",
2247	    ifp->if_name, ifp->if_unit);
2248	bzero(&s, sizeof (s));
2249
2250	lck_mtx_lock_spin(&ifp->if_poll_lock);
2251
2252	inp = ifp->if_inp;
2253	VERIFY(inp != NULL);
2254
2255	for (;;) {
2256		if (ifp->if_poll_thread != THREAD_NULL) {
2257			(void) msleep(&ifp->if_poll_thread, &ifp->if_poll_lock,
2258			    (PZERO - 1) | PSPIN, ifname, ts);
2259		}
2260
2261		/* interface is detached (maybe while asleep)? */
2262		if (ifp->if_poll_thread == THREAD_NULL) {
2263			ifnet_set_poll_cycle(ifp, NULL);
2264			lck_mtx_unlock(&ifp->if_poll_lock);
2265
2266			if (dlil_verbose) {
2267				printf("%s%d: poller thread terminated\n",
2268				    ifp->if_name, ifp->if_unit);
2269			}
2270
2271			/* for the extra refcnt from kernel_thread_start() */
2272			thread_deallocate(current_thread());
2273			/* this is the end */
2274			thread_terminate(current_thread());
2275			/* NOTREACHED */
2276			return;
2277		}
2278
2279		ifp->if_poll_active = 1;
2280		for (;;) {
2281			struct mbuf *m_head, *m_tail;
2282			u_int32_t m_lim, m_cnt, m_totlen;
2283			u_int16_t req = ifp->if_poll_req;
2284
2285			lck_mtx_unlock(&ifp->if_poll_lock);
2286
2287			/*
2288			 * If no longer attached, there's nothing to do;
2289			 * else hold an IO refcnt to prevent the interface
2290			 * from being detached (will be released below.)
2291			 */
2292			if (!ifnet_is_attached(ifp, 1)) {
2293				lck_mtx_lock_spin(&ifp->if_poll_lock);
2294				break;
2295			}
2296
2297			m_lim = (if_rxpoll_max != 0) ? if_rxpoll_max :
2298			    MAX((qlimit(&inp->rcvq_pkts)),
2299			    (inp->rxpoll_phiwat << 2));
2300
2301			if (dlil_verbose > 1) {
2302				printf("%s%d: polling up to %d pkts, "
2303				    "pkts avg %d max %d, wreq avg %d, "
2304				    "bytes avg %d\n",
2305				    ifp->if_name, ifp->if_unit, m_lim,
2306				    inp->rxpoll_pavg, inp->rxpoll_pmax,
2307				    inp->rxpoll_wavg, inp->rxpoll_bavg);
2308			}
2309
2310			/* invoke the driver's input poll routine */
2311			((*ifp->if_input_poll)(ifp, 0, m_lim, &m_head, &m_tail,
2312			    &m_cnt, &m_totlen));
2313
2314			if (m_head != NULL) {
2315				VERIFY(m_tail != NULL && m_cnt > 0);
2316
2317				if (dlil_verbose > 1) {
2318					printf("%s%d: polled %d pkts, "
2319					    "pkts avg %d max %d, wreq avg %d, "
2320					    "bytes avg %d\n",
2321					    ifp->if_name, ifp->if_unit, m_cnt,
2322					    inp->rxpoll_pavg, inp->rxpoll_pmax,
2323					    inp->rxpoll_wavg, inp->rxpoll_bavg);
2324				}
2325
2326				/* stats are required for extended variant */
2327				s.packets_in = m_cnt;
2328				s.bytes_in = m_totlen;
2329
2330				(void) ifnet_input_common(ifp, m_head, m_tail,
2331				    &s, TRUE, TRUE);
2332			} else if (dlil_verbose > 1) {
2333				printf("%s%d: no packets, pkts avg %d max %d, "
2334				    "wreq avg %d, bytes avg %d\n", ifp->if_name,
2335				    ifp->if_unit, inp->rxpoll_pavg,
2336				    inp->rxpoll_pmax, inp->rxpoll_wavg,
2337				    inp->rxpoll_bavg);
2338			}
2339
2340			/* Release the io ref count */
2341			ifnet_decr_iorefcnt(ifp);
2342
2343			lck_mtx_lock_spin(&ifp->if_poll_lock);
2344
2345			/* if there's no pending request, we're done */
2346			if (req == ifp->if_poll_req)
2347				break;
2348		}
2349		ifp->if_poll_req = 0;
2350		ifp->if_poll_active = 0;
2351
2352		/*
2353		 * Wakeup N ns from now, else sleep indefinitely (ts = NULL)
2354		 * until ifnet_poll() is called again.
2355		 */
2356		ts = &ifp->if_poll_cycle;
2357		if (ts->tv_sec == 0 && ts->tv_nsec == 0)
2358			ts = NULL;
2359	}
2360
2361	/* NOTREACHED */
2362	lck_mtx_unlock(&ifp->if_poll_lock);
2363	VERIFY(0);	/* we should never get here */
2364}
2365
2366void
2367ifnet_set_poll_cycle(struct ifnet *ifp, struct timespec *ts)
2368{
2369	if (ts == NULL)
2370		bzero(&ifp->if_poll_cycle, sizeof (ifp->if_poll_cycle));
2371	else
2372		*(&ifp->if_poll_cycle) = *ts;
2373
2374	if (ts != NULL && ts->tv_nsec != 0 && dlil_verbose)
2375		printf("%s%d: poll interval set to %lu nsec\n",
2376		    ifp->if_name, ifp->if_unit, ts->tv_nsec);
2377}
2378
2379void
2380ifnet_purge(struct ifnet *ifp)
2381{
2382	if (ifp != NULL && (ifp->if_eflags & IFEF_TXSTART))
2383		if_qflush(ifp, 0);
2384}
2385
2386void
2387ifnet_update_sndq(struct ifclassq *ifq, cqev_t ev)
2388{
2389	IFCQ_LOCK_ASSERT_HELD(ifq);
2390
2391	if (!(IFCQ_IS_READY(ifq)))
2392		return;
2393
2394	if (IFCQ_TBR_IS_ENABLED(ifq)) {
2395		struct tb_profile tb = { ifq->ifcq_tbr.tbr_rate_raw,
2396		    ifq->ifcq_tbr.tbr_percent, 0 };
2397		(void) ifclassq_tbr_set(ifq, &tb, FALSE);
2398	}
2399
2400	ifclassq_update(ifq, ev);
2401}
2402
2403void
2404ifnet_update_rcv(struct ifnet *ifp, cqev_t ev)
2405{
2406	switch (ev) {
2407	case CLASSQ_EV_LINK_SPEED:
2408		if (net_rxpoll && (ifp->if_eflags & IFEF_RXPOLL))
2409			ifp->if_poll_update++;
2410		break;
2411
2412	default:
2413		break;
2414	}
2415}
2416
2417errno_t
2418ifnet_set_output_sched_model(struct ifnet *ifp, u_int32_t model)
2419{
2420	struct ifclassq *ifq;
2421	u_int32_t omodel;
2422	errno_t err;
2423
2424	if (ifp == NULL || (model != IFNET_SCHED_MODEL_DRIVER_MANAGED &&
2425	    model != IFNET_SCHED_MODEL_NORMAL))
2426		return (EINVAL);
2427	else if (!(ifp->if_eflags & IFEF_TXSTART))
2428		return (ENXIO);
2429
2430	ifq = &ifp->if_snd;
2431	IFCQ_LOCK(ifq);
2432	omodel = ifp->if_output_sched_model;
2433	ifp->if_output_sched_model = model;
2434	if ((err = ifclassq_pktsched_setup(ifq)) != 0)
2435		ifp->if_output_sched_model = omodel;
2436	IFCQ_UNLOCK(ifq);
2437
2438	return (err);
2439}
2440
2441errno_t
2442ifnet_set_sndq_maxlen(struct ifnet *ifp, u_int32_t maxqlen)
2443{
2444	if (ifp == NULL)
2445		return (EINVAL);
2446	else if (!(ifp->if_eflags & IFEF_TXSTART))
2447		return (ENXIO);
2448
2449	ifclassq_set_maxlen(&ifp->if_snd, maxqlen);
2450
2451	return (0);
2452}
2453
2454errno_t
2455ifnet_get_sndq_maxlen(struct ifnet *ifp, u_int32_t *maxqlen)
2456{
2457	if (ifp == NULL || maxqlen == NULL)
2458		return (EINVAL);
2459	else if (!(ifp->if_eflags & IFEF_TXSTART))
2460		return (ENXIO);
2461
2462	*maxqlen = ifclassq_get_maxlen(&ifp->if_snd);
2463
2464	return (0);
2465}
2466
2467errno_t
2468ifnet_get_sndq_len(struct ifnet *ifp, u_int32_t *qlen)
2469{
2470	if (ifp == NULL || qlen == NULL)
2471		return (EINVAL);
2472	else if (!(ifp->if_eflags & IFEF_TXSTART))
2473		return (ENXIO);
2474
2475	*qlen = ifclassq_get_len(&ifp->if_snd);
2476
2477	return (0);
2478}
2479
2480errno_t
2481ifnet_set_rcvq_maxlen(struct ifnet *ifp, u_int32_t maxqlen)
2482{
2483	struct dlil_threading_info *inp;
2484
2485	if (ifp == NULL)
2486		return (EINVAL);
2487	else if (!(ifp->if_eflags & IFEF_RXPOLL) || ifp->if_inp == NULL)
2488		return (ENXIO);
2489
2490	if (maxqlen == 0)
2491		maxqlen = if_rcvq_maxlen;
2492	else if (maxqlen < IF_RCVQ_MINLEN)
2493		maxqlen = IF_RCVQ_MINLEN;
2494
2495	inp = ifp->if_inp;
2496	lck_mtx_lock(&inp->input_lck);
2497	qlimit(&inp->rcvq_pkts) = maxqlen;
2498	lck_mtx_unlock(&inp->input_lck);
2499
2500	return (0);
2501}
2502
2503errno_t
2504ifnet_get_rcvq_maxlen(struct ifnet *ifp, u_int32_t *maxqlen)
2505{
2506	struct dlil_threading_info *inp;
2507
2508	if (ifp == NULL || maxqlen == NULL)
2509		return (EINVAL);
2510	else if (!(ifp->if_eflags & IFEF_RXPOLL) || ifp->if_inp == NULL)
2511		return (ENXIO);
2512
2513	inp = ifp->if_inp;
2514	lck_mtx_lock(&inp->input_lck);
2515	*maxqlen = qlimit(&inp->rcvq_pkts);
2516	lck_mtx_unlock(&inp->input_lck);
2517	return (0);
2518}
2519
2520errno_t
2521ifnet_enqueue(struct ifnet *ifp, struct mbuf *m)
2522{
2523	int error;
2524
2525	if (ifp == NULL || m == NULL || !(m->m_flags & M_PKTHDR) ||
2526	    m->m_nextpkt != NULL) {
2527		if (m != NULL)
2528			m_freem_list(m);
2529		return (EINVAL);
2530	} else if (!(ifp->if_eflags & IFEF_TXSTART) ||
2531	    !(ifp->if_refflags & IFRF_ATTACHED)) {
2532		/* flag tested without lock for performance */
2533		m_freem(m);
2534		return (ENXIO);
2535	} else if (!(ifp->if_flags & IFF_UP)) {
2536		m_freem(m);
2537		return (ENETDOWN);
2538
2539	}
2540
2541	/* enqueue the packet */
2542	error = ifclassq_enqueue(&ifp->if_snd, m);
2543
2544	/*
2545	 * Tell the driver to start dequeueing; do this even when the queue
2546	 * for the packet is suspended (EQSUSPENDED), as the driver could still
2547	 * be dequeueing from other unsuspended queues.
2548	 */
2549	if (error == 0 || error == EQFULL || error == EQSUSPENDED)
2550		ifnet_start(ifp);
2551
2552	return (error);
2553}
2554
2555errno_t
2556ifnet_dequeue(struct ifnet *ifp, struct mbuf **mp)
2557{
2558	if (ifp == NULL || mp == NULL)
2559		return (EINVAL);
2560	else if (!(ifp->if_eflags & IFEF_TXSTART) ||
2561	    (ifp->if_output_sched_model != IFNET_SCHED_MODEL_NORMAL))
2562		return (ENXIO);
2563
2564	return (ifclassq_dequeue(&ifp->if_snd, 1, mp, NULL, NULL, NULL));
2565}
2566
2567errno_t
2568ifnet_dequeue_service_class(struct ifnet *ifp, mbuf_svc_class_t sc,
2569    struct mbuf **mp)
2570{
2571	if (ifp == NULL || mp == NULL || !MBUF_VALID_SC(sc))
2572		return (EINVAL);
2573	else if (!(ifp->if_eflags & IFEF_TXSTART) ||
2574	    (ifp->if_output_sched_model != IFNET_SCHED_MODEL_DRIVER_MANAGED))
2575		return (ENXIO);
2576
2577	return (ifclassq_dequeue_sc(&ifp->if_snd, sc, 1, mp, NULL, NULL, NULL));
2578}
2579
2580errno_t
2581ifnet_dequeue_multi(struct ifnet *ifp, u_int32_t limit, struct mbuf **head,
2582    struct mbuf **tail, u_int32_t *cnt, u_int32_t *len)
2583{
2584	if (ifp == NULL || head == NULL || limit < 1)
2585		return (EINVAL);
2586	else if (!(ifp->if_eflags & IFEF_TXSTART) ||
2587	    (ifp->if_output_sched_model != IFNET_SCHED_MODEL_NORMAL))
2588		return (ENXIO);
2589
2590	return (ifclassq_dequeue(&ifp->if_snd, limit, head, tail, cnt, len));
2591}
2592
2593errno_t
2594ifnet_dequeue_service_class_multi(struct ifnet *ifp, mbuf_svc_class_t sc,
2595    u_int32_t limit, struct mbuf **head, struct mbuf **tail, u_int32_t *cnt,
2596    u_int32_t *len)
2597{
2598
2599	if (ifp == NULL || head == NULL || limit < 1 || !MBUF_VALID_SC(sc))
2600		return (EINVAL);
2601	else if (!(ifp->if_eflags & IFEF_TXSTART) ||
2602	    (ifp->if_output_sched_model != IFNET_SCHED_MODEL_DRIVER_MANAGED))
2603		return (ENXIO);
2604
2605	return (ifclassq_dequeue_sc(&ifp->if_snd, sc, limit, head,
2606	    tail, cnt, len));
2607}
2608
2609static int
2610dlil_interface_filters_input(struct ifnet *ifp, struct mbuf **m_p,
2611    char **frame_header_p, protocol_family_t protocol_family)
2612{
2613	struct ifnet_filter *filter;
2614
2615	/*
2616	 * Pass the inbound packet to the interface filters
2617	 */
2618	lck_mtx_lock_spin(&ifp->if_flt_lock);
2619	/* prevent filter list from changing in case we drop the lock */
2620	if_flt_monitor_busy(ifp);
2621	TAILQ_FOREACH(filter, &ifp->if_flt_head, filt_next) {
2622		int result;
2623
2624		if (!filter->filt_skip && filter->filt_input != NULL &&
2625		    (filter->filt_protocol == 0 ||
2626		    filter->filt_protocol == protocol_family)) {
2627			lck_mtx_unlock(&ifp->if_flt_lock);
2628
2629			result = (*filter->filt_input)(filter->filt_cookie,
2630			    ifp, protocol_family, m_p, frame_header_p);
2631
2632			lck_mtx_lock_spin(&ifp->if_flt_lock);
2633			if (result != 0) {
2634				/* we're done with the filter list */
2635				if_flt_monitor_unbusy(ifp);
2636				lck_mtx_unlock(&ifp->if_flt_lock);
2637				return (result);
2638			}
2639		}
2640	}
2641	/* we're done with the filter list */
2642	if_flt_monitor_unbusy(ifp);
2643	lck_mtx_unlock(&ifp->if_flt_lock);
2644
2645	/*
2646	 * Strip away M_PROTO1 bit prior to sending packet up the stack as
2647	 * it is meant to be local to a subsystem -- if_bridge for M_PROTO1
2648	 */
2649	if (*m_p != NULL)
2650		(*m_p)->m_flags &= ~M_PROTO1;
2651
2652	return (0);
2653}
2654
2655static int
2656dlil_interface_filters_output(struct ifnet *ifp, struct mbuf **m_p,
2657    protocol_family_t protocol_family)
2658{
2659	struct ifnet_filter *filter;
2660
2661	/*
2662	 * Pass the outbound packet to the interface filters
2663	 */
2664	lck_mtx_lock_spin(&ifp->if_flt_lock);
2665	/* prevent filter list from changing in case we drop the lock */
2666	if_flt_monitor_busy(ifp);
2667	TAILQ_FOREACH(filter, &ifp->if_flt_head, filt_next) {
2668		int result;
2669
2670		if (!filter->filt_skip && filter->filt_output != NULL &&
2671		    (filter->filt_protocol == 0 ||
2672		    filter->filt_protocol == protocol_family)) {
2673			lck_mtx_unlock(&ifp->if_flt_lock);
2674
2675			result = filter->filt_output(filter->filt_cookie, ifp,
2676			    protocol_family, m_p);
2677
2678			lck_mtx_lock_spin(&ifp->if_flt_lock);
2679			if (result != 0) {
2680				/* we're done with the filter list */
2681				if_flt_monitor_unbusy(ifp);
2682				lck_mtx_unlock(&ifp->if_flt_lock);
2683				return (result);
2684			}
2685		}
2686	}
2687	/* we're done with the filter list */
2688	if_flt_monitor_unbusy(ifp);
2689	lck_mtx_unlock(&ifp->if_flt_lock);
2690
2691	return (0);
2692}
2693
2694static void
2695dlil_ifproto_input(struct if_proto * ifproto, mbuf_t m)
2696{
2697	int error;
2698
2699	if (ifproto->proto_kpi == kProtoKPI_v1) {
2700		/* Version 1 protocols get one packet at a time */
2701		while (m != NULL) {
2702			char *	frame_header;
2703			mbuf_t	next_packet;
2704
2705			next_packet = m->m_nextpkt;
2706			m->m_nextpkt = NULL;
2707			frame_header = m->m_pkthdr.header;
2708			m->m_pkthdr.header = NULL;
2709			error = (*ifproto->kpi.v1.input)(ifproto->ifp,
2710			    ifproto->protocol_family, m, frame_header);
2711			if (error != 0 && error != EJUSTRETURN)
2712				m_freem(m);
2713			m = next_packet;
2714		}
2715	} else if (ifproto->proto_kpi == kProtoKPI_v2) {
2716		/* Version 2 protocols support packet lists */
2717		error = (*ifproto->kpi.v2.input)(ifproto->ifp,
2718		    ifproto->protocol_family, m);
2719		if (error != 0 && error != EJUSTRETURN)
2720			m_freem_list(m);
2721	}
2722	return;
2723}
2724
2725static void
2726dlil_input_stats_add(const struct ifnet_stat_increment_param *s,
2727    struct dlil_threading_info *inp, boolean_t poll)
2728{
2729	struct ifnet_stat_increment_param *d = &inp->stats;
2730
2731	if (s->packets_in != 0)
2732		d->packets_in += s->packets_in;
2733	if (s->bytes_in != 0)
2734		d->bytes_in += s->bytes_in;
2735	if (s->errors_in != 0)
2736		d->errors_in += s->errors_in;
2737
2738	if (s->packets_out != 0)
2739		d->packets_out += s->packets_out;
2740	if (s->bytes_out != 0)
2741		d->bytes_out += s->bytes_out;
2742	if (s->errors_out != 0)
2743		d->errors_out += s->errors_out;
2744
2745	if (s->collisions != 0)
2746		d->collisions += s->collisions;
2747	if (s->dropped != 0)
2748		d->dropped += s->dropped;
2749
2750	if (poll)
2751		PKTCNTR_ADD(&inp->tstats, s->packets_in, s->bytes_in);
2752}
2753
2754static void
2755dlil_input_stats_sync(struct ifnet *ifp, struct dlil_threading_info *inp)
2756{
2757	struct ifnet_stat_increment_param *s = &inp->stats;
2758
2759	/*
2760	 * Use of atomic operations is unavoidable here because
2761	 * these stats may also be incremented elsewhere via KPIs.
2762	 */
2763	if (s->packets_in != 0) {
2764		atomic_add_64(&ifp->if_data.ifi_ipackets, s->packets_in);
2765		s->packets_in = 0;
2766	}
2767	if (s->bytes_in != 0) {
2768		atomic_add_64(&ifp->if_data.ifi_ibytes, s->bytes_in);
2769		s->bytes_in = 0;
2770	}
2771	if (s->errors_in != 0) {
2772		atomic_add_64(&ifp->if_data.ifi_ierrors, s->errors_in);
2773		s->errors_in = 0;
2774	}
2775
2776	if (s->packets_out != 0) {
2777		atomic_add_64(&ifp->if_data.ifi_opackets, s->packets_out);
2778		s->packets_out = 0;
2779	}
2780	if (s->bytes_out != 0) {
2781		atomic_add_64(&ifp->if_data.ifi_obytes, s->bytes_out);
2782		s->bytes_out = 0;
2783	}
2784	if (s->errors_out != 0) {
2785		atomic_add_64(&ifp->if_data.ifi_oerrors, s->errors_out);
2786		s->errors_out = 0;
2787	}
2788
2789	if (s->collisions != 0) {
2790		atomic_add_64(&ifp->if_data.ifi_collisions, s->collisions);
2791		s->collisions = 0;
2792	}
2793	if (s->dropped != 0) {
2794		atomic_add_64(&ifp->if_data.ifi_iqdrops, s->dropped);
2795		s->dropped = 0;
2796	}
2797
2798	/*
2799	 * No need for atomic operations as they are modified here
2800	 * only from within the DLIL input thread context.
2801	 */
2802	if (inp->tstats.packets != 0) {
2803		inp->pstats.ifi_poll_packets += inp->tstats.packets;
2804		inp->tstats.packets = 0;
2805	}
2806	if (inp->tstats.bytes != 0) {
2807		inp->pstats.ifi_poll_bytes += inp->tstats.bytes;
2808		inp->tstats.bytes = 0;
2809	}
2810}
2811
2812__private_extern__ void
2813dlil_input_packet_list(struct ifnet *ifp, struct mbuf *m)
2814{
2815	return (dlil_input_packet_list_common(ifp, m, 0,
2816	    IFNET_MODEL_INPUT_POLL_OFF, FALSE));
2817}
2818
2819__private_extern__ void
2820dlil_input_packet_list_extended(struct ifnet *ifp, struct mbuf *m,
2821    u_int32_t cnt, ifnet_model_t mode)
2822{
2823	return (dlil_input_packet_list_common(ifp, m, cnt, mode, TRUE));
2824}
2825
2826static void
2827dlil_input_packet_list_common(struct ifnet *ifp_param, struct mbuf *m,
2828    u_int32_t cnt, ifnet_model_t mode, boolean_t ext)
2829{
2830	int				error = 0;
2831	protocol_family_t		protocol_family;
2832	mbuf_t				next_packet;
2833	ifnet_t				ifp = ifp_param;
2834	char *				frame_header;
2835	struct if_proto	*		last_ifproto = NULL;
2836	mbuf_t				pkt_first = NULL;
2837	mbuf_t *			pkt_next = NULL;
2838	u_int32_t			poll_thresh = 0, poll_ival = 0;
2839
2840	KERNEL_DEBUG(DBG_FNC_DLIL_INPUT | DBG_FUNC_START,0,0,0,0,0);
2841
2842	if (ext && mode == IFNET_MODEL_INPUT_POLL_ON && cnt > 1 &&
2843	    (poll_ival = if_rxpoll_interval_pkts) > 0)
2844		poll_thresh = cnt;
2845
2846	while (m != NULL) {
2847		struct if_proto *ifproto = NULL;
2848		int iorefcnt = 0;
2849
2850		if (ifp_param == NULL)
2851			ifp = m->m_pkthdr.rcvif;
2852
2853		if ((ifp->if_eflags & IFEF_RXPOLL) && poll_thresh != 0 &&
2854		    poll_ival > 0 && (--poll_thresh % poll_ival) == 0)
2855			ifnet_poll(ifp);
2856
2857		/* Check if this mbuf looks valid */
2858		MBUF_INPUT_CHECK(m, ifp);
2859
2860		next_packet = m->m_nextpkt;
2861		m->m_nextpkt = NULL;
2862		frame_header = m->m_pkthdr.header;
2863		m->m_pkthdr.header = NULL;
2864
2865		/*
2866		 * Get an IO reference count if the interface is not
2867		 * loopback (lo0) and it is attached; lo0 never goes
2868		 * away, so optimize for that.
2869		 */
2870		if (ifp != lo_ifp) {
2871			if (!ifnet_is_attached(ifp, 1)) {
2872				m_freem(m);
2873				goto next;
2874			}
2875			iorefcnt = 1;
2876		}
2877
2878		ifp_inc_traffic_class_in(ifp, m);
2879
2880		/* find which protocol family this packet is for */
2881		ifnet_lock_shared(ifp);
2882		error = (*ifp->if_demux)(ifp, m, frame_header,
2883		    &protocol_family);
2884		ifnet_lock_done(ifp);
2885		if (error != 0) {
2886			if (error == EJUSTRETURN)
2887				goto next;
2888			protocol_family = 0;
2889		}
2890
2891#if CONFIG_EMBEDDED
2892		iptap_ipf_input(ifp, protocol_family, m, frame_header);
2893#endif /* CONFIG_EMBEDDED */
2894
2895		if (m->m_flags & (M_BCAST|M_MCAST))
2896			atomic_add_64(&ifp->if_imcasts, 1);
2897
2898		/* run interface filters, exclude VLAN packets PR-3586856 */
2899		if ((m->m_pkthdr.csum_flags & CSUM_VLAN_TAG_VALID) == 0) {
2900			error = dlil_interface_filters_input(ifp, &m,
2901			    &frame_header, protocol_family);
2902			if (error != 0) {
2903				if (error != EJUSTRETURN)
2904					m_freem(m);
2905				goto next;
2906			}
2907		}
2908		if (error != 0 || ((m->m_flags & M_PROMISC) != 0) ) {
2909			m_freem(m);
2910			goto next;
2911		}
2912
2913		/* Lookup the protocol attachment to this interface */
2914		if (protocol_family == 0) {
2915			ifproto = NULL;
2916		} else if (last_ifproto != NULL && last_ifproto->ifp == ifp &&
2917		    (last_ifproto->protocol_family == protocol_family)) {
2918			VERIFY(ifproto == NULL);
2919			ifproto = last_ifproto;
2920			if_proto_ref(last_ifproto);
2921		} else {
2922			VERIFY(ifproto == NULL);
2923			ifnet_lock_shared(ifp);
2924			/* callee holds a proto refcnt upon success */
2925			ifproto	= find_attached_proto(ifp, protocol_family);
2926			ifnet_lock_done(ifp);
2927		}
2928		if (ifproto == NULL) {
2929			/* no protocol for this packet, discard */
2930			m_freem(m);
2931			goto next;
2932		}
2933		if (ifproto != last_ifproto) {
2934			if (last_ifproto != NULL) {
2935				/* pass up the list for the previous protocol */
2936				dlil_ifproto_input(last_ifproto, pkt_first);
2937				pkt_first = NULL;
2938				if_proto_free(last_ifproto);
2939			}
2940			last_ifproto = ifproto;
2941			if_proto_ref(ifproto);
2942		}
2943		/* extend the list */
2944		m->m_pkthdr.header = frame_header;
2945		if (pkt_first == NULL) {
2946			pkt_first = m;
2947		} else {
2948			*pkt_next = m;
2949		}
2950		pkt_next = &m->m_nextpkt;
2951
2952next:
2953		if (next_packet == NULL && last_ifproto != NULL) {
2954			/* pass up the last list of packets */
2955			dlil_ifproto_input(last_ifproto, pkt_first);
2956			if_proto_free(last_ifproto);
2957			last_ifproto = NULL;
2958		}
2959		if (ifproto != NULL) {
2960			if_proto_free(ifproto);
2961			ifproto = NULL;
2962		}
2963
2964		m = next_packet;
2965
2966		/* update the driver's multicast filter, if needed */
2967		if (ifp->if_updatemcasts > 0 && if_mcasts_update(ifp) == 0)
2968			ifp->if_updatemcasts = 0;
2969		if (iorefcnt == 1)
2970			ifnet_decr_iorefcnt(ifp);
2971	}
2972
2973	KERNEL_DEBUG(DBG_FNC_DLIL_INPUT | DBG_FUNC_END,0,0,0,0,0);
2974}
2975
2976errno_t
2977if_mcasts_update(struct ifnet *ifp)
2978{
2979	errno_t err;
2980
2981	err = ifnet_ioctl(ifp, 0, SIOCADDMULTI, NULL);
2982	if (err == EAFNOSUPPORT)
2983		err = 0;
2984	printf("%s%d: %s %d suspended link-layer multicast membership(s) "
2985	    "(err=%d)\n", ifp->if_name, ifp->if_unit,
2986	    (err == 0 ? "successfully restored" : "failed to restore"),
2987	    ifp->if_updatemcasts, err);
2988
2989	/* just return success */
2990	return (0);
2991}
2992
2993static int
2994dlil_event_internal(struct ifnet *ifp, struct kev_msg *event)
2995{
2996	struct ifnet_filter *filter;
2997
2998	/* Get an io ref count if the interface is attached */
2999	if (!ifnet_is_attached(ifp, 1))
3000		goto done;
3001
3002	/*
3003	 * Pass the event to the interface filters
3004	 */
3005	lck_mtx_lock_spin(&ifp->if_flt_lock);
3006	/* prevent filter list from changing in case we drop the lock */
3007	if_flt_monitor_busy(ifp);
3008	TAILQ_FOREACH(filter, &ifp->if_flt_head, filt_next) {
3009		if (filter->filt_event != NULL) {
3010			lck_mtx_unlock(&ifp->if_flt_lock);
3011
3012			filter->filt_event(filter->filt_cookie, ifp,
3013			    filter->filt_protocol, event);
3014
3015			lck_mtx_lock_spin(&ifp->if_flt_lock);
3016		}
3017	}
3018	/* we're done with the filter list */
3019	if_flt_monitor_unbusy(ifp);
3020	lck_mtx_unlock(&ifp->if_flt_lock);
3021
3022	ifnet_lock_shared(ifp);
3023	if (ifp->if_proto_hash != NULL) {
3024		int i;
3025
3026		for (i = 0; i < PROTO_HASH_SLOTS; i++) {
3027			struct if_proto *proto;
3028
3029			SLIST_FOREACH(proto, &ifp->if_proto_hash[i],
3030			    next_hash) {
3031				proto_media_event eventp =
3032				    (proto->proto_kpi == kProtoKPI_v1 ?
3033				    proto->kpi.v1.event :
3034				    proto->kpi.v2.event);
3035
3036				if (eventp != NULL) {
3037					if_proto_ref(proto);
3038					ifnet_lock_done(ifp);
3039
3040					eventp(ifp, proto->protocol_family,
3041					    event);
3042
3043					ifnet_lock_shared(ifp);
3044					if_proto_free(proto);
3045				}
3046			}
3047		}
3048	}
3049	ifnet_lock_done(ifp);
3050
3051	/* Pass the event to the interface */
3052	if (ifp->if_event != NULL)
3053		ifp->if_event(ifp, event);
3054
3055	/* Release the io ref count */
3056	ifnet_decr_iorefcnt(ifp);
3057
3058done:
3059	return (kev_post_msg(event));
3060}
3061
3062errno_t
3063ifnet_event(ifnet_t ifp, struct kern_event_msg *event)
3064{
3065	struct kev_msg               kev_msg;
3066	int result = 0;
3067
3068	if (ifp == NULL || event == NULL)
3069		return (EINVAL);
3070
3071	bzero(&kev_msg, sizeof (kev_msg));
3072	kev_msg.vendor_code    = event->vendor_code;
3073	kev_msg.kev_class      = event->kev_class;
3074	kev_msg.kev_subclass   = event->kev_subclass;
3075	kev_msg.event_code     = event->event_code;
3076	kev_msg.dv[0].data_ptr = &event->event_data[0];
3077	kev_msg.dv[0].data_length = event->total_size - KEV_MSG_HEADER_SIZE;
3078	kev_msg.dv[1].data_length = 0;
3079
3080	result = dlil_event_internal(ifp, &kev_msg);
3081
3082	return (result);
3083}
3084
3085#if CONFIG_MACF_NET
3086#include <netinet/ip6.h>
3087#include <netinet/ip.h>
3088static int
3089dlil_get_socket_type(struct mbuf **mp, int family, int raw)
3090{
3091	struct mbuf *m;
3092	struct ip *ip;
3093	struct ip6_hdr *ip6;
3094	int type = SOCK_RAW;
3095
3096	if (!raw) {
3097		switch (family) {
3098		case PF_INET:
3099			m = m_pullup(*mp, sizeof(struct ip));
3100			if (m == NULL)
3101				break;
3102			*mp = m;
3103			ip = mtod(m, struct ip *);
3104			if (ip->ip_p == IPPROTO_TCP)
3105				type = SOCK_STREAM;
3106			else if (ip->ip_p == IPPROTO_UDP)
3107				type = SOCK_DGRAM;
3108			break;
3109		case PF_INET6:
3110			m = m_pullup(*mp, sizeof(struct ip6_hdr));
3111			if (m == NULL)
3112				break;
3113			*mp = m;
3114			ip6 = mtod(m, struct ip6_hdr *);
3115			if (ip6->ip6_nxt == IPPROTO_TCP)
3116				type = SOCK_STREAM;
3117			else if (ip6->ip6_nxt == IPPROTO_UDP)
3118				type = SOCK_DGRAM;
3119			break;
3120		}
3121	}
3122
3123	return (type);
3124}
3125#endif
3126
3127/*
3128 * This is mostly called from the context of the DLIL input thread;
3129 * because of that there is no need for atomic operations.
3130 */
3131static __inline void
3132ifp_inc_traffic_class_in(struct ifnet *ifp, struct mbuf *m)
3133{
3134	if (!(m->m_flags & M_PKTHDR))
3135		return;
3136
3137	switch (m_get_traffic_class(m)) {
3138	case MBUF_TC_BE:
3139		ifp->if_tc.ifi_ibepackets++;
3140		ifp->if_tc.ifi_ibebytes += m->m_pkthdr.len;
3141		break;
3142	case MBUF_TC_BK:
3143		ifp->if_tc.ifi_ibkpackets++;
3144		ifp->if_tc.ifi_ibkbytes += m->m_pkthdr.len;
3145		break;
3146	case MBUF_TC_VI:
3147		ifp->if_tc.ifi_ivipackets++;
3148		ifp->if_tc.ifi_ivibytes += m->m_pkthdr.len;
3149		break;
3150	case MBUF_TC_VO:
3151		ifp->if_tc.ifi_ivopackets++;
3152		ifp->if_tc.ifi_ivobytes += m->m_pkthdr.len;
3153		break;
3154	default:
3155		break;
3156	}
3157
3158	if (mbuf_is_traffic_class_privileged(m)) {
3159		ifp->if_tc.ifi_ipvpackets++;
3160		ifp->if_tc.ifi_ipvbytes += m->m_pkthdr.len;
3161	}
3162}
3163
3164/*
3165 * This is called from DLIL output, hence multiple threads could end
3166 * up modifying the statistics.  We trade off acccuracy for performance
3167 * by not using atomic operations here.
3168 */
3169static __inline void
3170ifp_inc_traffic_class_out(struct ifnet *ifp, struct mbuf *m)
3171{
3172	if (!(m->m_flags & M_PKTHDR))
3173		return;
3174
3175	switch (m_get_traffic_class(m)) {
3176	case MBUF_TC_BE:
3177		ifp->if_tc.ifi_obepackets++;
3178		ifp->if_tc.ifi_obebytes += m->m_pkthdr.len;
3179		break;
3180	case MBUF_TC_BK:
3181		ifp->if_tc.ifi_obkpackets++;
3182		ifp->if_tc.ifi_obkbytes += m->m_pkthdr.len;
3183		break;
3184	case MBUF_TC_VI:
3185		ifp->if_tc.ifi_ovipackets++;
3186		ifp->if_tc.ifi_ovibytes += m->m_pkthdr.len;
3187		break;
3188	case MBUF_TC_VO:
3189		ifp->if_tc.ifi_ovopackets++;
3190		ifp->if_tc.ifi_ovobytes += m->m_pkthdr.len;
3191		break;
3192	default:
3193		break;
3194	}
3195
3196	if (mbuf_is_traffic_class_privileged(m)) {
3197		ifp->if_tc.ifi_opvpackets++;
3198		ifp->if_tc.ifi_opvbytes += m->m_pkthdr.len;
3199	}
3200}
3201
3202/*
3203 * dlil_output
3204 *
3205 * Caller should have a lock on the protocol domain if the protocol
3206 * doesn't support finer grained locking. In most cases, the lock
3207 * will be held from the socket layer and won't be released until
3208 * we return back to the socket layer.
3209 *
3210 * This does mean that we must take a protocol lock before we take
3211 * an interface lock if we're going to take both. This makes sense
3212 * because a protocol is likely to interact with an ifp while it
3213 * is under the protocol lock.
3214 *
3215 * An advisory code will be returned if adv is not null. This
3216 * can be used to provide feedback about interface queues to the
3217 * application.
3218 */
3219errno_t
3220dlil_output(ifnet_t ifp, protocol_family_t proto_family, mbuf_t packetlist,
3221    void *route, const struct sockaddr *dest, int raw, struct flowadv *adv)
3222{
3223	char *frame_type = NULL;
3224	char *dst_linkaddr = NULL;
3225	int retval = 0;
3226	char frame_type_buffer[MAX_FRAME_TYPE_SIZE * 4];
3227	char dst_linkaddr_buffer[MAX_LINKADDR * 4];
3228	struct if_proto	*proto = NULL;
3229	mbuf_t	m;
3230	mbuf_t	send_head = NULL;
3231	mbuf_t	*send_tail = &send_head;
3232	int iorefcnt = 0;
3233#if CONFIG_EMBEDDED
3234	u_int32_t pre = 0, post = 0;
3235#endif /* CONFIG_EMBEDDED */
3236
3237	KERNEL_DEBUG(DBG_FNC_DLIL_OUTPUT | DBG_FUNC_START,0,0,0,0,0);
3238
3239	/* Get an io refcnt if the interface is attached to prevent ifnet_detach
3240	 * from happening while this operation is in progress */
3241	if (!ifnet_is_attached(ifp, 1)) {
3242		retval = ENXIO;
3243		goto cleanup;
3244	}
3245	iorefcnt = 1;
3246
3247	/* update the driver's multicast filter, if needed */
3248	if (ifp->if_updatemcasts > 0 && if_mcasts_update(ifp) == 0)
3249		ifp->if_updatemcasts = 0;
3250
3251	frame_type = frame_type_buffer;
3252	dst_linkaddr = dst_linkaddr_buffer;
3253
3254	if (raw == 0) {
3255		ifnet_lock_shared(ifp);
3256		/* callee holds a proto refcnt upon success */
3257		proto = find_attached_proto(ifp, proto_family);
3258		if (proto == NULL) {
3259			ifnet_lock_done(ifp);
3260			retval = ENXIO;
3261			goto cleanup;
3262		}
3263		ifnet_lock_done(ifp);
3264	}
3265
3266preout_again:
3267	if (packetlist == NULL)
3268		goto cleanup;
3269
3270	m = packetlist;
3271	packetlist = packetlist->m_nextpkt;
3272	m->m_nextpkt = NULL;
3273
3274	if (raw == 0) {
3275		proto_media_preout preoutp = (proto->proto_kpi == kProtoKPI_v1 ?
3276		    proto->kpi.v1.pre_output : proto->kpi.v2.pre_output);
3277		retval = 0;
3278		if (preoutp != NULL) {
3279			retval = preoutp(ifp, proto_family, &m, dest, route,
3280			    frame_type, dst_linkaddr);
3281
3282			if (retval != 0) {
3283				if (retval == EJUSTRETURN)
3284					goto preout_again;
3285				m_freem(m);
3286				goto cleanup;
3287			}
3288		}
3289	}
3290
3291#if CONFIG_MACF_NET
3292	retval = mac_ifnet_check_transmit(ifp, m, proto_family,
3293	    dlil_get_socket_type(&m, proto_family, raw));
3294	if (retval) {
3295		m_freem(m);
3296		goto cleanup;
3297	}
3298#endif
3299
3300	do {
3301#if CONFIG_DTRACE
3302		if (!raw && proto_family == PF_INET) {
3303			struct ip *ip = mtod(m, struct ip*);
3304	                DTRACE_IP6(send, struct mbuf *, m, struct inpcb *, NULL,
3305				struct ip *, ip, struct ifnet *, ifp,
3306				struct ip *, ip, struct ip6_hdr *, NULL);
3307
3308		} else if (!raw && proto_family == PF_INET6) {
3309			struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr*);
3310			DTRACE_IP6(send, struct mbuf*, m, struct inpcb *, NULL,
3311				struct ip6_hdr *, ip6, struct ifnet*, ifp,
3312				struct ip*, NULL, struct ip6_hdr *, ip6);
3313		}
3314#endif /* CONFIG_DTRACE */
3315
3316		if (raw == 0 && ifp->if_framer) {
3317			int rcvif_set = 0;
3318
3319			/*
3320			 * If this is a broadcast packet that needs to be
3321			 * looped back into the system, set the inbound ifp
3322			 * to that of the outbound ifp.  This will allow
3323			 * us to determine that it is a legitimate packet
3324			 * for the system.  Only set the ifp if it's not
3325			 * already set, just to be safe.
3326			 */
3327			if ((m->m_flags & (M_BCAST | M_LOOP)) &&
3328			    m->m_pkthdr.rcvif == NULL) {
3329				m->m_pkthdr.rcvif = ifp;
3330				rcvif_set = 1;
3331			}
3332
3333			retval = ifp->if_framer(ifp, &m, dest, dst_linkaddr,
3334			    frame_type
3335#if CONFIG_EMBEDDED
3336			    ,
3337			    &pre, &post
3338#endif /* CONFIG_EMBEDDED */
3339			    );
3340			if (retval) {
3341				if (retval != EJUSTRETURN)
3342					m_freem(m);
3343				goto next;
3344			}
3345
3346			/*
3347			 * Clear the ifp if it was set above, and to be
3348			 * safe, only if it is still the same as the
3349			 * outbound ifp we have in context.  If it was
3350			 * looped back, then a copy of it was sent to the
3351			 * loopback interface with the rcvif set, and we
3352			 * are clearing the one that will go down to the
3353			 * layer below.
3354			 */
3355			if (rcvif_set && m->m_pkthdr.rcvif == ifp)
3356				m->m_pkthdr.rcvif = NULL;
3357		}
3358
3359		/*
3360		 * Let interface filters (if any) do their thing ...
3361		 */
3362		/* Do not pass VLAN tagged packets to filters PR-3586856 */
3363		if ((m->m_pkthdr.csum_flags & CSUM_VLAN_TAG_VALID) == 0) {
3364			retval = dlil_interface_filters_output(ifp,
3365			    &m, proto_family);
3366			if (retval != 0) {
3367				if (retval != EJUSTRETURN)
3368					m_freem(m);
3369				goto next;
3370			}
3371		}
3372		/*
3373		 * Strip away M_PROTO1 bit prior to sending packet to the driver
3374		 * as this field may be used by the driver
3375		 */
3376		m->m_flags &= ~M_PROTO1;
3377
3378		/*
3379		 * If the underlying interface is not capable of handling a
3380		 * packet whose data portion spans across physically disjoint
3381		 * pages, we need to "normalize" the packet so that we pass
3382		 * down a chain of mbufs where each mbuf points to a span that
3383		 * resides in the system page boundary.  If the packet does
3384		 * not cross page(s), the following is a no-op.
3385		 */
3386		if (!(ifp->if_hwassist & IFNET_MULTIPAGES)) {
3387			if ((m = m_normalize(m)) == NULL)
3388				goto next;
3389		}
3390
3391		/*
3392		 * If this is a TSO packet, make sure the interface still
3393		 * advertise TSO capability.
3394		 */
3395
3396		if ((m->m_pkthdr.csum_flags & CSUM_TSO_IPV4) &&
3397		    !(ifp->if_hwassist & IFNET_TSO_IPV4)) {
3398			retval = EMSGSIZE;
3399			m_freem(m);
3400			goto cleanup;
3401		}
3402
3403		if ((m->m_pkthdr.csum_flags & CSUM_TSO_IPV6) &&
3404		    !(ifp->if_hwassist & IFNET_TSO_IPV6)) {
3405			retval = EMSGSIZE;
3406			m_freem(m);
3407			goto cleanup;
3408		}
3409
3410		/*
3411		 * Finally, call the driver.
3412		 */
3413		if ((ifp->if_eflags & IFEF_SENDLIST) != 0) {
3414			*send_tail = m;
3415			send_tail = &m->m_nextpkt;
3416		} else {
3417#if CONFIG_EMBEDDED
3418			iptap_ipf_output(ifp, proto_family, (struct mbuf *)m,
3419			    pre, post);
3420#endif /* CONFIG_EMBEDDED */
3421			ifp_inc_traffic_class_out(ifp, m);
3422			KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_START,
3423			    0,0,0,0,0);
3424			retval = (*ifp->if_output)(ifp, m);
3425			if (retval == EQFULL || retval == EQSUSPENDED) {
3426				if (adv != NULL && adv->code == FADV_SUCCESS) {
3427					adv->code = (retval == EQFULL ?
3428					    FADV_FLOW_CONTROLLED :
3429					    FADV_SUSPENDED);
3430				}
3431				retval = 0;
3432			}
3433			if (retval && dlil_verbose) {
3434				printf("%s: output error on %s%d retval = %d\n",
3435				    __func__, ifp->if_name, ifp->if_unit,
3436				    retval);
3437			}
3438			KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_END,
3439			    0,0,0,0,0);
3440		}
3441		KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_END, 0,0,0,0,0);
3442
3443next:
3444		m = packetlist;
3445		if (m) {
3446			packetlist = packetlist->m_nextpkt;
3447			m->m_nextpkt = NULL;
3448		}
3449	} while (m);
3450
3451	if (send_head) {
3452#if CONFIG_EMBEDDED
3453		iptap_ipf_output(ifp, proto_family, (struct mbuf *)send_head,
3454		    pre, post);
3455#endif /* CONFIG_EMBEDDED */
3456		ifp_inc_traffic_class_out(ifp, send_head);
3457
3458		KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_START, 0,0,0,0,0);
3459		retval = (*ifp->if_output)(ifp, send_head);
3460		if (retval == EQFULL || retval == EQSUSPENDED) {
3461			if (adv != NULL) {
3462				adv->code = (retval == EQFULL ?
3463				    FADV_FLOW_CONTROLLED : FADV_SUSPENDED);
3464			}
3465			retval = 0;
3466		}
3467		if (retval && dlil_verbose) {
3468			printf("%s: output error on %s%d retval = %d\n",
3469			    __func__, ifp->if_name, ifp->if_unit, retval);
3470		}
3471		KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_END, 0,0,0,0,0);
3472	}
3473
3474	KERNEL_DEBUG(DBG_FNC_DLIL_OUTPUT | DBG_FUNC_END,0,0,0,0,0);
3475
3476cleanup:
3477	if (proto != NULL)
3478		if_proto_free(proto);
3479	if (packetlist) /* if any packets are left, clean up */
3480		mbuf_freem_list(packetlist);
3481	if (retval == EJUSTRETURN)
3482		retval = 0;
3483	if (iorefcnt == 1)
3484		ifnet_decr_iorefcnt(ifp);
3485
3486	return (retval);
3487}
3488
3489errno_t
3490ifnet_ioctl(ifnet_t ifp, protocol_family_t proto_fam, u_long ioctl_code,
3491    void *ioctl_arg)
3492{
3493	struct ifnet_filter *filter;
3494	int retval = EOPNOTSUPP;
3495	int result = 0;
3496
3497	if (ifp == NULL || ioctl_code == 0)
3498		return (EINVAL);
3499
3500	/* Get an io ref count if the interface is attached */
3501	if (!ifnet_is_attached(ifp, 1))
3502		return (EOPNOTSUPP);
3503
3504	/* Run the interface filters first.
3505	 * We want to run all filters before calling the protocol,
3506	 * interface family, or interface.
3507	 */
3508	lck_mtx_lock_spin(&ifp->if_flt_lock);
3509	/* prevent filter list from changing in case we drop the lock */
3510	if_flt_monitor_busy(ifp);
3511	TAILQ_FOREACH(filter, &ifp->if_flt_head, filt_next) {
3512		if (filter->filt_ioctl != NULL && (filter->filt_protocol == 0 ||
3513		    filter->filt_protocol == proto_fam)) {
3514			lck_mtx_unlock(&ifp->if_flt_lock);
3515
3516			result = filter->filt_ioctl(filter->filt_cookie, ifp,
3517			    proto_fam, ioctl_code, ioctl_arg);
3518
3519			lck_mtx_lock_spin(&ifp->if_flt_lock);
3520
3521			/* Only update retval if no one has handled the ioctl */
3522			if (retval == EOPNOTSUPP || result == EJUSTRETURN) {
3523				if (result == ENOTSUP)
3524					result = EOPNOTSUPP;
3525				retval = result;
3526				if (retval != 0 && retval != EOPNOTSUPP) {
3527					/* we're done with the filter list */
3528					if_flt_monitor_unbusy(ifp);
3529					lck_mtx_unlock(&ifp->if_flt_lock);
3530					goto cleanup;
3531				}
3532			}
3533		}
3534	}
3535	/* we're done with the filter list */
3536	if_flt_monitor_unbusy(ifp);
3537	lck_mtx_unlock(&ifp->if_flt_lock);
3538
3539	/* Allow the protocol to handle the ioctl */
3540	if (proto_fam != 0) {
3541		struct if_proto	*proto;
3542
3543		/* callee holds a proto refcnt upon success */
3544		ifnet_lock_shared(ifp);
3545		proto = find_attached_proto(ifp, proto_fam);
3546		ifnet_lock_done(ifp);
3547		if (proto != NULL) {
3548			proto_media_ioctl ioctlp =
3549			    (proto->proto_kpi == kProtoKPI_v1 ?
3550			    proto->kpi.v1.ioctl : proto->kpi.v2.ioctl);
3551			result = EOPNOTSUPP;
3552			if (ioctlp != NULL)
3553				result = ioctlp(ifp, proto_fam, ioctl_code,
3554				    ioctl_arg);
3555			if_proto_free(proto);
3556
3557			/* Only update retval if no one has handled the ioctl */
3558			if (retval == EOPNOTSUPP || result == EJUSTRETURN) {
3559				if (result == ENOTSUP)
3560					result = EOPNOTSUPP;
3561				retval = result;
3562				if (retval && retval != EOPNOTSUPP)
3563					goto cleanup;
3564			}
3565		}
3566	}
3567
3568	/* retval is either 0 or EOPNOTSUPP */
3569
3570	/*
3571	 * Let the interface handle this ioctl.
3572	 * If it returns EOPNOTSUPP, ignore that, we may have
3573	 * already handled this in the protocol or family.
3574	 */
3575	if (ifp->if_ioctl)
3576		result = (*ifp->if_ioctl)(ifp, ioctl_code, ioctl_arg);
3577
3578	/* Only update retval if no one has handled the ioctl */
3579	if (retval == EOPNOTSUPP || result == EJUSTRETURN) {
3580		if (result == ENOTSUP)
3581			result = EOPNOTSUPP;
3582		retval = result;
3583		if (retval && retval != EOPNOTSUPP) {
3584			goto cleanup;
3585		}
3586	}
3587
3588cleanup:
3589	if (retval == EJUSTRETURN)
3590		retval = 0;
3591
3592	ifnet_decr_iorefcnt(ifp);
3593
3594	return (retval);
3595}
3596
3597__private_extern__ errno_t
3598dlil_set_bpf_tap(ifnet_t ifp, bpf_tap_mode mode, bpf_packet_func callback)
3599{
3600	errno_t	error = 0;
3601
3602
3603	if (ifp->if_set_bpf_tap) {
3604		/* Get an io reference on the interface if it is attached */
3605		if (!ifnet_is_attached(ifp, 1))
3606			return ENXIO;
3607		error = ifp->if_set_bpf_tap(ifp, mode, callback);
3608		ifnet_decr_iorefcnt(ifp);
3609	}
3610	return (error);
3611}
3612
3613errno_t
3614dlil_resolve_multi(struct ifnet *ifp, const struct sockaddr *proto_addr,
3615    struct sockaddr *ll_addr, size_t ll_len)
3616{
3617	errno_t	result = EOPNOTSUPP;
3618	struct if_proto *proto;
3619	const struct sockaddr *verify;
3620	proto_media_resolve_multi resolvep;
3621
3622	if (!ifnet_is_attached(ifp, 1))
3623		return result;
3624
3625	bzero(ll_addr, ll_len);
3626
3627	/* Call the protocol first; callee holds a proto refcnt upon success */
3628	ifnet_lock_shared(ifp);
3629	proto = find_attached_proto(ifp, proto_addr->sa_family);
3630	ifnet_lock_done(ifp);
3631	if (proto != NULL) {
3632		resolvep = (proto->proto_kpi == kProtoKPI_v1 ?
3633		    proto->kpi.v1.resolve_multi : proto->kpi.v2.resolve_multi);
3634		if (resolvep != NULL)
3635			result = resolvep(ifp, proto_addr,
3636			    (struct sockaddr_dl*)(void *)ll_addr, ll_len);
3637		if_proto_free(proto);
3638	}
3639
3640	/* Let the interface verify the multicast address */
3641	if ((result == EOPNOTSUPP || result == 0) && ifp->if_check_multi) {
3642		if (result == 0)
3643			verify = ll_addr;
3644		else
3645			verify = proto_addr;
3646		result = ifp->if_check_multi(ifp, verify);
3647	}
3648
3649	ifnet_decr_iorefcnt(ifp);
3650	return (result);
3651}
3652
3653__private_extern__ errno_t
3654dlil_send_arp_internal(ifnet_t ifp, u_short arpop,
3655    const struct sockaddr_dl* sender_hw, const struct sockaddr* sender_proto,
3656    const struct sockaddr_dl* target_hw, const struct sockaddr* target_proto)
3657{
3658	struct if_proto *proto;
3659	errno_t	result = 0;
3660
3661	/* callee holds a proto refcnt upon success */
3662	ifnet_lock_shared(ifp);
3663	proto = find_attached_proto(ifp, target_proto->sa_family);
3664	ifnet_lock_done(ifp);
3665	if (proto == NULL) {
3666		result = ENOTSUP;
3667	} else {
3668		proto_media_send_arp	arpp;
3669		arpp = (proto->proto_kpi == kProtoKPI_v1 ?
3670		    proto->kpi.v1.send_arp : proto->kpi.v2.send_arp);
3671		if (arpp == NULL)
3672			result = ENOTSUP;
3673		else
3674			result = arpp(ifp, arpop, sender_hw, sender_proto,
3675			    target_hw, target_proto);
3676		if_proto_free(proto);
3677	}
3678
3679	return (result);
3680}
3681
3682__private_extern__ errno_t
3683net_thread_check_lock(u_int32_t flag)
3684{
3685	struct uthread *uth = get_bsdthread_info(current_thread());
3686	return ((uth->uu_network_lock_held & flag) == flag);
3687}
3688
3689__private_extern__ void
3690net_thread_set_lock(u_int32_t flag)
3691{
3692	struct uthread *uth = get_bsdthread_info(current_thread());
3693
3694	VERIFY((uth->uu_network_lock_held & flag) != flag);
3695	uth->uu_network_lock_held |= flag;
3696}
3697
3698__private_extern__ void
3699net_thread_unset_lock(u_int32_t flag)
3700{
3701	struct uthread *uth = get_bsdthread_info(current_thread());
3702
3703	VERIFY((uth->uu_network_lock_held & flag) == flag);
3704	uth->uu_network_lock_held &= (~flag);
3705}
3706
3707static __inline__ int
3708_is_announcement(const struct sockaddr_in * sender_sin,
3709    const struct sockaddr_in * target_sin)
3710{
3711	if (sender_sin == NULL) {
3712		return (FALSE);
3713	}
3714	return (sender_sin->sin_addr.s_addr == target_sin->sin_addr.s_addr);
3715}
3716
3717__private_extern__ errno_t
3718dlil_send_arp(ifnet_t ifp, u_short arpop, const struct sockaddr_dl* sender_hw,
3719    const struct sockaddr* sender_proto, const struct sockaddr_dl* target_hw,
3720    const struct sockaddr* target_proto0, u_int32_t rtflags)
3721{
3722	errno_t	result = 0;
3723	const struct sockaddr_in * sender_sin;
3724	const struct sockaddr_in * target_sin;
3725	struct sockaddr_inarp target_proto_sinarp;
3726	struct sockaddr *target_proto = (void *)(uintptr_t)target_proto0;
3727
3728	if (target_proto == NULL || (sender_proto != NULL &&
3729	    sender_proto->sa_family != target_proto->sa_family))
3730		return (EINVAL);
3731
3732	/*
3733	 * If the target is a (default) router, provide that
3734	 * information to the send_arp callback routine.
3735	 */
3736	if (rtflags & RTF_ROUTER) {
3737		bcopy(target_proto, &target_proto_sinarp,
3738		    sizeof (struct sockaddr_in));
3739		target_proto_sinarp.sin_other |= SIN_ROUTER;
3740		target_proto = (struct sockaddr *)&target_proto_sinarp;
3741	}
3742
3743	/*
3744	 * If this is an ARP request and the target IP is IPv4LL,
3745	 * send the request on all interfaces.  The exception is
3746	 * an announcement, which must only appear on the specific
3747	 * interface.
3748	 */
3749	sender_sin = (struct sockaddr_in *)(void *)(uintptr_t)sender_proto;
3750	target_sin = (struct sockaddr_in *)(void *)(uintptr_t)target_proto;
3751	if (target_proto->sa_family == AF_INET &&
3752	    IN_LINKLOCAL(ntohl(target_sin->sin_addr.s_addr)) &&
3753	    ipv4_ll_arp_aware != 0 && arpop == ARPOP_REQUEST &&
3754	    !_is_announcement(target_sin, sender_sin)) {
3755		ifnet_t		*ifp_list;
3756		u_int32_t	count;
3757		u_int32_t	ifp_on;
3758
3759		result = ENOTSUP;
3760
3761		if (ifnet_list_get(IFNET_FAMILY_ANY, &ifp_list, &count) == 0) {
3762			for (ifp_on = 0; ifp_on < count; ifp_on++) {
3763				errno_t new_result;
3764				ifaddr_t source_hw = NULL;
3765				ifaddr_t source_ip = NULL;
3766				struct sockaddr_in source_ip_copy;
3767				struct ifnet *cur_ifp = ifp_list[ifp_on];
3768
3769				/*
3770				 * Only arp on interfaces marked for IPv4LL
3771				 * ARPing.  This may mean that we don't ARP on
3772				 * the interface the subnet route points to.
3773				 */
3774				if (!(cur_ifp->if_eflags & IFEF_ARPLL))
3775					continue;
3776
3777				/* Find the source IP address */
3778				ifnet_lock_shared(cur_ifp);
3779				source_hw = cur_ifp->if_lladdr;
3780				TAILQ_FOREACH(source_ip, &cur_ifp->if_addrhead,
3781				    ifa_link) {
3782					IFA_LOCK(source_ip);
3783					if (source_ip->ifa_addr != NULL &&
3784					    source_ip->ifa_addr->sa_family ==
3785					    AF_INET) {
3786						/* Copy the source IP address */
3787						source_ip_copy =
3788						    *(struct sockaddr_in *)
3789						    (void *)source_ip->ifa_addr;
3790						IFA_UNLOCK(source_ip);
3791						break;
3792					}
3793					IFA_UNLOCK(source_ip);
3794				}
3795
3796				/* No IP Source, don't arp */
3797				if (source_ip == NULL) {
3798					ifnet_lock_done(cur_ifp);
3799					continue;
3800				}
3801
3802				IFA_ADDREF(source_hw);
3803				ifnet_lock_done(cur_ifp);
3804
3805				/* Send the ARP */
3806				new_result = dlil_send_arp_internal(cur_ifp,
3807				    arpop, (struct sockaddr_dl *)(void *)
3808				    source_hw->ifa_addr,
3809				    (struct sockaddr *)&source_ip_copy, NULL,
3810				    target_proto);
3811
3812				IFA_REMREF(source_hw);
3813				if (result == ENOTSUP) {
3814					result = new_result;
3815				}
3816			}
3817			ifnet_list_free(ifp_list);
3818		}
3819	} else {
3820		result = dlil_send_arp_internal(ifp, arpop, sender_hw,
3821		    sender_proto, target_hw, target_proto);
3822	}
3823
3824	return (result);
3825}
3826
3827/*
3828 * Caller must hold ifnet head lock.
3829 */
3830static int
3831ifnet_lookup(struct ifnet *ifp)
3832{
3833	struct ifnet *_ifp;
3834
3835	lck_rw_assert(&ifnet_head_lock, LCK_RW_ASSERT_HELD);
3836	TAILQ_FOREACH(_ifp, &ifnet_head, if_link) {
3837		if (_ifp == ifp)
3838			break;
3839	}
3840	return (_ifp != NULL);
3841}
3842/*
3843 * Caller has to pass a non-zero refio argument to get a
3844 * IO reference count. This will prevent ifnet_detach from
3845 * being called when there are outstanding io reference counts.
3846 */
3847int
3848ifnet_is_attached(struct ifnet *ifp, int refio)
3849{
3850	int ret;
3851
3852	lck_mtx_lock_spin(&ifp->if_ref_lock);
3853	if ((ret = ((ifp->if_refflags & (IFRF_ATTACHED | IFRF_DETACHING)) ==
3854	    IFRF_ATTACHED))) {
3855		if (refio > 0)
3856			ifp->if_refio++;
3857	}
3858	lck_mtx_unlock(&ifp->if_ref_lock);
3859
3860	return (ret);
3861}
3862
3863void
3864ifnet_decr_iorefcnt(struct ifnet *ifp)
3865{
3866	lck_mtx_lock_spin(&ifp->if_ref_lock);
3867	VERIFY(ifp->if_refio > 0);
3868	VERIFY((ifp->if_refflags & (IFRF_ATTACHED | IFRF_DETACHING)) != 0);
3869	ifp->if_refio--;
3870
3871	/* if there are no more outstanding io references, wakeup the
3872	 * ifnet_detach thread if detaching flag is set.
3873	 */
3874	if (ifp->if_refio == 0 &&
3875		(ifp->if_refflags & IFRF_DETACHING) != 0) {
3876		wakeup(&(ifp->if_refio));
3877	}
3878	lck_mtx_unlock(&ifp->if_ref_lock);
3879}
3880
3881static void
3882dlil_if_trace(struct dlil_ifnet *dl_if, int refhold)
3883{
3884	struct dlil_ifnet_dbg *dl_if_dbg = (struct dlil_ifnet_dbg *)dl_if;
3885	ctrace_t *tr;
3886	u_int32_t idx;
3887	u_int16_t *cnt;
3888
3889	if (!(dl_if->dl_if_flags & DLIF_DEBUG)) {
3890		panic("%s: dl_if %p has no debug structure", __func__, dl_if);
3891		/* NOTREACHED */
3892	}
3893
3894	if (refhold) {
3895		cnt = &dl_if_dbg->dldbg_if_refhold_cnt;
3896		tr = dl_if_dbg->dldbg_if_refhold;
3897	} else {
3898		cnt = &dl_if_dbg->dldbg_if_refrele_cnt;
3899		tr = dl_if_dbg->dldbg_if_refrele;
3900	}
3901
3902	idx = atomic_add_16_ov(cnt, 1) % IF_REF_TRACE_HIST_SIZE;
3903	ctrace_record(&tr[idx]);
3904}
3905
3906errno_t
3907dlil_if_ref(struct ifnet *ifp)
3908{
3909	struct dlil_ifnet *dl_if = (struct dlil_ifnet *)ifp;
3910
3911	if (dl_if == NULL)
3912		return (EINVAL);
3913
3914	lck_mtx_lock_spin(&dl_if->dl_if_lock);
3915	++dl_if->dl_if_refcnt;
3916	if (dl_if->dl_if_refcnt == 0) {
3917		panic("%s: wraparound refcnt for ifp=%p", __func__, ifp);
3918		/* NOTREACHED */
3919	}
3920	if (dl_if->dl_if_trace != NULL)
3921		(*dl_if->dl_if_trace)(dl_if, TRUE);
3922	lck_mtx_unlock(&dl_if->dl_if_lock);
3923
3924	return (0);
3925}
3926
3927errno_t
3928dlil_if_free(struct ifnet *ifp)
3929{
3930	struct dlil_ifnet *dl_if = (struct dlil_ifnet *)ifp;
3931
3932	if (dl_if == NULL)
3933		return (EINVAL);
3934
3935	lck_mtx_lock_spin(&dl_if->dl_if_lock);
3936	if (dl_if->dl_if_refcnt == 0) {
3937		panic("%s: negative refcnt for ifp=%p", __func__, ifp);
3938		/* NOTREACHED */
3939	}
3940	--dl_if->dl_if_refcnt;
3941	if (dl_if->dl_if_trace != NULL)
3942		(*dl_if->dl_if_trace)(dl_if, FALSE);
3943	lck_mtx_unlock(&dl_if->dl_if_lock);
3944
3945	return (0);
3946}
3947
3948static errno_t
3949dlil_attach_protocol_internal(struct if_proto *proto,
3950    const struct ifnet_demux_desc *demux_list, u_int32_t demux_count)
3951{
3952	struct kev_dl_proto_data ev_pr_data;
3953	struct ifnet *ifp = proto->ifp;
3954	int retval = 0;
3955	u_int32_t hash_value = proto_hash_value(proto->protocol_family);
3956	struct if_proto *prev_proto;
3957	struct if_proto *_proto;
3958
3959	/* callee holds a proto refcnt upon success */
3960	ifnet_lock_exclusive(ifp);
3961	_proto = find_attached_proto(ifp, proto->protocol_family);
3962	if (_proto != NULL) {
3963		ifnet_lock_done(ifp);
3964		if_proto_free(_proto);
3965		return (EEXIST);
3966	}
3967
3968	/*
3969	 * Call family module add_proto routine so it can refine the
3970	 * demux descriptors as it wishes.
3971	 */
3972	retval = ifp->if_add_proto(ifp, proto->protocol_family, demux_list,
3973	    demux_count);
3974	if (retval) {
3975		ifnet_lock_done(ifp);
3976		return (retval);
3977	}
3978
3979	/*
3980	 * Insert the protocol in the hash
3981	 */
3982	prev_proto = SLIST_FIRST(&ifp->if_proto_hash[hash_value]);
3983	while (prev_proto != NULL && SLIST_NEXT(prev_proto, next_hash) != NULL)
3984		prev_proto = SLIST_NEXT(prev_proto, next_hash);
3985	if (prev_proto)
3986		SLIST_INSERT_AFTER(prev_proto, proto, next_hash);
3987	else
3988		SLIST_INSERT_HEAD(&ifp->if_proto_hash[hash_value],
3989		    proto, next_hash);
3990
3991	/* hold a proto refcnt for attach */
3992	if_proto_ref(proto);
3993
3994	/*
3995	 * The reserved field carries the number of protocol still attached
3996	 * (subject to change)
3997	 */
3998	ev_pr_data.proto_family = proto->protocol_family;
3999	ev_pr_data.proto_remaining_count = dlil_ifp_proto_count(ifp);
4000	ifnet_lock_done(ifp);
4001
4002	dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_PROTO_ATTACHED,
4003	    (struct net_event_data *)&ev_pr_data,
4004	    sizeof (struct kev_dl_proto_data));
4005	return (retval);
4006}
4007
4008errno_t
4009ifnet_attach_protocol(ifnet_t ifp, protocol_family_t protocol,
4010    const struct ifnet_attach_proto_param *proto_details)
4011{
4012	int retval = 0;
4013	struct if_proto  *ifproto = NULL;
4014
4015	ifnet_head_lock_shared();
4016	if (ifp == NULL || protocol == 0 || proto_details == NULL) {
4017		retval = EINVAL;
4018		goto end;
4019	}
4020	/* Check that the interface is in the global list */
4021	if (!ifnet_lookup(ifp)) {
4022		retval = ENXIO;
4023		goto end;
4024	}
4025
4026	ifproto = zalloc(dlif_proto_zone);
4027	if (ifproto == NULL) {
4028		retval = ENOMEM;
4029		goto end;
4030	}
4031	bzero(ifproto, dlif_proto_size);
4032
4033	/* refcnt held above during lookup */
4034	ifproto->ifp = ifp;
4035	ifproto->protocol_family = protocol;
4036	ifproto->proto_kpi = kProtoKPI_v1;
4037	ifproto->kpi.v1.input = proto_details->input;
4038	ifproto->kpi.v1.pre_output = proto_details->pre_output;
4039	ifproto->kpi.v1.event = proto_details->event;
4040	ifproto->kpi.v1.ioctl = proto_details->ioctl;
4041	ifproto->kpi.v1.detached = proto_details->detached;
4042	ifproto->kpi.v1.resolve_multi = proto_details->resolve;
4043	ifproto->kpi.v1.send_arp = proto_details->send_arp;
4044
4045	retval = dlil_attach_protocol_internal(ifproto,
4046	    proto_details->demux_list, proto_details->demux_count);
4047
4048	if (dlil_verbose) {
4049		printf("%s%d: attached v1 protocol %d\n", ifp->if_name,
4050		    ifp->if_unit, protocol);
4051	}
4052
4053end:
4054	if (retval != 0 && retval != EEXIST && ifp != NULL) {
4055		DLIL_PRINTF("%s%d: failed to attach v1 protocol %d (err=%d)\n",
4056		    ifp->if_name, ifp->if_unit, protocol, retval);
4057	}
4058	ifnet_head_done();
4059	if (retval != 0  && ifproto != NULL)
4060		zfree(dlif_proto_zone, ifproto);
4061	return (retval);
4062}
4063
4064errno_t
4065ifnet_attach_protocol_v2(ifnet_t ifp, protocol_family_t protocol,
4066    const struct ifnet_attach_proto_param_v2 *proto_details)
4067{
4068	int retval = 0;
4069	struct if_proto  *ifproto = NULL;
4070
4071	ifnet_head_lock_shared();
4072	if (ifp == NULL || protocol == 0 || proto_details == NULL) {
4073		retval = EINVAL;
4074		goto end;
4075	}
4076	/* Check that the interface is in the global list */
4077	if (!ifnet_lookup(ifp)) {
4078		retval = ENXIO;
4079		goto end;
4080	}
4081
4082	ifproto = zalloc(dlif_proto_zone);
4083	if (ifproto == NULL) {
4084		retval = ENOMEM;
4085		goto end;
4086	}
4087	bzero(ifproto, sizeof(*ifproto));
4088
4089	/* refcnt held above during lookup */
4090	ifproto->ifp = ifp;
4091	ifproto->protocol_family = protocol;
4092	ifproto->proto_kpi = kProtoKPI_v2;
4093	ifproto->kpi.v2.input = proto_details->input;
4094	ifproto->kpi.v2.pre_output = proto_details->pre_output;
4095	ifproto->kpi.v2.event = proto_details->event;
4096	ifproto->kpi.v2.ioctl = proto_details->ioctl;
4097	ifproto->kpi.v2.detached = proto_details->detached;
4098	ifproto->kpi.v2.resolve_multi = proto_details->resolve;
4099	ifproto->kpi.v2.send_arp = proto_details->send_arp;
4100
4101	retval = dlil_attach_protocol_internal(ifproto,
4102	    proto_details->demux_list, proto_details->demux_count);
4103
4104	if (dlil_verbose) {
4105		printf("%s%d: attached v2 protocol %d\n", ifp->if_name,
4106		    ifp->if_unit, protocol);
4107	}
4108
4109end:
4110	if (retval != 0 && retval != EEXIST && ifp != NULL) {
4111		DLIL_PRINTF("%s%d: failed to attach v2 protocol %d (err=%d)\n",
4112		    ifp->if_name, ifp->if_unit, protocol, retval);
4113	}
4114	ifnet_head_done();
4115	if (retval != 0 && ifproto != NULL)
4116		zfree(dlif_proto_zone, ifproto);
4117	return (retval);
4118}
4119
4120errno_t
4121ifnet_detach_protocol(ifnet_t ifp, protocol_family_t proto_family)
4122{
4123	struct if_proto *proto = NULL;
4124	int	retval = 0;
4125
4126	if (ifp == NULL || proto_family == 0) {
4127		retval = EINVAL;
4128		goto end;
4129	}
4130
4131	ifnet_lock_exclusive(ifp);
4132	/* callee holds a proto refcnt upon success */
4133	proto = find_attached_proto(ifp, proto_family);
4134	if (proto == NULL) {
4135		retval = ENXIO;
4136		ifnet_lock_done(ifp);
4137		goto end;
4138	}
4139
4140	/* call family module del_proto */
4141	if (ifp->if_del_proto)
4142		ifp->if_del_proto(ifp, proto->protocol_family);
4143
4144	SLIST_REMOVE(&ifp->if_proto_hash[proto_hash_value(proto_family)],
4145	    proto, if_proto, next_hash);
4146
4147	if (proto->proto_kpi == kProtoKPI_v1) {
4148		proto->kpi.v1.input = ifproto_media_input_v1;
4149		proto->kpi.v1.pre_output= ifproto_media_preout;
4150		proto->kpi.v1.event = ifproto_media_event;
4151		proto->kpi.v1.ioctl = ifproto_media_ioctl;
4152		proto->kpi.v1.resolve_multi = ifproto_media_resolve_multi;
4153		proto->kpi.v1.send_arp = ifproto_media_send_arp;
4154	} else {
4155		proto->kpi.v2.input = ifproto_media_input_v2;
4156		proto->kpi.v2.pre_output = ifproto_media_preout;
4157		proto->kpi.v2.event = ifproto_media_event;
4158		proto->kpi.v2.ioctl = ifproto_media_ioctl;
4159		proto->kpi.v2.resolve_multi = ifproto_media_resolve_multi;
4160		proto->kpi.v2.send_arp = ifproto_media_send_arp;
4161	}
4162	proto->detached = 1;
4163	ifnet_lock_done(ifp);
4164
4165	if (dlil_verbose) {
4166		printf("%s%d: detached %s protocol %d\n", ifp->if_name,
4167		    ifp->if_unit, (proto->proto_kpi == kProtoKPI_v1) ?
4168		    "v1" : "v2", proto_family);
4169	}
4170
4171	/* release proto refcnt held during protocol attach */
4172	if_proto_free(proto);
4173
4174	/*
4175	 * Release proto refcnt held during lookup; the rest of
4176	 * protocol detach steps will happen when the last proto
4177	 * reference is released.
4178	 */
4179	if_proto_free(proto);
4180
4181end:
4182	return (retval);
4183}
4184
4185
4186static errno_t
4187ifproto_media_input_v1(struct ifnet *ifp, protocol_family_t protocol,
4188    struct mbuf *packet, char *header)
4189{
4190#pragma unused(ifp, protocol, packet, header)
4191	return (ENXIO);
4192}
4193
4194static errno_t
4195ifproto_media_input_v2(struct ifnet *ifp, protocol_family_t protocol,
4196    struct mbuf *packet)
4197{
4198#pragma unused(ifp, protocol, packet)
4199	return (ENXIO);
4200
4201}
4202
4203static errno_t
4204ifproto_media_preout(struct ifnet *ifp, protocol_family_t protocol,
4205    mbuf_t *packet, const struct sockaddr *dest, void *route, char *frame_type,
4206    char *link_layer_dest)
4207{
4208#pragma unused(ifp, protocol, packet, dest, route, frame_type, link_layer_dest)
4209	return (ENXIO);
4210
4211}
4212
4213static void
4214ifproto_media_event(struct ifnet *ifp, protocol_family_t protocol,
4215    const struct kev_msg *event)
4216{
4217#pragma unused(ifp, protocol, event)
4218}
4219
4220static errno_t
4221ifproto_media_ioctl(struct ifnet *ifp, protocol_family_t protocol,
4222    unsigned long command, void *argument)
4223{
4224#pragma unused(ifp, protocol, command, argument)
4225	return (ENXIO);
4226}
4227
4228static errno_t
4229ifproto_media_resolve_multi(ifnet_t ifp, const struct sockaddr *proto_addr,
4230    struct sockaddr_dl *out_ll, size_t ll_len)
4231{
4232#pragma unused(ifp, proto_addr, out_ll, ll_len)
4233	return (ENXIO);
4234}
4235
4236static errno_t
4237ifproto_media_send_arp(struct ifnet *ifp, u_short arpop,
4238    const struct sockaddr_dl *sender_hw, const struct sockaddr *sender_proto,
4239    const struct sockaddr_dl *target_hw, const struct sockaddr *target_proto)
4240{
4241#pragma unused(ifp, arpop, sender_hw, sender_proto, target_hw, target_proto)
4242	return (ENXIO);
4243}
4244
4245extern int if_next_index(void);
4246
4247errno_t
4248ifnet_attach(ifnet_t ifp, const struct sockaddr_dl *ll_addr)
4249{
4250	struct ifnet *tmp_if;
4251	struct ifaddr *ifa;
4252	struct if_data_internal if_data_saved;
4253	struct dlil_ifnet *dl_if = (struct dlil_ifnet *)ifp;
4254	struct dlil_threading_info *dl_inp;
4255	u_int32_t sflags = 0;
4256	int err;
4257
4258	if (ifp == NULL)
4259		return (EINVAL);
4260
4261	/*
4262	 * Serialize ifnet attach using dlil_ifnet_lock, in order to
4263	 * prevent the interface from being configured while it is
4264	 * embryonic, as ifnet_head_lock is dropped and reacquired
4265	 * below prior to marking the ifnet with IFRF_ATTACHED.
4266	 */
4267	dlil_if_lock();
4268	ifnet_head_lock_exclusive();
4269	/* Verify we aren't already on the list */
4270	TAILQ_FOREACH(tmp_if, &ifnet_head, if_link) {
4271		if (tmp_if == ifp) {
4272			ifnet_head_done();
4273			dlil_if_unlock();
4274			return (EEXIST);
4275		}
4276	}
4277
4278	lck_mtx_lock_spin(&ifp->if_ref_lock);
4279	if (ifp->if_refflags & IFRF_ATTACHED) {
4280		panic_plain("%s: flags mismatch (attached set) ifp=%p",
4281		    __func__, ifp);
4282		/* NOTREACHED */
4283	}
4284	lck_mtx_unlock(&ifp->if_ref_lock);
4285
4286	ifnet_lock_exclusive(ifp);
4287
4288	/* Sanity check */
4289	VERIFY(ifp->if_detaching_link.tqe_next == NULL);
4290	VERIFY(ifp->if_detaching_link.tqe_prev == NULL);
4291
4292	if (ll_addr != NULL) {
4293		if (ifp->if_addrlen == 0) {
4294			ifp->if_addrlen = ll_addr->sdl_alen;
4295		} else if (ll_addr->sdl_alen != ifp->if_addrlen) {
4296			ifnet_lock_done(ifp);
4297			ifnet_head_done();
4298			dlil_if_unlock();
4299			return (EINVAL);
4300		}
4301	}
4302
4303	/*
4304	 * Allow interfaces without protocol families to attach
4305	 * only if they have the necessary fields filled out.
4306	 */
4307	if (ifp->if_add_proto == NULL || ifp->if_del_proto == NULL) {
4308		DLIL_PRINTF("%s: Attempt to attach interface without "
4309		    "family module - %d\n", __func__, ifp->if_family);
4310		ifnet_lock_done(ifp);
4311		ifnet_head_done();
4312		dlil_if_unlock();
4313		return (ENODEV);
4314	}
4315
4316	/* Allocate protocol hash table */
4317	VERIFY(ifp->if_proto_hash == NULL);
4318	ifp->if_proto_hash = zalloc(dlif_phash_zone);
4319	if (ifp->if_proto_hash == NULL) {
4320		ifnet_lock_done(ifp);
4321		ifnet_head_done();
4322		dlil_if_unlock();
4323		return (ENOBUFS);
4324	}
4325	bzero(ifp->if_proto_hash, dlif_phash_size);
4326
4327	lck_mtx_lock_spin(&ifp->if_flt_lock);
4328	VERIFY(TAILQ_EMPTY(&ifp->if_flt_head));
4329	TAILQ_INIT(&ifp->if_flt_head);
4330	VERIFY(ifp->if_flt_busy == 0);
4331	VERIFY(ifp->if_flt_waiters == 0);
4332	lck_mtx_unlock(&ifp->if_flt_lock);
4333
4334	VERIFY(TAILQ_EMPTY(&ifp->if_prefixhead));
4335	TAILQ_INIT(&ifp->if_prefixhead);
4336
4337	if (!(dl_if->dl_if_flags & DLIF_REUSE)) {
4338		VERIFY(LIST_EMPTY(&ifp->if_multiaddrs));
4339		LIST_INIT(&ifp->if_multiaddrs);
4340	}
4341
4342	VERIFY(ifp->if_allhostsinm == NULL);
4343	VERIFY(TAILQ_EMPTY(&ifp->if_addrhead));
4344	TAILQ_INIT(&ifp->if_addrhead);
4345
4346	if (ifp->if_index == 0) {
4347		int idx = if_next_index();
4348
4349		if (idx == -1) {
4350			ifp->if_index = 0;
4351			ifnet_lock_done(ifp);
4352			ifnet_head_done();
4353			dlil_if_unlock();
4354			return (ENOBUFS);
4355		}
4356		ifp->if_index = idx;
4357	}
4358	/* There should not be anything occupying this slot */
4359	VERIFY(ifindex2ifnet[ifp->if_index] == NULL);
4360
4361	/* allocate (if needed) and initialize a link address */
4362	VERIFY(!(dl_if->dl_if_flags & DLIF_REUSE) || ifp->if_lladdr != NULL);
4363	ifa = dlil_alloc_lladdr(ifp, ll_addr);
4364	if (ifa == NULL) {
4365		ifnet_lock_done(ifp);
4366		ifnet_head_done();
4367		dlil_if_unlock();
4368		return (ENOBUFS);
4369	}
4370
4371	VERIFY(ifnet_addrs[ifp->if_index - 1] == NULL);
4372	ifnet_addrs[ifp->if_index - 1] = ifa;
4373
4374	/* make this address the first on the list */
4375	IFA_LOCK(ifa);
4376	/* hold a reference for ifnet_addrs[] */
4377	IFA_ADDREF_LOCKED(ifa);
4378	/* if_attach_link_ifa() holds a reference for ifa_link */
4379	if_attach_link_ifa(ifp, ifa);
4380	IFA_UNLOCK(ifa);
4381
4382#if CONFIG_MACF_NET
4383	mac_ifnet_label_associate(ifp);
4384#endif
4385
4386	TAILQ_INSERT_TAIL(&ifnet_head, ifp, if_link);
4387	ifindex2ifnet[ifp->if_index] = ifp;
4388
4389	/* Hold a reference to the underlying dlil_ifnet */
4390	ifnet_reference(ifp);
4391
4392	/* Clear stats (save and restore other fields that we care) */
4393	if_data_saved = ifp->if_data;
4394	bzero(&ifp->if_data, sizeof (ifp->if_data));
4395	ifp->if_data.ifi_type = if_data_saved.ifi_type;
4396	ifp->if_data.ifi_typelen = if_data_saved.ifi_typelen;
4397	ifp->if_data.ifi_physical = if_data_saved.ifi_physical;
4398	ifp->if_data.ifi_addrlen = if_data_saved.ifi_addrlen;
4399	ifp->if_data.ifi_hdrlen = if_data_saved.ifi_hdrlen;
4400	ifp->if_data.ifi_mtu = if_data_saved.ifi_mtu;
4401	ifp->if_data.ifi_baudrate = if_data_saved.ifi_baudrate;
4402	ifp->if_data.ifi_hwassist = if_data_saved.ifi_hwassist;
4403	ifp->if_data.ifi_tso_v4_mtu = if_data_saved.ifi_tso_v4_mtu;
4404	ifp->if_data.ifi_tso_v6_mtu = if_data_saved.ifi_tso_v6_mtu;
4405	ifnet_touch_lastchange(ifp);
4406
4407	VERIFY(ifp->if_output_sched_model == IFNET_SCHED_MODEL_NORMAL ||
4408	    ifp->if_output_sched_model == IFNET_SCHED_MODEL_DRIVER_MANAGED);
4409
4410	/* By default, use SFB and enable flow advisory */
4411	sflags = PKTSCHEDF_QALG_SFB;
4412	if (if_flowadv)
4413		sflags |= PKTSCHEDF_QALG_FLOWCTL;
4414
4415	/* Initialize transmit queue(s) */
4416	err = ifclassq_setup(ifp, sflags, (dl_if->dl_if_flags & DLIF_REUSE));
4417	if (err != 0) {
4418		panic_plain("%s: ifp=%p couldn't initialize transmit queue; "
4419		    "err=%d", __func__, ifp, err);
4420		/* NOTREACHED */
4421	}
4422
4423	/* Sanity checks on the input thread storage */
4424	dl_inp = &dl_if->dl_if_inpstorage;
4425	bzero(&dl_inp->stats, sizeof (dl_inp->stats));
4426	VERIFY(dl_inp->input_waiting == 0);
4427	VERIFY(dl_inp->wtot == 0);
4428	VERIFY(dl_inp->ifp == NULL);
4429	VERIFY(qhead(&dl_inp->rcvq_pkts) == NULL && qempty(&dl_inp->rcvq_pkts));
4430	VERIFY(qlimit(&dl_inp->rcvq_pkts) == 0);
4431	VERIFY(!dl_inp->net_affinity);
4432	VERIFY(ifp->if_inp == NULL);
4433	VERIFY(dl_inp->input_thr == THREAD_NULL);
4434	VERIFY(dl_inp->wloop_thr == THREAD_NULL);
4435	VERIFY(dl_inp->poll_thr == THREAD_NULL);
4436	VERIFY(dl_inp->tag == 0);
4437	VERIFY(dl_inp->mode == IFNET_MODEL_INPUT_POLL_OFF);
4438	bzero(&dl_inp->tstats, sizeof (dl_inp->tstats));
4439	bzero(&dl_inp->pstats, sizeof (dl_inp->pstats));
4440	bzero(&dl_inp->sstats, sizeof (dl_inp->sstats));
4441#if IFNET_INPUT_SANITY_CHK
4442	VERIFY(dl_inp->input_mbuf_cnt == 0);
4443#endif /* IFNET_INPUT_SANITY_CHK */
4444
4445	/*
4446	 * A specific DLIL input thread is created per Ethernet/cellular
4447	 * interface or for an interface which supports opportunistic
4448	 * input polling.  Pseudo interfaces or other types of interfaces
4449	 * use the main input thread instead.
4450	 */
4451	if ((net_rxpoll && (ifp->if_eflags & IFEF_RXPOLL)) ||
4452	    ifp->if_type == IFT_ETHER || ifp->if_type == IFT_CELLULAR) {
4453		ifp->if_inp = dl_inp;
4454		err = dlil_create_input_thread(ifp, ifp->if_inp);
4455		if (err != 0) {
4456			panic_plain("%s: ifp=%p couldn't get an input thread; "
4457			    "err=%d", __func__, ifp, err);
4458			/* NOTREACHED */
4459		}
4460	}
4461
4462	/*
4463	 * If the driver supports the new transmit model, create a workloop
4464	 * starter thread to invoke the if_start callback where the packets
4465	 * may be dequeued and transmitted.
4466	 */
4467	if (ifp->if_eflags & IFEF_TXSTART) {
4468		VERIFY(ifp->if_start != NULL);
4469		VERIFY(ifp->if_start_thread == THREAD_NULL);
4470
4471		ifnet_set_start_cycle(ifp, NULL);
4472		ifp->if_start_active = 0;
4473		ifp->if_start_req = 0;
4474		if ((err = kernel_thread_start(ifnet_start_thread_fn, ifp,
4475		    &ifp->if_start_thread)) != KERN_SUCCESS) {
4476			panic_plain("%s: ifp=%p couldn't get a start thread; "
4477			    "err=%d", __func__, ifp, err);
4478			/* NOTREACHED */
4479		}
4480		ml_thread_policy(ifp->if_start_thread, MACHINE_GROUP,
4481		    (MACHINE_NETWORK_GROUP|MACHINE_NETWORK_WORKLOOP));
4482	}
4483
4484	/*
4485	 * If the driver supports the new receive model, create a poller
4486	 * thread to invoke if_input_poll callback where the packets may
4487	 * be dequeued from the driver and processed for reception.
4488	 */
4489	if (ifp->if_eflags & IFEF_RXPOLL) {
4490		VERIFY(ifp->if_input_poll != NULL);
4491		VERIFY(ifp->if_input_ctl != NULL);
4492		VERIFY(ifp->if_poll_thread == THREAD_NULL);
4493
4494		ifnet_set_poll_cycle(ifp, NULL);
4495		ifp->if_poll_update = 0;
4496		ifp->if_poll_active = 0;
4497		ifp->if_poll_req = 0;
4498		if ((err = kernel_thread_start(ifnet_poll_thread_fn, ifp,
4499		    &ifp->if_poll_thread)) != KERN_SUCCESS) {
4500			panic_plain("%s: ifp=%p couldn't get a poll thread; "
4501			    "err=%d", __func__, ifp, err);
4502			/* NOTREACHED */
4503		}
4504		ml_thread_policy(ifp->if_poll_thread, MACHINE_GROUP,
4505		    (MACHINE_NETWORK_GROUP|MACHINE_NETWORK_WORKLOOP));
4506	}
4507
4508	VERIFY(ifp->if_desc.ifd_maxlen == IF_DESCSIZE);
4509	VERIFY(ifp->if_desc.ifd_len == 0);
4510	VERIFY(ifp->if_desc.ifd_desc != NULL);
4511
4512	/* Record attach PC stacktrace */
4513	ctrace_record(&((struct dlil_ifnet *)ifp)->dl_if_attach);
4514
4515	ifp->if_updatemcasts = 0;
4516	if (!LIST_EMPTY(&ifp->if_multiaddrs)) {
4517		struct ifmultiaddr *ifma;
4518		LIST_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
4519			IFMA_LOCK(ifma);
4520			if (ifma->ifma_addr->sa_family == AF_LINK ||
4521			    ifma->ifma_addr->sa_family == AF_UNSPEC)
4522				ifp->if_updatemcasts++;
4523			IFMA_UNLOCK(ifma);
4524		}
4525
4526		printf("%s%d: attached with %d suspended link-layer multicast "
4527		    "membership(s)\n", ifp->if_name, ifp->if_unit,
4528		    ifp->if_updatemcasts);
4529	}
4530
4531	ifnet_lock_done(ifp);
4532	ifnet_head_done();
4533
4534	lck_mtx_lock(&ifp->if_cached_route_lock);
4535	/* Enable forwarding cached route */
4536	ifp->if_fwd_cacheok = 1;
4537	/* Clean up any existing cached routes */
4538	if (ifp->if_fwd_route.ro_rt != NULL)
4539		rtfree(ifp->if_fwd_route.ro_rt);
4540	bzero(&ifp->if_fwd_route, sizeof (ifp->if_fwd_route));
4541	if (ifp->if_src_route.ro_rt != NULL)
4542		rtfree(ifp->if_src_route.ro_rt);
4543	bzero(&ifp->if_src_route, sizeof (ifp->if_src_route));
4544	if (ifp->if_src_route6.ro_rt != NULL)
4545		rtfree(ifp->if_src_route6.ro_rt);
4546	bzero(&ifp->if_src_route6, sizeof (ifp->if_src_route6));
4547	lck_mtx_unlock(&ifp->if_cached_route_lock);
4548
4549	ifnet_llreach_ifattach(ifp, (dl_if->dl_if_flags & DLIF_REUSE));
4550
4551	/*
4552	 * Allocate and attach IGMPv3/MLDv2 interface specific variables
4553	 * and trees; do this before the ifnet is marked as attached.
4554	 * The ifnet keeps the reference to the info structures even after
4555	 * the ifnet is detached, since the network-layer records still
4556	 * refer to the info structures even after that.  This also
4557	 * makes it possible for them to still function after the ifnet
4558	 * is recycled or reattached.
4559	 */
4560#if INET
4561	if (IGMP_IFINFO(ifp) == NULL) {
4562		IGMP_IFINFO(ifp) = igmp_domifattach(ifp, M_WAITOK);
4563		VERIFY(IGMP_IFINFO(ifp) != NULL);
4564	} else {
4565		VERIFY(IGMP_IFINFO(ifp)->igi_ifp == ifp);
4566		igmp_domifreattach(IGMP_IFINFO(ifp));
4567	}
4568#endif /* INET */
4569#if INET6
4570	if (MLD_IFINFO(ifp) == NULL) {
4571		MLD_IFINFO(ifp) = mld_domifattach(ifp, M_WAITOK);
4572		VERIFY(MLD_IFINFO(ifp) != NULL);
4573	} else {
4574		VERIFY(MLD_IFINFO(ifp)->mli_ifp == ifp);
4575		mld_domifreattach(MLD_IFINFO(ifp));
4576	}
4577#endif /* INET6 */
4578
4579	/*
4580	 * Finally, mark this ifnet as attached.
4581	 */
4582	lck_mtx_lock(rnh_lock);
4583	ifnet_lock_exclusive(ifp);
4584	/* Initialize Link Quality Metric (loopback [lo0] is always good) */
4585	ifp->if_lqm = (ifp == lo_ifp) ? IFNET_LQM_THRESH_GOOD :
4586	    IFNET_LQM_THRESH_UNKNOWN;
4587	lck_mtx_lock_spin(&ifp->if_ref_lock);
4588	ifp->if_refflags = IFRF_ATTACHED;
4589	lck_mtx_unlock(&ifp->if_ref_lock);
4590	if (net_rtref) {
4591		/* boot-args override; enable idle notification */
4592		(void) ifnet_set_idle_flags_locked(ifp, IFRF_IDLE_NOTIFY,
4593		    IFRF_IDLE_NOTIFY);
4594	} else {
4595		/* apply previous request(s) to set the idle flags, if any */
4596		(void) ifnet_set_idle_flags_locked(ifp, ifp->if_idle_new_flags,
4597		    ifp->if_idle_new_flags_mask);
4598
4599	}
4600	ifnet_lock_done(ifp);
4601	lck_mtx_unlock(rnh_lock);
4602	dlil_if_unlock();
4603
4604#if PF
4605	/*
4606	 * Attach packet filter to this interface, if enabled.
4607	 */
4608	pf_ifnet_hook(ifp, 1);
4609#endif /* PF */
4610
4611	dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_IF_ATTACHED, NULL, 0);
4612
4613	if (dlil_verbose) {
4614		printf("%s%d: attached%s\n", ifp->if_name, ifp->if_unit,
4615		    (dl_if->dl_if_flags & DLIF_REUSE) ? " (recycled)" : "");
4616	}
4617
4618	return (0);
4619}
4620
4621/*
4622 * Prepare the storage for the first/permanent link address, which must
4623 * must have the same lifetime as the ifnet itself.  Although the link
4624 * address gets removed from if_addrhead and ifnet_addrs[] at detach time,
4625 * its location in memory must never change as it may still be referred
4626 * to by some parts of the system afterwards (unfortunate implementation
4627 * artifacts inherited from BSD.)
4628 *
4629 * Caller must hold ifnet lock as writer.
4630 */
4631static struct ifaddr *
4632dlil_alloc_lladdr(struct ifnet *ifp, const struct sockaddr_dl *ll_addr)
4633{
4634	struct ifaddr *ifa, *oifa;
4635	struct sockaddr_dl *asdl, *msdl;
4636	char workbuf[IFNAMSIZ*2];
4637	int namelen, masklen, socksize;
4638	struct dlil_ifnet *dl_if = (struct dlil_ifnet *)ifp;
4639
4640	ifnet_lock_assert(ifp, IFNET_LCK_ASSERT_EXCLUSIVE);
4641	VERIFY(ll_addr == NULL || ll_addr->sdl_alen == ifp->if_addrlen);
4642
4643	namelen = snprintf(workbuf, sizeof (workbuf), "%s%d",
4644	    ifp->if_name, ifp->if_unit);
4645	masklen = offsetof(struct sockaddr_dl, sdl_data[0]) + namelen;
4646	socksize = masklen + ifp->if_addrlen;
4647#define ROUNDUP(a) (1 + (((a) - 1) | (sizeof (u_int32_t) - 1)))
4648	if ((u_int32_t)socksize < sizeof (struct sockaddr_dl))
4649		socksize = sizeof(struct sockaddr_dl);
4650	socksize = ROUNDUP(socksize);
4651#undef ROUNDUP
4652
4653	ifa = ifp->if_lladdr;
4654	if (socksize > DLIL_SDLMAXLEN ||
4655	    (ifa != NULL && ifa != &dl_if->dl_if_lladdr.ifa)) {
4656		/*
4657		 * Rare, but in the event that the link address requires
4658		 * more storage space than DLIL_SDLMAXLEN, allocate the
4659		 * largest possible storages for address and mask, such
4660		 * that we can reuse the same space when if_addrlen grows.
4661		 * This same space will be used when if_addrlen shrinks.
4662		 */
4663		if (ifa == NULL || ifa == &dl_if->dl_if_lladdr.ifa) {
4664			int ifasize = sizeof (*ifa) + 2 * SOCK_MAXADDRLEN;
4665			ifa = _MALLOC(ifasize, M_IFADDR, M_WAITOK | M_ZERO);
4666			if (ifa == NULL)
4667				return (NULL);
4668			ifa_lock_init(ifa);
4669			/* Don't set IFD_ALLOC, as this is permanent */
4670			ifa->ifa_debug = IFD_LINK;
4671		}
4672		IFA_LOCK(ifa);
4673		/* address and mask sockaddr_dl locations */
4674		asdl = (struct sockaddr_dl *)(ifa + 1);
4675		bzero(asdl, SOCK_MAXADDRLEN);
4676		msdl = (struct sockaddr_dl *)(void *)
4677		    ((char *)asdl + SOCK_MAXADDRLEN);
4678		bzero(msdl, SOCK_MAXADDRLEN);
4679	} else {
4680		VERIFY(ifa == NULL || ifa == &dl_if->dl_if_lladdr.ifa);
4681		/*
4682		 * Use the storage areas for address and mask within the
4683		 * dlil_ifnet structure.  This is the most common case.
4684		 */
4685		if (ifa == NULL) {
4686			ifa = &dl_if->dl_if_lladdr.ifa;
4687			ifa_lock_init(ifa);
4688			/* Don't set IFD_ALLOC, as this is permanent */
4689			ifa->ifa_debug = IFD_LINK;
4690		}
4691		IFA_LOCK(ifa);
4692		/* address and mask sockaddr_dl locations */
4693		asdl = (struct sockaddr_dl *)(void *)&dl_if->dl_if_lladdr.asdl;
4694		bzero(asdl, sizeof (dl_if->dl_if_lladdr.asdl));
4695		msdl = (struct sockaddr_dl *)(void *)&dl_if->dl_if_lladdr.msdl;
4696		bzero(msdl, sizeof (dl_if->dl_if_lladdr.msdl));
4697	}
4698
4699	/* hold a permanent reference for the ifnet itself */
4700	IFA_ADDREF_LOCKED(ifa);
4701	oifa = ifp->if_lladdr;
4702	ifp->if_lladdr = ifa;
4703
4704	VERIFY(ifa->ifa_debug == IFD_LINK);
4705	ifa->ifa_ifp = ifp;
4706	ifa->ifa_rtrequest = link_rtrequest;
4707	ifa->ifa_addr = (struct sockaddr *)asdl;
4708	asdl->sdl_len = socksize;
4709	asdl->sdl_family = AF_LINK;
4710	bcopy(workbuf, asdl->sdl_data, namelen);
4711	asdl->sdl_nlen = namelen;
4712	asdl->sdl_index = ifp->if_index;
4713	asdl->sdl_type = ifp->if_type;
4714	if (ll_addr != NULL) {
4715		asdl->sdl_alen = ll_addr->sdl_alen;
4716		bcopy(CONST_LLADDR(ll_addr), LLADDR(asdl), asdl->sdl_alen);
4717	} else {
4718		asdl->sdl_alen = 0;
4719	}
4720	ifa->ifa_netmask = (struct sockaddr*)msdl;
4721	msdl->sdl_len = masklen;
4722	while (namelen != 0)
4723		msdl->sdl_data[--namelen] = 0xff;
4724	IFA_UNLOCK(ifa);
4725
4726	if (oifa != NULL)
4727		IFA_REMREF(oifa);
4728
4729	return (ifa);
4730}
4731
4732static void
4733if_purgeaddrs(struct ifnet *ifp)
4734{
4735#if INET
4736	in_purgeaddrs(ifp);
4737#endif /* INET */
4738#if INET6
4739	in6_purgeaddrs(ifp);
4740#endif /* INET6 */
4741#if NETAT
4742	at_purgeaddrs(ifp);
4743#endif
4744}
4745
4746errno_t
4747ifnet_detach(ifnet_t ifp)
4748{
4749	if (ifp == NULL)
4750		return (EINVAL);
4751
4752	lck_mtx_lock(rnh_lock);
4753	ifnet_head_lock_exclusive();
4754	ifnet_lock_exclusive(ifp);
4755
4756	/*
4757	 * Check to see if this interface has previously triggered
4758	 * aggressive protocol draining; if so, decrement the global
4759	 * refcnt and clear PR_AGGDRAIN on the route domain if
4760	 * there are no more of such an interface around.
4761	 */
4762	(void) ifnet_set_idle_flags_locked(ifp, 0, ~0);
4763
4764	lck_mtx_lock_spin(&ifp->if_ref_lock);
4765	 if (!(ifp->if_refflags & IFRF_ATTACHED)) {
4766		lck_mtx_unlock(&ifp->if_ref_lock);
4767		ifnet_lock_done(ifp);
4768		ifnet_head_done();
4769		lck_mtx_unlock(rnh_lock);
4770		return (EINVAL);
4771	} else if (ifp->if_refflags & IFRF_DETACHING) {
4772		/* Interface has already been detached */
4773		lck_mtx_unlock(&ifp->if_ref_lock);
4774		ifnet_lock_done(ifp);
4775		ifnet_head_done();
4776		lck_mtx_unlock(rnh_lock);
4777		return (ENXIO);
4778	}
4779	/* Indicate this interface is being detached */
4780	ifp->if_refflags &= ~IFRF_ATTACHED;
4781	ifp->if_refflags |= IFRF_DETACHING;
4782	lck_mtx_unlock(&ifp->if_ref_lock);
4783
4784	if (dlil_verbose)
4785		printf("%s%d: detaching\n", ifp->if_name, ifp->if_unit);
4786
4787	/*
4788	 * Remove ifnet from the ifnet_head, ifindex2ifnet[]; it will
4789	 * no longer be visible during lookups from this point.
4790	 */
4791	VERIFY(ifindex2ifnet[ifp->if_index] == ifp);
4792	TAILQ_REMOVE(&ifnet_head, ifp, if_link);
4793	ifp->if_link.tqe_next = NULL;
4794	ifp->if_link.tqe_prev = NULL;
4795	ifindex2ifnet[ifp->if_index] = NULL;
4796
4797	/* Record detach PC stacktrace */
4798	ctrace_record(&((struct dlil_ifnet *)ifp)->dl_if_detach);
4799
4800	ifnet_lock_done(ifp);
4801	ifnet_head_done();
4802	lck_mtx_unlock(rnh_lock);
4803
4804	/* Reset Link Quality Metric (unless loopback [lo0]) */
4805	if (ifp != lo_ifp)
4806		if_lqm_update(ifp, IFNET_LQM_THRESH_OFF);
4807
4808	/* Reset TCP local statistics */
4809	if (ifp->if_tcp_stat != NULL)
4810		bzero(ifp->if_tcp_stat, sizeof(*ifp->if_tcp_stat));
4811
4812	/* Reset UDP local statistics */
4813	if (ifp->if_udp_stat != NULL)
4814		bzero(ifp->if_udp_stat, sizeof(*ifp->if_udp_stat));
4815
4816	/* Let BPF know we're detaching */
4817	bpfdetach(ifp);
4818
4819	/* Mark the interface as DOWN */
4820	if_down(ifp);
4821
4822	/* Drain send queue */
4823	ifclassq_teardown(ifp);
4824
4825	/* Disable forwarding cached route */
4826	lck_mtx_lock(&ifp->if_cached_route_lock);
4827	ifp->if_fwd_cacheok = 0;
4828	lck_mtx_unlock(&ifp->if_cached_route_lock);
4829
4830	/*
4831	 * Drain any deferred IGMPv3/MLDv2 query responses, but keep the
4832	 * references to the info structures and leave them attached to
4833	 * this ifnet.
4834	 */
4835#if INET
4836	igmp_domifdetach(ifp);
4837#endif /* INET */
4838#if INET6
4839	mld_domifdetach(ifp);
4840#endif /* INET6 */
4841
4842	dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_IF_DETACHING, NULL, 0);
4843
4844	/* Let worker thread take care of the rest, to avoid reentrancy */
4845	dlil_if_lock();
4846	ifnet_detaching_enqueue(ifp);
4847	dlil_if_unlock();
4848
4849	return (0);
4850}
4851
4852static void
4853ifnet_detaching_enqueue(struct ifnet *ifp)
4854{
4855	dlil_if_lock_assert();
4856
4857	++ifnet_detaching_cnt;
4858	VERIFY(ifnet_detaching_cnt != 0);
4859	TAILQ_INSERT_TAIL(&ifnet_detaching_head, ifp, if_detaching_link);
4860	wakeup((caddr_t)&ifnet_delayed_run);
4861}
4862
4863static struct ifnet *
4864ifnet_detaching_dequeue(void)
4865{
4866	struct ifnet *ifp;
4867
4868	dlil_if_lock_assert();
4869
4870	ifp = TAILQ_FIRST(&ifnet_detaching_head);
4871	VERIFY(ifnet_detaching_cnt != 0 || ifp == NULL);
4872	if (ifp != NULL) {
4873		VERIFY(ifnet_detaching_cnt != 0);
4874		--ifnet_detaching_cnt;
4875		TAILQ_REMOVE(&ifnet_detaching_head, ifp, if_detaching_link);
4876		ifp->if_detaching_link.tqe_next = NULL;
4877		ifp->if_detaching_link.tqe_prev = NULL;
4878	}
4879	return (ifp);
4880}
4881
4882static int
4883ifnet_detacher_thread_cont(int err)
4884{
4885#pragma unused(err)
4886	struct ifnet *ifp;
4887
4888	for (;;) {
4889		dlil_if_lock_assert();
4890		while (ifnet_detaching_cnt == 0) {
4891			(void) msleep0(&ifnet_delayed_run, &dlil_ifnet_lock,
4892			    (PZERO - 1), "ifnet_detacher_cont", 0,
4893			    ifnet_detacher_thread_cont);
4894			/* NOTREACHED */
4895		}
4896
4897		VERIFY(TAILQ_FIRST(&ifnet_detaching_head) != NULL);
4898
4899		/* Take care of detaching ifnet */
4900		ifp = ifnet_detaching_dequeue();
4901		if (ifp != NULL) {
4902			dlil_if_unlock();
4903			ifnet_detach_final(ifp);
4904			dlil_if_lock();
4905		}
4906	}
4907	/* NOTREACHED */
4908	return (0);
4909}
4910
4911static void
4912ifnet_detacher_thread_func(void *v, wait_result_t w)
4913{
4914#pragma unused(v, w)
4915	dlil_if_lock();
4916	(void) msleep0(&ifnet_delayed_run, &dlil_ifnet_lock,
4917	    (PZERO - 1), "ifnet_detacher", 0, ifnet_detacher_thread_cont);
4918	/*
4919	 * msleep0() shouldn't have returned as PCATCH was not set;
4920	 * therefore assert in this case.
4921	 */
4922	dlil_if_unlock();
4923	VERIFY(0);
4924}
4925
4926static void
4927ifnet_detach_final(struct ifnet *ifp)
4928{
4929	struct ifnet_filter *filter, *filter_next;
4930	struct ifnet_filter_head fhead;
4931	struct dlil_threading_info *inp;
4932	struct ifaddr *ifa;
4933	ifnet_detached_func if_free;
4934	int i;
4935
4936	lck_mtx_lock(&ifp->if_ref_lock);
4937	if (!(ifp->if_refflags & IFRF_DETACHING)) {
4938		panic("%s: flags mismatch (detaching not set) ifp=%p",
4939		    __func__, ifp);
4940		/* NOTREACHED */
4941	}
4942
4943	/*
4944	 * Wait until the existing IO references get released
4945	 * before we proceed with ifnet_detach.  This is not a
4946	 * common case, so block without using a continuation.
4947	 */
4948	while (ifp->if_refio > 0) {
4949		printf("%s: Waiting for IO references on %s%d interface "
4950		    "to be released\n", __func__, ifp->if_name, ifp->if_unit);
4951		(void) msleep(&(ifp->if_refio), &ifp->if_ref_lock,
4952			(PZERO - 1), "ifnet_ioref_wait", NULL);
4953	}
4954	lck_mtx_unlock(&ifp->if_ref_lock);
4955
4956	/* Detach interface filters */
4957	lck_mtx_lock(&ifp->if_flt_lock);
4958	if_flt_monitor_enter(ifp);
4959
4960	lck_mtx_assert(&ifp->if_flt_lock, LCK_MTX_ASSERT_OWNED);
4961	fhead = ifp->if_flt_head;
4962	TAILQ_INIT(&ifp->if_flt_head);
4963
4964	for (filter = TAILQ_FIRST(&fhead); filter; filter = filter_next) {
4965		filter_next = TAILQ_NEXT(filter, filt_next);
4966		lck_mtx_unlock(&ifp->if_flt_lock);
4967
4968		dlil_detach_filter_internal(filter, 1);
4969		lck_mtx_lock(&ifp->if_flt_lock);
4970	}
4971	if_flt_monitor_leave(ifp);
4972	lck_mtx_unlock(&ifp->if_flt_lock);
4973
4974	/* Tell upper layers to drop their network addresses */
4975	if_purgeaddrs(ifp);
4976
4977	ifnet_lock_exclusive(ifp);
4978
4979	/* Uplumb all protocols */
4980	for (i = 0; i < PROTO_HASH_SLOTS; i++) {
4981		struct if_proto *proto;
4982
4983		proto = SLIST_FIRST(&ifp->if_proto_hash[i]);
4984		while (proto != NULL) {
4985			protocol_family_t family = proto->protocol_family;
4986			ifnet_lock_done(ifp);
4987			proto_unplumb(family, ifp);
4988			ifnet_lock_exclusive(ifp);
4989			proto = SLIST_FIRST(&ifp->if_proto_hash[i]);
4990		}
4991		/* There should not be any protocols left */
4992		VERIFY(SLIST_EMPTY(&ifp->if_proto_hash[i]));
4993	}
4994	zfree(dlif_phash_zone, ifp->if_proto_hash);
4995	ifp->if_proto_hash = NULL;
4996
4997	/* Detach (permanent) link address from if_addrhead */
4998	ifa = TAILQ_FIRST(&ifp->if_addrhead);
4999	VERIFY(ifnet_addrs[ifp->if_index - 1] == ifa);
5000	IFA_LOCK(ifa);
5001	if_detach_link_ifa(ifp, ifa);
5002	IFA_UNLOCK(ifa);
5003
5004	/* Remove (permanent) link address from ifnet_addrs[] */
5005	IFA_REMREF(ifa);
5006	ifnet_addrs[ifp->if_index - 1] = NULL;
5007
5008	/* This interface should not be on {ifnet_head,detaching} */
5009	VERIFY(ifp->if_link.tqe_next == NULL);
5010	VERIFY(ifp->if_link.tqe_prev == NULL);
5011	VERIFY(ifp->if_detaching_link.tqe_next == NULL);
5012	VERIFY(ifp->if_detaching_link.tqe_prev == NULL);
5013
5014	/* Prefix list should be empty by now */
5015	VERIFY(TAILQ_EMPTY(&ifp->if_prefixhead));
5016
5017	/* The slot should have been emptied */
5018	VERIFY(ifindex2ifnet[ifp->if_index] == NULL);
5019
5020	/* There should not be any addresses left */
5021	VERIFY(TAILQ_EMPTY(&ifp->if_addrhead));
5022
5023	/*
5024	 * Signal the starter thread to terminate itself.
5025	 */
5026	if (ifp->if_start_thread != THREAD_NULL) {
5027		lck_mtx_lock_spin(&ifp->if_start_lock);
5028		ifp->if_start_thread = THREAD_NULL;
5029		wakeup_one((caddr_t)&ifp->if_start_thread);
5030		lck_mtx_unlock(&ifp->if_start_lock);
5031	}
5032
5033	/*
5034	 * Signal the poller thread to terminate itself.
5035	 */
5036	if (ifp->if_poll_thread != THREAD_NULL) {
5037		lck_mtx_lock_spin(&ifp->if_poll_lock);
5038		ifp->if_poll_thread = THREAD_NULL;
5039		wakeup_one((caddr_t)&ifp->if_poll_thread);
5040		lck_mtx_unlock(&ifp->if_poll_lock);
5041	}
5042
5043	/*
5044	 * If thread affinity was set for the workloop thread, we will need
5045	 * to tear down the affinity and release the extra reference count
5046	 * taken at attach time.  Does not apply to lo0 or other interfaces
5047	 * without dedicated input threads.
5048	 */
5049	if ((inp = ifp->if_inp) != NULL) {
5050		VERIFY(inp != dlil_main_input_thread);
5051
5052		if (inp->net_affinity) {
5053			struct thread *tp, *wtp, *ptp;
5054
5055			lck_mtx_lock_spin(&inp->input_lck);
5056			wtp = inp->wloop_thr;
5057			inp->wloop_thr = THREAD_NULL;
5058			ptp = inp->poll_thr;
5059			inp->poll_thr = THREAD_NULL;
5060			tp = inp->input_thr;	/* don't nullify now */
5061			inp->tag = 0;
5062			inp->net_affinity = FALSE;
5063			lck_mtx_unlock(&inp->input_lck);
5064
5065			/* Tear down poll thread affinity */
5066			if (ptp != NULL) {
5067				VERIFY(ifp->if_eflags & IFEF_RXPOLL);
5068				(void) dlil_affinity_set(ptp,
5069				    THREAD_AFFINITY_TAG_NULL);
5070				thread_deallocate(ptp);
5071			}
5072
5073			/* Tear down workloop thread affinity */
5074			if (wtp != NULL) {
5075				(void) dlil_affinity_set(wtp,
5076				    THREAD_AFFINITY_TAG_NULL);
5077				thread_deallocate(wtp);
5078			}
5079
5080			/* Tear down DLIL input thread affinity */
5081			(void) dlil_affinity_set(tp, THREAD_AFFINITY_TAG_NULL);
5082			thread_deallocate(tp);
5083		}
5084
5085		/* disassociate ifp DLIL input thread */
5086		ifp->if_inp = NULL;
5087
5088		lck_mtx_lock_spin(&inp->input_lck);
5089		inp->input_waiting |= DLIL_INPUT_TERMINATE;
5090		if (!(inp->input_waiting & DLIL_INPUT_RUNNING)) {
5091			wakeup_one((caddr_t)&inp->input_waiting);
5092		}
5093		lck_mtx_unlock(&inp->input_lck);
5094	}
5095
5096	/* The driver might unload, so point these to ourselves */
5097	if_free = ifp->if_free;
5098	ifp->if_output = ifp_if_output;
5099	ifp->if_pre_enqueue = ifp_if_output;
5100	ifp->if_start = ifp_if_start;
5101	ifp->if_output_ctl = ifp_if_ctl;
5102	ifp->if_input_poll = ifp_if_input_poll;
5103	ifp->if_input_ctl = ifp_if_ctl;
5104	ifp->if_ioctl = ifp_if_ioctl;
5105	ifp->if_set_bpf_tap = ifp_if_set_bpf_tap;
5106	ifp->if_free = ifp_if_free;
5107	ifp->if_demux = ifp_if_demux;
5108	ifp->if_event = ifp_if_event;
5109	ifp->if_framer = ifp_if_framer;
5110	ifp->if_add_proto = ifp_if_add_proto;
5111	ifp->if_del_proto = ifp_if_del_proto;
5112	ifp->if_check_multi = ifp_if_check_multi;
5113
5114	/* wipe out interface description */
5115	VERIFY(ifp->if_desc.ifd_maxlen == IF_DESCSIZE);
5116	ifp->if_desc.ifd_len = 0;
5117	VERIFY(ifp->if_desc.ifd_desc != NULL);
5118	bzero(ifp->if_desc.ifd_desc, IF_DESCSIZE);
5119
5120	ifnet_lock_done(ifp);
5121
5122#if PF
5123	/*
5124	 * Detach this interface from packet filter, if enabled.
5125	 */
5126	pf_ifnet_hook(ifp, 0);
5127#endif /* PF */
5128
5129	/* Filter list should be empty */
5130	lck_mtx_lock_spin(&ifp->if_flt_lock);
5131	VERIFY(TAILQ_EMPTY(&ifp->if_flt_head));
5132	VERIFY(ifp->if_flt_busy == 0);
5133	VERIFY(ifp->if_flt_waiters == 0);
5134	lck_mtx_unlock(&ifp->if_flt_lock);
5135
5136	/* Last chance to drain send queue */
5137	if_qflush(ifp, 0);
5138
5139	/* Last chance to cleanup any cached route */
5140	lck_mtx_lock(&ifp->if_cached_route_lock);
5141	VERIFY(!ifp->if_fwd_cacheok);
5142	if (ifp->if_fwd_route.ro_rt != NULL)
5143		rtfree(ifp->if_fwd_route.ro_rt);
5144	bzero(&ifp->if_fwd_route, sizeof (ifp->if_fwd_route));
5145	if (ifp->if_src_route.ro_rt != NULL)
5146		rtfree(ifp->if_src_route.ro_rt);
5147	bzero(&ifp->if_src_route, sizeof (ifp->if_src_route));
5148	if (ifp->if_src_route6.ro_rt != NULL)
5149		rtfree(ifp->if_src_route6.ro_rt);
5150	bzero(&ifp->if_src_route6, sizeof (ifp->if_src_route6));
5151	lck_mtx_unlock(&ifp->if_cached_route_lock);
5152
5153	ifnet_llreach_ifdetach(ifp);
5154
5155	dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_IF_DETACHED, NULL, 0);
5156
5157	if (if_free != NULL)
5158		if_free(ifp);
5159
5160	/*
5161	 * Finally, mark this ifnet as detached.
5162	 */
5163	lck_mtx_lock_spin(&ifp->if_ref_lock);
5164	if (!(ifp->if_refflags & IFRF_DETACHING)) {
5165		panic("%s: flags mismatch (detaching not set) ifp=%p",
5166		    __func__, ifp);
5167		/* NOTREACHED */
5168	}
5169	ifp->if_refflags &= ~IFRF_DETACHING;
5170	lck_mtx_unlock(&ifp->if_ref_lock);
5171
5172	if (dlil_verbose)
5173		printf("%s%d: detached\n", ifp->if_name, ifp->if_unit);
5174
5175	/* Release reference held during ifnet attach */
5176	ifnet_release(ifp);
5177}
5178
5179static errno_t
5180ifp_if_output(struct ifnet *ifp, struct mbuf *m)
5181{
5182#pragma unused(ifp)
5183	m_freem(m);
5184	return (0);
5185}
5186
5187static void
5188ifp_if_start(struct ifnet *ifp)
5189{
5190	ifnet_purge(ifp);
5191}
5192
5193static void
5194ifp_if_input_poll(struct ifnet *ifp, u_int32_t flags, u_int32_t max_cnt,
5195    struct mbuf **m_head, struct mbuf **m_tail, u_int32_t *cnt, u_int32_t *len)
5196{
5197#pragma unused(ifp, flags, max_cnt)
5198	if (m_head != NULL)
5199		*m_head = NULL;
5200	if (m_tail != NULL)
5201		*m_tail = NULL;
5202	if (cnt != NULL)
5203		*cnt = 0;
5204	if (len != NULL)
5205		*len = 0;
5206}
5207
5208static errno_t
5209ifp_if_ctl(struct ifnet *ifp, ifnet_ctl_cmd_t cmd, u_int32_t arglen, void *arg)
5210{
5211#pragma unused(ifp, cmd, arglen, arg)
5212	return (EOPNOTSUPP);
5213}
5214
5215static errno_t
5216ifp_if_demux(struct ifnet *ifp, struct mbuf *m, char *fh, protocol_family_t *pf)
5217{
5218#pragma unused(ifp, fh, pf)
5219	m_freem(m);
5220	return (EJUSTRETURN);
5221}
5222
5223static errno_t
5224ifp_if_add_proto(struct ifnet *ifp, protocol_family_t pf,
5225    const struct ifnet_demux_desc *da, u_int32_t dc)
5226{
5227#pragma unused(ifp, pf, da, dc)
5228	return (EINVAL);
5229}
5230
5231static errno_t
5232ifp_if_del_proto(struct ifnet *ifp, protocol_family_t pf)
5233{
5234#pragma unused(ifp, pf)
5235	return (EINVAL);
5236}
5237
5238static errno_t
5239ifp_if_check_multi(struct ifnet *ifp, const struct sockaddr *sa)
5240{
5241#pragma unused(ifp, sa)
5242	return (EOPNOTSUPP);
5243}
5244
5245static errno_t ifp_if_framer(struct ifnet *ifp, struct mbuf **m,
5246const struct sockaddr *sa, const char *ll, const char *t
5247#if CONFIG_EMBEDDED
5248		,
5249		u_int32_t *pre, u_int32_t *post
5250#endif /* CONFIG_EMBEDDED */
5251							 )
5252{
5253#pragma unused(ifp, m, sa, ll, t)
5254	m_freem(*m);
5255	*m = NULL;
5256#if CONFIG_EMBEDDED
5257	*pre = 0;
5258	*post = 0;
5259#endif /* CONFIG_EMBEDDED */
5260	return (EJUSTRETURN);
5261}
5262
5263errno_t
5264ifp_if_ioctl(struct ifnet *ifp, unsigned long cmd, void *arg)
5265{
5266#pragma unused(ifp, cmd, arg)
5267	return (EOPNOTSUPP);
5268}
5269
5270static errno_t
5271ifp_if_set_bpf_tap(struct ifnet *ifp, bpf_tap_mode tm, bpf_packet_func f)
5272{
5273#pragma unused(ifp, tm, f)
5274	/* XXX not sure what to do here */
5275	return (0);
5276}
5277
5278static void
5279ifp_if_free(struct ifnet *ifp)
5280{
5281#pragma unused(ifp)
5282}
5283
5284static void
5285ifp_if_event(struct ifnet *ifp, const struct kev_msg *e)
5286{
5287#pragma unused(ifp, e)
5288}
5289
5290__private_extern__
5291int dlil_if_acquire(u_int32_t family, const void *uniqueid,
5292    size_t uniqueid_len, struct ifnet **ifp)
5293{
5294	struct ifnet *ifp1 = NULL;
5295	struct dlil_ifnet *dlifp1 = NULL;
5296	void *buf, *base, **pbuf;
5297	int ret = 0;
5298
5299	dlil_if_lock();
5300	TAILQ_FOREACH(dlifp1, &dlil_ifnet_head, dl_if_link) {
5301		ifp1 = (struct ifnet *)dlifp1;
5302
5303		if (ifp1->if_family != family)
5304			continue;
5305
5306		lck_mtx_lock(&dlifp1->dl_if_lock);
5307		/* same uniqueid and same len or no unique id specified */
5308		if ((uniqueid_len == dlifp1->dl_if_uniqueid_len) &&
5309		    !bcmp(uniqueid, dlifp1->dl_if_uniqueid, uniqueid_len)) {
5310			/* check for matching interface in use */
5311			if (dlifp1->dl_if_flags & DLIF_INUSE) {
5312				if (uniqueid_len) {
5313					ret = EBUSY;
5314					lck_mtx_unlock(&dlifp1->dl_if_lock);
5315					goto end;
5316				}
5317			} else {
5318				dlifp1->dl_if_flags |= (DLIF_INUSE|DLIF_REUSE);
5319				lck_mtx_unlock(&dlifp1->dl_if_lock);
5320				*ifp = ifp1;
5321				goto end;
5322			}
5323		}
5324		lck_mtx_unlock(&dlifp1->dl_if_lock);
5325	}
5326
5327	/* no interface found, allocate a new one */
5328	buf = zalloc(dlif_zone);
5329	if (buf == NULL) {
5330		ret = ENOMEM;
5331		goto end;
5332	}
5333	bzero(buf, dlif_bufsize);
5334
5335	/* Get the 64-bit aligned base address for this object */
5336	base = (void *)P2ROUNDUP((intptr_t)buf + sizeof (u_int64_t),
5337	    sizeof (u_int64_t));
5338	VERIFY(((intptr_t)base + dlif_size) <= ((intptr_t)buf + dlif_bufsize));
5339
5340	/*
5341	 * Wind back a pointer size from the aligned base and
5342	 * save the original address so we can free it later.
5343	 */
5344	pbuf = (void **)((intptr_t)base - sizeof (void *));
5345	*pbuf = buf;
5346	dlifp1 = base;
5347
5348	if (uniqueid_len) {
5349		MALLOC(dlifp1->dl_if_uniqueid, void *, uniqueid_len,
5350		    M_NKE, M_WAITOK);
5351		if (dlifp1->dl_if_uniqueid == NULL) {
5352			zfree(dlif_zone, dlifp1);
5353			ret = ENOMEM;
5354			goto end;
5355		}
5356		bcopy(uniqueid, dlifp1->dl_if_uniqueid, uniqueid_len);
5357		dlifp1->dl_if_uniqueid_len = uniqueid_len;
5358	}
5359
5360	ifp1 = (struct ifnet *)dlifp1;
5361	dlifp1->dl_if_flags = DLIF_INUSE;
5362	if (ifnet_debug) {
5363		dlifp1->dl_if_flags |= DLIF_DEBUG;
5364		dlifp1->dl_if_trace = dlil_if_trace;
5365	}
5366	ifp1->if_name = dlifp1->dl_if_namestorage;
5367
5368	/* initialize interface description */
5369	ifp1->if_desc.ifd_maxlen = IF_DESCSIZE;
5370	ifp1->if_desc.ifd_len = 0;
5371	ifp1->if_desc.ifd_desc = dlifp1->dl_if_descstorage;
5372
5373#if CONFIG_MACF_NET
5374	mac_ifnet_label_init(ifp1);
5375#endif
5376
5377	if ((ret = dlil_alloc_local_stats(ifp1)) != 0) {
5378		DLIL_PRINTF("%s: failed to allocate if local stats, "
5379		    "error: %d\n", __func__, ret);
5380		/* This probably shouldn't be fatal */
5381		ret = 0;
5382	}
5383
5384	lck_mtx_init(&dlifp1->dl_if_lock, ifnet_lock_group, ifnet_lock_attr);
5385	lck_rw_init(&ifp1->if_lock, ifnet_lock_group, ifnet_lock_attr);
5386	lck_mtx_init(&ifp1->if_ref_lock, ifnet_lock_group, ifnet_lock_attr);
5387	lck_mtx_init(&ifp1->if_flt_lock, ifnet_lock_group, ifnet_lock_attr);
5388	lck_mtx_init(&ifp1->if_addrconfig_lock, ifnet_lock_group,
5389	    ifnet_lock_attr);
5390	lck_rw_init(&ifp1->if_llreach_lock, ifnet_lock_group, ifnet_lock_attr);
5391
5392	/* for send data paths */
5393	lck_mtx_init(&ifp1->if_start_lock, ifnet_snd_lock_group,
5394	    ifnet_lock_attr);
5395	lck_mtx_init(&ifp1->if_cached_route_lock, ifnet_snd_lock_group,
5396	    ifnet_lock_attr);
5397	lck_mtx_init(&ifp1->if_snd.ifcq_lock, ifnet_snd_lock_group,
5398	    ifnet_lock_attr);
5399
5400	/* for receive data paths */
5401	lck_mtx_init(&ifp1->if_poll_lock, ifnet_rcv_lock_group,
5402	    ifnet_lock_attr);
5403
5404	TAILQ_INSERT_TAIL(&dlil_ifnet_head, dlifp1, dl_if_link);
5405
5406	*ifp = ifp1;
5407
5408end:
5409	dlil_if_unlock();
5410
5411	VERIFY(dlifp1 == NULL || (IS_P2ALIGNED(dlifp1, sizeof (u_int64_t)) &&
5412	    IS_P2ALIGNED(&ifp1->if_data, sizeof (u_int64_t))));
5413
5414	return (ret);
5415}
5416
5417__private_extern__ void
5418dlil_if_release(ifnet_t	ifp)
5419{
5420	struct dlil_ifnet *dlifp = (struct dlil_ifnet *)ifp;
5421
5422	ifnet_lock_exclusive(ifp);
5423	lck_mtx_lock(&dlifp->dl_if_lock);
5424	dlifp->dl_if_flags &= ~DLIF_INUSE;
5425	strncpy(dlifp->dl_if_namestorage, ifp->if_name, IFNAMSIZ);
5426	ifp->if_name = dlifp->dl_if_namestorage;
5427	lck_mtx_unlock(&dlifp->dl_if_lock);
5428#if CONFIG_MACF_NET
5429	/*
5430	* We can either recycle the MAC label here or in dlil_if_acquire().
5431	* It seems logical to do it here but this means that anything that
5432	* still has a handle on ifp will now see it as unlabeled.
5433	* Since the interface is "dead" that may be OK.  Revisit later.
5434	*/
5435	mac_ifnet_label_recycle(ifp);
5436#endif
5437	ifnet_lock_done(ifp);
5438}
5439
5440__private_extern__ void
5441dlil_if_lock(void)
5442{
5443	lck_mtx_lock(&dlil_ifnet_lock);
5444}
5445
5446__private_extern__ void
5447dlil_if_unlock(void)
5448{
5449	lck_mtx_unlock(&dlil_ifnet_lock);
5450}
5451
5452__private_extern__ void
5453dlil_if_lock_assert(void)
5454{
5455	lck_mtx_assert(&dlil_ifnet_lock, LCK_MTX_ASSERT_OWNED);
5456}
5457
5458__private_extern__ void
5459dlil_proto_unplumb_all(struct ifnet *ifp)
5460{
5461	/*
5462	 * if_proto_hash[0-3] are for PF_INET, PF_INET6, PF_APPLETALK
5463	 * and PF_VLAN, where each bucket contains exactly one entry;
5464	 * PF_VLAN does not need an explicit unplumb.
5465	 *
5466	 * if_proto_hash[4] is for other protocols; we expect anything
5467	 * in this bucket to respond to the DETACHING event (which would
5468	 * have happened by now) and do the unplumb then.
5469	 */
5470	(void) proto_unplumb(PF_INET, ifp);
5471#if INET6
5472	(void) proto_unplumb(PF_INET6, ifp);
5473#endif /* INET6 */
5474#if NETAT
5475	(void) proto_unplumb(PF_APPLETALK, ifp);
5476#endif /* NETAT */
5477}
5478
5479static void
5480ifp_src_route_copyout(struct ifnet *ifp, struct route *dst)
5481{
5482	lck_mtx_lock_spin(&ifp->if_cached_route_lock);
5483	lck_mtx_convert_spin(&ifp->if_cached_route_lock);
5484
5485	route_copyout(dst, &ifp->if_src_route, sizeof (*dst));
5486
5487	lck_mtx_unlock(&ifp->if_cached_route_lock);
5488}
5489
5490static void
5491ifp_src_route_copyin(struct ifnet *ifp, struct route *src)
5492{
5493	lck_mtx_lock_spin(&ifp->if_cached_route_lock);
5494	lck_mtx_convert_spin(&ifp->if_cached_route_lock);
5495
5496	if (ifp->if_fwd_cacheok) {
5497		route_copyin(src, &ifp->if_src_route, sizeof (*src));
5498	} else {
5499		rtfree(src->ro_rt);
5500		src->ro_rt = NULL;
5501	}
5502	lck_mtx_unlock(&ifp->if_cached_route_lock);
5503}
5504
5505#if INET6
5506static void
5507ifp_src_route6_copyout(struct ifnet *ifp, struct route_in6 *dst)
5508{
5509	lck_mtx_lock_spin(&ifp->if_cached_route_lock);
5510	lck_mtx_convert_spin(&ifp->if_cached_route_lock);
5511
5512	route_copyout((struct route *)dst, (struct route *)&ifp->if_src_route6,
5513	    sizeof (*dst));
5514
5515	lck_mtx_unlock(&ifp->if_cached_route_lock);
5516}
5517
5518static void
5519ifp_src_route6_copyin(struct ifnet *ifp, struct route_in6 *src)
5520{
5521	lck_mtx_lock_spin(&ifp->if_cached_route_lock);
5522	lck_mtx_convert_spin(&ifp->if_cached_route_lock);
5523
5524	if (ifp->if_fwd_cacheok) {
5525		route_copyin((struct route *)src,
5526		    (struct route *)&ifp->if_src_route6, sizeof (*src));
5527	} else {
5528		rtfree(src->ro_rt);
5529		src->ro_rt = NULL;
5530	}
5531	lck_mtx_unlock(&ifp->if_cached_route_lock);
5532}
5533#endif /* INET6 */
5534
5535struct rtentry *
5536ifnet_cached_rtlookup_inet(struct ifnet	*ifp, struct in_addr src_ip)
5537{
5538	struct route		src_rt;
5539	struct sockaddr_in	*dst;
5540
5541	dst = (struct sockaddr_in *)(void *)(&src_rt.ro_dst);
5542
5543	ifp_src_route_copyout(ifp, &src_rt);
5544
5545	if (src_rt.ro_rt == NULL || !(src_rt.ro_rt->rt_flags & RTF_UP) ||
5546	    src_ip.s_addr != dst->sin_addr.s_addr ||
5547	    src_rt.ro_rt->generation_id != route_generation) {
5548		if (src_rt.ro_rt != NULL) {
5549			rtfree(src_rt.ro_rt);
5550			src_rt.ro_rt = NULL;
5551		} else if (dst->sin_family != AF_INET) {
5552			bzero(&src_rt.ro_dst, sizeof (src_rt.ro_dst));
5553			dst->sin_len = sizeof (src_rt.ro_dst);
5554			dst->sin_family = AF_INET;
5555		}
5556		dst->sin_addr = src_ip;
5557
5558		if (src_rt.ro_rt == NULL) {
5559			src_rt.ro_rt = rtalloc1_scoped((struct sockaddr *)dst,
5560			    0, 0, ifp->if_index);
5561
5562			if (src_rt.ro_rt != NULL) {
5563				/* retain a ref, copyin consumes one */
5564				struct rtentry	*rte = src_rt.ro_rt;
5565				RT_ADDREF(rte);
5566				ifp_src_route_copyin(ifp, &src_rt);
5567				src_rt.ro_rt = rte;
5568			}
5569		}
5570	}
5571
5572	return (src_rt.ro_rt);
5573}
5574
5575#if INET6
5576struct rtentry*
5577ifnet_cached_rtlookup_inet6(struct ifnet *ifp, struct in6_addr *src_ip6)
5578{
5579	struct route_in6 src_rt;
5580
5581	ifp_src_route6_copyout(ifp, &src_rt);
5582
5583	if (src_rt.ro_rt == NULL || !(src_rt.ro_rt->rt_flags & RTF_UP) ||
5584	    !IN6_ARE_ADDR_EQUAL(src_ip6, &src_rt.ro_dst.sin6_addr) ||
5585	    src_rt.ro_rt->generation_id != route_generation) {
5586		if (src_rt.ro_rt != NULL) {
5587			rtfree(src_rt.ro_rt);
5588			src_rt.ro_rt = NULL;
5589		} else if (src_rt.ro_dst.sin6_family != AF_INET6) {
5590			bzero(&src_rt.ro_dst, sizeof (src_rt.ro_dst));
5591			src_rt.ro_dst.sin6_len = sizeof (src_rt.ro_dst);
5592			src_rt.ro_dst.sin6_family = AF_INET6;
5593		}
5594		src_rt.ro_dst.sin6_scope_id = in6_addr2scopeid(ifp, src_ip6);
5595		bcopy(src_ip6, &src_rt.ro_dst.sin6_addr,
5596		    sizeof (src_rt.ro_dst.sin6_addr));
5597
5598		if (src_rt.ro_rt == NULL) {
5599			src_rt.ro_rt = rtalloc1_scoped(
5600			    (struct sockaddr *)&src_rt.ro_dst, 0, 0,
5601			    ifp->if_index);
5602
5603			if (src_rt.ro_rt != NULL) {
5604				/* retain a ref, copyin consumes one */
5605				struct rtentry	*rte = src_rt.ro_rt;
5606				RT_ADDREF(rte);
5607				ifp_src_route6_copyin(ifp, &src_rt);
5608				src_rt.ro_rt = rte;
5609			}
5610		}
5611	}
5612
5613	return (src_rt.ro_rt);
5614}
5615#endif /* INET6 */
5616
5617void
5618if_lqm_update(struct ifnet *ifp, int lqm)
5619{
5620	struct kev_dl_link_quality_metric_data ev_lqm_data;
5621
5622	VERIFY(lqm >= IFNET_LQM_MIN && lqm <= IFNET_LQM_MAX);
5623
5624	/* Normalize to edge */
5625	if (lqm > IFNET_LQM_THRESH_UNKNOWN && lqm <= IFNET_LQM_THRESH_POOR)
5626		lqm = IFNET_LQM_THRESH_POOR;
5627	else if (lqm > IFNET_LQM_THRESH_POOR && lqm <= IFNET_LQM_THRESH_GOOD)
5628		lqm = IFNET_LQM_THRESH_GOOD;
5629
5630	ifnet_lock_exclusive(ifp);
5631	if (lqm == ifp->if_lqm) {
5632		ifnet_lock_done(ifp);
5633		return;		/* nothing to update */
5634	}
5635	ifp->if_lqm = lqm;
5636	ifnet_lock_done(ifp);
5637
5638	bzero(&ev_lqm_data, sizeof (ev_lqm_data));
5639	ev_lqm_data.link_quality_metric = lqm;
5640
5641	dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_LINK_QUALITY_METRIC_CHANGED,
5642	    (struct net_event_data *)&ev_lqm_data, sizeof (ev_lqm_data));
5643}
5644
5645/* for uuid.c */
5646int
5647uuid_get_ethernet(u_int8_t *node)
5648{
5649	struct ifnet *ifp;
5650	struct sockaddr_dl *sdl;
5651
5652	ifnet_head_lock_shared();
5653	TAILQ_FOREACH(ifp, &ifnet_head, if_link) {
5654		ifnet_lock_shared(ifp);
5655		IFA_LOCK_SPIN(ifp->if_lladdr);
5656		sdl = (struct sockaddr_dl *)(void *)ifp->if_lladdr->ifa_addr;
5657		if (sdl->sdl_type == IFT_ETHER) {
5658			memcpy(node, LLADDR(sdl), ETHER_ADDR_LEN);
5659			IFA_UNLOCK(ifp->if_lladdr);
5660			ifnet_lock_done(ifp);
5661			ifnet_head_done();
5662			return (0);
5663		}
5664		IFA_UNLOCK(ifp->if_lladdr);
5665		ifnet_lock_done(ifp);
5666	}
5667	ifnet_head_done();
5668
5669	return (-1);
5670}
5671
5672static int
5673sysctl_rxpoll SYSCTL_HANDLER_ARGS
5674{
5675#pragma unused(arg1, arg2)
5676	int i, err;
5677
5678	i = if_rxpoll;
5679
5680	err = sysctl_handle_int(oidp, &i, 0, req);
5681	if (err != 0 || req->newptr == USER_ADDR_NULL)
5682		return (err);
5683
5684	if (net_rxpoll == 0)
5685		return (ENXIO);
5686
5687	if_rxpoll = i;
5688	return (err);
5689}
5690
5691static int
5692sysctl_sndq_maxlen SYSCTL_HANDLER_ARGS
5693{
5694#pragma unused(arg1, arg2)
5695	int i, err;
5696
5697	i = if_sndq_maxlen;
5698
5699	err = sysctl_handle_int(oidp, &i, 0, req);
5700	if (err != 0 || req->newptr == USER_ADDR_NULL)
5701		return (err);
5702
5703	if (i < IF_SNDQ_MINLEN)
5704		i = IF_SNDQ_MINLEN;
5705
5706	if_sndq_maxlen = i;
5707	return (err);
5708}
5709
5710static int
5711sysctl_rcvq_maxlen SYSCTL_HANDLER_ARGS
5712{
5713#pragma unused(arg1, arg2)
5714	int i, err;
5715
5716	i = if_rcvq_maxlen;
5717
5718	err = sysctl_handle_int(oidp, &i, 0, req);
5719	if (err != 0 || req->newptr == USER_ADDR_NULL)
5720		return (err);
5721
5722	if (i < IF_RCVQ_MINLEN)
5723		i = IF_RCVQ_MINLEN;
5724
5725	if_rcvq_maxlen = i;
5726	return (err);
5727}
5728
5729void
5730ifnet_fclist_append(struct sfb *sp, struct sfb_fc_list *fcl)
5731{
5732	struct sfb_bin_fcentry *fce, *tfce;
5733
5734	lck_mtx_lock_spin(&ifnet_fclist_lock);
5735
5736	SLIST_FOREACH_SAFE(fce, fcl, fce_link, tfce) {
5737		SLIST_REMOVE(fcl, fce, sfb_bin_fcentry, fce_link);
5738		SLIST_INSERT_HEAD(&ifnet_fclist, fce, fce_link);
5739		sp->sfb_stats.flow_feedback++;
5740	}
5741	VERIFY(SLIST_EMPTY(fcl) && !SLIST_EMPTY(&ifnet_fclist));
5742
5743	wakeup(&ifnet_fclist);
5744
5745	lck_mtx_unlock(&ifnet_fclist_lock);
5746}
5747
5748struct sfb_bin_fcentry *
5749ifnet_fce_alloc(int how)
5750{
5751	struct sfb_bin_fcentry *fce;
5752
5753	fce = (how == M_WAITOK) ? zalloc(ifnet_fcezone) :
5754	    zalloc_noblock(ifnet_fcezone);
5755	if (fce != NULL)
5756		bzero(fce, ifnet_fcezone_size);
5757
5758	return (fce);
5759}
5760
5761void
5762ifnet_fce_free(struct sfb_bin_fcentry *fce)
5763{
5764	zfree(ifnet_fcezone, fce);
5765}
5766
5767static void
5768ifnet_fc_init(void)
5769{
5770	thread_t thread = THREAD_NULL;
5771
5772	SLIST_INIT(&ifnet_fclist);
5773	lck_mtx_init(&ifnet_fclist_lock, ifnet_snd_lock_group, NULL);
5774
5775	ifnet_fcezone_size = P2ROUNDUP(sizeof (struct sfb_bin_fcentry),
5776	    sizeof (u_int64_t));
5777	ifnet_fcezone = zinit(ifnet_fcezone_size,
5778	    IFNET_FCEZONE_MAX * ifnet_fcezone_size, 0, IFNET_FCEZONE_NAME);
5779	if (ifnet_fcezone == NULL) {
5780		panic("%s: failed allocating %s", __func__, IFNET_FCEZONE_NAME);
5781		/* NOTREACHED */
5782	}
5783	zone_change(ifnet_fcezone, Z_EXPAND, TRUE);
5784	zone_change(ifnet_fcezone, Z_CALLERACCT, FALSE);
5785
5786	if (kernel_thread_start(ifnet_fc_thread_func,
5787	    NULL, &thread) != KERN_SUCCESS) {
5788		panic("%s: couldn't create flow event advisory thread",
5789		    __func__);
5790		/* NOTREACHED */
5791	}
5792	thread_deallocate(thread);
5793}
5794
5795static int
5796ifnet_fc_thread_cont(int err)
5797{
5798#pragma unused(err)
5799	struct sfb_bin_fcentry *fce;
5800	struct inp_fc_entry *infc;
5801
5802	for (;;) {
5803		lck_mtx_assert(&ifnet_fclist_lock, LCK_MTX_ASSERT_OWNED);
5804		while (SLIST_EMPTY(&ifnet_fclist)) {
5805			(void) msleep0(&ifnet_fclist, &ifnet_fclist_lock,
5806			    (PSOCK | PSPIN), "ifnet_fc_cont", 0,
5807			    ifnet_fc_thread_cont);
5808			/* NOTREACHED */
5809		}
5810
5811		fce = SLIST_FIRST(&ifnet_fclist);
5812		SLIST_REMOVE(&ifnet_fclist, fce, sfb_bin_fcentry, fce_link);
5813		SLIST_NEXT(fce, fce_link) = NULL;
5814		lck_mtx_unlock(&ifnet_fclist_lock);
5815
5816		infc = inp_fc_getinp(fce->fce_flowhash);
5817		if (infc == NULL) {
5818			ifnet_fce_free(fce);
5819			lck_mtx_lock_spin(&ifnet_fclist_lock);
5820			continue;
5821		}
5822		VERIFY(infc->infc_inp != NULL);
5823
5824		inp_fc_feedback(infc->infc_inp);
5825
5826		inp_fc_entry_free(infc);
5827		ifnet_fce_free(fce);
5828		lck_mtx_lock_spin(&ifnet_fclist_lock);
5829	}
5830}
5831
5832static void
5833ifnet_fc_thread_func(void *v, wait_result_t w)
5834{
5835#pragma unused(v, w)
5836	lck_mtx_lock(&ifnet_fclist_lock);
5837	(void) msleep0(&ifnet_fclist, &ifnet_fclist_lock,
5838	    (PSOCK | PSPIN), "ifnet_fc", 0, ifnet_fc_thread_cont);
5839	/*
5840	 * msleep0() shouldn't have returned as PCATCH was not set;
5841	 * therefore assert in this case.
5842	 */
5843	lck_mtx_unlock(&ifnet_fclist_lock);
5844	VERIFY(0);
5845}
5846
5847void
5848dlil_node_present(struct ifnet *ifp, struct sockaddr *sa,
5849    int32_t rssi, int lqm, int npm, u_int8_t srvinfo[48])
5850{
5851	struct kev_dl_node_presence kev;
5852	struct sockaddr_dl *sdl;
5853	struct sockaddr_in6 *sin6;
5854
5855	VERIFY(ifp);
5856	VERIFY(sa);
5857	VERIFY(sa->sa_family == AF_LINK || sa->sa_family == AF_INET6);
5858
5859	bzero(&kev, sizeof (kev));
5860	sin6 = &kev.sin6_node_address;
5861	sdl = &kev.sdl_node_address;
5862	nd6_alt_node_addr_decompose(ifp, sa, sdl, sin6);
5863	kev.rssi = rssi;
5864	kev.link_quality_metric = lqm;
5865	kev.node_proximity_metric = npm;
5866	bcopy(srvinfo, kev.node_service_info, sizeof (kev.node_service_info));
5867
5868	nd6_alt_node_present(ifp, sin6, sdl, rssi, lqm, npm);
5869	dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_NODE_PRESENCE,
5870	    &kev.link_data, sizeof (kev));
5871}
5872
5873void
5874dlil_node_absent(struct ifnet *ifp, struct sockaddr *sa)
5875{
5876	struct kev_dl_node_absence kev;
5877	struct sockaddr_in6 *sin6;
5878	struct sockaddr_dl *sdl;
5879
5880	VERIFY(ifp);
5881	VERIFY(sa);
5882	VERIFY(sa->sa_family == AF_LINK || sa->sa_family == AF_INET6);
5883
5884	bzero(&kev, sizeof (kev));
5885	sin6 = &kev.sin6_node_address;
5886	sdl = &kev.sdl_node_address;
5887	nd6_alt_node_addr_decompose(ifp, sa, sdl, sin6);
5888
5889	nd6_alt_node_absent(ifp, sin6);
5890	dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_NODE_ABSENCE,
5891	    &kev.link_data, sizeof (kev));
5892}
5893
5894errno_t
5895ifnet_getset_opportunistic(ifnet_t ifp, u_long cmd, struct ifreq *ifr,
5896    struct proc *p)
5897{
5898	u_int32_t level = IFNET_THROTTLE_OFF;
5899	errno_t result = 0;
5900
5901	VERIFY(cmd == SIOCSIFOPPORTUNISTIC || cmd == SIOCGIFOPPORTUNISTIC);
5902
5903	if (cmd == SIOCSIFOPPORTUNISTIC) {
5904		/*
5905		 * XXX: Use priv_check_cred() instead of root check?
5906		 */
5907		if ((result = proc_suser(p)) != 0)
5908			return (result);
5909
5910		if (ifr->ifr_opportunistic.ifo_flags ==
5911		    IFRIFOF_BLOCK_OPPORTUNISTIC)
5912			level = IFNET_THROTTLE_OPPORTUNISTIC;
5913		else if (ifr->ifr_opportunistic.ifo_flags == 0)
5914			level = IFNET_THROTTLE_OFF;
5915		else
5916			result = EINVAL;
5917
5918		if (result == 0)
5919			result = ifnet_set_throttle(ifp, level);
5920	} else if ((result = ifnet_get_throttle(ifp, &level)) == 0) {
5921		ifr->ifr_opportunistic.ifo_flags = 0;
5922		if (level == IFNET_THROTTLE_OPPORTUNISTIC) {
5923			ifr->ifr_opportunistic.ifo_flags |=
5924			    IFRIFOF_BLOCK_OPPORTUNISTIC;
5925		}
5926	}
5927
5928	/*
5929	 * Return the count of current opportunistic connections
5930	 * over the interface.
5931	 */
5932	if (result == 0) {
5933		uint32_t flags = 0;
5934		flags |= (cmd == SIOCSIFOPPORTUNISTIC) ?
5935			INPCB_OPPORTUNISTIC_SETCMD : 0;
5936		flags |= (level == IFNET_THROTTLE_OPPORTUNISTIC) ?
5937			INPCB_OPPORTUNISTIC_THROTTLEON : 0;
5938		ifr->ifr_opportunistic.ifo_inuse =
5939		    udp_count_opportunistic(ifp->if_index, flags) +
5940		    tcp_count_opportunistic(ifp->if_index, flags);
5941	}
5942
5943	if (result == EALREADY)
5944		result = 0;
5945
5946	return (result);
5947}
5948
5949int
5950ifnet_get_throttle(struct ifnet *ifp, u_int32_t *level)
5951{
5952	struct ifclassq *ifq;
5953	int err = 0;
5954
5955	if (!(ifp->if_eflags & IFEF_TXSTART))
5956		return (ENXIO);
5957
5958	*level = IFNET_THROTTLE_OFF;
5959
5960	ifq = &ifp->if_snd;
5961	IFCQ_LOCK(ifq);
5962	/* Throttling works only for IFCQ, not ALTQ instances */
5963	if (IFCQ_IS_ENABLED(ifq))
5964		IFCQ_GET_THROTTLE(ifq, *level, err);
5965	IFCQ_UNLOCK(ifq);
5966
5967	return (err);
5968}
5969
5970int
5971ifnet_set_throttle(struct ifnet *ifp, u_int32_t level)
5972{
5973	struct ifclassq *ifq;
5974	int err = 0;
5975
5976	if (!(ifp->if_eflags & IFEF_TXSTART))
5977		return (ENXIO);
5978
5979	switch (level) {
5980	case IFNET_THROTTLE_OFF:
5981	case IFNET_THROTTLE_OPPORTUNISTIC:
5982#if PF_ALTQ
5983		/* Throttling works only for IFCQ, not ALTQ instances */
5984		if (ALTQ_IS_ENABLED(IFCQ_ALTQ(ifq)))
5985			return (ENXIO);
5986#endif /* PF_ALTQ */
5987		break;
5988	default:
5989		return (EINVAL);
5990	}
5991
5992	ifq = &ifp->if_snd;
5993	IFCQ_LOCK(ifq);
5994	if (IFCQ_IS_ENABLED(ifq))
5995		IFCQ_SET_THROTTLE(ifq, level, err);
5996	IFCQ_UNLOCK(ifq);
5997
5998	if (err == 0) {
5999		printf("%s%d: throttling level set to %d\n", ifp->if_name,
6000		    ifp->if_unit, level);
6001		if (level == IFNET_THROTTLE_OFF)
6002			ifnet_start(ifp);
6003	}
6004
6005	return (err);
6006}
6007